Commit 80d9bca5 authored by mateuszg's avatar mateuszg

Initial commit

parents
[main]
saper_cfg = /home/anaaa/REPOS/saper/cfg/entailment_bestn.ini
# An optional parameter, default to False
overwrite_ccl = False
tagset = nkjp
features =
[mead]
# An optional parameter, default to "/usr/local/share/mead/bin/"
mead_bin_dir = /usr/local/share/mead/bin/
# An optional MEAD's configuration file. If not provided a default one will
# be created in a cluster directory.
mead_cfg_file =
[main]
saper_cfg = /home/anaaa/REPOS/saper/cfg/entailment_bestn.ini
tagset = nkjp
features = QueryNE1 1 QueryNE2 3
# An optional parameter, default to False
overwrite_ccl = False
[mead]
# An optional parameter, default to "/usr/local/share/mead/bin/"
mead_bin_dir = /usr/local/share/mead/bin/
# An optional MEAD's configuration file. If not provided a default one will
# be created in a cluster directory.
# Don't fill it in! Not supported yet.
mead_cfg_file =
[QueryNE1]
type = external
class = NEQueryPhraseMatchFeature
feature_name = QueryNE1
out_file = ne1
channels =
weighting_function = 0.2
[QueryNE2]
type = external
class = NEQueryPhraseMatchFeature
feature_name = QueryNE2
out_file = ne2
channels =
weighting_function = frequency
;[Centroid]
;type = mead
;feature = Centroid
;mead_class = Centroid
[main]
saper_cfg = /home/ajanz/repo/saper/cfg/saper.ini
tagset = nkjp
features = QueryNE1 1 QueryNE2 3
# An optional parameter, default to False
overwrite_ccl = False
[mead]
# An optional parameter, default to "/usr/local/share/mead/bin/"
mead_bin_dir = /usr/local/share/mead/bin/
# An optional MEAD's configuration file. If not provided a default one will
# be created in a cluster directory.
# Don't fill it in! Not supported yet.
mead_cfg_file =
[QueryNE1]
type = external
class = NEQueryPhraseMatchFeature
feature_name = QueryNE1
out_file = ne1
channels =
weighting_function = 0.2
[QueryNE2]
type = external
class = NEQueryPhraseMatchFeature
feature_name = QueryNE2
out_file = ne2
channels =
weighting_function = frequency
;[Centroid]
;type = mead
;feature = Centroid
;mead_class = Centroid
; PLIK KONFIGURACYJNY WORKERA
; Plik zawiera konfigurację zarówno Api usługi sieciowej jak i narzędzia.
;
; Autor: Tomasz Walkowiak
; email: tomasz.walkowiak@pwr.edu.pl
; --------- CZĘŚĆ DLA Serwisu ---------
[service]
root = /samba/requests/
tool = summarize
rabbit_host =10.17.0.85
rabbit_user =clarin
rabbit_password =clarin123
; --------- CZĘŚĆ DLA Narzedzia ---------
[tool]
workers_number = 1
sum_config=cfg/basic.ini
# -*- encoding: utf-8 -*-
import os
import logging, logging.handlers
import traceback
DEBUG = logging.DEBUG
INFO = logging.INFO
WARNING = logging.WARNING
ERROR = logging.ERROR
CRITICAL= logging.CRITICAL
class Logger(object):
"""This class is adapter for standard logging library - it has function for both service-wise logging
and tas-wise logging. The interface is ver similliar to standard logging package."""
def __init__(self, lvl, main_log_path, task_logs_path, log_to_console):
"""Constructor
lvl - logging level (string), possible values: "DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"
main_log_path - path to service-wise logfile
task_logs_path - path to directory containing task specific logfiles
"""
self._task_logs_path = task_logs_path
self._main_log_path = main_log_path
self._log_to_console = log_to_console
self.default_logging_lvl = lvl
logger_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
self._default_formatter = logging.Formatter(logger_format)
self._main_logger = None
def log(self, lvl, exception):
"""Log message in main logfile
lvl - message level, as in constructor
exception - string message or exception to log
If the exception is WARNING, ERROR or CRITICAL, than logs traceback as well.
"""
if self._main_logger is None:
self._main_logger = self._get_main_logger(self._main_log_path, self._log_to_console)
if lvl in [WARNING, ERROR, CRITICAL]:
self._log_traceback(self._main_logger, lvl, exception)
self._log(self._main_logger, lvl, exception)
def debug(exception):
self.log(DEBUG, exception)
def info(exception):
self.log(INFO, exception)
def warning(exception):
self.log(WARNING, exception)
def error(exception):
self.log(ERROR, exception)
def task_log(self, task_token, lvl, exception):
"""Log message in task specific logfile
lvl - message level, as in constructor
exception - string message or exception to log
If the exception is WARNING, ERROR or CRITICAL, than logs traceback with main logger.
"""
if lvl in [WARNING, ERROR, CRITICAL]:
self._log_traceback(self._main_logger, lvl, exception)
task_logger, task_logger_file_descriptor = self._get_task_logger(task_token)
self._log(task_logger, lvl, exception)
def task_debug(task_token, exception):
self.task_log(task_token, DEBUG, exception)
def task_info(task_token, exception):
self.task_log(task_token, INFO, exception)
def task_warning(task_token, exception):
self.task_log(task_token, WARNING, exception)
def task_error(task_token, exception):
self.task_log(task_token, ERROR, exception)
def shutdown(self):
logging.shutdown()
@staticmethod
def str2logging_lvl(str_level):
"""Translates logging level in string into logging module const"""
str_level = str_level.lower()
return {"debug" : DEBUG,
"info" : INFO,
"warning" : WARNING,
"error" : ERROR,
"critical" : CRITICAL}[str_level]
def _get_main_logger(self, main_log_path, log_to_console):
"""Creates /retrieves main logger - global for whole service and a parent to all task loggers,
which writes to console and main logfile."""
logger, descriptor = self._get_logger(self.default_logging_lvl, 'service', main_log_path)
self.main_logfile_descr = descriptor
if log_to_console and self._check_logger_inited('service'):
console_handler = logging.StreamHandler()
console_handler.setLevel(self.default_logging_lvl)
console_handler.setFormatter(self._default_formatter)
logger.addHandler(console_handler)
return logger
def _get_task_logger(self, task_token):
"""Creates/retrieves logging.Logger instance with certain level for task with supplied token"""
task_logger_name = 'service.task-' + task_token
task_log_file = os.path.join(self._task_logs_path, task_token)
return self._get_logger(self.default_logging_lvl, task_logger_name, task_log_file, delay=True)
def _get_logger(self, logging_lvl, name, filepath, delay=False):
"""Creates/retreives logging.Logger instance with certain level and name that writes into files
given by filepath (rotating).Returns logger and it's file descriptor."""
logger = logging.getLogger(name)
if not self._check_logger_inited(name):
handler = logging.handlers.RotatingFileHandler(filepath, delay=delay, maxBytes=1024000, backupCount=10)
handler.setLevel(logging_lvl)
handler.setFormatter(self._default_formatter)
logger.setLevel(logging_lvl)
logger.addHandler(handler)
else:
handler = logger.handlers[0]
descriptor = handler.stream.fileno() if handler.stream else None
return logger, descriptor
def _log(self, logger, lvl, exception):
"""Logs given exception to given logger with given level."""
logger.log(lvl, str(exception))
def _log_traceback(self, logger, lvl, exception):
"""If exception's traceback can be extracted, it logs this traceback."""
if isinstance(exception, Exception):
if hasattr(exception, 'traceback'):
traceback_str = exception.traceback
else:
traceback_str = traceback.format_exc()
if traceback_str is not None:
self._log(self._main_logger, lvl, traceback_str)
def _check_logger_inited(self, name):
logger = logging.getLogger(name)
return len(logger.handlers) != 0
"""
Main summarization script responsible for creating summarization of plain text
files. It uses MEAD tool with some extensions for processing plain texts in
polish language.
"""
import argparse
import os
import summary.utils.basicutils as basicutils
from summary.utils.basicutils import save_files_list
from summary.summarizer.summarizer import Summarizer
try:
import argcomplete
except ImportError:
argcomplete = None
def create_argparser(argv=None, description=None):
"""!
Create a parser and return the parsed arguments.
@param argv a list of parser arguments (e.g. ['option','value',...] or only
an option if it doesn't need any value - like flags).
@type argv: list
@param description that will be added to the parser and shown as a help
message.
@type description: str
@return parsed arguments.
@rtype: Namespace
"""
arg_parser = \
argparse.ArgumentParser(
description=description,
formatter_class=type(
"CustomFormatter",
(argparse.ArgumentDefaultsHelpFormatter,
argparse.RawDescriptionHelpFormatter),
{})
)
arg_parser.add_argument(
'summary_config',
help="An ini configuration file for summarization."
)
arg_parser.add_argument(
'output_summary',
help="The path to a file where the final summary will be written to."
)
# arg_parser.add_argument(
# '-b',
# '--batch_mode',
# action='store_true',
# default=False,
# help=("Input files are processed as if each of them contains "
# "a list of files to be summarized.")
# )
arg_parser.add_argument(
'-r',
'--recursive',
action='store_true',
default=False,
help="Whether to search an input directory recursively for txt files."
)
arg_parser.add_argument(
'inputs',
metavar="input",
nargs='+',
help=("A plain txt file, or a directory with such files, to make the "
"summary from. If the input is a file then it doesn't have to "
"have a txt extension. However, in case of directory only files "
"with txt extension are taken into account. Make sure that all"
"files have different names.")
)
arg_parser.add_argument(
'-c',
'--ccl_mode',
action='store_true',
default=False,
help="Use directly ccl files instead of plain txt files."
)
if argcomplete:
argcomplete.autocomplete(arg_parser)
return arg_parser.parse_args(args=argv)
def check_args(args):
"""!
Check the correctness of input arguments, especially if all files exists.
@param args parsed arguments.
@type args: Namespace
@return a list of found errors (as messages).
@rtype list
"""
errors = []
if not os.path.exists(args.summary_config):
errors.append(("The given summary configuration file {:} "
"doesn't exist").format(args.summary_config))
missing_inputs = [input_ for input_ in args.inputs
if not os.path.exists(input_)]
if missing_inputs:
errors.append(("The following input files don't exist: "
"\n{:}").format("\n".join(missing_inputs)))
return errors
def run(argv=None):
"""!
Get correct input parameters and run summary process.
@param argv a list of parser arguments (e.g. ['option','value',...] or only
an option if it doesn't need any value - like flags).
@type argv: list
"""
args = create_argparser(argv, description=__doc__)
errors = check_args(args)
if errors:
# Show errors are stop immediately
print "\n".join(errors)
import sys
sys.exit(2)
else:
summarizer = Summarizer(args.summary_config)
# Create a list of files to make the summary of
files_list = basicutils.get_files(args.inputs, args.recursive)
if files_list:
tmp_filein = save_files_list(files_list)
# process - the core function creating summaries
summarizer.process(tmp_filein, args.output_summary)
os.unlink(tmp_filein)
else:
print ("No valid input files provided (e.g. directories may not "
"contain any .txt file).")
if __name__ == "__main__":
run()
#!/usr/bin/python
# -*- coding: utf-8 -*-
from worker import NLPWorker,NLPService
from logger import *
from utils.basicutils import save_files_list
from utils.basicutils import get_files
from summarizer import Summarizer
import os,shutil,tempfile
class SummarizeWorker(NLPWorker):
def init(self):
self.summarizer=Summarizer(self.config['sum_config'])
def process(self, inputFile, taskOptions, outputFile):
#inputFile="test/1.xml"
tf = tempfile.NamedTemporaryFile(suffix='.xml')
shutil.copy2(inputFile, tf.name)
files_list = [];
files_list.append(tf.name);
filein = save_files_list(files_list)
try:
self.summarizer.process(filein, outputFile)
except IndexError:
print("Text too short")
open(outputFile, 'a').close()
os.unlink(filein)
os.remove(tf.name)
if __name__ == '__main__':
service= NLPService();
service.start(SummarizeWorker);
This diff is collapsed.
import os
import tempfile
import re
CCL_EXTENSION = ".xml"
REL_CCL_EXTENSION = ".rel.xml"
CCL_PATTERN_EXTENSION = "(\.rel)?\.xml$"
TXT_PATTERN_EXTENSION = ".txt$"
def inherit_docs(cls):
"""! Inherit docstrings of superclass functions if they are not defined. """
for name, elem in vars(cls).iteritems():
if not elem.__doc__:
for parent in cls.__bases__:
# Do parent have this elem?
p_elem = getattr(parent, name, None)
if p_elem and getattr(p_elem, '__doc__'):
# Assign docstring to elem
elem.__doc__ = p_elem.__doc__
# Found! Don't look further
break
return cls
def get_only_ccl_files(ccl_dir):
"""!
Return only ccl files (not rel-ccl) in the given directory.
@param ccl_dir: a directory with ccl files (having .xml or .rel.xml
extensions). It must have a flat structure.
@type ccl_dir: str
@return a list of ccl files.
@rtype: list
"""
return [file_ for file_ in os.listdir(ccl_dir)
if file_.endswith(CCL_EXTENSION)
and not file_.endswith(REL_CCL_EXTENSION)]
def get_ccls_without_extensions(ccl_dir):
"""!
Return ccl files but ignore file XML extensions.
"""
return [file_ for file_ in os.listdir(ccl_dir)]
def get_ccl_relccl_files(ccl_dir):
"""!
Return paired ccl and rel-ccl files in the given directory.
@param ccl_dir: a directory with ccl files (having .xml or .rel.xml
extensions). It must have a flat structure.
@type ccl_dir: str
@return a list of tuples (ccl, rel-ccl) files.
@rtype: list
"""
# Find ccl and rel-ccl files
ccl_files = get_only_ccl_files(ccl_dir)
relccl_files = get_files([ccl_dir], extension=REL_CCL_EXTENSION)
# Match them
relccl_map = dict((re.sub(CCL_PATTERN_EXTENSION, "", relccl), relccl)
for relccl in relccl_files)
pairs = []
for ccl in ccl_files:
key = re.sub(CCL_PATTERN_EXTENSION, "", ccl)
if key in relccl_map:
pairs.append((ccl, relccl_map[key]))
return pairs
def filter_files(files_list, ccl_dir):
"""!
Filter a list of files by removing files that have been already preprocessed.
The method checks whether the provided ccl directory contains files from
the input list. A file from the input list is compared
with a set of files (without extensions) in the ccl directory.
@param files_list a list of files.
@type files_list: list
@param ccl_dir: a directory with ccl files (having .xml or .rel.xml
extensions). It must have a flat structure.
@type ccl_dir: str
@return a list of files that haven't been preprocessed yet (don't exist in
the ccl directory)
@rtype: list
"""
ccl_files = {re.sub(CCL_PATTERN_EXTENSION, "", file_)
for file_ in os.listdir(ccl_dir)
if re.search(CCL_PATTERN_EXTENSION, file_)}
return [file_ for file_ in files_list
if os.path.basename(file_) not in ccl_files]
def save_files_list(files_list):
"""!
Save a list of files to a temporary file and return the name of this file.
SAPER, in the batch mode, needs a file with a list of files to process.
Each file has to be written in a separate line. Thus, this method writes
the input list of files to a temporary file and returns the name of this
file.
@param files_list a list of files.
@type files_list: list
@return the name of a temporary file containing the list.
@rtype: str
"""
tmp_file = tempfile.NamedTemporaryFile(delete=False)
tmp_file.file.write("\n".join(files_list))
tmp_file.file.close()
return tmp_file.name
def load_files_list(files_list_path):
"""!
Load and return a list of files from a file.
The input file has to contain a list of files, each in a separate line.
@param files_list_path the name of a file that contains a list of files.
@type files_list_path: str
@return the loaded list
@rtype: list
"""
loaded_list = []
with open(files_list_path) as files_list_file:
loaded_list.extend(line.strip() for line in files_list_file.readlines())
return loaded_list
def get_full_path(path):
"""!
Return the absolute path of the input path.
@param path an input path.
@type path: str
@return the absolute path of the input path.
@rtype str
"""
return path if os.path.isabs(path) else os.path.abspath(path)
def get_files(inputs, recursive=False, extension='.txt'):
"""!
Return a list of files extracted from the inputs list.
Inputs may have both files and directories. If a single input is a file
then it is added to the output list. If an input is a directory then all
files with the defined extension are added to the output list. If the
recursive flag is set to True then files in directories are searched for
recursively.
@param inputs a list of files and directories
@type inputs: list
@param recursive whether to searched directories recursively.
@type recursive: bool
@param extension the extension of files that are looked for in directories.
@type extension: str
@return a list of absolute paths of files.
@rtype: list
"""
output = []
for single_input in inputs:
# Process the single input only if it exists...
if os.path.exists(single_input):
# Add a file
if os.path.isfile(single_input):
output.append(get_full_path(single_input))
# Check a directory (flat)
elif os.path.isdir(single_input) and not recursive:
output.extend(get_full_path(filename)
for filename in os.listdir(single_input)
if os.path.isfile(filename)
and filename.endswith(extension))
# Check a directory recursively
elif os.path.isdir(single_input):
for dirpath, _, filenames in os.walk(single_input):
output.extend(os.path.abspath(os.path.join(dirpath, filename))
for filename in filenames
if filename.endswith(extension))
return output
This diff is collapsed.
from ConfigParser import SafeConfigParser
from collections import OrderedDict
import re
import logging
import basicutils
import features
# Only for loading a config file
SEC_MAIN = "main"
SEC_MEAD = "mead"
# Not only for loading a config file, but also for creating inner dictionary
# However, you can change names. The inner dictionary is created automatically.
SAPER_CFG = "saper_cfg"
OVERWRITE_CCL = "overwrite_ccl"
TAGSET = "tagset"
MEAD_BIN_DIR = "mead_bin_dir"
MEAD_CFG_FILE = "mead_cfg_file"
FEATURES = "features"
# FEATURE REQUIRED OPTIONS
FEATURE_TYPE = "type"
FEATURE_CLASS = "class"
FEATURE_NAME = "feature_name"
FEATURE_FILE = "out_file"
FEATURE_TYPE_EXTERNAL = "external"
FEATURE_TYPE_TOOL = "tool"
FEATURE_TYPE_MEAD = "mead"
class ConfigOptions(object):
"""!
A configuration options object that reads parameters from a configuration
file.
"""
def __init__(self, cfg_file_path):
"""!
Create an instance of ConfigObject based on a configuration file.
@param cfg_file_path tha path of a configuration file.
@type cfg_file_path: str
"""
self._options = dict()
self._parse_cfg(cfg_file_path)
@property
def saper_cfg(self):
"""! The absolute path of a SAPER configuration file. """
# This option is mandatory, an instance of the class won't be created
# without it
return self._options[SAPER_CFG]
@property
def overwrite_ccl(self):
"""! Whether to run SAPER again and overwrite ccl files. """
# The default value has been set during parsing a cfg file.
# Thus, don't use get with default value
return self._options[OVERWRITE_CCL]
@property
def mead_bin_dir(self):
"""! The absolute path of MEAD's bin directory. """
# The default value has been set during parsing a cfg file.
# Thus, don't use get with default value nor abspath again.
return self._options[MEAD_BIN_DIR]
@property
def mead_cfg_file(self):
"""! The absolute path of MEAD's configuration file. """
# The default value has been set during parsing a cfg file.
# Thus, don't use get with default value nor abspath again.
return self._options[MEAD_CFG_FILE]
@property
def features(self):
"""! Return a list of dictionaries, each contains options for a feature. """
return [f_vals[1] for f_vals in self._options[FEATURES].itervalues()]
@property
def features_weights(self):
"""! Return features and their weights. """
return [(f_vals[1][FEATURE_NAME], f_vals[0])
for f_vals in self._options[FEATURES].itervalues()]
@property
def tagset(self):
"""! Return the name of a tagset. """
return self._options[TAGSET]
def _parse_cfg(self, cfg_file_path):
"""!
Parse a configuration file and store options.
@param cfg_file_path tha path of a configuration file.
@type cfg_file_path: str
"""
parser = SafeConfigParser()
parser.read(cfg_file_path)
# Read main section
saper_cfg_opt = parser.get(SEC_MAIN, SAPER_CFG).strip()
if saper_cfg_opt:
self._options[SAPER_CFG] = basicutils.get_full_path(saper_cfg_opt)
else:
msg = "Missing {:} value in section {:}".format(SAPER_CFG, SEC_MAIN)
logging.error(msg)
raise ValueError(msg)
self._options[OVERWRITE_CCL] = False
if parser.has_option(SEC_MAIN, OVERWRITE_CCL):
try:
self._options[OVERWRITE_CCL] = \