Commit 80d9bca5 authored by mateuszg's avatar mateuszg

Initial commit

parents
[main]
saper_cfg = /home/anaaa/REPOS/saper/cfg/entailment_bestn.ini
# An optional parameter, default to False
overwrite_ccl = False
tagset = nkjp
features =
[mead]
# An optional parameter, default to "/usr/local/share/mead/bin/"
mead_bin_dir = /usr/local/share/mead/bin/
# An optional MEAD's configuration file. If not provided a default one will
# be created in a cluster directory.
mead_cfg_file =
[main]
saper_cfg = /home/anaaa/REPOS/saper/cfg/entailment_bestn.ini
tagset = nkjp
features = QueryNE1 1 QueryNE2 3
# An optional parameter, default to False
overwrite_ccl = False
[mead]
# An optional parameter, default to "/usr/local/share/mead/bin/"
mead_bin_dir = /usr/local/share/mead/bin/
# An optional MEAD's configuration file. If not provided a default one will
# be created in a cluster directory.
# Don't fill it in! Not supported yet.
mead_cfg_file =
[QueryNE1]
type = external
class = NEQueryPhraseMatchFeature
feature_name = QueryNE1
out_file = ne1
channels =
weighting_function = 0.2
[QueryNE2]
type = external
class = NEQueryPhraseMatchFeature
feature_name = QueryNE2
out_file = ne2
channels =
weighting_function = frequency
;[Centroid]
;type = mead
;feature = Centroid
;mead_class = Centroid
[main]
saper_cfg = /home/ajanz/repo/saper/cfg/saper.ini
tagset = nkjp
features = QueryNE1 1 QueryNE2 3
# An optional parameter, default to False
overwrite_ccl = False
[mead]
# An optional parameter, default to "/usr/local/share/mead/bin/"
mead_bin_dir = /usr/local/share/mead/bin/
# An optional MEAD's configuration file. If not provided a default one will
# be created in a cluster directory.
# Don't fill it in! Not supported yet.
mead_cfg_file =
[QueryNE1]
type = external
class = NEQueryPhraseMatchFeature
feature_name = QueryNE1
out_file = ne1
channels =
weighting_function = 0.2
[QueryNE2]
type = external
class = NEQueryPhraseMatchFeature
feature_name = QueryNE2
out_file = ne2
channels =
weighting_function = frequency
;[Centroid]
;type = mead
;feature = Centroid
;mead_class = Centroid
; PLIK KONFIGURACYJNY WORKERA
; Plik zawiera konfigurację zarówno Api usługi sieciowej jak i narzędzia.
;
; Autor: Tomasz Walkowiak
; email: tomasz.walkowiak@pwr.edu.pl
; --------- CZĘŚĆ DLA Serwisu ---------
[service]
root = /samba/requests/
tool = summarize
rabbit_host =10.17.0.85
rabbit_user =clarin
rabbit_password =clarin123
; --------- CZĘŚĆ DLA Narzedzia ---------
[tool]
workers_number = 1
sum_config=cfg/basic.ini
# -*- encoding: utf-8 -*-
import os
import logging, logging.handlers
import traceback
DEBUG = logging.DEBUG
INFO = logging.INFO
WARNING = logging.WARNING
ERROR = logging.ERROR
CRITICAL= logging.CRITICAL
class Logger(object):
"""This class is adapter for standard logging library - it has function for both service-wise logging
and tas-wise logging. The interface is ver similliar to standard logging package."""
def __init__(self, lvl, main_log_path, task_logs_path, log_to_console):
"""Constructor
lvl - logging level (string), possible values: "DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"
main_log_path - path to service-wise logfile
task_logs_path - path to directory containing task specific logfiles
"""
self._task_logs_path = task_logs_path
self._main_log_path = main_log_path
self._log_to_console = log_to_console
self.default_logging_lvl = lvl
logger_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
self._default_formatter = logging.Formatter(logger_format)
self._main_logger = None
def log(self, lvl, exception):
"""Log message in main logfile
lvl - message level, as in constructor
exception - string message or exception to log
If the exception is WARNING, ERROR or CRITICAL, than logs traceback as well.
"""
if self._main_logger is None:
self._main_logger = self._get_main_logger(self._main_log_path, self._log_to_console)
if lvl in [WARNING, ERROR, CRITICAL]:
self._log_traceback(self._main_logger, lvl, exception)
self._log(self._main_logger, lvl, exception)
def debug(exception):
self.log(DEBUG, exception)
def info(exception):
self.log(INFO, exception)
def warning(exception):
self.log(WARNING, exception)
def error(exception):
self.log(ERROR, exception)
def task_log(self, task_token, lvl, exception):
"""Log message in task specific logfile
lvl - message level, as in constructor
exception - string message or exception to log
If the exception is WARNING, ERROR or CRITICAL, than logs traceback with main logger.
"""
if lvl in [WARNING, ERROR, CRITICAL]:
self._log_traceback(self._main_logger, lvl, exception)
task_logger, task_logger_file_descriptor = self._get_task_logger(task_token)
self._log(task_logger, lvl, exception)
def task_debug(task_token, exception):
self.task_log(task_token, DEBUG, exception)
def task_info(task_token, exception):
self.task_log(task_token, INFO, exception)
def task_warning(task_token, exception):
self.task_log(task_token, WARNING, exception)
def task_error(task_token, exception):
self.task_log(task_token, ERROR, exception)
def shutdown(self):
logging.shutdown()
@staticmethod
def str2logging_lvl(str_level):
"""Translates logging level in string into logging module const"""
str_level = str_level.lower()
return {"debug" : DEBUG,
"info" : INFO,
"warning" : WARNING,
"error" : ERROR,
"critical" : CRITICAL}[str_level]
def _get_main_logger(self, main_log_path, log_to_console):
"""Creates /retrieves main logger - global for whole service and a parent to all task loggers,
which writes to console and main logfile."""
logger, descriptor = self._get_logger(self.default_logging_lvl, 'service', main_log_path)
self.main_logfile_descr = descriptor
if log_to_console and self._check_logger_inited('service'):
console_handler = logging.StreamHandler()
console_handler.setLevel(self.default_logging_lvl)
console_handler.setFormatter(self._default_formatter)
logger.addHandler(console_handler)
return logger
def _get_task_logger(self, task_token):
"""Creates/retrieves logging.Logger instance with certain level for task with supplied token"""
task_logger_name = 'service.task-' + task_token
task_log_file = os.path.join(self._task_logs_path, task_token)
return self._get_logger(self.default_logging_lvl, task_logger_name, task_log_file, delay=True)
def _get_logger(self, logging_lvl, name, filepath, delay=False):
"""Creates/retreives logging.Logger instance with certain level and name that writes into files
given by filepath (rotating).Returns logger and it's file descriptor."""
logger = logging.getLogger(name)
if not self._check_logger_inited(name):
handler = logging.handlers.RotatingFileHandler(filepath, delay=delay, maxBytes=1024000, backupCount=10)
handler.setLevel(logging_lvl)
handler.setFormatter(self._default_formatter)
logger.setLevel(logging_lvl)
logger.addHandler(handler)
else:
handler = logger.handlers[0]
descriptor = handler.stream.fileno() if handler.stream else None
return logger, descriptor
def _log(self, logger, lvl, exception):
"""Logs given exception to given logger with given level."""
logger.log(lvl, str(exception))
def _log_traceback(self, logger, lvl, exception):
"""If exception's traceback can be extracted, it logs this traceback."""
if isinstance(exception, Exception):
if hasattr(exception, 'traceback'):
traceback_str = exception.traceback
else:
traceback_str = traceback.format_exc()
if traceback_str is not None:
self._log(self._main_logger, lvl, traceback_str)
def _check_logger_inited(self, name):
logger = logging.getLogger(name)
return len(logger.handlers) != 0
"""
Main summarization script responsible for creating summarization of plain text
files. It uses MEAD tool with some extensions for processing plain texts in
polish language.
"""
import argparse
import os
import summary.utils.basicutils as basicutils
from summary.utils.basicutils import save_files_list
from summary.summarizer.summarizer import Summarizer
try:
import argcomplete
except ImportError:
argcomplete = None
def create_argparser(argv=None, description=None):
"""!
Create a parser and return the parsed arguments.
@param argv a list of parser arguments (e.g. ['option','value',...] or only
an option if it doesn't need any value - like flags).
@type argv: list
@param description that will be added to the parser and shown as a help
message.
@type description: str
@return parsed arguments.
@rtype: Namespace
"""
arg_parser = \
argparse.ArgumentParser(
description=description,
formatter_class=type(
"CustomFormatter",
(argparse.ArgumentDefaultsHelpFormatter,
argparse.RawDescriptionHelpFormatter),
{})
)
arg_parser.add_argument(
'summary_config',
help="An ini configuration file for summarization."
)
arg_parser.add_argument(
'output_summary',
help="The path to a file where the final summary will be written to."
)
# arg_parser.add_argument(
# '-b',
# '--batch_mode',
# action='store_true',
# default=False,
# help=("Input files are processed as if each of them contains "
# "a list of files to be summarized.")
# )
arg_parser.add_argument(
'-r',
'--recursive',
action='store_true',
default=False,
help="Whether to search an input directory recursively for txt files."
)
arg_parser.add_argument(
'inputs',
metavar="input",
nargs='+',
help=("A plain txt file, or a directory with such files, to make the "
"summary from. If the input is a file then it doesn't have to "
"have a txt extension. However, in case of directory only files "
"with txt extension are taken into account. Make sure that all"
"files have different names.")
)
arg_parser.add_argument(
'-c',
'--ccl_mode',
action='store_true',
default=False,
help="Use directly ccl files instead of plain txt files."
)
if argcomplete:
argcomplete.autocomplete(arg_parser)
return arg_parser.parse_args(args=argv)
def check_args(args):
"""!
Check the correctness of input arguments, especially if all files exists.
@param args parsed arguments.
@type args: Namespace
@return a list of found errors (as messages).
@rtype list
"""
errors = []
if not os.path.exists(args.summary_config):
errors.append(("The given summary configuration file {:} "
"doesn't exist").format(args.summary_config))
missing_inputs = [input_ for input_ in args.inputs
if not os.path.exists(input_)]
if missing_inputs:
errors.append(("The following input files don't exist: "
"\n{:}").format("\n".join(missing_inputs)))
return errors
def run(argv=None):
"""!
Get correct input parameters and run summary process.
@param argv a list of parser arguments (e.g. ['option','value',...] or only
an option if it doesn't need any value - like flags).
@type argv: list
"""
args = create_argparser(argv, description=__doc__)
errors = check_args(args)
if errors:
# Show errors are stop immediately
print "\n".join(errors)
import sys
sys.exit(2)
else:
summarizer = Summarizer(args.summary_config)
# Create a list of files to make the summary of
files_list = basicutils.get_files(args.inputs, args.recursive)
if files_list:
tmp_filein = save_files_list(files_list)
# process - the core function creating summaries
summarizer.process(tmp_filein, args.output_summary)
os.unlink(tmp_filein)
else:
print ("No valid input files provided (e.g. directories may not "
"contain any .txt file).")
if __name__ == "__main__":
run()
#!/usr/bin/python
# -*- coding: utf-8 -*-
from worker import NLPWorker,NLPService
from logger import *
from utils.basicutils import save_files_list
from utils.basicutils import get_files
from summarizer import Summarizer
import os,shutil,tempfile
class SummarizeWorker(NLPWorker):
def init(self):
self.summarizer=Summarizer(self.config['sum_config'])
def process(self, inputFile, taskOptions, outputFile):
#inputFile="test/1.xml"
tf = tempfile.NamedTemporaryFile(suffix='.xml')
shutil.copy2(inputFile, tf.name)
files_list = [];
files_list.append(tf.name);
filein = save_files_list(files_list)
try:
self.summarizer.process(filein, outputFile)
except IndexError:
print("Text too short")
open(outputFile, 'a').close()
os.unlink(filein)
os.remove(tf.name)
if __name__ == '__main__':
service= NLPService();
service.start(SummarizeWorker);
import os
import logging
import tempfile
import shutil
import errno
from summary.utils.configuration import ConfigOptions
from summary.utils.basicutils import save_files_list
from saper.saper_run import load_config as saper_load_cfg
from saper.saper_run import batch_processing as saper_preprocessing
import summary.utils.basicutils as basicutils
import summary.utils.meadutils as meadutils
import summary.utils.configuration as uconfig
import summary.features as features
class Summarizer(object):
def __init__(
self,
cfg=None,
mode=True,
tagset="nkjp"
):
super(Summarizer, self).__init__()
self._config = ConfigOptions(cfg)
self._ccl_mode = mode
self._tagset = tagset
self._cluster_dir = None
self._ccl_dir = None
self._query_dir = None
self._docsent_dir = None
@property
def config(self):
return self._config
@config.setter
def config(self, cfg):
self._config = cfg
@property
def ccl_mode(self):
return self._ccl_mode
@property
def cluster_dir(self):
return self._cluster_dir
@cluster_dir.setter
def cluster_dir(self, directory):
self._cluster_dir = directory
@property
def initialized(self):
if self._config: #and self._cluster_dir:
return True
return False
def _init_dirs(self, cluster_dir):
self._cluster_dir = basicutils.get_full_path(cluster_dir)
if os.path.exists(self._cluster_dir):
self._ccl_dir = os.path.join(
self._cluster_dir, meadutils.CLUSTER_CCL_DIR
)
self._query_dir = os.path.join(
self._cluster_dir, meadutils.CLUSTER_QUERY_DIR
)
self._docsent_dir = os.path.join(
self._cluster_dir, meadutils.CLUSTER_SENT_DIR
)
else:
self._cluster_dir = None
def _update_cluster_file(self, cluster_path, cluster_name, files_list_path):
"""!
Update a cluster file by adding to it files to be summarized.
@param cluster_path tha path where a cluster directory will be created.
@type cluster_path: str
@param cluster_name tha name of the cluster directory.
@type cluster_path: str
@param files_list_path the path of a file that contains a list of files
to make the summary of.
@type files_list_path: str
"""
# Keep document identifiers to ccl files
if not self._ccl_mode:
document_ids = [
meadutils.get_did("{:}{:}".format(file_, basicutils.CCL_EXTENSION))
for file_ in basicutils.load_files_list(files_list_path)
]
else:
document_ids = [meadutils.get_did(file_)
for file_ in basicutils.load_files_list(files_list_path)
]
meadutils.add_files_to_cluster(
meadutils.get_cluster_file_path(cluster_path, cluster_name),
document_ids
)
def _run_preprocessing(self, files_list_path, cluster_dir):
"""!
Preprocess input files and write it to the ccl directory in the cluster.
@param cfg_options a configuration options for the summary process.
@type cfg_options: utils.config.ConfigOptions
@param files_list_path the path of a file that contains a list of files
to make the summary of.
@type files_list_path: str
@param cluster_dir the path to a directory, called cluster, that will be used
for storing steps of processing. The final part of the path should be
a name of a cluster (it will be created automatically if doesn't exist).
@type cluster_dir: str
@param tagset the name of a tagset that will be used during preprocessing.
@type tagset: str
"""
nothing_to_process = False
files_to_preprocess_path = files_list_path
ccl_dir = self._ccl_dir
if not self._config.overwrite_ccl:
filtered_files_list = \
basicutils.filter_files(
basicutils.load_files_list(files_list_path),
ccl_dir
)
if filtered_files_list:
files_to_preprocess_path = save_files_list(filtered_files_list)
else:
nothing_to_process = True
if not nothing_to_process:
saper_preprocessing(
self._tagset,
files_to_preprocess_path,
"plain",
ccl_dir,
saper_load_cfg(self._config.saper_cfg),
exc_cont=False
)
if files_to_preprocess_path != files_list_path:
os.unlink(files_to_preprocess_path)
def _make_features(self):
"""!
Make a feature from each feature options.
@param cluster_dir the path to a directory, called cluster, that will be used
for storing steps of processing. The final part of the path should be
a name of a cluster (it will be created automatically if doesn't exist).
@type cluster_dir: str
@param features_options a list of dictionaries with feature options.
@type features_options: list
@param mead_bin_dir a full path to MEAD's main directory
@type mead_bin_dir: str
@param tagset: the name of a tagset.
@type tagset: str
@return a list of xml string tags that represents each feature.
@rtype: list
"""
mead_bin_dir = self._config.mead_bin_dir
features_options = self._config.features
ccl_dir = self._ccl_dir
query_dir = self._query_dir
cluster_dir = self._cluster_dir
tagset = self._tagset
docsent_dir = self._docsent_dir
features_objs = [
getattr(
features,
f_opt[uconfig.FEATURE_CLASS]
).instantiate_from_options(f_opt, mead_bin_dir)
for f_opt in features_options
]
# Make queries/features
features_xml_tags = []
query_counter = 1
for feature_obj in features_objs:
if isinstance(feature_obj, features.Query):
feature_obj.make_query(
ccl_dir,
query_dir,
"", # TODO: how to make a query id?
query_counter,
tagset=tagset,
docsent_dir=docsent_dir
)
features_xml_tags.append(feature_obj.xml_feature)
query_counter += 1
elif isinstance(feature_obj, features.Feature):
# TODO
raise NotImplementedError("Sorry, this part hasn't been implemented yet.")
elif isinstance(feature_obj, features.MeadFeatureClass):
# TODO
raise NotImplementedError("Sorry, this part hasn't been implemented yet.")