Commit cfcad47f authored by Szymon Ciombor's avatar Szymon Ciombor Committed by Mateusz Gniewkowski

added Dockerfile, CI, fixed several bugs, escaped dependency hell

parent 80d9bca5
image: 'clarinpl/python:2.7'
build_image:
image: 'docker:18.09.7'
only:
- master
services:
- 'docker:18.09.7-dind'
script:
- docker build -t clarinpl/summarize .
- echo $DOCKER_PASSWORD > pass.txt
- cat pass.txt | docker login --username $DOCKER_USERNAME --password-stdin
- rm pass.txt
- docker push clarinpl/summarize
FROM clarinpl/python:2.7
ENV LANG C.UTF-8
ENV LC_ALL C.UTF-8
RUN apt-get update && apt-get install -y \
pwrutils \
crfpp \
corpus2-python2.7 \
corpus2mwe-python2.7 \
wccl-python2.7 \
morphanalyser-python2.7 \
expat
WORKDIR /home/deps
RUN wget https://minio.clarin-pl.eu/public/share/CRF++-0.58.tar.gz && tar -xzf CRF++-0.58.tar.gz
RUN cd CRF++-0.58/python && python setup.py build && python setup.py install && cd /home/deps
RUN git clone https://github.com/juzefwt/mead-pl.git mead && mv mead /usr/local/share
RUN cpan App::cpanminus
RUN cpanm XML::Parser && \
cpanm XML::Writer && \
cpanm XML::TreeBuilder && \
cpanm Text::Iconv
WORKDIR /home/worker
COPY ./src ./src
COPY ./main.py .
COPY ./requirements.txt .
COPY ./saper-0.1.0-py2-none-any.whl .
RUN python2.7 -m pip install ./saper-0.1.0-py2-none-any.whl
RUN python2.7 -m pip install -r requirements.txt
CMD ["python2.7", "main.py", "service"]
\ No newline at end of file
; PLIK KONFIGURACYJNY WORKERA
; Plik zawiera konfigurację zarówno Api usługi sieciowej jak i narzędzia.
;
; Autor: Tomasz Walkowiak
; email: tomasz.walkowiak@pwr.edu.pl
; --------- CZĘŚĆ DLA Serwisu ---------
[service]
root = /samba/requests/
tool = summarize
rabbit_host =10.17.0.85
rabbit_user =clarin
rabbit_password =clarin123
; --------- CZĘŚĆ DLA Narzedzia ---------
root = /samba/requests/
rabbit_host = rabbitmq
rabbit_user = test
rabbit_password = test
[tool]
workers_number = 1
sum_config=cfg/basic.ini
workers_number = 2
sum_config=src/cfg/basic.ini
[logging]
port = 9999
local_log_level = INFO
\ No newline at end of file
"""Implementation of summarize service."""
import argparse
import nlp_ws
from src.summarize_worker import SummarizeWorker
def get_args():
"""Gets command line arguments."""
parser = argparse.ArgumentParser(description="summarize implementation")
subparsers = parser.add_subparsers(dest="algorithm")
subparsers.required = True
subparsers.add_parser("service", help="Run as a service")
return parser.parse_args()
def main():
"""Runs the program."""
args = get_args()
generators = {
"service": lambda: nlp_ws.NLPService.main(SummarizeWorker),
}
gen_fn = generators.get(args.algorithm, lambda: None)
gen_fn()
if __name__ == "__main__":
main()
nlp_ws
lxml
saper
\ No newline at end of file
......@@ -3,13 +3,11 @@ saper_cfg = /home/anaaa/REPOS/saper/cfg/entailment_bestn.ini
# An optional parameter, default to False
overwrite_ccl = False
tagset = nkjp
features =
features =
[mead]
# An optional parameter, default to "/usr/local/share/mead/bin/"
mead_bin_dir = /usr/local/share/mead/bin/
# An optional MEAD's configuration file. If not provided a default one will
# be created in a cluster directory.
mead_cfg_file =
mead_cfg_file =
from summary.features.feature import Query
import summary.utils.basicutils as basicutils
import stdmods.cclutils as cclutils
import summary.utils.meadutils as meadutils
from collections import defaultdict
import summary.utils.configuration as uconfig
def _func_weight(str_representation):
"""!
Return a real function from its string representation.
@param str_representation one of defined function's string representation.
@type str_representation: str
@return a function object
@rtype function
"""
# Make sure that all returned functions take exactly one input parameter
# which is a list of the number of occurrences
if str_representation == "frequency":
return _freq_weight
else:
try:
constant_val = float(str_representation)
return lambda occ: [constant_val]*len(occ)
except ValueError, _:
raise AttributeError("Wrong weighting function representation.")
def _freq_weight(occurrences):
"""!
Divide the number of occurrences of each element by the sum of all elements.
@param occurrences a list of numbers of occurrences.
@type occurrences: list
@return a normalized occurrences.
@rtype: list
"""
sum_ = float(sum(occurrences))
return [occ / sum_ for occ in occurrences]
ATTRIBUTES_FUNCS = {
"channels": lambda s: s.strip().split(),
"weighting_function": _func_weight
}
@basicutils.inherit_docs
class NEQueryPhraseMatchFeature(Query):
"""!
Integration of Named Entities (NE) and MEAD's QueryPhraseMatch feature.
The class needs ccl files with NE channels. It's some kind of proxy class
actually. Thus, it creates only a keyword file and create a fragment
of MEAD command which will be called later (by different part of
the application).
By defining a different set of channels or choosing different weighting
function the final query output may very. Define those in a configuration
file.
"""
def __init__(self, query_class_path, query_feature_name, query_file,
title="", description="", narrative="", keywords=None,
*args, **kwargs):
# Extract required kwargs
for required_key in NEQueryPhraseMatchFeature.required_init_kwargs():
# Check kwargs
if required_key not in kwargs:
raise AttributeError(
"Missing {:} in keyword arguments".format(required_key)
)
attribute_name = "_{:}".format(required_key)
# Check the class if it doesn't already use this attribute name
if hasattr(self, attribute_name):
raise AttributeError(
("Attribute name {:} exists, use a different name (the prefix '_' "
"is added automatically)").format(attribute_name)
)
else:
# So far, it's possible to make it dynamically
# If this will grow up rapidly or be to difficult to maintain
# then it should be changed.
setattr(
self,
attribute_name,
ATTRIBUTES_FUNCS[required_key](kwargs[required_key])
)
# Query class will use all kwargs, thus it's necessary to remove
# the required ones by this class
super_kwargs = dict(kwargs)
for key in NEQueryPhraseMatchFeature.required_init_kwargs():
del super_kwargs[key]
super(NEQueryPhraseMatchFeature, self).__init__(
query_class_path,
query_feature_name,
query_file,
title,
description,
narrative,
keywords,
*args,
**super_kwargs
)
@staticmethod
def mead_class_name():
"""! Return the name of MEAD's class the feature refers to. """
return "QueryPhraseMatch"
@staticmethod
def required_kwargs():
return "tagset", "docsent_dir"
@staticmethod
def required_init_kwargs():
return "channels", "weighting_function"
def make_query(self, ccl_dir, query_dir, query_id, query_no, **kwargs):
"""!
Make a query by selecting named entities as keywords.
It requires a tagset name (or object) as additional keyword argument.
The method merged keywords provided during instantiation of the class
(if provided) before saving the query file.
@param ccl_dir a ccl directory.
@type ccl_dir: str
@param query_dir a directory where a query will be saved.
@type query_dir: str
@param query_id an identifier of a query.
@type query_id: str
@param query_no the number of a query.
@type query_no: int
"""
# Get tagset - an exception is raised if it's missing
tagset = cclutils.get_tagset(kwargs["tagset"])
extracted_ne = defaultdict(float)
# Load ccl files
documents = cclutils.read_ccl_from_dir(ccl_dir)
for doc in documents:
# Create NE extractor and get NE
ne_extractor = cclutils.NamedEntitiesExtractor(doc, tagset)
for ne in ne_extractor.extracted_ne(self._channels):
extracted_ne[ne] += 1.
# NE are keywords - weight them
# Keep the same order in next steps
keywords, occurrences = zip(*extracted_ne.items())
keyword_dict = dict(zip(keywords, self._weighting_function(occurrences)))
keyword_dict.update(self._keywords)
output_query_file = meadutils.save_query(
query_dir,
self._query_file,
self._title,
self._description,
self._narrative,
keyword_dict,
query_id=query_id,
query_no=query_no
)
# Update args
self._single_args.append(basicutils.get_full_path(output_query_file))
self._single_args.append(basicutils.get_full_path(kwargs['docsent_dir']))
@classmethod
def instantiate_from_options(cls, options_dict, mead_bin_dir=None):
query_class_path = \
meadutils.get_mead_class_path(cls.mead_class_name(), mead_bin_dir)
query_feature_name = options_dict[uconfig.FEATURE_NAME]
query_file = options_dict[uconfig.FEATURE_FILE]
kwargs = dict(options_dict)
# Remove keys invalid for query class
del kwargs[uconfig.FEATURE_TYPE]
del kwargs[uconfig.FEATURE_CLASS]
del kwargs[uconfig.FEATURE_NAME]
del kwargs[uconfig.FEATURE_FILE]
return cls(query_class_path, query_feature_name, query_file, **kwargs)
This diff is collapsed.
......@@ -5,9 +5,9 @@ polish language.
"""
import argparse
import os
import summary.utils.basicutils as basicutils
from summary.utils.basicutils import save_files_list
from summary.summarizer.summarizer import Summarizer
import utils.basicutils as basicutils
from utils.basicutils import save_files_list
from summarizer import Summarizer
try:
import argcomplete
......
#!/usr/bin/python
# -*- coding: utf-8 -*-
from worker import NLPWorker,NLPService
import nlp_ws
from logger import *
from utils.basicutils import save_files_list
from utils.basicutils import get_files
from summarizer import Summarizer
import os,shutil,tempfile
class SummarizeWorker(NLPWorker):
def init(self):
self.summarizer=Summarizer(self.config['sum_config'])
class SummarizeWorker(nlp_ws.NLPWorker):
@classmethod
def static_init(self, config):
self.summarizer=Summarizer(config['tool']['sum_config'])
def process(self, inputFile, taskOptions, outputFile):
......@@ -30,9 +31,3 @@ class SummarizeWorker(NLPWorker):
open(outputFile, 'a').close()
os.unlink(filein)
os.remove(tf.name)
if __name__ == '__main__':
service= NLPService();
service.start(SummarizeWorker);
......@@ -4,14 +4,14 @@ import tempfile
import shutil
import errno
from summary.utils.configuration import ConfigOptions
from summary.utils.basicutils import save_files_list
from utils.configuration import ConfigOptions
from utils.basicutils import save_files_list
from saper.saper_run import load_config as saper_load_cfg
from saper.saper_run import batch_processing as saper_preprocessing
import summary.utils.basicutils as basicutils
import summary.utils.meadutils as meadutils
import summary.utils.configuration as uconfig
import summary.features as features
import utils.basicutils as basicutils
import utils.meadutils as meadutils
import utils.configuration as uconfig
import features
class Summarizer(object):
......
......@@ -3,7 +3,7 @@ from collections import OrderedDict
import re
import logging
import basicutils
import features
import src.features
# Only for loading a config file
SEC_MAIN = "main"
......
......@@ -2,8 +2,8 @@ from ConfigParser import SafeConfigParser
from collections import OrderedDict
import re
import logging
import summary.utils.basicutils as basicutils
import summary.features as features
import basicutils as basicutils
import src.features as features
# Only for loading a config file
SEC_MAIN = "main"
......
......@@ -8,8 +8,8 @@ MEAD's structures, files and scripts.
import os
import lxml.etree as lxmltree
import subprocess
import stdmods.cclutils as cclutils
import summary.utils.basicutils as basicutils
import cclutils as cclutils
import basicutils
import shutil
ENCODING = "UTF-8"
......@@ -623,9 +623,9 @@ def extract_summary_sentences(cluster_dir, cluster_name, tagset):
for did in dids:
ccl_doc_path = os.path.join(cluster_dir, CLUSTER_CCL_DIR, did)
ccl_sentences[did] = [
cclutils.sentence2str(sent)
cclutils.sentence2str(sent[0])
for sent
in cclutils.get_sentences(ccl_doc_path, tagset_obj, flat=True)
in cclutils.get_sentences(ccl_doc_path, tagset_obj)
]
# Make the final result
summary_list = [ccl_sentences[did][sno] for did, sno
......
#!/usr/bin/env python
import pika
import ConfigParser
import os
import json
import shutil,time
from multiprocessing import Process
from logger import *
class NLPWorker(object):
"""Class for communication with service engine. Has functions for retrieving new task (if available)
and finishing completed tasks."""
def __init__(self,connection,logger,config):
"""Constructor
config - config given in the form of dictionary containing required paths and request data.
logger - logger instance (as in logger.py)
"""
self.logger=logger
self.connection=connection;
self.config=config
self.queue_name="nlp_"+self.config["tool"];
self.init();
def init(self):
return
@staticmethod
def static_init(config,logger):
return
def start(self):
creditentials = pika.PlainCredentials(self.config['rabbit_user'], self.config['rabbit_password'])
self.connection = pika.BlockingConnection(pika.ConnectionParameters(
host=self.config['rabbit_host'],credentials=creditentials))
channel = self.connection.channel()
channel.queue_declare(queue=self.queue_name)
#self.channel=channel;
channel.basic_qos(prefetch_count=1)
channel.basic_consume(self.on_request, queue=self.queue_name)
self.logger.log(INFO, "Worker started with queue: " +self.queue_name )
channel.start_consuming()
def on_request(self,ch, method, props, body):
result={};
try:
data=json.loads(body);
outputFile=self.config["root"]+self.config["tool"]+"/"+props.correlation_id;
result["file"]=outputFile;
start_time = time.time()
self.process(str(data["file"]),data["options"],outputFile);
stop_time = time.time()
self.logger.log(INFO, "Finished processing of task: " + props.correlation_id+ " in "+str(-start_time+stop_time));
result["error"]="";
result["time"]=-start_time+stop_time;
except Exception as e:
self.logger.log(ERROR, "Unable to process a task "+props.correlation_id+" with message: "+body)
result["error"]=str(e);
self.logger.log(ERROR, e)
ch.basic_publish(exchange='',
routing_key=props.reply_to,
properties=pika.BasicProperties(correlation_id = \
props.correlation_id),
body=str(json.dumps(result)))
ch.basic_ack(delivery_tag = method.delivery_tag)
self.logger.log(INFO, "Finished task: " + props.correlation_id)
def process(self, inputFile, taskOptions, outputFile):
"""Processing function. This is specific to service and should be implemented in the subclass."""
raise NotImplementedError("`process` method should be implemented in the subclass.")
class NLPService(object):
def __init__(self):
run_as_daemon=False;
self.config = self._read_config("config.ini")
self.connection=None
logging_lvl = Logger.str2logging_lvl("debug")
logfile_path = os.path.join("", 'service.log')
self.logger = Logger(logging_lvl, logfile_path, "", True)
path=self.config["root"]+self.config["tool"]+'/';
d = os.path.dirname(path)
if not os.path.exists(d):
os.makedirs(d)
def p(self,workerClass):
worker=workerClass(self.connection,self.logger,self.config);
worker.start()
def start(self,workerClass):
self.logger.log(INFO, "Starting "+self.config["tool"] );
workerClass.static_init(self.config,self.logger);
processes=[];
for id in range(self.config['workers']):
p = Process(target=self.p, args=(workerClass,))
p.start()
processes.append(p)
for p in processes:
p.join()
def _read_config(self, config_path):
"""Read config file and create dictionary of option values"""
S_SERVICE = 'service'
S_TOOL = 'tool'
config = dict()
with open(config_path) as config_file:
config_parser = ConfigParser.RawConfigParser()
config_parser.readfp(config_file)
for name, value in config_parser.items(S_SERVICE):
config[name] = value
for name, value in config_parser.items(S_TOOL):
config[name] = value
config['workers'] = int(config['workers_number']) if 'workers_number' in config else 1
return config
#worker= Worker();
#worker.start();
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment