Commit 00180cd7 authored by Mateusz Gniewkowski's avatar Mateusz Gniewkowski

Merge branch 'OMWN_to_docker' into 'master'

Omwn to docker

See merge request !1
parents f62a61f3 c510bbaf
Pipeline #746 passed with stages
in 1 minute and 56 seconds
image: 'clarinpl/python:3.6'
cache:
paths:
- .tox
stages:
- check_style
- build
before_script:
- pip install tox==2.9.1
pep8:
stage: check_style
script:
- tox -v -e pep8
docstyle:
stage: check_style
script:
- tox -v -e docstyle
build_image:
stage: build
image: 'docker:18.09.7'
only:
- master
services:
- 'docker:18.09.7-dind'
before_script:
- ''
script:
- docker build -t clarinpl/omwn .
- echo $DOCKER_PASSWORD > pass.txt
- cat pass.txt | docker login --username $DOCKER_USERNAME --password-stdin
- rm pass.txt
- docker push clarinpl/omwn
FROM clarinpl/python:3.6
WORKDIR /home/worker
COPY ./src ./src
COPY ./main.py .
COPY ./requirements.txt .
RUN python3.6 -m pip install -r requirements.txt
CMD ["python3.6", "main.py", "service"]
version: '3'
services:
stat:
container_name: clarin_omwn
build: ./
working_dir: /home/worker
entrypoint:
- python3.6
- main.py
- service
environment:
- PYTHONUNBUFFERED=0
volumes:
- '/samba:/samba'
- './config.ini:/home/worker/config.ini'
- './src:/home/worker/src'
- './main.py:/home/worker/main.py'
from ._service import *
from ._worker import *
This diff is collapsed.
from __future__ import absolute_import, unicode_literals, division
from abc import ABCMeta, abstractmethod
import logging
import six
__all__ = 'LexWorker',
@six.add_metaclass(ABCMeta)
class LexWorker(object):
"""
The abstract class that all workers should be derived from.
If defines methods which may be overridden to perform initialization of the
worker, as well as the main processing method that will be called for every
request to the worker.
"""
@classmethod
def static_init(cls, config):
"""
This initialization method is called exactly once, when the worker
class is loaded and the service is starting.
It should load and initialize resources that can be shared across
processes, so that they don't need to be loaded many times.
All variables added to this class by this method will be pickled and
sent to all started processes. Therefore, they need to be picklable.
:param dict config: The service configuration dictionary. It's passed
whole to this method in case it wants to look at some of the
parameters.
"""
pass
@classmethod
def static_close(cls):
"""
Called after all processes have stopped and the service is shutting
down. If any of shared resources allocated by :meth:`static_init` need
to be cleaned up, the subclass should override this method to do so.
"""
pass
@staticmethod
def logging_init(log_socket_handler, log_levels):
"""
Called in each subprocess before :meth:`init`.
The purpose of this method is to set up loggers used by the worker. By
default, it takes all keys from ``logging_levels`` section of the
configuration, treats them as logger names and sets their levels to the
values assigned to them. All of those loggers also have
``log_socket_handler`` added as their handler, so they can log to the
centralized logging system.
This method silently does nothing when ``log_socket_handler`` is
``None``.
Normally, there should be no reason to override this method in a
subclass, unless some very special treatment of some loggers is
required.
:param log_socket_handler: The socket handler created for the process.
This may be ``None`` if the central logger has not been set up.
:type log_socket_handler: Optional[logging.handlers.SocketHandler]
:param log_levels: Mapping of logger names to their levels. Normally
taken from ``logging_levels`` section of config dictionary, after
textual level names are resolved.
:type log_levels: Mapping[str,int]
"""
if log_socket_handler is None:
return
for name, level in six.iteritems(log_levels):
logger = logging.getLogger(name)
logger.setLevel(level)
logger.addHandler(log_socket_handler)
def init(self):
"""
Called after an instance of this class has ben constructed in the
process it will be run. It is run once for each instance
(and therefore each process).
It should load all resources that can't be pickled and shared.
"""
pass
def close(self):
"""
Called when the process is being shut down. If the worker allocates any
per-process resources that need to be cleaned up, the subclass should
override this method to do so.
"""
pass
@abstractmethod
def process(self, task_options):
"""
Called for each request made to the worker. This method performs the
task the service is constructed to do and must be overridden by
subclasses.
:param dict task_options: Dictionary containing options for the current
processing task. Subclasses should describe what options that can
handle (or require). This dictionary may contain all values that
can be JSON-encoded.
"""
pass
from __future__ import absolute_import, unicode_literals, division
import logging
from logging.handlers import RotatingFileHandler
from threading import Thread
from struct import Struct
from six.moves import socketserver, cPickle
__all__ = 'LogServer', 'configure_loggers', 'parse_loglevel'
_DEFAULT_LOGFORMAT = ('%(processName)s>>> [%(asctime)s] (%(name)s:%(lineno)d) '
'%(levelname)s: %(message)s')
# Since logging module doesn't standardize name-to-level conversion, here's a
# dict with the standard levels.
_NAME2LVL = {
'CRITICAL': 50,
'ERROR': 40,
'WARNING': 30,
'INFO': 20,
'DEBUG': 10,
}
_logger = logging.getLogger(__name__)
# Main logger is configured for loggers in this library themselves. It gets the
# same handler as worker loggers.
_main_logger = logging.getLogger(__name__.split('.', 1)[0])
# The logger that handles logs the remote loggers. It remains aside loggers for
# this library, accepts everything, logs nothing on its own and only handles
# received log records.
_service_logger = logging.getLogger('<service-remote>')
_handler = None
def configure_loggers(logfile_name='service.log',
logfile_maxsize=1024**2,
logfile_maxbackups=10,
log_format=_DEFAULT_LOGFORMAT,
local_log_level='WARNING'):
"""
Configure the logger used by :class`LogServer` instances.
This function must be called before any logging servers are started (not
that there should ever be need for more than one) and cannot be called
again. This is to ensure thread safety.
:param str logfile_name: Name of the file to which logs are written.
:param int logfile_maxsize: Maximal size in bytes of a single rotating
log file. Default is 1 MiB.
:param int logfile_maxbackups: Maximal number of backup log files kept.
Default is 10.
:param str log_format: Format string for log records output by this
server. A reasonable default is provided.
:param str local_log_level: Name of the log level for loggers in *this*
library. Refer to standard documentation for possible names. This
setting does not affect loggers from workers.
"""
global _handler
if _handler is not None:
raise RuntimeError('Cannot configure logger twice')
_handler = RotatingFileHandler(logfile_name,
maxBytes=logfile_maxsize,
backupCount=logfile_maxbackups,
encoding='utf-8',
delay=True)
fmter = logging.Formatter(log_format)
_handler.setFormatter(fmter)
_main_logger.addHandler(_handler)
_main_logger.setLevel(parse_loglevel(local_log_level))
_service_logger.addHandler(_handler)
# Make sure this will accept everything. Worker loggers should do
# filtering.
_service_logger.setLevel(logging.NOTSET)
# Also log to stderr (usually will be screen). This does not require any
# fussing.
stdhandler = logging.StreamHandler()
stdhandler.setFormatter(fmter)
_main_logger.addHandler(stdhandler)
_service_logger.addHandler(stdhandler)
def parse_loglevel(log_level):
"""
Get logging level constant number from a string.
If the string is an integer literal, return it as integer. Otherwise try to
interpret the string as one of the standard level names and return value
associated with that.
:param str log_level: String naming the log level, to be parsed.
:return: Integer value of the log level, as used by ``logging`` module.
:rtype: int
:raise KeyError: When ``log_level`` is neither an integer literal nor
the name of a standard logging level.
"""
try:
lvlnum = int(log_level)
except ValueError:
lvlnum = _NAME2LVL[log_level.upper()]
return lvlnum
class LogServer(object):
"""
Creates and starts a logging server thread. This threads awaits for
``LogRecord`` pickles from a given port on localhost and logs them to a
rotating file handler.
The thread is meant to run while waiting for subprocesses to end, so it
should not impact efficiency. The thread will also spend most of its time
listening on socket.
The logging server can be told to shutdown at any time.
"""
# str cast is for python2 compatibility.
HOST = str('localhost')
SHUTDOWN_POLL_INTERVAL = 2.
def __init__(self, port):
"""
:param int port: Number of TCP port the server will be listening on.
"""
self._port = port
self._sserver = _LogSocketServer((self.HOST, port), _LogRequestHandler)
self._sthread = Thread(target=self._sserver.serve_forever,
args=(self.SHUTDOWN_POLL_INTERVAL,),
name='logging')
@property
def socket_address(self):
"""The address tuple for socket handlers to connect to this server."""
return self.HOST, self._port
def start(self):
"""
Start the logging thread and return immediately.
:raise RuntimeError: If :func:`configure_loggers` has not been
called before this method.
"""
if _handler is None:
raise RuntimeError('configure_loggers() has not been '
'called before starting')
self._sthread.start()
def shutdown(self):
"""
Shutdown the logging server and thread.
If the thread is not alive, silently do nothing.
"""
if not self._sthread.is_alive():
return
self._sserver.shutdown()
self._sthread.join()
class _LogSocketServer(socketserver.TCPServer):
# By default, socket errors go to stdout. We want them integrated
# nicely with the logging system, hence this subclass.
# This handler is called from except block in socketserver code, so
# it's safe to log exceptions.
def handle_error(self, request, client_address):
_logger.exception('Error while handling message from %r',
client_address)
class _LogRequestHandler(socketserver.StreamRequestHandler):
# This handler is based on the stdlib example:
# https://docs.python.org/2/howto/logging-cookbook.html#sending-and-receiving-logging-events-across-a-network
# But it uses a UNIX stream socket.
# According to the example, the log record length prefix is an unsigned
# long. Calculate its size more flexibly then the hard-coded 4 bytes in the
# example.
# str cast is for python2 compatibility.
__PREFIX_STRUCT = Struct(str('!L'))
def handle(self):
# Read the length prefix.
chunk = self.rfile.read(self.__PREFIX_STRUCT.size)
if len(chunk) < self.__PREFIX_STRUCT.size:
# Must be malformed, we got EOF before reading the struct.
raise RuntimeError(
'Bad length prefix in message: expected {} bytes '
'but only got {}'
.format(self.__PREFIX_STRUCT.size, len(chunk))
)
# Get the integer representing length.
loglen = self.__PREFIX_STRUCT.unpack(chunk)[0]
chunk = self.rfile.read(loglen)
if len(chunk) < loglen:
# Again with the malformed.
raise RuntimeError(
'Bad payload in message: expected {} bytes '
'but only got {}'
.format(loglen, len(chunk))
)
logdict = cPickle.loads(chunk)
logrecord = logging.makeLogRecord(logdict)
_service_logger.handle(logrecord)
"""Implementation of hask service."""
import argparse
import lex_ws
from src.omwn_worker import OMWNWorker
def get_args():
"""Gets command line arguments."""
parser = argparse.ArgumentParser(description="Topic Modeling")
subparsers = parser.add_subparsers(dest="algorithm")
subparsers.required = True
subparsers.add_parser(
"service",
help="Run as a service"
)
return parser.parse_args()
def main():
"""Runs the program."""
args = get_args()
generators = {
"service": lambda: lex_ws.LexService.main(OMWNWorker),
}
gen_fn = generators.get(args.algorithm, lambda: None)
gen_fn()
if __name__ == "__main__":
main()
#!/usr/bin/python
import logging
import lex_ws
import nltk
from nltk.corpus import wordnet as wn
my_logger = logging.getLogger(__name__)
_log = logging.getLogger(__name__)
languages={"pl":"pol","en":"eng","es":"spa"};
class OMWNWorker(lex_ws.LexWorker):
@classmethod
def static_init(cls, config):
my_logger.info('Loading models...')
for lang in languages:
wn.lemmas("test", lang=languages[lang]);
my_logger.info('Loading finished.')
return
def process(self, input):
my_logger.info('Doing work!')
res={};
if "function" in input:
res=self.evaluate_function(input["function"],input)
my_logger.info('Work done!')
return res;
def evaluate_function(self, function_type, input):
response = {}
if function_type == 'list':
element=input["element"];
url="http://compling.hss.ntu.edu.sg/omw/cgi-bin/wn-gridx.cgi?";
if not "lang" in element or not (element["lang"] in languages):
return response;
if ("lemma" in element):
print str(element["lemma"].encode('utf-8'))
res=wn.lemmas(element["lemma"].encode('utf-8').decode('utf-8') , lang=languages[element["lang"]]);
if len(res)>0:
formats=["json"];
url=url+"lemma="+element["lemma"]+"&lang="+languages[element["lang"]];
response={"formats":formats,"url":url}
return response;
elif function_type == 'get':
element=input["element"];
if not "lang" in element or not (element["lang"] in languages):
return response;
if ("lemma" in element):
return self.getDatabyLemma(element["lemma"],languages[element["lang"]]);
return {};
elif function_type == 'getInfo':
response={'pl':{'name':"Inne języki",'fullName':"Open Multilingual Wordnet",
'description':'Słowosieć (z ang. wordnet) to sieć semantyczna, która odzwierciedla system leksykalny języka naturalnego. Węzłami Słowosieci są jednostki leksykalne, czyli wyrazy i ich znaczenia, różnorako połączone relacjami semantycznymi ze ściśle określonego repertuaru. Na przykład kot jest hiponimem (podklasą) zwierzęcia, pazur i łapa są w relacji meronimii (część/całość), a wchodzić i wychodzić są antonimami. Jednostka leksykalna uzyskuje znaczenie przez odniesienie do innych jednostek leksykalnych w obrębie systemu, a możemy o niej wnioskować na podstawie przypisanych jej relacji. Na przykład kota definiuje się jako rodzaj zwierzęcia, łapę jako całość, której częścią jest pazur, a czynności wchodzenia i wychodzenia jako przeciwieństwa. <br> Struktura wordnetu jest dostosowana do potrzeb automatycznej analizy tekstów. Jest to w istocie podstawowy zasób językowy, ważny w badaniach nad sztuczną inteligencją.'
+'<br><a target="_blank" href="http://compling.hss.ntu.edu.sg/omw/">więcej...</a>',
'copyright':'Utrzymanie: <a href="http://www3.ntu.edu.sg/home/fcbond/">Francis Bond</a>&lt;<a href="mailto:bond@ieee.org">bond@ieee.org</a>&gt;'
},
'en':{'name':"Other languages",'fullName':"Open Multilingual Wordnet",
'description':'Open wordnets in a variety of languages, all linked to the Princeton Wordnet of English (PWN). The goal is to make it easy to use wordnets in multiple languages. The individual wordnets have been made by many different projects and vary greatly in size and accuracy'
+'<br><a target="_blank" href="http://compling.hss.ntu.edu.sg/omw/">more...</a>',
'copyright':'<a href="http://www3.ntu.edu.sg/home/fcbond/">Francis Bond</a>&lt;<a href="mailto:bond@ieee.org">bond@ieee.org</a>&gt;'
}
};
return response;
def getDatabyLemma(self,lemma,language):
result=[];
wnlemmas=wn.lemmas(lemma, lang=language);
for wnlem in wnlemmas:
wnsynset=wnlem.synset()
trans=dict()
for lang in languages:
if (languages[lang]!=language):
trans[lang]=wnsynset.lemma_names(languages[lang])
synset={'name':wnsynset.name(),'definition':wnsynset.definition(),'offset':str(wnsynset.offset()).zfill(8) + '-' + wnsynset.pos(),'translate':trans};
result.append(synset);
return result;
if __name__ == '__main__':
lex_ws.LexService.main(OMWNWorker)
lex_ws
nltk
\ No newline at end of file
{
"pl":{
"name":"Inne języki",
"fullName":"Open Multilingual Wordnet",
"description":"Słowosieć (z ang. wordnet) to sieć semantyczna, która odzwierciedla system leksykalny języka naturalnego. Węzłami Słowosieci są jednostki leksykalne, czyli wyrazy i ich znaczenia, różnorako połączone relacjami semantycznymi ze ściśle określonego repertuaru. Na przykład kot jest hiponimem (podklasą) zwierzęcia, pazur i łapa są w relacji meronimii (część/całość), a wchodzić i wychodzić są antonimami. Jednostka leksykalna uzyskuje znaczenie przez odniesienie do innych jednostek leksykalnych w obrębie systemu, a możemy o niej wnioskować na podstawie przypisanych jej relacji. Na przykład kota definiuje się jako rodzaj zwierzęcia, łapę jako całość, której częścią jest pazur, a czynności wchodzenia i wychodzenia jako przeciwieństwa. <br> Struktura wordnetu jest dostosowana do potrzeb automatycznej analizy tekstów. Jest to w istocie podstawowy zasób językowy, ważny w badaniach nad sztuczną inteligencją.<br> <a target=_blank href=http://compling.hss.ntu.edu.sg/omw/>więcej...</a>",
"copyright":"Utrzymanie: <a href=http://www3.ntu.edu.sg/home/fcbond/>Francis Bond</a>&lt;<a href=mailto:bond@ieee.org>bond@ieee.org</a>&gt;"
},
"en":{
"name":"Other languages",
"fullName":"Open Multilingual Wordnet",
"description":"Open wordnets in a variety of languages, all linked to the Princeton Wordnet of English (PWN). The goal is to make it easy to use wordnets in multiple languages. The individual wordnets have been made by many different projects and vary greatly in size and accuracy <br><a target=_blank href=http://compling.hss.ntu.edu.sg/omw/ >more...</a>",
"copyright":"<a href=http://www3.ntu.edu.sg/home/fcbond/>Francis Bond</a>&lt;<a href=mailto:bond@ieee.org>bond@ieee.org</a>&gt;"
}
}
\ No newline at end of file
"""Implementation of omwn_worker."""
import logging
import json
import lex_ws
from nltk.corpus import wordnet as wn
my_logger = logging.getLogger(__name__)
_log = logging.getLogger(__name__)
languages = {"pl": "pol", "en": "eng", "es": "spa"}
class OMWNWorker(lex_ws.LexWorker):
"""Implements omwn worker."""
@classmethod
def static_init(cls, config):
"""Initialize worker."""
my_logger.info("Loading models...")
for lang in languages:
wn.lemmas("test", lang=languages[lang])
my_logger.info("Loading finished.")
return
def process(self, input):
"""Running lex process."""
my_logger.info("Doing work!")
result = {}
if "function" in input:
result = self.evaluate_function(input["function"], input)
my_logger.info("Work done!")
return result
def evaluate_function(self, function_type, input):
"""Performes evaluation and returns appropriate response."""
response = {}
if function_type == "list":
element = input["element"]
url = "http://compling.hss.ntu.edu.sg/omw/cgi-bin/wn-gridx.cgi?"
if "lang" not in element or element["lang"] not in languages:
return response
if ("lemma" in element):
res = wn.lemmas(element["lemma"].encode(
"utf-8").decode("utf-8"), lang=languages[element["lang"]])
if len(res) > 0:
formats = ["json"]
url = url + "lemma=" + element["lemma"] + \
"&lang=" + languages[element["lang"]]
response = {"formats": formats, "url": url}
return response
elif function_type == "get":
element = input["element"]
if "lang" not in element or element["lang"] not in languages:
return response
if ("lemma" in element):
return self._get_data_by_lemma(
element["lemma"], languages[element["lang"]])
return {}
elif function_type == "getInfo":
with open("info.json", "rt", encoding="utf8") as f:
response = json.load(f)
# response = json.dumps(response)
return response
return response
def _get_data_by_lemma(self, lemma, language):
result = []
wnlemmas = wn.lemmas(lemma, lang=language)
for wnlem in wnlemmas:
wnsynset = wnlem.synset()
trans = dict()
for lang in languages:
if (languages[lang] != language):
trans[lang] = wnsynset.lemma_names(languages[lang])
synset = {"name": wnsynset.name(),
"definition": wnsynset.definition(),
"offset": str(wnsynset.offset()).zfill(8) +
"-" + wnsynset.pos(),
"translate": trans}
result.append(synset)
return result
[tox]
envlist = pep8,docstyle
skipsdist = True
[testenv:pep8]
deps =
flake8
basepython = python3
commands =
flake8 {posargs}
[testenv:docstyle]
deps =
pydocstyle
basepython = python3
commands =
pydocstyle --verbose {posargs}
[flake8]
# W504 skipped because it is overeager and unnecessary
ignore = W504
show-source = True
exclude = .git,.venv,.tox,dist,doc,*egg,build,venv
import-order-style = pep8
max-line-length = 80
[pydocstyle]
# D104 Missing docstring in public package
# D203 1 blank line required before class docstring
# D213 Multi-line docstring summary should start at the second line
# D214 Section is over-indented
# D215 Section underline is over-indented
# D401 First line should be in imperative mood; try rephrasing
# D405 Section name should be properly capitalized
# D406 Section name should end with a newline
# D407 Missing dashed underline after section
# D408 Section underline should be in the line following the section’s name
# D409 Section underline should match the length of its name
# D410 Missing blank line after section
# D411 Missing blank line before section
ignore = D104,D203,D213,D214,D215,D401,D405,D406,D407,D408,D409,D410,D411
match-dir = ^(?!\.tox|venv).*
match = ^(?!setup).*\.py
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment