Commit 8d1e87d8 authored by Leszek Szymczak's avatar Leszek Szymczak Committed by Mateusz Gniewkowski

Plwordnet to rancher

parent 12185e6b
image: 'clarinpl/python:3.6'
cache:
paths:
- .tox
stages:
- check_style
- build
before_script:
- pip install tox==2.9.1
pep8:
stage: check_style
script:
- tox -v -e pep8
docstyle:
stage: check_style
script:
- tox -v -e docstyle
build_image:
stage: build
image: 'docker:18.09.7'
only:
- master
services:
- 'docker:18.09.7-dind'
before_script:
- ''
script:
- docker build -t clarinpl/plwordnet .
- echo $DOCKER_PASSWORD > pass.txt
- cat pass.txt | docker login --username $DOCKER_USERNAME --password-stdin
- rm pass.txt
- docker push clarinpl/plwordnet
FROM clarinpl/python:2.7
WORKDIR /tmp/
COPY requirements.txt .
RUN pip install -r requirements.txt
COPY src/PLWN_API-0.9 PLWN_API-0.9
RUN cd PLWN_API-0.9 && \
pip install .
FROM clarinpl/python:3.6
WORKDIR /home/worker
COPY ./src ./src
COPY ./main.py .
COPY ./requirements.txt .
RUN python3.6 -m pip install -r requirements.txt
CMD ["python3.6", "main.py", "service"]
1. get model
wget -O model/plwn_dump_27-03-2018.sqlite http://minio.clarin-pl.eu/public/models/plwn_dump_27-03-2018.sqlite
2. Build
docker-compose build
version: '3'
services:
services:
plwordnet:
container_name: clarin_plwordnet
build: ./
working_dir: /home/worker
entrypoint:
# - sleep
# - 1d
- python2
- plwordnet_worker.py
- python3.6
- main.py
- service
environment:
- PYTHONUNBUFFERED=0
volumes:
- /samba:/samba
- ./src/plwordnet_worker.py:/home/worker/plwordnet_worker.py
- ./config.ini:/home/worker/config.ini
- ./model/:/home/worker/model/
\ No newline at end of file
- '/samba:/samba'
- './config.ini:/home/worker/config.ini'
- './src:/home/worker/src'
- './main.py:/home/worker/main.py'
"""Implementation of hask service."""
import argparse
import lex_ws
from src.plwordnet_worker import PLWordnetWorker
def get_args():
"""Gets command line arguments."""
parser = argparse.ArgumentParser(description="Topic Modeling")
subparsers = parser.add_subparsers(dest="algorithm")
subparsers.required = True
subparsers.add_parser(
"service",
help="Run as a service"
)
return parser.parse_args()
def main():
"""Runs the program."""
args = get_args()
generators = {
"service": lambda: lex_ws.LexService.main(PLWordnetWorker),
}
gen_fn = generators.get(args.algorithm, lambda: None)
gen_fn()
if __name__ == "__main__":
main()
lex-ws
pika==0.12
\ No newline at end of file
pika==0.12
plwn_api
\ No newline at end of file
include README-pl-beta.txt
Metadata-Version: 1.0
Name: PLWN_API
Version: 0.9
Summary: Python API to access plWordNet lexicon
Home-page: UNKNOWN
Author: Michał Kaliński
Author-email: michal.kalinski@pwr.edu.pl
License: UNKNOWN
Description: UNKNOWN
Platform: UNKNOWN
Metadata-Version: 1.0
Name: PLWN-API
Version: 0.9
Summary: Python API to access plWordNet lexicon
Home-page: UNKNOWN
Author: Michał Kaliński
Author-email: michal.kalinski@pwr.edu.pl
License: UNKNOWN
Description: UNKNOWN
Platform: UNKNOWN
MANIFEST.in
README-pl-beta.txt
setup.py
PLWN_API.egg-info/PKG-INFO
PLWN_API.egg-info/SOURCES.txt
PLWN_API.egg-info/dependency_links.txt
PLWN_API.egg-info/requires.txt
PLWN_API.egg-info/top_level.txt
plwn/__init__.py
plwn/_loading.py
plwn/bases.py
plwn/enums.py
plwn/exceptions.py
plwn/relation_aliases.tsv
plwn/relresolver.py
plwn/readers/__init__.py
plwn/readers/comments.py
plwn/readers/nodes.py
plwn/readers/ubylmf.py
plwn/readers/wndb.py
plwn/readers/wnxml.py
plwn/storages/__init__.py
plwn/storages/objects.py
plwn/storages/sqlite.py
plwn/utils/__init__.py
plwn/utils/graphmlout.py
plwn/utils/sorting.py
plwn/utils/tupwrap.py
\ No newline at end of file
******************
**** PlWN API ****
******************
PlWN API umożliwia:
- wyszukiwanie synsetów i jednostek leksykalnych w Słowosieci;
- dostęp do własności synsetów i jednostek leksykalnych, oraz ich relacji;
- eksport całości bądź części Słowosieci do grafu.
To README jest krótką, tymczasową instrukcją do wersji beta PlWN API.
Interfejs oraz funkcjonalność mogą ulec zmianie.
=============
Inicjalizacja
=============
>>> import plwn
>>> wn = plwn.load('plwn-3.0.db', 'sqlite3')
=================
Zrzuty baz danych
=================
Na ten moment, zalecany jest dostęp do bazy danych Słowosieci poprzez zrzuty do
plików SQLite, "plwn-X.db" (gdzie X to wersja Słowosieci). Zrzuty powinny
były zostać udostępnione razem z API.
==============
Funkcjonalność
==============
Opis funkcjonalności jest dostępny poprzez docstringi modułu plwn/bases.py
$ pydoc plwn.bases
Dodatkowo, w plwn/_pos.py znajduje się lista stałych wartości part-of-speech
używanych przez API.
Zgodnie z konwencją przyjętą przez Python 3, większość metod zwracających
kolekcje obiektów zwraca je w postaci generatorów.
>>> wn.lexical_units(lemma=u'pies')
TupWrapper(<generator object <genexpr> at 0x7f1048583410>)
Jeśli celem zapytania jest iteracja po wynikach, nie potrzeba niczego więcej.
>>> for lu in wn.lexical_units(lemma=u'pies'):
>>> print(lu.id)
5563
52245
...
Aby otrzymać listę (albo inną kolekcję), należy rzutować otrzymany obiekt.
>>> list(wn.lexical_units(lemma=u'pies'))
[<LexicalUnit id=5563 lemma=u'pies' pos=u'noun' variant=1>,
<LexicalUnit id=52245 lemma=u'pies' pos=u'noun' variant=2>,
...
]
Dla wygody w trybie interaktywnym Pythona, generatory są opakowane w obiekty
"TupWrapper", które umożliwiają rzutowanie generatora do typu krotki
poprzez "wywołanie" obiektu.
>>> wn.lexical_units(lemma=u'pies')()
(<LexicalUnit id=5563 lemma=u'pies' pos=u'noun' variant=1>,
<LexicalUnit id=52245 lemma=u'pies' pos=u'noun' variant=2>,
...
)
Jednak w przypadku pisania programów odwołujących się do API zalecane jest
jawne rzutowanie zwracanych generatorów. "Explicit is better than implicit."
from ._loading import *
from .enums import PoS
# Setup logging for the package (not)
import logging as _logging
_logging.getLogger('plwn').addHandler(_logging.NullHandler())
"""Defines user-facing functions that allow simple construction of
:class:`PLWordnetBase` instances, with selected storages and readers.
"""
from __future__ import absolute_import, division, print_function
from collections import namedtuple
from importlib import import_module
import textwrap as tw
import six
__all__ = 'read', 'load', 'show_source_formats', 'show_storage_formats'
_Info = namedtuple('_Info', ('desc', 'modname'))
_READERS = {
'uby-lmf': _Info('Discontinued XML-based format', 'ubylmf'),
'database': _Info(
'MySQL database of plWordNet. Only works on python 2 and requires '
'certain additional libraries. This is meant for internal use only '
'and will not work for most users. The file should contain one line '
'with SQLAlchemy URL to the database.',
'wndb',
),
'xml': _Info('The official PLWN XML format', 'wnxml'),
}
_STORAGES = {
'sqlite3': _Info(
'File database format, with a compact schema (compared to internal '
'PLWN database).',
'sqlite',
),
'objects': _Info(
'Stores data in plain python objects, dumping them in pickle format. '
'Quick to construct, but querying and memory efficiency is not '
'guaranteed.',
'objects',
),
}
# Defaults for this version
_READERS[None] = _READERS['xml']
_STORAGES[None] = _STORAGES['sqlite3']
def _imp_reader(modname):
# Pre-import the root package - py3 needs this?
import plwn.readers # noqa
return import_module('.' + modname, 'plwn.readers')._this_reader_
def _imp_storage(modname):
# Pre-import the root package - py3 needs this?
import plwn.storages # noqa
return import_module('.' + modname, 'plwn.storages')._this_storage_
def read(source_file,
source_format=None,
storage_file=None,
storage_format=None):
"""Read plWordNet data from a file and return the right
:class:`PLWordnetBase` subclass instance for the selected parameters.
Where defaults are mentioned, those values may change with each minor
version of PLWN API. If you depend on some particular format for a long
running program, state it explicitly.
:param str source_file: Path to the file from which the plWordNet data will
be read. The required contents of the file depend on selected
``source_format``.
:param str source_format: Name of the format of data that's contained in
``source_file``. If ``None``, then the default for the current version will
be chosen.
:param str storage_file: Path to the file where the internal representation
of the storage will be dumped. It will be possible to load this file using
:func:`load`. If ``None``, then the representation will not be dumped.
:param str storage_format: Name of the format in which PLWN API will store
data in memory. Access methods provided should be the same, but their
efficiency may differ. If ``None``, then the default for the current
version will be chosen.
:rtype: PLWordnetBase
"""
stor_cls = _imp_storage(_STORAGES[storage_format].modname)
rdr = _imp_reader(_READERS[source_format].modname)
return stor_cls.from_reader(rdr(source_file), storage_file)
def load(storage_file, storage_format=None):
"""Read plWordNet data from a cached file with internal PLWN API
representation.
This function is much faster than :func:`read` if such file is available.
:param str storage_file: Path to the file from which the cached data will
be read.
:param str storage_format: Name of the format the data is stored in. It
must match the actual format and version of schema contained in the file.
:rtype: PLWordnetBase
"""
stor_cls = _imp_storage(_STORAGES[storage_format].modname)
return stor_cls.from_dump(storage_file)
def show_source_formats():
"""Print names and short descriptions of available source file formats to
``stdout``.
This function is primarily meant to be informative in interactive shell
mode.
"""
_show(_READERS)
def show_storage_formats():
"""Print names and short descriptions of available storage formats to
``stdout``.
This function is primarily meant to be informative in interactive shell
mode.
"""
_show(_STORAGES)
def _show(dict_):
for name, info in six.iteritems(dict_):
if name is None:
continue
print(name)
print('-' * len(name))
print(tw.fill(info.desc), end='\n\n')
This diff is collapsed.
# coding: utf8
"""
Enumerated values used in plWordNet
"""
from __future__ import absolute_import, division
import re
from enum import Enum
import six
__all__ = (
'PoS',
'VerbAspect',
'EmotionMarkedness',
'EmotionName',
'EmotionValuation',
'Domain',
'make_values_tuple',
)
# Helper function for making dictionaries translating enum instances into
# numbers used to denote them in plWN database.
def _fill_numtrans(enumclass, num2enum, enum2num):
for num, enuminst in enumerate(enumclass, 1):
num2enum[num] = enuminst
enum2num[enuminst] = num
def _get_from_numtrans(numtrans, num, optional):
try:
return numtrans[num]
except KeyError:
if optional:
return None
raise
# Explicit ordering is needed only in python 2.
_POS_ORDER = 'verb noun adverb adjective'
_POS_NUM2ENUM = {}
_POS_ENUM2NUM = {}
class PoS(Enum):
"""
Defines **Part of Speech** values used by plWN.
"""
if six.PY2:
__order__ = _POS_ORDER
verb = u'verb'
noun = u'noun'
adverb = u'adverb'
adjective = u'adjective'
v = verb
n = noun
adv = adverb
adj = adjective
@staticmethod
def by_db_number(number, optional=False):
return _get_from_numtrans(_POS_NUM2ENUM, number, optional)
@property
def db_number(self):
return _POS_ENUM2NUM[self]
_fill_numtrans(PoS, _POS_NUM2ENUM, _POS_ENUM2NUM)
_VA_ORDER = 'perfective imperfective predicative two_aspect'
_VA_NUM2ENUM = {}
_VA_ENUM2NUM = {}
class VerbAspect(Enum):
"""
Defines aspect values used by verbs in plWN.
"""
if six.PY2:
__order__ = _VA_ORDER
perfective = u'perf'
imperfective = u'imperf'
predicative = u'pred'
two_aspect = u'imperf.perf'
perf = perfective
imperf = imperfective
pred = predicative
two = two_aspect
# Additionally, some Polish abbreviations
dk = perfective
ndk = imperfective
@staticmethod
def by_db_number(number, optional=False):
return _get_from_numtrans(_VA_NUM2ENUM, number, optional)
@property
def db_number(self):
return _VA_ENUM2NUM[self]
_fill_numtrans(VerbAspect, _VA_NUM2ENUM, _VA_ENUM2NUM)
class EmotionMarkedness(Enum):
"""
Defines markedness of emotions associated with some lexical units.
"""
strong_positive = u'+ m'
strong_negative = u'- m'
weak_positive = u'+ s'
weak_negative = u'- s'
ambiguous = u'amb'
plus_m = strong_positive
minus_m = strong_negative
plus_s = weak_positive
minus_s = weak_negative
amb = ambiguous
@classmethod
def normalized(cls, strvalue):
"""
Return an instance of this enum with string value normalized with
regards to whitespace.
"""
strvalue = strvalue.strip()
# Try the one value value that won't require matching
if strvalue == cls.ambiguous.value:
return cls.ambiguous
match = re.match(r'([+-])\s*([sm])', strvalue, re.U)
if not match:
# This can't be a valid string, so let the built-in exception
# raise.
return cls(strvalue)
return cls(match.group(1) + u' ' + match.group(2))
class EmotionName(Enum):
"""
Possible names of emotions associated with some lexical units.
"""
joy = u'radość'
trust = u'zaufanie'
anticipation = u'cieszenie się na coś oczekiwanego'
surprise = u'zaskoczenie czymś nieprzewidywanym'
sadness = u'smutek'
anger = u'złość'
fear = u'strach'
disgust = u'wstręt'
radosc = joy
zaufanie = trust
cieszenie_sie_na = anticipation
zaskoczenie = surprise
smutek = sadness
zlosc = anger
strach = fear
wstret = disgust
class EmotionValuation(Enum):
"""