From 6e853db742014df589a66d5a0837da47951eae64 Mon Sep 17 00:00:00 2001 From: pwalkow <pawel.walkowiak@hotmail.com> Date: Fri, 16 Dec 2022 12:44:14 +0100 Subject: [PATCH 1/4] refactor code --- .gitlab-ci.yml | 76 +++++++++++++++++++++++++++++++++++++++++++++++ README.md | 31 +++---------------- main.py | 2 +- requirements.txt | 2 +- src/converter.py | 11 ++++++- src/lemmatizer.py | 11 +++---- src/tagger.py | 39 ++++++++++++------------ tests/test.py | 8 +++++ tox.ini | 63 +++++++++++++++++++++++++++++++++++++++ 9 files changed, 186 insertions(+), 57 deletions(-) create mode 100644 .gitlab-ci.yml create mode 100644 tests/test.py create mode 100644 tox.ini diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..407c6ed --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,76 @@ +image: clarinpl/python:3.8 + +cache: + paths: + - .tox + +stages: + - check_style + - tests + - pages + - build_master + - build_develop + +pep8: + stage: check_style + before_script: + - pip install tox==2.9.1 + script: + - tox -v -e pep8 + +docstyle: + stage: check_style + before_script: + - pip install tox==2.9.1 + script: + - tox -v -e docstyle + +tests: + stage: tests + before_script: + - pip install tox==2.9.1 + script: + - tox -v -e pytest + artifacts: + paths: + - htmlcov + expire_in: 1 week + reports: + junit: + - report.xml + +pages: + stage: pages + script: + - mkdir -p public/coverage + - cp -r htmlcov/* public/coverage/ + artifacts: + name: coverage + paths: + - public + + +build_develop: + except: + - master + stage: build_develop + image: docker:18.09.7 + services: + - 'docker:18.09.7-dind' + script: + - docker build -t $CI_REGISTRY_IMAGE:develop . + - docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY + - docker push $CI_REGISTRY_IMAGE:develop + + +build_master: + stage: build_master + image: 'docker:18.09.7' + only: + - master + services: + - 'docker:18.09.7-dind' + script: + - docker build -t $CI_REGISTRY_IMAGE:latest . + - docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY + - docker push $CI_REGISTRY_IMAGE:latest diff --git a/README.md b/README.md index 4b43687..55901b6 100644 --- a/README.md +++ b/README.md @@ -1,36 +1,13 @@ -## Fextor3 -Konwersja usługi fextorbis na python3 +## Pos_tagger +Wrapper for tagger services like morphodita, converting ccl output to json tagger output task_options ============ +`lang:` language of text (default = 'pl') -`lemmas-count: 'json file path'` - -Plik json zawierający informacje o lematach, częściach wielowyrazowców oraz końcowych termach, przykład dla wycinka ze słownika: - -| terms | lemmas | -|---------------------|---------------------| -| szkoła jezior | szkoła jezioro | - | szkoła literacka | szkoła literacki | -| szkoła strukturalna | szkoła strukturalny | -| szkoła sycylijska | szkoła sycylijski | -| szkoła śląska | szkoła śląski | -| szkoła ukraińska | szkoła ukraiński | - -"szkoła": [ \ -  {"lemma": "szkoła", "parts": []}, \ -  {"lemma": "szkoła jezioro", "parts": ["jezioro"], "term": "szkoła_jezior"}, \ -  {"lemma": "szkoła literacki", "parts": ["literacki"], "term": "szkoła_literacka"}, \ -  {"lemma": "szkoła strukturalny", "parts": ["strukturalny"], "term": "szkoła_strukturalna"}, \ -  {"lemma": "szkoła sycylijski", "parts": ["sycylijski"], "term": "szkoła_sycylijska"}, \ -  {"lemma": "szkoła śląski", "parts": ["śląski"], "term": "szkoła_śląska"}, \ -  {"lemma": "szkoła ukraiński", "parts": ["ukraiński"], "term": "szkoła_ukraińska"}]\ - -Element {"lemma": "szkoła", "parts": []} nie ma klucza "term", ponieważ słowo "szkoła" nie występuje jako osobny term. - -[Raport pokrycia kodu](http://nlpworkers.pages.clarin-pl.eu/fextor3/coverage/) +`output:` format of results (default = 'json', values: json, ccl, lemmas)` diff --git a/main.py b/main.py index 8cc3a2d..7cd46b4 100644 --- a/main.py +++ b/main.py @@ -1,4 +1,4 @@ -"""Implementation of fextor3 worker.""" +"""Implementation of pos_tagger worker.""" import nlp_ws from src.tagger import TaggerWorker diff --git a/requirements.txt b/requirements.txt index af4c17c..64d57ba 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -nlp-ws +nlp-ws>=2.1 ccl2json==0.1.1 pyaml-env==1.2.1 \ No newline at end of file diff --git a/src/converter.py b/src/converter.py index 53b23e1..2883f1d 100644 --- a/src/converter.py +++ b/src/converter.py @@ -1,3 +1,4 @@ +"""Implementation of ccl2json usage.""" import json from xml.sax import handler, make_parser @@ -5,6 +6,13 @@ from ccl2json.parse import CCLhandler def ccl2json(path_in, path_out): + """Function converting xml ccl to json. + + :param path_in: path to xml ccl file + :type path_in: str + :param path_out: path to output json file + :type path_out: str + """ parser = make_parser() parser.setFeature(handler.feature_external_ges, False) parser.setContentHandler(CCLhandler()) @@ -15,6 +23,7 @@ def ccl2json(path_in, path_out): "filename": path_in.split('/')[-1].replace('.ccl', ''), 'text': parser.getContentHandler().get_text(), 'tagset': 'nkjp', - 'tokens': [token.as_dict() for token in parser.getContentHandler().get_tokens()] + 'tokens': [token.as_dict() for token + in parser.getContentHandler().get_tokens()] } json.dump(dout, fout, ensure_ascii=False) diff --git a/src/lemmatizer.py b/src/lemmatizer.py index a314254..8a3561f 100644 --- a/src/lemmatizer.py +++ b/src/lemmatizer.py @@ -1,12 +1,11 @@ -#!/usr/bin/python3 -"""Implementation of lemmatizer from CCL""" +"""Implementation of lemmatizer from CCL.""" -from collections import defaultdict -from xml.dom import pulldom import logging +from xml.dom import pulldom _log = logging.getLogger(__name__) + def ccl_2_lemmas(input_file, output_file): """Implementation of lemmas extracting function. @@ -17,8 +16,6 @@ def ccl_2_lemmas(input_file, output_file): :param output_file: path to resulting text file with words in lemma forms :type output_file: str """ - - input_data = pulldom.parse(input_file) with open(output_file, 'wt', encoding='utf-8') as f: _log.error("here") @@ -27,6 +24,6 @@ def ccl_2_lemmas(input_file, output_file): input_data.expandNode(node) words = [base.firstChild.data for base in node.getElementsByTagName('base') - if base is not None and base.firstChild is not None] + if base is not None and base.firstChild is not None] f.write(" ".join(words)) f.write("\n") diff --git a/src/tagger.py b/src/tagger.py index 8d9ac6f..1a36054 100644 --- a/src/tagger.py +++ b/src/tagger.py @@ -1,17 +1,17 @@ -#!/usr/bin/python3 -"""Fextor3 worker implementation.""" +"""Tagger worker implementation.""" from __future__ import absolute_import, division, unicode_literals -import json import logging import shutil import nlp_ws from nlp_ws import SubTask -import src.converter as converter from pyaml_env import parse_config +import src.converter as converter +import src.lemmatizer as lemmatizer + _log = logging.getLogger(__name__) SubTask.turn_on() @@ -23,15 +23,13 @@ class TaggerWorker(nlp_ws.NLPWorker): def static_init(cls, config): """Initialize process.""" cls._taggers = {} - yaml_path = config.get('tool').get('config','pos_tagger.yaml') - yaml_config = parse_config(yaml_path) + yaml_path = config.get('tool').get('config', 'pos_tagger.yaml') + yaml_config = parse_config(yaml_path) cls._taggers = yaml_config["taggers"] _log.error(f"Config from yaml: {cls._taggers}") - - def process(self, input_path, task_options, output_path): - """Called for each request made to the worker. + """Call for each request made to the worker. Extract features defined in task option from file on input_path in xml ccl format and write json output into output_path. @@ -41,8 +39,8 @@ class TaggerWorker(nlp_ws.NLPWorker): :type input_path: str :param task_options: Dictionary containing options of pos_tagger - lang: language of text (deafult = 'pl') - output: format of resulrs (defualt = 'json', values: json, ccl, lemmas) + lang: language of text (default = 'pl') + output: format of results (default = 'json', values: json, ccl, lemmas) :type task_options: dict :param output_path: Path to directory where the @@ -52,22 +50,23 @@ class TaggerWorker(nlp_ws.NLPWorker): lang = "pl" if "lang" in task_options: lang = task_options["lang"] - if not lang in self._taggers: - raise Exception(f"Unsuported language: {lang}, supported {list(self._taggers.keys())}") - type = "default" - subtask = SubTask(input_path,self._taggers[lang][type]["lpmn"]) + if lang not in self._taggers: + raise Exception(f"Unsupported language: {lang}, " + f"supported {list(self._taggers.keys())}") + tagger_type = "default" + subtask = SubTask(input_path, self._taggers[lang][tagger_type]["lpmn"]) subtask.run(blocking=False) l_result = subtask.get_output_path() output = "json" if "output" in task_options: output = task_options["output"] - tager_output = self._taggers[lang][type]["output"] - if tager_output == output: + tagger_output = self._taggers[lang][tagger_type]["output"] + if tagger_output == output: shutil.copyfile(l_result, output_path) - elif tager_output=="ccl" and output == "lemmas": + elif tagger_output == "ccl" and output == "lemmas": lemmatizer.ccl_2_lemmas(l_result, output_path) - elif tager_output=="ccl" and output == "json": + elif tagger_output == "ccl" and output == "json": converter.ccl2json(l_result, output_path) else: - raise Exception(f"Unsuported format conversion") + raise Exception("Unsupported format conversion") diff --git a/tests/test.py b/tests/test.py new file mode 100644 index 0000000..da96d2b --- /dev/null +++ b/tests/test.py @@ -0,0 +1,8 @@ +import pytest + +from src.tagger import TaggerWorker + + +def test_init(): + worker = TaggerWorker() + assert type(worker).__name__ == 'TaggerWorker' \ No newline at end of file diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..25b159c --- /dev/null +++ b/tox.ini @@ -0,0 +1,63 @@ +[tox] +envlist = pep8,docstyle +skipsdist = True + +[testenv:pep8] +deps = + flake8 +basepython = python3 +commands = + flake8 {posargs} + +[testenv:docstyle] +deps = + pydocstyle +basepython = python3 +commands = + pydocstyle --verbose {posargs} + +[testenv:pytest] +deps = + pytest + pytest-pythonpath + coverage + nlp-ws>=2.1 + ccl2json==0.1.1 + pyaml-env==1.2.1 +commands = + coverage run --source=src -m pytest --junitxml=report.xml tests/test.py + coverage html + +[pytest] +python_paths = src src + +[run] +relative_files = True +branch = True + +[flake8] +# W504 skipped because it is overeager and unnecessary +ignore = W504 +show-source = True +exclude = .git,.venv,.tox,dist,doc,*egg,build,venv,tests +import-order-style = pep8 +max-line-length = 80 + + +[pydocstyle] +# D104 Missing docstring in public package +# D203 1 blank line required before class docstring +# D213 Multi-line docstring summary should start at the second line +# D214 Section is over-indented +# D215 Section underline is over-indented +# D401 First line should be in imperative mood; try rephrasing +# D405 Section name should be properly capitalized +# D406 Section name should end with a newline +# D407 Missing dashed underline after section +# D408 Section underline should be in the line following the section’s name +# D409 Section underline should match the length of its name +# D410 Missing blank line after section +# D411 Missing blank line before section +ignore = D104,D203,D213,D214,D215,D401,D405,D406,D407,D408,D409,D410,D411 +match-dir = ^(?!\.tox|venv|tests).* +match = ^(?!setup).*\.py \ No newline at end of file -- GitLab From ef9f1cbb49f4a63541ac81f9ab032a5eade34a87 Mon Sep 17 00:00:00 2001 From: pwalkow <pawel.walkowiak@hotmail.com> Date: Fri, 16 Dec 2022 13:14:53 +0100 Subject: [PATCH 2/4] Update tox version --- .gitlab-ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 407c6ed..cbd1ab9 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -14,21 +14,21 @@ stages: pep8: stage: check_style before_script: - - pip install tox==2.9.1 + - pip install tox==3.18.1 script: - tox -v -e pep8 docstyle: stage: check_style before_script: - - pip install tox==2.9.1 + - pip install tox==3.18.1 script: - tox -v -e docstyle tests: stage: tests before_script: - - pip install tox==2.9.1 + - pip install tox==3.18.1 script: - tox -v -e pytest artifacts: -- GitLab From ac7776f0070faffeb26abd16506611209a94f94e Mon Sep 17 00:00:00 2001 From: pwalkow <pawel.walkowiak@hotmail.com> Date: Fri, 16 Dec 2022 13:23:57 +0100 Subject: [PATCH 3/4] Update tool name --- pos_tagger.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pos_tagger.yaml b/pos_tagger.yaml index f90ae19..18b84c7 100644 --- a/pos_tagger.yaml +++ b/pos_tagger.yaml @@ -1,5 +1,5 @@ taggers: pl: default: - lpmn: ["morphoDita"] + lpmn: ["morphodita"] output: ccl -- GitLab From 47382d6e6c871fc442ed37f0b9acbe776ec9bfbe Mon Sep 17 00:00:00 2001 From: pwalkow <pawel.walkowiak@hotmail.com> Date: Fri, 16 Dec 2022 15:10:22 +0100 Subject: [PATCH 4/4] Add no text option --- src/converter.py | 17 +++++++++++------ src/tagger.py | 4 +++- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/src/converter.py b/src/converter.py index 2883f1d..eee4920 100644 --- a/src/converter.py +++ b/src/converter.py @@ -1,15 +1,19 @@ """Implementation of ccl2json usage.""" import json +import pathlib from xml.sax import handler, make_parser from ccl2json.parse import CCLhandler -def ccl2json(path_in, path_out): +def ccl2json(path_in, options, path_out): """Function converting xml ccl to json. :param path_in: path to xml ccl file :type path_in: str + :param options: task_options: Dictionary containing options of ccl2json + json_text: if json output should contain original text (default = True) + :type: options: dict :param path_out: path to output json file :type path_out: str """ @@ -18,12 +22,13 @@ def ccl2json(path_in, path_out): parser.setContentHandler(CCLhandler()) parser.parse(path_in) - with open(path_out, 'w', encoding='utf-8') as fout: - dout = { - "filename": path_in.split('/')[-1].replace('.ccl', ''), - 'text': parser.getContentHandler().get_text(), + with open(path_out, 'w', encoding='utf-8') as file_out: + data_out = { + "filename": pathlib.Path(path_in).stem, 'tagset': 'nkjp', 'tokens': [token.as_dict() for token in parser.getContentHandler().get_tokens()] } - json.dump(dout, fout, ensure_ascii=False) + if 'json_text' not in options or options['json_text']: + data_out['text'] = parser.getContentHandler().get_text() + json.dump(data_out, file_out, ensure_ascii=False) diff --git a/src/tagger.py b/src/tagger.py index 1a36054..dacb0bf 100644 --- a/src/tagger.py +++ b/src/tagger.py @@ -41,6 +41,8 @@ class TaggerWorker(nlp_ws.NLPWorker): :param task_options: Dictionary containing options of pos_tagger lang: language of text (default = 'pl') output: format of results (default = 'json', values: json, ccl, lemmas) + json_text: bool if json output should contain original + text (default = True) :type task_options: dict :param output_path: Path to directory where the @@ -67,6 +69,6 @@ class TaggerWorker(nlp_ws.NLPWorker): elif tagger_output == "ccl" and output == "lemmas": lemmatizer.ccl_2_lemmas(l_result, output_path) elif tagger_output == "ccl" and output == "json": - converter.ccl2json(l_result, output_path) + converter.ccl2json(l_result, task_options, output_path) else: raise Exception("Unsupported format conversion") -- GitLab