From 6e853db742014df589a66d5a0837da47951eae64 Mon Sep 17 00:00:00 2001
From: pwalkow <pawel.walkowiak@hotmail.com>
Date: Fri, 16 Dec 2022 12:44:14 +0100
Subject: [PATCH 1/4] refactor code

---
 .gitlab-ci.yml    | 76 +++++++++++++++++++++++++++++++++++++++++++++++
 README.md         | 31 +++----------------
 main.py           |  2 +-
 requirements.txt  |  2 +-
 src/converter.py  | 11 ++++++-
 src/lemmatizer.py | 11 +++----
 src/tagger.py     | 39 ++++++++++++------------
 tests/test.py     |  8 +++++
 tox.ini           | 63 +++++++++++++++++++++++++++++++++++++++
 9 files changed, 186 insertions(+), 57 deletions(-)
 create mode 100644 .gitlab-ci.yml
 create mode 100644 tests/test.py
 create mode 100644 tox.ini

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 0000000..407c6ed
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,76 @@
+image: clarinpl/python:3.8
+
+cache:
+  paths:
+    - .tox
+
+stages:
+  - check_style
+  - tests
+  - pages
+  - build_master
+  - build_develop
+
+pep8:
+  stage: check_style
+  before_script:
+    - pip install tox==2.9.1
+  script:
+    - tox -v -e pep8
+
+docstyle:
+  stage: check_style
+  before_script:
+    - pip install tox==2.9.1
+  script:
+    - tox -v -e docstyle
+
+tests:
+  stage: tests
+  before_script:
+    - pip install tox==2.9.1
+  script:
+    - tox -v -e pytest
+  artifacts:
+    paths:
+      - htmlcov
+    expire_in: 1 week
+    reports:
+      junit:
+        - report.xml
+
+pages:
+  stage: pages
+  script:
+    - mkdir -p public/coverage
+    - cp -r htmlcov/* public/coverage/
+  artifacts:
+    name: coverage
+    paths:
+      - public
+
+
+build_develop:
+  except:
+    - master
+  stage: build_develop
+  image: docker:18.09.7
+  services:
+    - 'docker:18.09.7-dind'
+  script:
+    - docker build -t $CI_REGISTRY_IMAGE:develop .
+    - docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY
+    - docker push $CI_REGISTRY_IMAGE:develop
+
+
+build_master:
+  stage: build_master
+  image: 'docker:18.09.7'
+  only:
+    - master
+  services:
+    - 'docker:18.09.7-dind'
+  script:
+    - docker build -t $CI_REGISTRY_IMAGE:latest .
+    - docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY
+    - docker push $CI_REGISTRY_IMAGE:latest
diff --git a/README.md b/README.md
index 4b43687..55901b6 100644
--- a/README.md
+++ b/README.md
@@ -1,36 +1,13 @@
-## Fextor3
-Konwersja usługi fextorbis na python3
+## Pos_tagger
+Wrapper for tagger services like morphodita, converting ccl output to json tagger output
 
 
 
 task_options
 ============
+`lang:` language of text (default = 'pl')
 
-`lemmas-count: 'json file path'`
-
-Plik json zawierający informacje o lematach, częściach wielowyrazowców oraz końcowych termach, przykład dla wycinka ze słownika:
-
-| terms               | lemmas              |
-|---------------------|---------------------|
-| szkoła jezior       | szkoła jezioro      |
- | szkoła literacka    | szkoła literacki    |
-| szkoła strukturalna | szkoła strukturalny |
-| szkoła sycylijska   | szkoła sycylijski   |
-| szkoła śląska       | szkoła śląski       |
-| szkoła ukraińska    | szkoła ukraiński    |
-
-"szkoła": [ \
-&emsp; {"lemma": "szkoła", "parts": []}, \
-&emsp;    {"lemma": "szkoła jezioro", "parts": ["jezioro"], "term": "szkoła_jezior"}, \
-&emsp;    {"lemma": "szkoła literacki", "parts": ["literacki"], "term": "szkoła_literacka"}, \
-&emsp;    {"lemma": "szkoła strukturalny", "parts": ["strukturalny"], "term": "szkoła_strukturalna"}, \
-&emsp;    {"lemma": "szkoła sycylijski", "parts": ["sycylijski"], "term": "szkoła_sycylijska"}, \
-&emsp;    {"lemma": "szkoła śląski", "parts": ["śląski"], "term": "szkoła_śląska"}, \
-&emsp;    {"lemma": "szkoła ukraiński", "parts": ["ukraiński"], "term": "szkoła_ukraińska"}]\
-
-Element {"lemma": "szkoła", "parts": []} nie ma klucza "term", ponieważ słowo "szkoła" nie występuje jako osobny term.
-
-[Raport pokrycia kodu](http://nlpworkers.pages.clarin-pl.eu/fextor3/coverage/)
+`output:` format of results (default = 'json', values: json, ccl, lemmas)`
 
 
     
diff --git a/main.py b/main.py
index 8cc3a2d..7cd46b4 100644
--- a/main.py
+++ b/main.py
@@ -1,4 +1,4 @@
-"""Implementation of fextor3 worker."""
+"""Implementation of pos_tagger worker."""
 import nlp_ws
 
 from src.tagger import TaggerWorker
diff --git a/requirements.txt b/requirements.txt
index af4c17c..64d57ba 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,3 @@
-nlp-ws
+nlp-ws>=2.1
 ccl2json==0.1.1
 pyaml-env==1.2.1
\ No newline at end of file
diff --git a/src/converter.py b/src/converter.py
index 53b23e1..2883f1d 100644
--- a/src/converter.py
+++ b/src/converter.py
@@ -1,3 +1,4 @@
+"""Implementation of ccl2json usage."""
 import json
 from xml.sax import handler, make_parser
 
@@ -5,6 +6,13 @@ from ccl2json.parse import CCLhandler
 
 
 def ccl2json(path_in, path_out):
+    """Function converting xml ccl to json.
+
+    :param path_in: path to xml ccl file
+    :type path_in: str
+    :param path_out: path to output json file
+    :type path_out: str
+    """
     parser = make_parser()
     parser.setFeature(handler.feature_external_ges, False)
     parser.setContentHandler(CCLhandler())
@@ -15,6 +23,7 @@ def ccl2json(path_in, path_out):
             "filename": path_in.split('/')[-1].replace('.ccl', ''),
             'text': parser.getContentHandler().get_text(),
             'tagset': 'nkjp',
-            'tokens': [token.as_dict() for token in parser.getContentHandler().get_tokens()]
+            'tokens': [token.as_dict() for token
+                       in parser.getContentHandler().get_tokens()]
         }
         json.dump(dout, fout, ensure_ascii=False)
diff --git a/src/lemmatizer.py b/src/lemmatizer.py
index a314254..8a3561f 100644
--- a/src/lemmatizer.py
+++ b/src/lemmatizer.py
@@ -1,12 +1,11 @@
-#!/usr/bin/python3
-"""Implementation of lemmatizer from CCL"""
+"""Implementation of lemmatizer from CCL."""
 
-from collections import defaultdict
-from xml.dom import pulldom
 import logging
+from xml.dom import pulldom
 
 _log = logging.getLogger(__name__)
 
+
 def ccl_2_lemmas(input_file, output_file):
     """Implementation of lemmas extracting function.
 
@@ -17,8 +16,6 @@ def ccl_2_lemmas(input_file, output_file):
     :param output_file: path to resulting text file with words in lemma forms
     :type output_file: str
     """
-
-
     input_data = pulldom.parse(input_file)
     with open(output_file, 'wt', encoding='utf-8') as f:
         _log.error("here")
@@ -27,6 +24,6 @@ def ccl_2_lemmas(input_file, output_file):
                 input_data.expandNode(node)
                 words = [base.firstChild.data for base
                          in node.getElementsByTagName('base')
-                         if base is not None and base.firstChild is not None]     
+                         if base is not None and base.firstChild is not None]
                 f.write(" ".join(words))
                 f.write("\n")
diff --git a/src/tagger.py b/src/tagger.py
index 8d9ac6f..1a36054 100644
--- a/src/tagger.py
+++ b/src/tagger.py
@@ -1,17 +1,17 @@
-#!/usr/bin/python3
-"""Fextor3 worker implementation."""
+"""Tagger worker implementation."""
 
 from __future__ import absolute_import, division, unicode_literals
 
-import json
 import logging
 import shutil
 
 import nlp_ws
 from nlp_ws import SubTask
-import src.converter as converter
 from pyaml_env import parse_config
 
+import src.converter as converter
+import src.lemmatizer as lemmatizer
+
 _log = logging.getLogger(__name__)
 SubTask.turn_on()
 
@@ -23,15 +23,13 @@ class TaggerWorker(nlp_ws.NLPWorker):
     def static_init(cls, config):
         """Initialize process."""
         cls._taggers = {}
-        yaml_path = config.get('tool').get('config','pos_tagger.yaml')
-        yaml_config =  parse_config(yaml_path)
+        yaml_path = config.get('tool').get('config', 'pos_tagger.yaml')
+        yaml_config = parse_config(yaml_path)
         cls._taggers = yaml_config["taggers"]
         _log.error(f"Config from yaml: {cls._taggers}")
 
-
-
     def process(self, input_path, task_options, output_path):
-        """Called for each request made to the worker.
+        """Call for each request made to the worker.
 
         Extract features defined in task option from file on
         input_path in xml ccl format and write json output into output_path.
@@ -41,8 +39,8 @@ class TaggerWorker(nlp_ws.NLPWorker):
         :type input_path: str
 
         :param task_options: Dictionary containing options of pos_tagger
-        lang: language of text (deafult = 'pl')
-        output: format of resulrs (defualt = 'json', values: json, ccl, lemmas)
+        lang: language of text (default = 'pl')
+        output: format of results (default = 'json', values: json, ccl, lemmas)
         :type task_options: dict
 
         :param output_path: Path to directory where the
@@ -52,22 +50,23 @@ class TaggerWorker(nlp_ws.NLPWorker):
         lang = "pl"
         if "lang" in task_options:
             lang = task_options["lang"]
-        if not lang in self._taggers:
-            raise Exception(f"Unsuported language: {lang}, supported {list(self._taggers.keys())}")
-        type = "default"
-        subtask = SubTask(input_path,self._taggers[lang][type]["lpmn"])
+        if lang not in self._taggers:
+            raise Exception(f"Unsupported language: {lang}, "
+                            f"supported {list(self._taggers.keys())}")
+        tagger_type = "default"
+        subtask = SubTask(input_path, self._taggers[lang][tagger_type]["lpmn"])
         subtask.run(blocking=False)
         l_result = subtask.get_output_path()
         output = "json"
         if "output" in task_options:
             output = task_options["output"]
 
-        tager_output =  self._taggers[lang][type]["output"]
-        if tager_output == output:
+        tagger_output = self._taggers[lang][tagger_type]["output"]
+        if tagger_output == output:
             shutil.copyfile(l_result, output_path)
-        elif tager_output=="ccl" and output == "lemmas":
+        elif tagger_output == "ccl" and output == "lemmas":
             lemmatizer.ccl_2_lemmas(l_result, output_path)
-        elif tager_output=="ccl" and output == "json":
+        elif tagger_output == "ccl" and output == "json":
             converter.ccl2json(l_result, output_path)
         else:
-            raise Exception(f"Unsuported format conversion")
+            raise Exception("Unsupported format conversion")
diff --git a/tests/test.py b/tests/test.py
new file mode 100644
index 0000000..da96d2b
--- /dev/null
+++ b/tests/test.py
@@ -0,0 +1,8 @@
+import pytest
+
+from src.tagger import TaggerWorker
+
+
+def test_init():
+    worker = TaggerWorker()
+    assert type(worker).__name__ == 'TaggerWorker'
\ No newline at end of file
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 0000000..25b159c
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,63 @@
+[tox]
+envlist = pep8,docstyle
+skipsdist = True
+
+[testenv:pep8]
+deps =
+    flake8
+basepython = python3
+commands =
+    flake8 {posargs}
+
+[testenv:docstyle]
+deps =
+    pydocstyle
+basepython = python3
+commands =
+    pydocstyle --verbose {posargs}
+
+[testenv:pytest]
+deps =
+    pytest
+    pytest-pythonpath
+    coverage
+    nlp-ws>=2.1
+    ccl2json==0.1.1
+    pyaml-env==1.2.1
+commands =
+    coverage run --source=src -m pytest --junitxml=report.xml tests/test.py
+    coverage html
+
+[pytest]
+python_paths = src src
+
+[run]
+relative_files = True
+branch = True
+
+[flake8]
+# W504 skipped because it is overeager and unnecessary
+ignore = W504
+show-source = True
+exclude = .git,.venv,.tox,dist,doc,*egg,build,venv,tests
+import-order-style = pep8
+max-line-length = 80
+
+
+[pydocstyle]
+# D104 Missing docstring in public package
+# D203 1 blank line required before class docstring
+# D213 Multi-line docstring summary should start at the second line
+# D214 Section is over-indented
+# D215 Section underline is over-indented
+# D401 First line should be in imperative mood; try rephrasing
+# D405 Section name should be properly capitalized
+# D406 Section name should end with a newline
+# D407 Missing dashed underline after section
+# D408 Section underline should be in the line following the section’s name
+# D409 Section underline should match the length of its name
+# D410 Missing blank line after section
+# D411 Missing blank line before section
+ignore = D104,D203,D213,D214,D215,D401,D405,D406,D407,D408,D409,D410,D411
+match-dir = ^(?!\.tox|venv|tests).*
+match = ^(?!setup).*\.py
\ No newline at end of file
-- 
GitLab


From ef9f1cbb49f4a63541ac81f9ab032a5eade34a87 Mon Sep 17 00:00:00 2001
From: pwalkow <pawel.walkowiak@hotmail.com>
Date: Fri, 16 Dec 2022 13:14:53 +0100
Subject: [PATCH 2/4] Update tox version

---
 .gitlab-ci.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 407c6ed..cbd1ab9 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -14,21 +14,21 @@ stages:
 pep8:
   stage: check_style
   before_script:
-    - pip install tox==2.9.1
+    - pip install tox==3.18.1
   script:
     - tox -v -e pep8
 
 docstyle:
   stage: check_style
   before_script:
-    - pip install tox==2.9.1
+    - pip install tox==3.18.1
   script:
     - tox -v -e docstyle
 
 tests:
   stage: tests
   before_script:
-    - pip install tox==2.9.1
+    - pip install tox==3.18.1
   script:
     - tox -v -e pytest
   artifacts:
-- 
GitLab


From ac7776f0070faffeb26abd16506611209a94f94e Mon Sep 17 00:00:00 2001
From: pwalkow <pawel.walkowiak@hotmail.com>
Date: Fri, 16 Dec 2022 13:23:57 +0100
Subject: [PATCH 3/4] Update tool name

---
 pos_tagger.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pos_tagger.yaml b/pos_tagger.yaml
index f90ae19..18b84c7 100644
--- a/pos_tagger.yaml
+++ b/pos_tagger.yaml
@@ -1,5 +1,5 @@
 taggers:
     pl:
       default:
-        lpmn: ["morphoDita"]
+        lpmn: ["morphodita"]
         output: ccl
-- 
GitLab


From 47382d6e6c871fc442ed37f0b9acbe776ec9bfbe Mon Sep 17 00:00:00 2001
From: pwalkow <pawel.walkowiak@hotmail.com>
Date: Fri, 16 Dec 2022 15:10:22 +0100
Subject: [PATCH 4/4] Add no text option

---
 src/converter.py | 17 +++++++++++------
 src/tagger.py    |  4 +++-
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/converter.py b/src/converter.py
index 2883f1d..eee4920 100644
--- a/src/converter.py
+++ b/src/converter.py
@@ -1,15 +1,19 @@
 """Implementation of ccl2json usage."""
 import json
+import pathlib
 from xml.sax import handler, make_parser
 
 from ccl2json.parse import CCLhandler
 
 
-def ccl2json(path_in, path_out):
+def ccl2json(path_in, options, path_out):
     """Function converting xml ccl to json.
 
     :param path_in: path to xml ccl file
     :type path_in: str
+    :param options: task_options: Dictionary containing options of ccl2json
+    json_text: if json output should contain original text (default = True)
+    :type: options: dict
     :param path_out: path to output json file
     :type path_out: str
     """
@@ -18,12 +22,13 @@ def ccl2json(path_in, path_out):
     parser.setContentHandler(CCLhandler())
     parser.parse(path_in)
 
-    with open(path_out, 'w', encoding='utf-8') as fout:
-        dout = {
-            "filename": path_in.split('/')[-1].replace('.ccl', ''),
-            'text': parser.getContentHandler().get_text(),
+    with open(path_out, 'w', encoding='utf-8') as file_out:
+        data_out = {
+            "filename": pathlib.Path(path_in).stem,
             'tagset': 'nkjp',
             'tokens': [token.as_dict() for token
                        in parser.getContentHandler().get_tokens()]
         }
-        json.dump(dout, fout, ensure_ascii=False)
+        if 'json_text' not in options or options['json_text']:
+            data_out['text'] = parser.getContentHandler().get_text()
+        json.dump(data_out, file_out, ensure_ascii=False)
diff --git a/src/tagger.py b/src/tagger.py
index 1a36054..dacb0bf 100644
--- a/src/tagger.py
+++ b/src/tagger.py
@@ -41,6 +41,8 @@ class TaggerWorker(nlp_ws.NLPWorker):
         :param task_options: Dictionary containing options of pos_tagger
         lang: language of text (default = 'pl')
         output: format of results (default = 'json', values: json, ccl, lemmas)
+        json_text: bool if json output should contain original
+        text (default = True)
         :type task_options: dict
 
         :param output_path: Path to directory where the
@@ -67,6 +69,6 @@ class TaggerWorker(nlp_ws.NLPWorker):
         elif tagger_output == "ccl" and output == "lemmas":
             lemmatizer.ccl_2_lemmas(l_result, output_path)
         elif tagger_output == "ccl" and output == "json":
-            converter.ccl2json(l_result, output_path)
+            converter.ccl2json(l_result, task_options, output_path)
         else:
             raise Exception("Unsupported format conversion")
-- 
GitLab