Merge branch 'add_ci' into 'master'

Add ci See merge request !1

Merge branch 'add_ci' into 'master'
e6909115 · Paweł Walkowiak · 87db876c · 13a501fb · e6909115 · e6909115
Commit e6909115 authored Dec 16, 2022 by Paweł Walkowiak
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
+image: clarinpl/python:3.8
+
+cache:
+  paths:
+    - .tox
+
+stages:
+  - check_style
+  - tests
+  - pages
+  - build_master
+  - build_develop
+
+pep8:
+  stage: check_style
+  before_script:
+    - pip install tox==3.18.1
+  script:
+    - tox -v -e pep8
+
+docstyle:
+  stage: check_style
+  before_script:
+    - pip install tox==3.18.1
+  script:
+    - tox -v -e docstyle
+
+tests:
+  stage: tests
+  before_script:
+    - pip install tox==3.18.1
+  script:
+    - tox -v -e pytest
+  artifacts:
+    paths:
+      - htmlcov
+    expire_in: 1 week
+    reports:
+      junit:
+        - report.xml
+
+pages:
+  stage: pages
+  script:
+    - mkdir -p public/coverage
+    - cp -r htmlcov/* public/coverage/
+  artifacts:
+    name: coverage
+    paths:
+      - public
+
+
+build_develop:
+  except:
+    - master
+  stage: build_develop
+  image: docker:18.09.7
+  services:
+    - 'docker:18.09.7-dind'
+  script:
+    - docker build -t $CI_REGISTRY_IMAGE:develop .
+    - docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY
+    - docker push $CI_REGISTRY_IMAGE:develop
+
+
+build_master:
+  stage: build_master
+  image: 'docker:18.09.7'
+  only:
+    - master
+  services:
+    - 'docker:18.09.7-dind'
+  script:
+    - docker build -t $CI_REGISTRY_IMAGE:latest .
+    - docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY
+    - docker push $CI_REGISTRY_IMAGE:latest
--- a/README.md
+++ b/README.md
-## Fextor3
-Konwersja usługi fextorbis na python3
+## Pos_tagger
+Wrapper for tagger services like morphodita, converting ccl output to json tagger output



 task_options
 ============
+`lang:` language of text (default = 'pl')

-`lemmas-count: 'json file path'`
-
-Plik json zawierający informacje o lematach, częściach wielowyrazowców oraz końcowych termach, przykład dla wycinka ze słownika:
-
-| terms               | lemmas              |
-|---------------------|---------------------|
-| szkoła jezior       | szkoła jezioro      |
- | szkoła literacka    | szkoła literacki    |
-| szkoła strukturalna | szkoła strukturalny |
-| szkoła sycylijska   | szkoła sycylijski   |
-| szkoła śląska       | szkoła śląski       |
-| szkoła ukraińska    | szkoła ukraiński    |
-
-"szkoła": [ \
-&emsp; {"lemma": "szkoła", "parts": []}, \
-&emsp;    {"lemma": "szkoła jezioro", "parts": ["jezioro"], "term": "szkoła_jezior"}, \
-&emsp;    {"lemma": "szkoła literacki", "parts": ["literacki"], "term": "szkoła_literacka"}, \
-&emsp;    {"lemma": "szkoła strukturalny", "parts": ["strukturalny"], "term": "szkoła_strukturalna"}, \
-&emsp;    {"lemma": "szkoła sycylijski", "parts": ["sycylijski"], "term": "szkoła_sycylijska"}, \
-&emsp;    {"lemma": "szkoła śląski", "parts": ["śląski"], "term": "szkoła_śląska"}, \
-&emsp;    {"lemma": "szkoła ukraiński", "parts": ["ukraiński"], "term": "szkoła_ukraińska"}]\
-
-Element {"lemma": "szkoła", "parts": []} nie ma klucza "term", ponieważ słowo "szkoła" nie występuje jako osobny term.
-
-[Raport pokrycia kodu](http://nlpworkers.pages.clarin-pl.eu/fextor3/coverage/)
+`output:` format of results (default = 'json', values: json, ccl, lemmas)`


    

--- a/main.py
+++ b/main.py
-"""Implementation of fextor3 worker."""
+"""Implementation of pos_tagger worker."""
 import nlp_ws

 from src.tagger import TaggerWorker

--- a/pos_tagger.yaml
+++ b/pos_tagger.yaml
 taggers:
    pl:
      default:
-        lpmn: ["morphoDita"]
+        lpmn: ["morphodita"]
        output: ccl
--- a/requirements.txt
+++ b/requirements.txt
-nlp-ws
+nlp-ws>=2.1
 ccl2json==0.1.1
 pyaml-env==1.2.1
\ No newline at end of file
--- a/src/converter.py
+++ b/src/converter.py
+"""Implementation of ccl2json usage."""
 import json
+import pathlib
 from xml.sax import handler, make_parser

 from ccl2json.parse import CCLhandler


-def ccl2json(path_in, path_out):
+def ccl2json(path_in, options, path_out):
+    """Function converting xml ccl to json.
+
+    :param path_in: path to xml ccl file
+    :type path_in: str
+    :param options: task_options: Dictionary containing options of ccl2json
+    json_text: if json output should contain original text (default = True)
+    :type: options: dict
+    :param path_out: path to output json file
+    :type path_out: str
+    """
    parser = make_parser()
    parser.setFeature(handler.feature_external_ges, False)
    parser.setContentHandler(CCLhandler())
    parser.parse(path_in)

-    with open(path_out, 'w', encoding='utf-8') as fout:
-        dout = {
-            "filename": path_in.split('/')[-1].replace('.ccl', ''),
-            'text': parser.getContentHandler().get_text(),
+    with open(path_out, 'w', encoding='utf-8') as file_out:
+        data_out = {
+            "filename": pathlib.Path(path_in).stem,
            'tagset': 'nkjp',
-            'tokens': [token.as_dict() for token in parser.getContentHandler().get_tokens()]
+            'tokens': [token.as_dict() for token
+                       in parser.getContentHandler().get_tokens()]
        }
-        json.dump(dout, fout, ensure_ascii=False)
+        if 'json_text' not in options or options['json_text']:
+            data_out['text'] = parser.getContentHandler().get_text()
+        json.dump(data_out, file_out, ensure_ascii=False)
--- a/src/lemmatizer.py
+++ b/src/lemmatizer.py
-#!/usr/bin/python3
-"""Implementation of lemmatizer from CCL"""
+"""Implementation of lemmatizer from CCL."""

-from collections import defaultdict
-from xml.dom import pulldom
 import logging
+from xml.dom import pulldom

 _log = logging.getLogger(__name__)

+
 def ccl_2_lemmas(input_file, output_file):
    """Implementation of lemmas extracting function.

@@ -17,8 +16,6 @@ def ccl_2_lemmas(input_file, output_file):
    :param output_file: path to resulting text file with words in lemma forms
    :type output_file: str
    """
-
-
    input_data = pulldom.parse(input_file)
    with open(output_file, 'wt', encoding='utf-8') as f:
        _log.error("here")

--- a/src/tagger.py
+++ b/src/tagger.py
-#!/usr/bin/python3
-"""Fextor3 worker implementation."""
+"""Tagger worker implementation."""

 from __future__ import absolute_import, division, unicode_literals

-import json
 import logging
 import shutil

 import nlp_ws
 from nlp_ws import SubTask
-import src.converter as converter
 from pyaml_env import parse_config

+import src.converter as converter
+import src.lemmatizer as lemmatizer
+
 _log = logging.getLogger(__name__)
 SubTask.turn_on()

@@ -28,10 +28,8 @@ class TaggerWorker(nlp_ws.NLPWorker):
        cls._taggers = yaml_config["taggers"]
        _log.error(f"Config from yaml: {cls._taggers}")

-
-
    def process(self, input_path, task_options, output_path):
-        """Called for each request made to the worker.
+        """Call for each request made to the worker.

        Extract features defined in task option from file on
        input_path in xml ccl format and write json output into output_path.
@@ -41,8 +39,10 @@ class TaggerWorker(nlp_ws.NLPWorker):
        :type input_path: str

        :param task_options: Dictionary containing options of pos_tagger
-        lang: language of text (deafult = 'pl')
-        output: format of resulrs (defualt = 'json', values: json, ccl, lemmas)
+        lang: language of text (default = 'pl')
+        output: format of results (default = 'json', values: json, ccl, lemmas)
+        json_text: bool if json output should contain original
+        text (default = True)
        :type task_options: dict

        :param output_path: Path to directory where the
@@ -52,22 +52,23 @@ class TaggerWorker(nlp_ws.NLPWorker):
        lang = "pl"
        if "lang" in task_options:
            lang = task_options["lang"]
-        if not lang in self._taggers:
-            raise Exception(f"Unsuported language: {lang}, supported {list(self._taggers.keys())}")
-        type = "default"
-        subtask = SubTask(input_path,self._taggers[lang][type]["lpmn"])
+        if lang not in self._taggers:
+            raise Exception(f"Unsupported language: {lang}, "
+                            f"supported {list(self._taggers.keys())}")
+        tagger_type = "default"
+        subtask = SubTask(input_path, self._taggers[lang][tagger_type]["lpmn"])
        subtask.run(blocking=False)
        l_result = subtask.get_output_path()
        output = "json"
        if "output" in task_options:
            output = task_options["output"]

-        tager_output =  self._taggers[lang][type]["output"]
-        if tager_output == output:
+        tagger_output = self._taggers[lang][tagger_type]["output"]
+        if tagger_output == output:
            shutil.copyfile(l_result, output_path)
-        elif tager_output=="ccl" and output == "lemmas":
+        elif tagger_output == "ccl" and output == "lemmas":
            lemmatizer.ccl_2_lemmas(l_result, output_path)
-        elif tager_output=="ccl" and output == "json":
-            converter.ccl2json(l_result, output_path)
+        elif tagger_output == "ccl" and output == "json":
+            converter.ccl2json(l_result, task_options, output_path)
        else:
-            raise Exception(f"Unsuported format conversion")
+            raise Exception("Unsupported format conversion")
--- a/tests/test.py
+++ b/tests/test.py
+import pytest
+
+from src.tagger import TaggerWorker
+
+
+def test_init():
+    worker = TaggerWorker()
+    assert type(worker).__name__ == 'TaggerWorker'
\ No newline at end of file
--- a/tox.ini
+++ b/tox.ini
+[tox]
+envlist = pep8,docstyle
+skipsdist = True
+
+[testenv:pep8]
+deps =
+    flake8
+basepython = python3
+commands =
+    flake8 {posargs}
+
+[testenv:docstyle]
+deps =
+    pydocstyle
+basepython = python3
+commands =
+    pydocstyle --verbose {posargs}
+
+[testenv:pytest]
+deps =
+    pytest
+    pytest-pythonpath
+    coverage
+    nlp-ws>=2.1
+    ccl2json==0.1.1
+    pyaml-env==1.2.1
+commands =
+    coverage run --source=src -m pytest --junitxml=report.xml tests/test.py
+    coverage html
+
+[pytest]
+python_paths = src src
+
+[run]
+relative_files = True
+branch = True
+
+[flake8]
+# W504 skipped because it is overeager and unnecessary
+ignore = W504
+show-source = True
+exclude = .git,.venv,.tox,dist,doc,*egg,build,venv,tests
+import-order-style = pep8
+max-line-length = 80
+
+
+[pydocstyle]
+# D104 Missing docstring in public package
+# D203 1 blank line required before class docstring
+# D213 Multi-line docstring summary should start at the second line
+# D214 Section is over-indented
+# D215 Section underline is over-indented
+# D401 First line should be in imperative mood; try rephrasing
+# D405 Section name should be properly capitalized
+# D406 Section name should end with a newline
+# D407 Missing dashed underline after section
+# D408 Section underline should be in the line following the section’s name
+# D409 Section underline should match the length of its name
+# D410 Missing blank line after section
+# D411 Missing blank line before section
+ignore = D104,D203,D213,D214,D215,D401,D405,D406,D407,D408,D409,D410,D411
+match-dir = ^(?!\.tox|venv|tests).*
+match = ^(?!setup).*\.py
\ No newline at end of file