Skip to content
Snippets Groups Projects
Commit e6909115 authored by Paweł Walkowiak's avatar Paweł Walkowiak
Browse files

Merge branch 'add_ci' into 'master'

Add ci

See merge request !1
parents 87db876c 13a501fb
No related branches found
No related tags found
1 merge request!1Add ci
Pipeline #6880 passed
image: clarinpl/python:3.8
cache:
paths:
- .tox
stages:
- check_style
- tests
- pages
- build_master
- build_develop
pep8:
stage: check_style
before_script:
- pip install tox==3.18.1
script:
- tox -v -e pep8
docstyle:
stage: check_style
before_script:
- pip install tox==3.18.1
script:
- tox -v -e docstyle
tests:
stage: tests
before_script:
- pip install tox==3.18.1
script:
- tox -v -e pytest
artifacts:
paths:
- htmlcov
expire_in: 1 week
reports:
junit:
- report.xml
pages:
stage: pages
script:
- mkdir -p public/coverage
- cp -r htmlcov/* public/coverage/
artifacts:
name: coverage
paths:
- public
build_develop:
except:
- master
stage: build_develop
image: docker:18.09.7
services:
- 'docker:18.09.7-dind'
script:
- docker build -t $CI_REGISTRY_IMAGE:develop .
- docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY
- docker push $CI_REGISTRY_IMAGE:develop
build_master:
stage: build_master
image: 'docker:18.09.7'
only:
- master
services:
- 'docker:18.09.7-dind'
script:
- docker build -t $CI_REGISTRY_IMAGE:latest .
- docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY
- docker push $CI_REGISTRY_IMAGE:latest
## Fextor3
Konwersja usługi fextorbis na python3
## Pos_tagger
Wrapper for tagger services like morphodita, converting ccl output to json tagger output
task_options
============
`lang:` language of text (default = 'pl')
`lemmas-count: 'json file path'`
Plik json zawierający informacje o lematach, częściach wielowyrazowców oraz końcowych termach, przykład dla wycinka ze słownika:
| terms | lemmas |
|---------------------|---------------------|
| szkoła jezior | szkoła jezioro |
| szkoła literacka | szkoła literacki |
| szkoła strukturalna | szkoła strukturalny |
| szkoła sycylijska | szkoła sycylijski |
| szkoła śląska | szkoła śląski |
| szkoła ukraińska | szkoła ukraiński |
"szkoła": [ \
  {"lemma": "szkoła", "parts": []}, \
  {"lemma": "szkoła jezioro", "parts": ["jezioro"], "term": "szkoła_jezior"}, \
  {"lemma": "szkoła literacki", "parts": ["literacki"], "term": "szkoła_literacka"}, \
  {"lemma": "szkoła strukturalny", "parts": ["strukturalny"], "term": "szkoła_strukturalna"}, \
  {"lemma": "szkoła sycylijski", "parts": ["sycylijski"], "term": "szkoła_sycylijska"}, \
  {"lemma": "szkoła śląski", "parts": ["śląski"], "term": "szkoła_śląska"}, \
  {"lemma": "szkoła ukraiński", "parts": ["ukraiński"], "term": "szkoła_ukraińska"}]\
Element {"lemma": "szkoła", "parts": []} nie ma klucza "term", ponieważ słowo "szkoła" nie występuje jako osobny term.
[Raport pokrycia kodu](http://nlpworkers.pages.clarin-pl.eu/fextor3/coverage/)
`output:` format of results (default = 'json', values: json, ccl, lemmas)`
......
"""Implementation of fextor3 worker."""
"""Implementation of pos_tagger worker."""
import nlp_ws
from src.tagger import TaggerWorker
......
taggers:
pl:
default:
lpmn: ["morphoDita"]
lpmn: ["morphodita"]
output: ccl
nlp-ws
nlp-ws>=2.1
ccl2json==0.1.1
pyaml-env==1.2.1
\ No newline at end of file
"""Implementation of ccl2json usage."""
import json
import pathlib
from xml.sax import handler, make_parser
from ccl2json.parse import CCLhandler
def ccl2json(path_in, path_out):
def ccl2json(path_in, options, path_out):
"""Function converting xml ccl to json.
:param path_in: path to xml ccl file
:type path_in: str
:param options: task_options: Dictionary containing options of ccl2json
json_text: if json output should contain original text (default = True)
:type: options: dict
:param path_out: path to output json file
:type path_out: str
"""
parser = make_parser()
parser.setFeature(handler.feature_external_ges, False)
parser.setContentHandler(CCLhandler())
parser.parse(path_in)
with open(path_out, 'w', encoding='utf-8') as fout:
dout = {
"filename": path_in.split('/')[-1].replace('.ccl', ''),
'text': parser.getContentHandler().get_text(),
with open(path_out, 'w', encoding='utf-8') as file_out:
data_out = {
"filename": pathlib.Path(path_in).stem,
'tagset': 'nkjp',
'tokens': [token.as_dict() for token in parser.getContentHandler().get_tokens()]
'tokens': [token.as_dict() for token
in parser.getContentHandler().get_tokens()]
}
json.dump(dout, fout, ensure_ascii=False)
if 'json_text' not in options or options['json_text']:
data_out['text'] = parser.getContentHandler().get_text()
json.dump(data_out, file_out, ensure_ascii=False)
#!/usr/bin/python3
"""Implementation of lemmatizer from CCL"""
"""Implementation of lemmatizer from CCL."""
from collections import defaultdict
from xml.dom import pulldom
import logging
from xml.dom import pulldom
_log = logging.getLogger(__name__)
def ccl_2_lemmas(input_file, output_file):
"""Implementation of lemmas extracting function.
......@@ -17,8 +16,6 @@ def ccl_2_lemmas(input_file, output_file):
:param output_file: path to resulting text file with words in lemma forms
:type output_file: str
"""
input_data = pulldom.parse(input_file)
with open(output_file, 'wt', encoding='utf-8') as f:
_log.error("here")
......
#!/usr/bin/python3
"""Fextor3 worker implementation."""
"""Tagger worker implementation."""
from __future__ import absolute_import, division, unicode_literals
import json
import logging
import shutil
import nlp_ws
from nlp_ws import SubTask
import src.converter as converter
from pyaml_env import parse_config
import src.converter as converter
import src.lemmatizer as lemmatizer
_log = logging.getLogger(__name__)
SubTask.turn_on()
......@@ -28,10 +28,8 @@ class TaggerWorker(nlp_ws.NLPWorker):
cls._taggers = yaml_config["taggers"]
_log.error(f"Config from yaml: {cls._taggers}")
def process(self, input_path, task_options, output_path):
"""Called for each request made to the worker.
"""Call for each request made to the worker.
Extract features defined in task option from file on
input_path in xml ccl format and write json output into output_path.
......@@ -41,8 +39,10 @@ class TaggerWorker(nlp_ws.NLPWorker):
:type input_path: str
:param task_options: Dictionary containing options of pos_tagger
lang: language of text (deafult = 'pl')
output: format of resulrs (defualt = 'json', values: json, ccl, lemmas)
lang: language of text (default = 'pl')
output: format of results (default = 'json', values: json, ccl, lemmas)
json_text: bool if json output should contain original
text (default = True)
:type task_options: dict
:param output_path: Path to directory where the
......@@ -52,22 +52,23 @@ class TaggerWorker(nlp_ws.NLPWorker):
lang = "pl"
if "lang" in task_options:
lang = task_options["lang"]
if not lang in self._taggers:
raise Exception(f"Unsuported language: {lang}, supported {list(self._taggers.keys())}")
type = "default"
subtask = SubTask(input_path,self._taggers[lang][type]["lpmn"])
if lang not in self._taggers:
raise Exception(f"Unsupported language: {lang}, "
f"supported {list(self._taggers.keys())}")
tagger_type = "default"
subtask = SubTask(input_path, self._taggers[lang][tagger_type]["lpmn"])
subtask.run(blocking=False)
l_result = subtask.get_output_path()
output = "json"
if "output" in task_options:
output = task_options["output"]
tager_output = self._taggers[lang][type]["output"]
if tager_output == output:
tagger_output = self._taggers[lang][tagger_type]["output"]
if tagger_output == output:
shutil.copyfile(l_result, output_path)
elif tager_output=="ccl" and output == "lemmas":
elif tagger_output == "ccl" and output == "lemmas":
lemmatizer.ccl_2_lemmas(l_result, output_path)
elif tager_output=="ccl" and output == "json":
converter.ccl2json(l_result, output_path)
elif tagger_output == "ccl" and output == "json":
converter.ccl2json(l_result, task_options, output_path)
else:
raise Exception(f"Unsuported format conversion")
raise Exception("Unsupported format conversion")
import pytest
from src.tagger import TaggerWorker
def test_init():
worker = TaggerWorker()
assert type(worker).__name__ == 'TaggerWorker'
\ No newline at end of file
tox.ini 0 → 100644
[tox]
envlist = pep8,docstyle
skipsdist = True
[testenv:pep8]
deps =
flake8
basepython = python3
commands =
flake8 {posargs}
[testenv:docstyle]
deps =
pydocstyle
basepython = python3
commands =
pydocstyle --verbose {posargs}
[testenv:pytest]
deps =
pytest
pytest-pythonpath
coverage
nlp-ws>=2.1
ccl2json==0.1.1
pyaml-env==1.2.1
commands =
coverage run --source=src -m pytest --junitxml=report.xml tests/test.py
coverage html
[pytest]
python_paths = src src
[run]
relative_files = True
branch = True
[flake8]
# W504 skipped because it is overeager and unnecessary
ignore = W504
show-source = True
exclude = .git,.venv,.tox,dist,doc,*egg,build,venv,tests
import-order-style = pep8
max-line-length = 80
[pydocstyle]
# D104 Missing docstring in public package
# D203 1 blank line required before class docstring
# D213 Multi-line docstring summary should start at the second line
# D214 Section is over-indented
# D215 Section underline is over-indented
# D401 First line should be in imperative mood; try rephrasing
# D405 Section name should be properly capitalized
# D406 Section name should end with a newline
# D407 Missing dashed underline after section
# D408 Section underline should be in the line following the section’s name
# D409 Section underline should match the length of its name
# D410 Missing blank line after section
# D411 Missing blank line before section
ignore = D104,D203,D213,D214,D215,D401,D405,D406,D407,D408,D409,D410,D411
match-dir = ^(?!\.tox|venv|tests).*
match = ^(?!setup).*\.py
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment