Skip to content
Snippets Groups Projects
Commit 70d9ab2f authored by Tomasz Walkowiak's avatar Tomasz Walkowiak
Browse files

Initial commit

parents
Branches
No related merge requests found
.coverage
/tests/.pytest_cache
.pytest_cache
.idea
*__pycache__
htmlcov
config-test.ini
FROM clarinpl/python:3.8
WORKDIR /home/worker
COPY ./src ./src
COPY requirements.txt .
COPY config.ini .
COPY main.py .
RUN pip install -r requirements.txt
CMD python main.py
## Fextor3
Konwersja usługi fextorbis na python3
task_options
============
`lemmas-count: 'json file path'`
Plik json zawierający informacje o lematach, częściach wielowyrazowców oraz końcowych termach, przykład dla wycinka ze słownika:
| terms | lemmas |
|---------------------|---------------------|
| szkoła jezior | szkoła jezioro |
| szkoła literacka | szkoła literacki |
| szkoła strukturalna | szkoła strukturalny |
| szkoła sycylijska | szkoła sycylijski |
| szkoła śląska | szkoła śląski |
| szkoła ukraińska | szkoła ukraiński |
"szkoła": [ \
  {"lemma": "szkoła", "parts": []}, \
  {"lemma": "szkoła jezioro", "parts": ["jezioro"], "term": "szkoła_jezior"}, \
  {"lemma": "szkoła literacki", "parts": ["literacki"], "term": "szkoła_literacka"}, \
  {"lemma": "szkoła strukturalny", "parts": ["strukturalny"], "term": "szkoła_strukturalna"}, \
  {"lemma": "szkoła sycylijski", "parts": ["sycylijski"], "term": "szkoła_sycylijska"}, \
  {"lemma": "szkoła śląski", "parts": ["śląski"], "term": "szkoła_śląska"}, \
  {"lemma": "szkoła ukraiński", "parts": ["ukraiński"], "term": "szkoła_ukraińska"}]\
Element {"lemma": "szkoła", "parts": []} nie ma klucza "term", ponieważ słowo "szkoła" nie występuje jako osobny term.
[Raport pokrycia kodu](http://nlpworkers.pages.clarin-pl.eu/fextor3/coverage/)
[service]
tool = fextor3
root = /samba/requests/
rabbit_host = rabbit.clarin.ws
rabbit_user = clarin
rabbit_password = clarin123
[tool]
workers_number = 1
stoplist_basedir = /samba/
lemmas_count = /samba/dictionary.json
[logging]
port = 9098
local_log_level = INFO
[logging_levels]
__main__ = INFO
version: '3'
services:
fextor3:
container_name: clarin_tagger
build: ./
working_dir: /home/worker/
entrypoint:
- python
- main.py
volumes:
- ./src:/home/worker/src
- ./main.py:/home/worker/main.py
- ./config-test.ini:/home/worker/config.ini
- /samba:/samba
environment:
- PYTHONUNBUFFERED=0
"""Implementation of fextor3 worker."""
import nlp_ws
from src.tagger import TaggerWorker
if __name__ == '__main__':
nlp_ws.NLPService.main(TaggerWorker, pause_at_exit=False)
taggers:
pl:
default:
lpmn: morphoDiTa
output: ccl
nlp-ws
ccl2json==0.1.1
\ No newline at end of file
import json
from xml.sax import handler, make_parser
from ccl2json.parse import CCLhandler
def ccl2json(path_in, path_out):
parser = make_parser()
parser.setFeature(handler.feature_external_ges, False)
parser.setContentHandler(CCLhandler())
parser.parse(path_in)
with open(path_out, 'w', encoding='utf-8') as fout:
dout = {
"filename": path_in.split('/')[-1].replace('.ccl', ''),
'text': parser.getContentHandler().get_text(),
'tagset': 'nkjp',
'tokens': [token.as_dict() for token in parser.getContentHandler().get_tokens()]
}
json.dump(dout, fout, ensure_ascii=False)
#!/usr/bin/python3
"""Implementation of lemmatizer from CCL"""
from collections import defaultdict
from xml.dom import pulldom
import logging
_log = logging.getLogger(__name__)
def ccl_2_lemmas(input_file, output_file):
"""Implementation of lemmas extracting function.
:param input_file: path to a ccl file with words lemmas,
output of MorphoDiTa or Wcrft2
:type input_file: str
:param output_file: path to resulting text file with words in lemma forms
:type output_file: str
"""
input_data = pulldom.parse(input_file)
with open(output_file, 'wt', encoding='utf-8') as f:
_log.error("here")
for event, node in input_data:
if event == pulldom.START_ELEMENT and node.tagName == 'sentence':
input_data.expandNode(node)
words = [base.firstChild.data for base
in node.getElementsByTagName('base')
if base is not None and base.firstChild is not None]
f.write(" ".join(words))
f.write("\n")
#!/usr/bin/python3
"""Fextor3 worker implementation."""
from __future__ import absolute_import, division, unicode_literals
import json
import logging
import shutil
import nlp_ws
from nlp_ws import SubTask
import src.converter as converter
_log = logging.getLogger(__name__)
SubTask.turn_on()
class TaggerWorker(nlp_ws.NLPWorker):
"""Class implementing TaggerWorker worker."""
@classmethod
def static_init(cls, config):
"""Initialize process."""
cls._taggers = {}
c_tool = config.get('tool')
if 'pl' in c_tool:
cls._taggers['pl'] = c_tool["pl"]
_log.info("DONE")
def process(self, input_path, task_options, output_path):
"""Called for each request made to the worker.
Extract features defined in task option from file on
input_path in xml ccl format and write json output into output_path.
:param input_path: Path to either a file from which
the worker should read the xml data.
:type input_path: str
:param task_options: Dictionary containing path to json with key
lemmas-count with lemmas dict of structure
first: [{"lemma": "first", parts: []},
"lemma": "first second", parts: ["second"], term: "first_second"]
term keys is present only if term is present in terms list
:type task_options: dict
:param output_path: Path to directory where the
worker will store result json file.
:type output_path: str
"""
_log.info(self._taggers)
lang = "pl"
if "lang" in task_options:
lang = task_options["lang"]
if not lang in self._taggers:
raise Exception(f"Unsuported language: {lang}, supported {list(self._taggers.keys())}")
subtask = SubTask(input_path, [ self._taggers[lang] ])
subtask.run(blocking=False)
l_result = subtask.get_output_path()
output = "ccl"
if "output" in task_options:
output = task_options["output"]
if output == "lemmas":
lemmatizer.ccl_2_lemmas(l_result, output_path)
elif output == "json":
converter.ccl2json(l_result, output_path)
else:
shutil.copyfile(l_result, output_path)
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment