Initial commit

70d9ab2f · Tomasz Walkowiak · 70d9ab2f · 70d9ab2f · 70d9ab2f · 70d9ab2f
Commit 70d9ab2f authored 2 years ago by Tomasz Walkowiak
--- a/.gitignore
+++ b/.gitignore
+.coverage
+/tests/.pytest_cache
+.pytest_cache
+.idea
+*__pycache__
+htmlcov
+config-test.ini
--- a/Dockerfile
+++ b/Dockerfile
+FROM clarinpl/python:3.8	
+
+WORKDIR /home/worker
+
+COPY ./src ./src
+COPY requirements.txt .
+COPY config.ini .
+COPY main.py .
+
+RUN pip install -r requirements.txt
+
+CMD python main.py
+
--- a/README.md
+++ b/README.md
+## Fextor3
+Konwersja usługi fextorbis na python3
+
+
+
+task_options
+============
+
+`lemmas-count: 'json file path'`
+
+Plik json zawierający informacje o lematach, częściach wielowyrazowców oraz końcowych termach, przykład dla wycinka ze słownika:
+
+| terms               | lemmas              |
+|---------------------|---------------------|
+| szkoła jezior       | szkoła jezioro      |
+ | szkoła literacka    | szkoła literacki    |
+| szkoła strukturalna | szkoła strukturalny |
+| szkoła sycylijska   | szkoła sycylijski   |
+| szkoła śląska       | szkoła śląski       |
+| szkoła ukraińska    | szkoła ukraiński    |
+
+"szkoła": [ \
+&emsp; {"lemma": "szkoła", "parts": []}, \
+&emsp;    {"lemma": "szkoła jezioro", "parts": ["jezioro"], "term": "szkoła_jezior"}, \
+&emsp;    {"lemma": "szkoła literacki", "parts": ["literacki"], "term": "szkoła_literacka"}, \
+&emsp;    {"lemma": "szkoła strukturalny", "parts": ["strukturalny"], "term": "szkoła_strukturalna"}, \
+&emsp;    {"lemma": "szkoła sycylijski", "parts": ["sycylijski"], "term": "szkoła_sycylijska"}, \
+&emsp;    {"lemma": "szkoła śląski", "parts": ["śląski"], "term": "szkoła_śląska"}, \
+&emsp;    {"lemma": "szkoła ukraiński", "parts": ["ukraiński"], "term": "szkoła_ukraińska"}]\
+
+Element {"lemma": "szkoła", "parts": []} nie ma klucza "term", ponieważ słowo "szkoła" nie występuje jako osobny term.
+
+[Raport pokrycia kodu](http://nlpworkers.pages.clarin-pl.eu/fextor3/coverage/)
+
+
+    
+
+
--- a/config.ini
+++ b/config.ini
+[service]
+tool = fextor3
+
+root = /samba/requests/
+rabbit_host = rabbit.clarin.ws
+rabbit_user = clarin
+rabbit_password = clarin123
+
+[tool]
+workers_number = 1
+stoplist_basedir = /samba/
+lemmas_count = /samba/dictionary.json
+
+[logging]
+port = 9098
+local_log_level = INFO
+
+[logging_levels]
+__main__ = INFO
--- a/docker-compose.yml
+++ b/docker-compose.yml
+version: '3'
+services:
+
+  fextor3:
+    container_name: clarin_tagger
+    build: ./
+    working_dir: /home/worker/
+    entrypoint:
+        - python
+        - main.py
+    volumes:
+        - ./src:/home/worker/src
+        - ./main.py:/home/worker/main.py
+        - ./config-test.ini:/home/worker/config.ini
+        - /samba:/samba
+    environment:
+      - PYTHONUNBUFFERED=0
--- a/main.py
+++ b/main.py
+"""Implementation of fextor3 worker."""
+import nlp_ws
+
+from src.tagger import TaggerWorker
+
+if __name__ == '__main__':
+    nlp_ws.NLPService.main(TaggerWorker, pause_at_exit=False)
--- a/pos_tagger.yaml
+++ b/pos_tagger.yaml
+taggers:
+    pl:
+      default:
+        lpmn: morphoDiTa
+        output: ccl
--- a/requirements.txt
+++ b/requirements.txt
+nlp-ws
+ccl2json==0.1.1
\ No newline at end of file
--- a/src/converter.py
+++ b/src/converter.py
+import json
+from xml.sax import handler, make_parser
+
+from ccl2json.parse import CCLhandler
+
+
+def ccl2json(path_in, path_out):
+    parser = make_parser()
+    parser.setFeature(handler.feature_external_ges, False)
+    parser.setContentHandler(CCLhandler())
+    parser.parse(path_in)
+
+    with open(path_out, 'w', encoding='utf-8') as fout:
+        dout = {
+            "filename": path_in.split('/')[-1].replace('.ccl', ''),
+            'text': parser.getContentHandler().get_text(),
+            'tagset': 'nkjp',
+            'tokens': [token.as_dict() for token in parser.getContentHandler().get_tokens()]
+        }
+        json.dump(dout, fout, ensure_ascii=False)
--- a/src/lemmatizer.py
+++ b/src/lemmatizer.py
+#!/usr/bin/python3
+"""Implementation of lemmatizer from CCL"""
+
+from collections import defaultdict
+from xml.dom import pulldom
+import logging
+
+_log = logging.getLogger(__name__)
+
+def ccl_2_lemmas(input_file, output_file):
+    """Implementation of lemmas extracting function.
+
+    :param  input_file: path to a ccl file with words lemmas,
+    output of MorphoDiTa or Wcrft2
+    :type input_file: str
+
+    :param output_file: path to resulting text file with words in lemma forms
+    :type output_file: str
+    """
+
+
+    input_data = pulldom.parse(input_file)
+    with open(output_file, 'wt', encoding='utf-8') as f:
+        _log.error("here")
+        for event, node in input_data:
+            if event == pulldom.START_ELEMENT and node.tagName == 'sentence':
+                input_data.expandNode(node)
+                words = [base.firstChild.data for base
+                         in node.getElementsByTagName('base')
+                         if base is not None and base.firstChild is not None]     
+                f.write(" ".join(words))
+                f.write("\n")
--- a/src/tagger.py
+++ b/src/tagger.py
+#!/usr/bin/python3
+"""Fextor3 worker implementation."""
+
+from __future__ import absolute_import, division, unicode_literals
+
+import json
+import logging
+import shutil
+
+import nlp_ws
+from nlp_ws import SubTask
+import src.converter as converter
+
+_log = logging.getLogger(__name__)
+SubTask.turn_on()
+
+
+class TaggerWorker(nlp_ws.NLPWorker):
+    """Class implementing TaggerWorker worker."""
+
+    @classmethod
+    def static_init(cls, config):
+        """Initialize process."""
+        cls._taggers = {}
+        c_tool = config.get('tool')
+        if 'pl' in c_tool:
+            cls._taggers['pl'] = c_tool["pl"]
+        _log.info("DONE")
+
+
+    def process(self, input_path, task_options, output_path):
+        """Called for each request made to the worker.
+
+        Extract features defined in task option from file on
+        input_path in xml ccl format and write json output into output_path.
+
+        :param input_path: Path to either a file from which
+            the worker should read the xml data.
+        :type input_path: str
+
+        :param task_options: Dictionary containing path to json with key
+        lemmas-count with lemmas dict of structure
+        first: [{"lemma": "first", parts: []},
+            "lemma": "first second", parts: ["second"], term: "first_second"]
+        term keys is present only if term is present in terms list
+        :type task_options: dict
+
+        :param output_path: Path to directory where the
+            worker will store result json file.
+        :type output_path: str
+        """
+        _log.info(self._taggers)
+        lang = "pl"
+        if "lang" in task_options:
+            lang = task_options["lang"]
+        if not lang in self._taggers:
+            raise Exception(f"Unsuported language: {lang}, supported {list(self._taggers.keys())}")
+        subtask = SubTask(input_path, [ self._taggers[lang] ])
+        subtask.run(blocking=False)
+        l_result = subtask.get_output_path()
+
+        output = "ccl"
+        if "output" in task_options:
+            output = task_options["output"]
+        if output == "lemmas":
+            lemmatizer.ccl_2_lemmas(l_result, output_path)
+        elif output == "json":
+            converter.ccl2json(l_result, output_path)
+        else:
+            shutil.copyfile(l_result, output_path)