Merge branch 'winer_support' into 'master'

Winer support See merge request !4

Merge branch 'winer_support' into 'master'
3063fa44 · Paweł Walkowiak · 7c9be063 · 2c550650 · 3063fa44 · 3063fa44
Commit 3063fa44 authored Feb 24, 2023 by Paweł Walkowiak
--- a/lpmn_queries.json
+++ b/lpmn_queries.json
@@ -7,5 +7,9 @@
    "post_any2txt": {"task": ["any2txt", {"postagger": {"lang": "en", "output": "json"}}], "input": "post_spacy_input", "expected": "post_spacy_expected.json"},
-    "postagger_lone_lemmas": {"task": [{"postagger": {"lang": "pl", "output": "lemmas"}}], "input": "post_postagger_input", "expected": "post_postagger_input_lemmas"}
+    "postagger_lone_lemmas": {"task": [{"postagger": {"lang": "pl", "output": "lemmas"}}], "input": "post_postagger_input", "expected": "post_postagger_input_lemmas"},
+    "ner_for_pl": {"task": [{"postagger": {"lang": "pl", "output": "json", "method": "ner"}}], "input": "post_postagger_input", "expected": "ner_for_pl.json"},
+    "ner_for_en": {"task": [{"postagger": {"lang": "en", "output": "json", "method": "ner"}}], "input": "post_spacy_input", "expected": "ner_for_en.json"}
 }
--- a/pos_tagger.yaml
+++ b/pos_tagger.yaml
@@ -9,3 +9,14 @@ taggers:
        lpmn: [{"spacy":{"lang":"en"}}]
        output: json
        tagset: ud
+ners:
+    pl:
+      default:
+        lpmn: ['morphodita', {'posconverter': {'input_format': 'ccl', 'output_format': 'json'}}, 'winer']
+        output: json
+        tagset: nkjp
+    en:
+      default:
+        lpmn: [{"spacy":{"lang":"en", 'method': 'ner'}}]
+        output: json
+        tagset: ud
--- a/src/tagger.py
+++ b/src/tagger.py
@@ -15,6 +15,14 @@ import src.utils
 _log = logging.getLogger(__name__)
 SubTask.turn_on()
+DEFAULT_TYPE = "default"
+OUTPUT = "output"
+JSON = "json"
+TAGSET = "tagset"
+TAGGER = "tagger"
+NER = "ner"
+LPMN = "lpmn"
 class TaggerWorker(nlp_ws.NLPWorker):
    """Class implementing TaggerWorker worker."""
@@ -23,15 +31,18 @@ class TaggerWorker(nlp_ws.NLPWorker):
    def static_init(cls, config):
        """Initialize process."""
        cls._taggers = {}
+        cls._ners = {}
        yaml_path = config.get('tool').get('config', 'pos_tagger.yaml')
        yaml_config = parse_config(yaml_path)
        cls._taggers = yaml_config["taggers"]
-        _log.error(f"Config from yaml: {cls._taggers}")
+        cls._ners = yaml_config["ners"]
+        _log.info(f"Config taggers from yaml: {cls._taggers}")
+        _log.info(f"Config ners from yaml: {cls._ners}")
        cls._chunking_limit = config.get('tool').get('chunking_limit', 50000)
        if not isinstance(cls._chunking_limit, int):
            cls._chunking_limit = int(cls._chunking_limit)
-        _log.error(f"Chunk size: {cls._chunking_limit}")
+        _log.info(f"Chunk size: {cls._chunking_limit}")
    def get_converter_directive(self, input_format, input_tagset, output_format,
                                output_tagset, json_text):
@@ -63,37 +74,47 @@ class TaggerWorker(nlp_ws.NLPWorker):
        output: format of results (default = 'json', values: json, ccl, lemmas)
        json_text: bool if json output should contain original
        text (default = True)
+        method: method of processing (default = 'tagger', values: tagger, ner)
        :type task_options: dict
        :param output_path: Path to directory where the
            worker will store result file.
        :type output_path: str
        """
-        lang = "pl"
+        lang = task_options.get("lang", "pl")
-        if "lang" in task_options:
-            lang = task_options["lang"]
+        method = task_options.get("method", "tagger")
-        if lang not in self._taggers:
+        if method not in [TAGGER, NER]:
+            raise Exception(f"Unsupported method: {method}")
+        if method == TAGGER and lang not in self._taggers:
            raise Exception(f"Unsupported language: {lang}, "
                            f"supported {list(self._taggers.keys())}")
-        output = "json"
-        if "output" in task_options:
-            output = task_options["output"]
-        tagset = "identical"
+        if method == NER and lang not in self._ners:
-        if "tagset" in task_options:
+            raise Exception(f"Unsupported language: {lang}, "
-            tagset = task_options["tagset"]
+                            f"supported {list(self._ners.keys())}")
+        output = task_options.get("output", "json")
-        json_text = True
+        tagset = task_options.get("tagset", "identical")
-        if "json_text" in task_options:
-            json_text = task_options["json_text"]
-        tagger_type = "default"
+        json_text = task_options.get("json_text", True)
-        tagger_opt = self._taggers[lang][tagger_type]
+        tagger_opt = self._taggers[lang][DEFAULT_TYPE]
+        ner_opt = self._ners[lang][DEFAULT_TYPE]
        convert_lpmn = self.get_converter_directive(
-            tagger_opt["output"], tagger_opt["tagset"], output, tagset,
+            tagger_opt[OUTPUT], tagger_opt[TAGSET], output, tagset,
+            json_text) if method == TAGGER else self.get_converter_directive(
+            ner_opt[OUTPUT], ner_opt[TAGSET], output, tagset,
            json_text)
-        json_lpmn = tagger_opt["lpmn"].copy()
+        json_lpmn = (
-        if convert_lpmn is not None:
+            tagger_opt[LPMN].copy()
+            if method == TAGGER
+            else ner_opt[LPMN].copy()
+        )
+        if convert_lpmn is not None and method == TAGGER:
            json_lpmn.append(convert_lpmn)
        _dir_style = False
@@ -101,7 +122,7 @@ class TaggerWorker(nlp_ws.NLPWorker):
            _dir_style = True
            json_lpmn = [json_lpmn]
        _log.debug(f"Running LPMN: {json_lpmn}")
-        if output == "json" and not _dir_style:
+        if output == JSON and not _dir_style:
            # split file into chunks
            chunk_size = int(self._chunking_limit * 0.5)
            destination_path = os.path.join(

--- a/tests/testdata/expected/ner_for_en.json
+++ b/tests/testdata/expected/ner_for_en.json
+{"filename": "6801426b-6ece-403b-868e-574ae96ce660", "tagset": "ud", "tokens": [{"index": 1, "position": [0, 4], "orth": "When", "lexemes": [{"lemma": "when", "mstag": "SCONJ", "disamb": true}]}, {"index": 2, "position": [5, 14], "orth": "Sebastian", "lexemes": [{"lemma": "Sebastian", "mstag": "PROPN", "disamb": true}]}, {"index": 3, "position": [15, 20], "orth": "Thrun", "lexemes": [{"lemma": "Thrun", "mstag": "PROPN", "disamb": true}]}, {"index": 4, "position": [21, 28], "orth": "started", "lexemes": [{"lemma": "start", "mstag": "VERB", "disamb": true}]}, {"index": 5, "position": [29, 36], "orth": "working", "lexemes": [{"lemma": "work", "mstag": "VERB", "disamb": true}]}, {"index": 6, "position": [37, 39], "orth": "on", "lexemes": [{"lemma": "on", "mstag": "ADP", "disamb": true}]}, {"index": 7, "position": [40, 44], "orth": "self", "lexemes": [{"lemma": "self", "mstag": "NOUN", "disamb": true}]}, {"index": 8, "position": [45, 45], "orth": "-", "lexemes": [{"lemma": "-", "mstag": "PUNCT", "disamb": true}]}, {"index": 9, "position": [46, 52], "orth": "driving", "lexemes": [{"lemma": "drive", "mstag": "VERB", "disamb": true}]}, {"index": 10, "position": [53, 57], "orth": "cars", "lexemes": [{"lemma": "car", "mstag": "NOUN", "disamb": true}]}, {"index": 11, "position": [58, 60], "orth": "at", "lexemes": [{"lemma": "at", "mstag": "ADP", "disamb": true}]}, {"index": 12, "position": [61, 67], "orth": "Google", "lexemes": [{"lemma": "Google", "mstag": "PROPN", "disamb": true}]}, {"index": 13, "position": [68, 70], "orth": "in", "lexemes": [{"lemma": "in", "mstag": "ADP", "disamb": true}]}, {"index": 14, "position": [71, 75], "orth": "2007", "lexemes": [{"lemma": "2007", "mstag": "NUM", "disamb": true}]}, {"index": 15, "position": [76, 76], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 16, "position": [77, 80], "orth": "few", "lexemes": [{"lemma": "few", "mstag": "ADJ", "disamb": true}]}, {"index": 17, "position": [81, 87], "orth": "people", "lexemes": [{"lemma": "people", "mstag": "NOUN", "disamb": true}]}, {"index": 18, "position": [88, 95], "orth": "outside", "lexemes": [{"lemma": "outside", "mstag": "ADV", "disamb": true}]}, {"index": 19, "position": [96, 98], "orth": "of", "lexemes": [{"lemma": "of", "mstag": "ADP", "disamb": true}]}, {"index": 20, "position": [99, 102], "orth": "the", "lexemes": [{"lemma": "the", "mstag": "DET", "disamb": true}]}, {"index": 21, "position": [103, 110], "orth": "company", "lexemes": [{"lemma": "company", "mstag": "NOUN", "disamb": true}]}, {"index": 22, "position": [111, 115], "orth": "took", "lexemes": [{"lemma": "take", "mstag": "VERB", "disamb": true}]}, {"index": 23, "position": [116, 119], "orth": "him", "lexemes": [{"lemma": "he", "mstag": "PRON", "disamb": true}]}, {"index": 24, "position": [120, 129], "orth": "seriously", "lexemes": [{"lemma": "seriously", "mstag": "ADV", "disamb": true}]}, {"index": 25, "position": [130, 130], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 26, "position": [131, 132], "orth": "“", "lexemes": [{"lemma": "\"", "mstag": "PUNCT", "disamb": true}]}, {"index": 27, "position": [133, 133], "orth": "I", "lexemes": [{"lemma": "I", "mstag": "PRON", "disamb": true}]}, {"index": 28, "position": [134, 137], "orth": "can", "lexemes": [{"lemma": "can", "mstag": "AUX", "disamb": true}]}, {"index": 29, "position": [138, 142], "orth": "tell", "lexemes": [{"lemma": "tell", "mstag": "VERB", "disamb": true}]}, {"index": 30, "position": [143, 146], "orth": "you", "lexemes": [{"lemma": "you", "mstag": "PRON", "disamb": true}]}, {"index": 31, "position": [147, 151], "orth": "very", "lexemes": [{"lemma": "very", "mstag": "ADV", "disamb": true}]}, {"index": 32, "position": [152, 158], "orth": "senior", "lexemes": [{"lemma": "senior", "mstag": "ADJ", "disamb": true}]}, {"index": 33, "position": [159, 163], "orth": "CEOs", "lexemes": [{"lemma": "ceo", "mstag": "NOUN", "disamb": true}]}, {"index": 34, "position": [164, 166], "orth": "of", "lexemes": [{"lemma": "of", "mstag": "ADP", "disamb": true}]}, {"index": 35, "position": [167, 172], "orth": "major", "lexemes": [{"lemma": "major", "mstag": "ADJ", "disamb": true}]}, {"index": 36, "position": [173, 181], "orth": "American", "lexemes": [{"lemma": "american", "mstag": "ADJ", "disamb": true}]}, {"index": 37, "position": [182, 185], "orth": "car", "lexemes": [{"lemma": "car", "mstag": "NOUN", "disamb": true}]}, {"index": 38, "position": [186, 195], "orth": "companies", "lexemes": [{"lemma": "company", "mstag": "NOUN", "disamb": true}]}, {"index": 39, "position": [196, 201], "orth": "would", "lexemes": [{"lemma": "would", "mstag": "AUX", "disamb": true}]}, {"index": 40, "position": [202, 207], "orth": "shake", "lexemes": [{"lemma": "shake", "mstag": "VERB", "disamb": true}]}, {"index": 41, "position": [208, 210], "orth": "my", "lexemes": [{"lemma": "my", "mstag": "PRON", "disamb": true}]}, {"index": 42, "position": [211, 215], "orth": "hand", "lexemes": [{"lemma": "hand", "mstag": "NOUN", "disamb": true}]}, {"index": 43, "position": [216, 219], "orth": "and", "lexemes": [{"lemma": "and", "mstag": "CCONJ", "disamb": true}]}, {"index": 44, "position": [220, 224], "orth": "turn", "lexemes": [{"lemma": "turn", "mstag": "VERB", "disamb": true}]}, {"index": 45, "position": [225, 229], "orth": "away", "lexemes": [{"lemma": "away", "mstag": "ADV", "disamb": true}]}, {"index": 46, "position": [230, 237], "orth": "because", "lexemes": [{"lemma": "because", "mstag": "SCONJ", "disamb": true}]}, {"index": 47, "position": [238, 239], "orth": "I", "lexemes": [{"lemma": "I", "mstag": "PRON", "disamb": true}]}, {"index": 48, "position": [240, 243], "orth": "was", "lexemes": [{"lemma": "be", "mstag": "AUX", "disamb": true}]}, {"index": 49, "position": [244, 246], "orth": "n’t", "lexemes": [{"lemma": "not", "mstag": "PART", "disamb": true}]}, {"index": 50, "position": [247, 252], "orth": "worth", "lexemes": [{"lemma": "worth", "mstag": "ADJ", "disamb": true}]}, {"index": 51, "position": [253, 260], "orth": "talking", "lexemes": [{"lemma": "talk", "mstag": "VERB", "disamb": true}]}, {"index": 52, "position": [261, 263], "orth": "to", "lexemes": [{"lemma": "to", "mstag": "ADP", "disamb": true}]}, {"index": 53, "position": [264, 264], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 54, "position": [265, 265], "orth": "”", "lexemes": [{"lemma": "\"", "mstag": "PUNCT", "disamb": true}]}, {"index": 55, "position": [266, 270], "orth": "said", "lexemes": [{"lemma": "say", "mstag": "VERB", "disamb": true}]}, {"index": 56, "position": [271, 276], "orth": "Thrun", "lexemes": [{"lemma": "Thrun", "mstag": "PROPN", "disamb": true}]}, {"index": 57, "position": [277, 277], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 58, "position": [278, 280], "orth": "in", "lexemes": [{"lemma": "in", "mstag": "ADP", "disamb": true}]}, {"index": 59, "position": [281, 283], "orth": "an", "lexemes": [{"lemma": "an", "mstag": "DET", "disamb": true}]}, {"index": 60, "position": [284, 293], "orth": "interview", "lexemes": [{"lemma": "interview", "mstag": "NOUN", "disamb": true}]}, {"index": 61, "position": [294, 298], "orth": "with", "lexemes": [{"lemma": "with", "mstag": "ADP", "disamb": true}]}, {"index": 62, "position": [299, 305], "orth": "Recode", "lexemes": [{"lemma": "Recode", "mstag": "PROPN", "disamb": true}]}, {"index": 63, "position": [306, 313], "orth": "earlier", "lexemes": [{"lemma": "early", "mstag": "ADV", "disamb": true}]}, {"index": 64, "position": [314, 318], "orth": "this", "lexemes": [{"lemma": "this", "mstag": "DET", "disamb": true}]}, {"index": 65, "position": [319, 323], "orth": "week", "lexemes": [{"lemma": "week", "mstag": "NOUN", "disamb": true}]}, {"index": 66, "position": [324, 324], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 67, "position": [325, 325], "orth": "\n", "lexemes": [{"lemma": "\n", "mstag": "SPACE", "disamb": true}]}], "text": "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously. “I can tell you very senior CEOs of major American car companies would shake my hand and turn away because I wasn’t worth talking to,” said Thrun, in an interview with Recode earlier this week.\n", "entities": [{"text": "Sebastian Thrun", "type": "PERSON", "tokens": [1, 3], "position": [5, 20]}, {"text": "Google", "type": "ORG", "tokens": [11, 12], "position": [61, 67]}, {"text": "2007", "type": "DATE", "tokens": [13, 14], "position": [71, 75]}, {"text": "American", "type": "NORP", "tokens": [35, 36], "position": [173, 181]}, {"text": "Thrun", "type": "PERSON", "tokens": [55, 56], "position": [271, 276]}, {"text": "Recode", "type": "ORG", "tokens": [61, 62], "position": [299, 305]}, {"text": "earlier this week", "type": "DATE", "tokens": [62, 65], "position": [306, 323]}]}
--- a/tests/testdata/expected/ner_for_pl.json
+++ b/tests/testdata/expected/ner_for_pl.json
--- a/tests/testdata/input/pos_tagger.yaml
+++ b/tests/testdata/input/pos_tagger.yaml
@@ -9,3 +9,14 @@ taggers:
        lpmn: [{"spacy":{"lang":"en"}}]
        output: json
        tagset: ud
+ners:
+    pl:
+      default:
+        lpmn: ['morphodita', {'posconverter': {'input_format': 'ccl', 'output_format': 'json'}}, 'winer']
+        output: json
+        tagset: nkjp
+    en:
+      default:
+        lpmn: [{"spacy":{"lang":"en", 'method': 'ner'}}]
+        output: json
+        tagset: ud