From 83ab0a116d4842e3bac38ac05fde1cc1c10d6634 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Walkowiak?= <pawel.walkowiak@pwr.edu.pl>
Date: Mon, 12 Feb 2024 07:24:24 +0000
Subject: [PATCH] New tagger

---
 .gitignore                           |   9 ++
 README.md                            |   8 +-
 pos_tagger.yaml                      |  58 ++++++++-
 src/tagger.py                        |  25 +++-
 src/utils.py                         |  87 +++++++-------
 tests/testdata/input/pos_tagger.yaml | 174 ++++++++++++++++-----------
 tox.ini                              |   3 -
 7 files changed, 242 insertions(+), 122 deletions(-)

diff --git a/.gitignore b/.gitignore
index d755acc..de37d40 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,12 @@
 htmlcov
 config-test.ini
 /tests/tmp-test.py
+example*
+ner_*
+report.xml
+test.json
+test
+test-em.json
+ascii.json
+test-ascii.json
+non_ascii.json
diff --git a/README.md b/README.md
index 71991a1..f377769 100644
--- a/README.md
+++ b/README.md
@@ -11,4 +11,10 @@ task_options
 
 `method`: method of tagging (default = 'tagger', values: 'tagger', 'ner')
 
-`ner_type`: type of named entity recognition tool (default = 'winer', values: 'winer', 'liner')
+`tagger_type`: type of tagger tool (default = 'morphodita',
+                                    values: 'morphodita', 'poldeepner2tagger', 'spacy', for pl,
+                                            'spacy' for other languages)
+
+`ner_type`: type of named entity recognition tool (default = 'winer', 
+                                                   values: 'winer', 'liner', 'poldeepner2', 'spacy' for pl, 
+                                                           'spacy', 'poldeepner2' for other languages)
diff --git a/pos_tagger.yaml b/pos_tagger.yaml
index ea0777e..e089919 100644
--- a/pos_tagger.yaml
+++ b/pos_tagger.yaml
@@ -4,6 +4,14 @@ taggers:
         lpmn: ["morphodita"]
         output: ccl
         tagset: nkjp
+      spacy:
+        lpmn: [{"spacy":{"lang":"pl"}}]
+        output: json
+        tagset: ud
+      poldeepner2tagger:
+        lpmn: ["witok", "poldeepner2tagger"]
+        output: json
+        tagset: nkjp
     en:
       default:
         lpmn: [{"spacy":{"lang":"en"}}]
@@ -37,9 +45,20 @@ taggers:
 ners:
     pl:
       default:
-        lpmn: ['morphodita', {'posconverter': {'input_format': 'ccl', 'output_format': 'json'}}, 'winer']
+        lpmn: ['winer']
+        output: json
+        tagset: nkjp
+        type: 'additive'
+      spacy:
+        lpmn: [{"spacy":{"lang":"pl", 'method': 'ner'}}]
+        output: json
+        tagset: ud
+        type: 'overwrite'
+      poldeepner2:
+        lpmn: ["poldeepner2"]
         output: json
         tagset: nkjp
+        type: 'additive'
       liner:
         lpmn: [
           'morphodita',
@@ -48,36 +67,73 @@ ners:
         ]
         output: json
         tagset: nkjp
+        type: 'overwrite'
     en:
       default:
         lpmn: [{"spacy":{"lang":"en", 'method': 'ner'}}]
         output: json
         tagset: ud
+        type: 'overwrite'
+      poldeepner2:
+        lpmn: ["poldeepner2"]
+        output: json
+        tagset: ud
+        type: 'additive'
     de:
       default:
         lpmn: [{"spacy":{"lang":"de", 'method': 'ner'}}]
         output: json
         tagset: ud
+        type: 'overwrite'
+      poldeepner2:
+        lpmn: [ "poldeepner2" ]
+        output: json
+        tagset: ud
+        type: 'additive'
     es:
       default:
         lpmn: [{"spacy":{"lang":"es", 'method': 'ner'}}]
         output: json
         tagset: ud
+        type: 'overwrite'
+      poldeepner2:
+        lpmn: [ "poldeepner2" ]
+        output: json
+        tagset: ud
+        type: 'additive'
     pt:
       default:
         lpmn: [{"spacy":{"lang":"pt", 'method': 'ner'}}]
         output: json
         tagset: ud
+        type: 'overwrite'
+      poldeepner2:
+        lpmn: [ "poldeepner2" ]
+        output: json
+        tagset: ud
+        type: 'additive'
     fr:
       default:
         lpmn: [{"spacy":{"lang":"fr", 'method': 'ner'}}]
         output: json
         tagset: ud
+        type: 'overwrite'
+      poldeepner2:
+        lpmn: [ "poldeepner2" ]
+        output: json
+        tagset: ud
+        type: 'additive'
     ru:
       default:
         lpmn: [{"spacy":{"lang":"ru", 'method': 'ner'}}]
         output: json
         tagset: ud
+        type: 'overwrite'
+      poldeepner2:
+        lpmn: [ "poldeepner2" ]
+        output: json
+        tagset: ud
+        type: 'additive'
 
 linkers:
   clalink:
diff --git a/src/tagger.py b/src/tagger.py
index 9d2632d..58cbd05 100644
--- a/src/tagger.py
+++ b/src/tagger.py
@@ -31,6 +31,7 @@ LINKING_TYPE = "linking_type"
 TASK = "task"
 MARKERS = "markers"
 KB_ID = "knowledge_base_id"
+TYPE = "type"
 
 
 class TaggerWorker(nlp_ws.NLPWorker):
@@ -101,10 +102,18 @@ class TaggerWorker(nlp_ws.NLPWorker):
 
         method = task_options.get("method", "tagger")
 
+        tagger_type = task_options.get("tagger_type", "default")
+        tagger_type = DEFAULT_TYPE \
+            if tagger_type == "default" else tagger_type
+        tagger_type = tagger_type \
+            if tagger_type in self._taggers[lang] else DEFAULT_TYPE
+
         ner_type = task_options.get("ner_type", "winer")
         ner_type = DEFAULT_TYPE if ner_type == "winer" else ner_type
         ner_type = ner_type if ner_type in self._ners[lang] else DEFAULT_TYPE
 
+        ner_query_type = self._ners[lang][ner_type][TYPE]
+
         if method not in [TAGGER, NER]:
             raise Exception(f"Unsupported method: {method}")
 
@@ -122,7 +131,7 @@ class TaggerWorker(nlp_ws.NLPWorker):
         linking_type = task_options.get("linking_type", None)
         if linking_name in self._linkers:
             linking = self._linkers[linking_name]
-        elif linking_name is None:
+        elif linking_name is None or linking_name == "None":
             linking = None
         else:
             raise Exception(f"Unsupported linking: {linking_name}")
@@ -141,17 +150,27 @@ class TaggerWorker(nlp_ws.NLPWorker):
             self._parallel_subtasks
         )
 
-        tagger_opt = self._taggers[lang][DEFAULT_TYPE]
+        tagger_opt = self._taggers[lang][tagger_type]
         ner_opt = self._ners[lang][ner_type]
         convert_lpmn = self.get_converter_directive(
             tagger_opt[OUTPUT], tagger_opt[TAGSET], output, tagset,
             json_text) if method == TAGGER else self.get_converter_directive(
             ner_opt[OUTPUT], ner_opt[TAGSET], output, tagset,
             json_text, ner_opt[NER] if NER in ner_opt else False)
+
+        ner_query = ner_opt[LPMN].copy()
+        if method == NER and ner_query_type == "additive":
+            ner_query = [*tagger_opt[LPMN].copy(), *ner_opt[LPMN].copy()]
+            if tagger_opt[OUTPUT] == 'ccl' and ner_opt[OUTPUT] == 'json':
+                ner_query = [*tagger_opt[LPMN].copy(),
+                             {'posconverter': {'input_format': 'ccl',
+                                               'output_format': 'json'}},
+                             *ner_opt[LPMN].copy()]
+
         json_lpmn = (
             tagger_opt[LPMN].copy()
             if method == TAGGER
-            else ner_opt[LPMN].copy()
+            else ner_query
         )
         if convert_lpmn is not None and method == TAGGER and output != TEI:
             json_lpmn.append(convert_lpmn)
diff --git a/src/utils.py b/src/utils.py
index a243d66..9a50e12 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -185,51 +185,50 @@ def merge_splits(output_path: str, destination_path: str,
             l_results = [subtask.get_output_path() for subtask in subtasks]
 
             for l_result in l_results:
-                with open(l_result, "r") as f:
-                    if merge_type == MergeType.PLAINTEXT2CCL:
-                        file_content2 = f.read()
-                        l_data2 = json.loads(file_content2)
-                        f2.write(f"{json.dumps(l_data2)}\n")
-                    elif merge_type == MergeType.PLAINTEXT2JSON:
-                        file_content = f.read()
-                        l_data = json.loads(file_content)
-                        doc = clarin_json.Document.from_dict(l_data)
-                        f2.write(doc)
-                    elif merge_type == MergeType.TAGGER2LEMMAS:
-                        file_content_lemmas = f.read()
-                        f2.write(file_content_lemmas)
-                    elif merge_type == MergeType.NER2LEMMAS:
-                        file_content_lemmas = f.read()
-                        l_data_lemmas = json.loads(file_content_lemmas)
-                        doc = clarin_json.Document.from_dict(l_data_lemmas)
-                        words = []
-                        for token in doc.tokens():
-                            for lexem in token.lexemes:
-                                words.append(lexem.lemma)
-                        f2.write(" ".join(words))
-                        f2.write("\n")
-                    elif merge_type == MergeType.CCLS2TEI:
-                        with open(l_result, "r") as f:
-                            try:
-                                xml = ET.fromstring(bytes(f.read(), 'utf-8'))
-                                for child in xml.iter('chunk'):
-                                    merged_ccl += ET.tostring(
-                                        child,
-                                        encoding='unicode',
-                                        pretty_print=True
-                                    )
-
-                            except ET.XMLSyntaxError:
-                                _log.error('File is not valid XML!')
-                                continue
-
-                    elif merge_type == MergeType.JSON2TEI:
+                if merge_type == MergeType.CCLS2TEI:
+                    with open(l_result, "r") as f:
                         try:
-                            file_data = json.loads(f.read())
-                        except json.JSONDecodeError:
-                            _log.error('File is not valid JSON!')
-                        else:
-                            json_parts.append(file_data)
+                            xml = ET.fromstring(bytes(f.read(), 'utf-8'))
+                            for child in xml.iter('chunk'):
+                                merged_ccl += ET.tostring(
+                                    child,
+                                    encoding='unicode',
+                                    pretty_print=True
+                                )
+
+                        except ET.XMLSyntaxError:
+                            _log.error('File is not valid XML!')
+                            continue
+                else:
+                    with open(l_result, "r") as f:
+                        for file_content in f.readlines():
+                            if merge_type == MergeType.PLAINTEXT2CCL:
+                                l_data2 = json.loads(file_content)
+                                f2.write(f"{json.dumps(l_data2)}\n")
+                            elif merge_type == MergeType.PLAINTEXT2JSON:
+                                l_data = json.loads(file_content)
+                                doc = clarin_json.Document.from_dict(l_data)
+                                f2.write(doc)
+                            elif merge_type == MergeType.TAGGER2LEMMAS:
+                                f2.write(file_content)
+                            elif merge_type == MergeType.NER2LEMMAS:
+                                l_data_lemmas = json.loads(file_content)
+                                doc = clarin_json.Document.from_dict(
+                                    l_data_lemmas
+                                )
+                                words = []
+                                for token in doc.tokens():
+                                    for lexem in token.lexemes:
+                                        words.append(lexem.lemma)
+                                f2.write(" ".join(words))
+                                f2.write("\n")
+                            elif merge_type == MergeType.JSON2TEI:
+                                try:
+                                    file_data = json.loads(file_content)
+                                except json.JSONDecodeError:
+                                    _log.error('File is not valid JSON!')
+                                else:
+                                    json_parts.append(file_data)
 
             del subtask_args_queue_awaiting[:parallel_subtasks]
 
diff --git a/tests/testdata/input/pos_tagger.yaml b/tests/testdata/input/pos_tagger.yaml
index e989f4a..28e990f 100644
--- a/tests/testdata/input/pos_tagger.yaml
+++ b/tests/testdata/input/pos_tagger.yaml
@@ -1,75 +1,109 @@
 taggers:
-    pl:
-      default:
-        lpmn: ["morphodita"]
-        output: ccl
-        tagset: nkjp
-    en:
-      default:
-        lpmn: [{"spacy":{"lang":"en"}}]
-        output: json
-        tagset: ud
-    de:
-      default:
-        lpmn: [{"spacy":{"lang":"de"}}]
-        output: json
-        tagset: ud
-    es:
-      default:
-        lpmn: [{"spacy":{"lang":"es"}}]
-        output: json
-        tagset: ud
-    pt:
-      default:
-        lpmn: [{"spacy":{"lang":"pt"}}]
-        output: json
-        tagset: ud
-    fr:
-      default:
-        lpmn: [{"spacy":{"lang":"fr"}}]
-        output: json
-        tagset: ud
-    ru:
-      default:
-        lpmn: [{"spacy":{"lang":"ru"}}]
-        output: json
-        tagset: ud
+  pl:
+    default:
+      lpmn: ["morphodita", {'posconverter': {'input_format': 'ccl', 'output_format': 'json' }}]
+      output: json
+      tagset: nkjp
+    spacy:
+      lpmn: [{"spacy":{"lang":"pl"}}]
+      output: json
+      tagset: ud
+    poldeepner2tagger:
+      lpmn: ["witok", "poldeepner2tagger"]
+      output: json
+      tagset: nkjp
+  en:
+    default:
+      lpmn: [{"spacy":{"lang":"en"}}]
+      output: json
+      tagset: ud
+  de:
+    default:
+      lpmn: [{"spacy":{"lang":"de"}}]
+      output: json
+      tagset: ud
+  es:
+    default:
+      lpmn: [{"spacy":{"lang":"es"}}]
+      output: json
+      tagset: ud
+  pt:
+    default:
+      lpmn: [{"spacy":{"lang":"pt"}}]
+      output: json
+      tagset: ud
+  fr:
+    default:
+      lpmn: [{"spacy":{"lang":"fr"}}]
+      output: json
+      tagset: ud
+  ru:
+    default:
+      lpmn: [{"spacy":{"lang":"ru"}}]
+      output: json
+      tagset: ud
 ners:
-    pl:
-      default:
-        lpmn: ['morphodita', {'posconverter': {'input_format': 'ccl', 'output_format': 'json'}}, 'winer']
-        output: json
-        tagset: nkjp
-    en:
-      default:
-        lpmn: [{"spacy":{"lang":"en", 'method': 'ner'}}]
-        output: json
-        tagset: ud
-    de:
-      default:
-        lpmn: [{"spacy":{"lang":"de", 'method': 'ner'}}]
-        output: json
-        tagset: ud
-    es:
-      default:
-        lpmn: [{"spacy":{"lang":"es", 'method': 'ner'}}]
-        output: json
-        tagset: ud
-    pt:
-      default:
-        lpmn: [{"spacy":{"lang":"pt", 'method': 'ner'}}]
-        output: json
-        tagset: ud
-    fr:
-      default:
-        lpmn: [{"spacy":{"lang":"fr", 'method': 'ner'}}]
-        output: json
-        tagset: ud
-    ru:
-      default:
-        lpmn: [{"spacy":{"lang":"ru", 'method': 'ner'}}]
-        output: json
-        tagset: ud
+  pl:
+    default:
+      lpmn: ['winer']
+      output: json
+      tagset: nkjp
+      type: 'additive'
+    spacy:
+      lpmn: [{"spacy":{"lang":"pl", 'method': 'ner'}}]
+      output: json
+      tagset: ud
+      type: 'overwrite'
+    poldeepner2:
+      lpmn: ["poldeepner2"]
+      output: json
+      tagset: nkjp
+      type: 'additive'
+    liner:
+      lpmn: [
+        'morphodita',
+        {'liner2': {'model': 'n82'}},
+        {'posconverter': {'input_format': 'ccl', 'output_format': 'json', 'ner': True}}
+      ]
+      output: json
+      tagset: nkjp
+      type: 'overwrite'
+  en:
+    default:
+      lpmn: [{"spacy":{"lang":"en", 'method': 'ner'}}]
+      output: json
+      tagset: ud
+      type: 'overwrite'
+  de:
+    default:
+      lpmn: [{"spacy":{"lang":"de", 'method': 'ner'}}]
+      output: json
+      tagset: ud
+      type: 'overwrite'
+  es:
+    default:
+      lpmn: [{"spacy":{"lang":"es", 'method': 'ner'}}]
+      output: json
+      tagset: ud
+      type: 'overwrite'
+  pt:
+    default:
+      lpmn: [{"spacy":{"lang":"pt", 'method': 'ner'}}]
+      output: json
+      tagset: ud
+      type: 'overwrite'
+  fr:
+    default:
+      lpmn: [{"spacy":{"lang":"fr", 'method': 'ner'}}]
+      output: json
+      tagset: ud
+      type: 'overwrite'
+  ru:
+    default:
+      lpmn: [{"spacy":{"lang":"ru", 'method': 'ner'}}]
+      output: json
+      tagset: ud
+      type: 'overwrite'
 
 linkers:
   clalink:
diff --git a/tox.ini b/tox.ini
index 0aa64da..01a3db1 100644
--- a/tox.ini
+++ b/tox.ini
@@ -33,9 +33,6 @@ commands =
     coverage run --source=src -m pytest --junitxml=report.xml tests/test.py
     coverage html
 
-[pytest]
-python_paths = src src
-
 [run]
 relative_files = True
 branch = True
-- 
GitLab