From 83ab0a116d4842e3bac38ac05fde1cc1c10d6634 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Walkowiak?= <pawel.walkowiak@pwr.edu.pl> Date: Mon, 12 Feb 2024 07:24:24 +0000 Subject: [PATCH] New tagger --- .gitignore | 9 ++ README.md | 8 +- pos_tagger.yaml | 58 ++++++++- src/tagger.py | 25 +++- src/utils.py | 87 +++++++------- tests/testdata/input/pos_tagger.yaml | 174 ++++++++++++++++----------- tox.ini | 3 - 7 files changed, 242 insertions(+), 122 deletions(-) diff --git a/.gitignore b/.gitignore index d755acc..de37d40 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,12 @@ htmlcov config-test.ini /tests/tmp-test.py +example* +ner_* +report.xml +test.json +test +test-em.json +ascii.json +test-ascii.json +non_ascii.json diff --git a/README.md b/README.md index 71991a1..f377769 100644 --- a/README.md +++ b/README.md @@ -11,4 +11,10 @@ task_options `method`: method of tagging (default = 'tagger', values: 'tagger', 'ner') -`ner_type`: type of named entity recognition tool (default = 'winer', values: 'winer', 'liner') +`tagger_type`: type of tagger tool (default = 'morphodita', + values: 'morphodita', 'poldeepner2tagger', 'spacy', for pl, + 'spacy' for other languages) + +`ner_type`: type of named entity recognition tool (default = 'winer', + values: 'winer', 'liner', 'poldeepner2', 'spacy' for pl, + 'spacy', 'poldeepner2' for other languages) diff --git a/pos_tagger.yaml b/pos_tagger.yaml index ea0777e..e089919 100644 --- a/pos_tagger.yaml +++ b/pos_tagger.yaml @@ -4,6 +4,14 @@ taggers: lpmn: ["morphodita"] output: ccl tagset: nkjp + spacy: + lpmn: [{"spacy":{"lang":"pl"}}] + output: json + tagset: ud + poldeepner2tagger: + lpmn: ["witok", "poldeepner2tagger"] + output: json + tagset: nkjp en: default: lpmn: [{"spacy":{"lang":"en"}}] @@ -37,9 +45,20 @@ taggers: ners: pl: default: - lpmn: ['morphodita', {'posconverter': {'input_format': 'ccl', 'output_format': 'json'}}, 'winer'] + lpmn: ['winer'] + output: json + tagset: nkjp + type: 'additive' + spacy: + lpmn: [{"spacy":{"lang":"pl", 'method': 'ner'}}] + output: json + tagset: ud + type: 'overwrite' + poldeepner2: + lpmn: ["poldeepner2"] output: json tagset: nkjp + type: 'additive' liner: lpmn: [ 'morphodita', @@ -48,36 +67,73 @@ ners: ] output: json tagset: nkjp + type: 'overwrite' en: default: lpmn: [{"spacy":{"lang":"en", 'method': 'ner'}}] output: json tagset: ud + type: 'overwrite' + poldeepner2: + lpmn: ["poldeepner2"] + output: json + tagset: ud + type: 'additive' de: default: lpmn: [{"spacy":{"lang":"de", 'method': 'ner'}}] output: json tagset: ud + type: 'overwrite' + poldeepner2: + lpmn: [ "poldeepner2" ] + output: json + tagset: ud + type: 'additive' es: default: lpmn: [{"spacy":{"lang":"es", 'method': 'ner'}}] output: json tagset: ud + type: 'overwrite' + poldeepner2: + lpmn: [ "poldeepner2" ] + output: json + tagset: ud + type: 'additive' pt: default: lpmn: [{"spacy":{"lang":"pt", 'method': 'ner'}}] output: json tagset: ud + type: 'overwrite' + poldeepner2: + lpmn: [ "poldeepner2" ] + output: json + tagset: ud + type: 'additive' fr: default: lpmn: [{"spacy":{"lang":"fr", 'method': 'ner'}}] output: json tagset: ud + type: 'overwrite' + poldeepner2: + lpmn: [ "poldeepner2" ] + output: json + tagset: ud + type: 'additive' ru: default: lpmn: [{"spacy":{"lang":"ru", 'method': 'ner'}}] output: json tagset: ud + type: 'overwrite' + poldeepner2: + lpmn: [ "poldeepner2" ] + output: json + tagset: ud + type: 'additive' linkers: clalink: diff --git a/src/tagger.py b/src/tagger.py index 9d2632d..58cbd05 100644 --- a/src/tagger.py +++ b/src/tagger.py @@ -31,6 +31,7 @@ LINKING_TYPE = "linking_type" TASK = "task" MARKERS = "markers" KB_ID = "knowledge_base_id" +TYPE = "type" class TaggerWorker(nlp_ws.NLPWorker): @@ -101,10 +102,18 @@ class TaggerWorker(nlp_ws.NLPWorker): method = task_options.get("method", "tagger") + tagger_type = task_options.get("tagger_type", "default") + tagger_type = DEFAULT_TYPE \ + if tagger_type == "default" else tagger_type + tagger_type = tagger_type \ + if tagger_type in self._taggers[lang] else DEFAULT_TYPE + ner_type = task_options.get("ner_type", "winer") ner_type = DEFAULT_TYPE if ner_type == "winer" else ner_type ner_type = ner_type if ner_type in self._ners[lang] else DEFAULT_TYPE + ner_query_type = self._ners[lang][ner_type][TYPE] + if method not in [TAGGER, NER]: raise Exception(f"Unsupported method: {method}") @@ -122,7 +131,7 @@ class TaggerWorker(nlp_ws.NLPWorker): linking_type = task_options.get("linking_type", None) if linking_name in self._linkers: linking = self._linkers[linking_name] - elif linking_name is None: + elif linking_name is None or linking_name == "None": linking = None else: raise Exception(f"Unsupported linking: {linking_name}") @@ -141,17 +150,27 @@ class TaggerWorker(nlp_ws.NLPWorker): self._parallel_subtasks ) - tagger_opt = self._taggers[lang][DEFAULT_TYPE] + tagger_opt = self._taggers[lang][tagger_type] ner_opt = self._ners[lang][ner_type] convert_lpmn = self.get_converter_directive( tagger_opt[OUTPUT], tagger_opt[TAGSET], output, tagset, json_text) if method == TAGGER else self.get_converter_directive( ner_opt[OUTPUT], ner_opt[TAGSET], output, tagset, json_text, ner_opt[NER] if NER in ner_opt else False) + + ner_query = ner_opt[LPMN].copy() + if method == NER and ner_query_type == "additive": + ner_query = [*tagger_opt[LPMN].copy(), *ner_opt[LPMN].copy()] + if tagger_opt[OUTPUT] == 'ccl' and ner_opt[OUTPUT] == 'json': + ner_query = [*tagger_opt[LPMN].copy(), + {'posconverter': {'input_format': 'ccl', + 'output_format': 'json'}}, + *ner_opt[LPMN].copy()] + json_lpmn = ( tagger_opt[LPMN].copy() if method == TAGGER - else ner_opt[LPMN].copy() + else ner_query ) if convert_lpmn is not None and method == TAGGER and output != TEI: json_lpmn.append(convert_lpmn) diff --git a/src/utils.py b/src/utils.py index a243d66..9a50e12 100644 --- a/src/utils.py +++ b/src/utils.py @@ -185,51 +185,50 @@ def merge_splits(output_path: str, destination_path: str, l_results = [subtask.get_output_path() for subtask in subtasks] for l_result in l_results: - with open(l_result, "r") as f: - if merge_type == MergeType.PLAINTEXT2CCL: - file_content2 = f.read() - l_data2 = json.loads(file_content2) - f2.write(f"{json.dumps(l_data2)}\n") - elif merge_type == MergeType.PLAINTEXT2JSON: - file_content = f.read() - l_data = json.loads(file_content) - doc = clarin_json.Document.from_dict(l_data) - f2.write(doc) - elif merge_type == MergeType.TAGGER2LEMMAS: - file_content_lemmas = f.read() - f2.write(file_content_lemmas) - elif merge_type == MergeType.NER2LEMMAS: - file_content_lemmas = f.read() - l_data_lemmas = json.loads(file_content_lemmas) - doc = clarin_json.Document.from_dict(l_data_lemmas) - words = [] - for token in doc.tokens(): - for lexem in token.lexemes: - words.append(lexem.lemma) - f2.write(" ".join(words)) - f2.write("\n") - elif merge_type == MergeType.CCLS2TEI: - with open(l_result, "r") as f: - try: - xml = ET.fromstring(bytes(f.read(), 'utf-8')) - for child in xml.iter('chunk'): - merged_ccl += ET.tostring( - child, - encoding='unicode', - pretty_print=True - ) - - except ET.XMLSyntaxError: - _log.error('File is not valid XML!') - continue - - elif merge_type == MergeType.JSON2TEI: + if merge_type == MergeType.CCLS2TEI: + with open(l_result, "r") as f: try: - file_data = json.loads(f.read()) - except json.JSONDecodeError: - _log.error('File is not valid JSON!') - else: - json_parts.append(file_data) + xml = ET.fromstring(bytes(f.read(), 'utf-8')) + for child in xml.iter('chunk'): + merged_ccl += ET.tostring( + child, + encoding='unicode', + pretty_print=True + ) + + except ET.XMLSyntaxError: + _log.error('File is not valid XML!') + continue + else: + with open(l_result, "r") as f: + for file_content in f.readlines(): + if merge_type == MergeType.PLAINTEXT2CCL: + l_data2 = json.loads(file_content) + f2.write(f"{json.dumps(l_data2)}\n") + elif merge_type == MergeType.PLAINTEXT2JSON: + l_data = json.loads(file_content) + doc = clarin_json.Document.from_dict(l_data) + f2.write(doc) + elif merge_type == MergeType.TAGGER2LEMMAS: + f2.write(file_content) + elif merge_type == MergeType.NER2LEMMAS: + l_data_lemmas = json.loads(file_content) + doc = clarin_json.Document.from_dict( + l_data_lemmas + ) + words = [] + for token in doc.tokens(): + for lexem in token.lexemes: + words.append(lexem.lemma) + f2.write(" ".join(words)) + f2.write("\n") + elif merge_type == MergeType.JSON2TEI: + try: + file_data = json.loads(file_content) + except json.JSONDecodeError: + _log.error('File is not valid JSON!') + else: + json_parts.append(file_data) del subtask_args_queue_awaiting[:parallel_subtasks] diff --git a/tests/testdata/input/pos_tagger.yaml b/tests/testdata/input/pos_tagger.yaml index e989f4a..28e990f 100644 --- a/tests/testdata/input/pos_tagger.yaml +++ b/tests/testdata/input/pos_tagger.yaml @@ -1,75 +1,109 @@ taggers: - pl: - default: - lpmn: ["morphodita"] - output: ccl - tagset: nkjp - en: - default: - lpmn: [{"spacy":{"lang":"en"}}] - output: json - tagset: ud - de: - default: - lpmn: [{"spacy":{"lang":"de"}}] - output: json - tagset: ud - es: - default: - lpmn: [{"spacy":{"lang":"es"}}] - output: json - tagset: ud - pt: - default: - lpmn: [{"spacy":{"lang":"pt"}}] - output: json - tagset: ud - fr: - default: - lpmn: [{"spacy":{"lang":"fr"}}] - output: json - tagset: ud - ru: - default: - lpmn: [{"spacy":{"lang":"ru"}}] - output: json - tagset: ud + pl: + default: + lpmn: ["morphodita", {'posconverter': {'input_format': 'ccl', 'output_format': 'json' }}] + output: json + tagset: nkjp + spacy: + lpmn: [{"spacy":{"lang":"pl"}}] + output: json + tagset: ud + poldeepner2tagger: + lpmn: ["witok", "poldeepner2tagger"] + output: json + tagset: nkjp + en: + default: + lpmn: [{"spacy":{"lang":"en"}}] + output: json + tagset: ud + de: + default: + lpmn: [{"spacy":{"lang":"de"}}] + output: json + tagset: ud + es: + default: + lpmn: [{"spacy":{"lang":"es"}}] + output: json + tagset: ud + pt: + default: + lpmn: [{"spacy":{"lang":"pt"}}] + output: json + tagset: ud + fr: + default: + lpmn: [{"spacy":{"lang":"fr"}}] + output: json + tagset: ud + ru: + default: + lpmn: [{"spacy":{"lang":"ru"}}] + output: json + tagset: ud ners: - pl: - default: - lpmn: ['morphodita', {'posconverter': {'input_format': 'ccl', 'output_format': 'json'}}, 'winer'] - output: json - tagset: nkjp - en: - default: - lpmn: [{"spacy":{"lang":"en", 'method': 'ner'}}] - output: json - tagset: ud - de: - default: - lpmn: [{"spacy":{"lang":"de", 'method': 'ner'}}] - output: json - tagset: ud - es: - default: - lpmn: [{"spacy":{"lang":"es", 'method': 'ner'}}] - output: json - tagset: ud - pt: - default: - lpmn: [{"spacy":{"lang":"pt", 'method': 'ner'}}] - output: json - tagset: ud - fr: - default: - lpmn: [{"spacy":{"lang":"fr", 'method': 'ner'}}] - output: json - tagset: ud - ru: - default: - lpmn: [{"spacy":{"lang":"ru", 'method': 'ner'}}] - output: json - tagset: ud + pl: + default: + lpmn: ['winer'] + output: json + tagset: nkjp + type: 'additive' + spacy: + lpmn: [{"spacy":{"lang":"pl", 'method': 'ner'}}] + output: json + tagset: ud + type: 'overwrite' + poldeepner2: + lpmn: ["poldeepner2"] + output: json + tagset: nkjp + type: 'additive' + liner: + lpmn: [ + 'morphodita', + {'liner2': {'model': 'n82'}}, + {'posconverter': {'input_format': 'ccl', 'output_format': 'json', 'ner': True}} + ] + output: json + tagset: nkjp + type: 'overwrite' + en: + default: + lpmn: [{"spacy":{"lang":"en", 'method': 'ner'}}] + output: json + tagset: ud + type: 'overwrite' + de: + default: + lpmn: [{"spacy":{"lang":"de", 'method': 'ner'}}] + output: json + tagset: ud + type: 'overwrite' + es: + default: + lpmn: [{"spacy":{"lang":"es", 'method': 'ner'}}] + output: json + tagset: ud + type: 'overwrite' + pt: + default: + lpmn: [{"spacy":{"lang":"pt", 'method': 'ner'}}] + output: json + tagset: ud + type: 'overwrite' + fr: + default: + lpmn: [{"spacy":{"lang":"fr", 'method': 'ner'}}] + output: json + tagset: ud + type: 'overwrite' + ru: + default: + lpmn: [{"spacy":{"lang":"ru", 'method': 'ner'}}] + output: json + tagset: ud + type: 'overwrite' linkers: clalink: diff --git a/tox.ini b/tox.ini index 0aa64da..01a3db1 100644 --- a/tox.ini +++ b/tox.ini @@ -33,9 +33,6 @@ commands = coverage run --source=src -m pytest --junitxml=report.xml tests/test.py coverage html -[pytest] -python_paths = src src - [run] relative_files = True branch = True -- GitLab