diff --git a/.gitignore b/.gitignore index de37d402095918631c5f3b34b4bb0c072f497ef5..ac16f32c65a77973a783fd431943288256025306 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,7 @@ test-em.json ascii.json test-ascii.json non_ascii.json +response_* +test_* +samba +.DS_Store diff --git a/README.md b/README.md index f37776957ebd23d3a5ac7e57d42d1ec99ebd46c5..07111afcb3ea58e868204bb37c17ced27149707d 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,8 @@ task_options `lang`: language of text (default = 'pl') +`tagset`: tagset of results (default = 'identical', values: 'identical', 'nkjp', 'ud') + `output`: format of results (default = 'json', values: 'json', 'ccl', 'lemmas', 'tei') `method`: method of tagging (default = 'tagger', values: 'tagger', 'ner') @@ -18,3 +20,10 @@ task_options `ner_type`: type of named entity recognition tool (default = 'winer', values: 'winer', 'liner', 'poldeepner2', 'spacy' for pl, 'spacy', 'poldeepner2' for other languages) + +`linking`: linkers names, could be str of list for many linkers run (default = None , + values: 'clalink', 'senselink', ['clalink', 'senselink']) + + +`linking_type`: specific form to link to dictionary, for every linker (default = None, + values: {'clalink': 'ner', 'senselink': ['noun', 'verb', 'adverb', 'adjective']} ) \ No newline at end of file diff --git a/pos_tagger.yaml b/pos_tagger.yaml index e08991937395a85719bbf084219c991ff8ed0703..94322355c78005e7a848a9f7ec47a10fdfc02d52 100644 --- a/pos_tagger.yaml +++ b/pos_tagger.yaml @@ -12,6 +12,10 @@ taggers: lpmn: ["witok", "poldeepner2tagger"] output: json tagset: nkjp + ptag: + lpmn: ["ptag"] + output: json + tagset: nkjp en: default: lpmn: [{"spacy":{"lang":"en"}}] @@ -141,3 +145,10 @@ linkers: markers: ["[unused0]", "[unused1]"] linking_type: "ner" knowledge_base_id: "wikidata" + senselink: + task: "senselink" + markers: ["[unused0]", "[unused1]"] + linking_type: ["noun", "verb", "adverb", "adjective"] + knowledge_base_id: "wordnet" + + diff --git a/src/tagger.py b/src/tagger.py index 58cbd0586cde1b564edd2aa3d9f9d1ef2d28c4fd..1e59ca3f245adeece0eb402166a6e4656894188f 100644 --- a/src/tagger.py +++ b/src/tagger.py @@ -127,14 +127,29 @@ class TaggerWorker(nlp_ws.NLPWorker): output = task_options.get("output", "json") - linking_name = task_options.get("linking", None) + linking_names = task_options.get("linking", []) + linking = [] linking_type = task_options.get("linking_type", None) - if linking_name in self._linkers: - linking = self._linkers[linking_name] - elif linking_name is None or linking_name == "None": - linking = None + linking_types = {} + if linking_names is not None and linking_names != "None": + if not isinstance(linking_names, list): + linking_names = [linking_names] else: - raise Exception(f"Unsupported linking: {linking_name}") + linking_names = [] + for linking_name in linking_names: + if linking_name in self._linkers: + linking.append(self._linkers[linking_name]) + else: + raise Exception(f"Unsupported linking: {linking_name}") + + if linking_type is not None and linking_type != "None": + if isinstance(linking_type, dict): + for link_name, l_type in linking_type.items(): + if link_name in self._linkers: + linking_types[self._linkers[link_name][TASK]] = l_type + else: + raise Exception(f"Unsupported linking type: " + f"{linking_type} should be dict") if (output, method) == (CCL, NER) and lang != "pl" and \ lang in self._ners.keys(): @@ -244,15 +259,24 @@ class TaggerWorker(nlp_ws.NLPWorker): if os.path.exists(destination_path): shutil.rmtree(destination_path) if output == JSON and linking: + links = [] + for linker in linking: + l_type = linking_types[linker[TASK]] \ + if linker[TASK] in linking_types \ + else linker[LINKING_TYPE] + links.append( + { + "task": linker[TASK], + "marker_start": linker[MARKERS][0], + "marker_stop": linker[MARKERS][1], + "linking_type": l_type, + "knowledge_base_id": linker[KB_ID] + } + ) src.utils.run_linking( - linking[TASK], result_path, output_path, - linking[MARKERS][0], - linking[MARKERS][1], - linking_type if linking_type else linking[LINKING_TYPE], - linking[KB_ID], - _log + links ) except Exception as e: if os.path.exists(destination_path): diff --git a/src/utils.py b/src/utils.py index 9a50e12372bcf324b56650311ee74c8c76b6b2f6..9d0c06d8e37be0d8952582c7bec251982d5ce150 100644 --- a/src/utils.py +++ b/src/utils.py @@ -25,20 +25,49 @@ SENT_LIMIT = 150 LINKING_MARGIN_SIZE = 100 TEI_HEADER = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n' \ - '<teiCorpus xmlns:nkjp="http://www.nkjp.pl/ns/1.0" ' \ - 'xmlns:xi="http://www.w3.org/2001/XInclude" ' \ - 'xmlns="http://www.tei-c.org/ns/1.0">\n' \ - ' <xi:include href="PPC_header.xml"/>\n' \ - ' <TEI>\n' \ - ' <xi:include href="header.xml"/>\n' \ - ' <text>\n' \ - ' <body>\n' - + '<teiCorpus xmlns:nkjp="http://www.nkjp.pl/ns/1.0" ' \ + 'xmlns:xi="http://www.w3.org/2001/XInclude" ' \ + 'xmlns="http://www.tei-c.org/ns/1.0">\n' \ + ' <xi:include href="PPC_header.xml"/>\n' \ + ' <TEI>\n' \ + ' <xi:include href="header.xml"/>\n' \ + ' <text>\n' \ + ' <body>\n' TEI_FOOTER = ' </body>\n' \ - ' </text>\n' \ - ' </TEI>\n' \ - '</teiCorpus>\n' + ' </text>\n' \ + ' </TEI>\n' \ + '</teiCorpus>\n' + +NKJP2PLWN = { + "verb": [ + "fin", + "praet", + "impt", + "imps", + "pcon", + "pant", + "ger", + "pact", + "aglt" + ], + "noun": ["subst", "depr"], + "adverb": ["adv"], + "adjective": ["adj"], +} + +UD2PLWN = { + "verb": [ + "VERB", + "AUX" + ], + "noun": [ + "NOUN", + "PROPN" + ], + "adverb": ["ADV"], + "adjective": ["ADJ"] +} class MergeType(Enum): @@ -164,14 +193,14 @@ def merge_splits(output_path: str, destination_path: str, _log.debug(f"Subtask args queue: {subtask_args_queue_awaiting}") merged_ccl = '<?xml version="1.0" encoding="UTF-8"?>\n' \ - '<!DOCTYPE chunkList SYSTEM "ccl.dtd">\n' \ - '<chunkList>\n' + '<!DOCTYPE chunkList SYSTEM "ccl.dtd">\n' \ + '<chunkList>\n' json_parts = [] with WriterCM( - output_path, 'a', - merge_type == MergeType.PLAINTEXT2JSON + output_path, 'a', + merge_type == MergeType.PLAINTEXT2JSON ) as f2: while len(subtask_args_queue_awaiting) > 0: args = subtask_args_queue_awaiting[:parallel_subtasks] @@ -308,7 +337,7 @@ def write_ccl_as_tei(xml, output_path): # inner fs indent_lvl = _write_opening_tag( 'fs-lex', - [f'{i_chunk+1}.{sentence_id}.{i_token+1}'], + [f'{i_chunk + 1}.{sentence_id}.{i_token + 1}'], indent_lvl, output_file ) # inner multipe f tags @@ -343,7 +372,7 @@ def write_ccl_as_tei(xml, output_path): f'<symbol value="' f'{":".join(ctag.text.split(":")[1:])}' f'" xml:id="morph_' - f'{i_chunk+1}.{sentence_id}.{i_token+1}' + f'{i_chunk + 1}.{sentence_id}.{i_token + 1}' f'.1.1-msd"/>\n' ) indent_lvl = _write_closing_tag( @@ -371,7 +400,7 @@ def write_ccl_as_tei(xml, output_path): output_file.write(' ' * indent_lvl) output_file.write( f'<f fVal="#morph_' - f'{i_chunk+1}.{sentence_id}.{i_token+1}' + f'{i_chunk + 1}.{sentence_id}.{i_token + 1}' f'.1.1-msd" name="choice"/>\n' ) indent_lvl = _write_opening_tag( @@ -438,7 +467,7 @@ def write_json_as_tei(json_parts, output_path): ) sent_present = "spans" in json_part and \ - "sentence" in json_part["spans"] + "sentence" in json_part["spans"] p_sent = 0 i_token_in_sent = 1 @@ -452,13 +481,13 @@ def write_json_as_tei(json_parts, output_path): if sent_present and token['start'] == \ json_part['spans']['sentence'][p_sent]['start']: indent_lvl = _write_opening_tag( - 's', [f'{i_json_part+1}.{p_sent+1}'], + 's', [f'{i_json_part + 1}.{p_sent + 1}'], indent_lvl, output_file ) i_token_in_sent = 1 indent_lvl = _write_opening_tag( - 'seg', [f'{i_json_part+1}.{segment_id}'], + 'seg', [f'{i_json_part + 1}.{segment_id}'], indent_lvl, output_file ) @@ -494,7 +523,7 @@ def write_json_as_tei(json_parts, output_path): indent_lvl = _write_opening_tag( 'fs-lex', - [f'{i_json_part+1}.{sentence_id}.{i_token_in_sent}'], + [f'{i_json_part + 1}.{sentence_id}.{i_token_in_sent}'], indent_lvl, output_file ) @@ -540,7 +569,7 @@ def write_json_as_tei(json_parts, output_path): f'<symbol value="' f'{":".join(token["lexemes"][0]["pos"].split(":")[1:])}' f'" xml:id="morph_' - f'{i_json_part+1}.{sentence_id}.{i_token_in_sent}' + f'{i_json_part + 1}.{sentence_id}.{i_token_in_sent}' f'.1.1-msd"/>\n' ) @@ -571,7 +600,7 @@ def write_json_as_tei(json_parts, output_path): output_file.write(' ' * indent_lvl) output_file.write( f'<f fVal="#morph_' - f'{i_json_part+1}.{sentence_id}.{i_token_in_sent}' + f'{i_json_part + 1}.{sentence_id}.{i_token_in_sent}' f'.1.1.msd" name="choice"/>\n' ) @@ -739,8 +768,8 @@ def merge_ccls(output_path, l_results, _log): :type _log: logging.Logger """ header = '<?xml version="1.0" encoding="UTF-8"?>\n' \ - '<!DOCTYPE chunkList SYSTEM "ccl.dtd">\n' \ - '<chunkList>\n' + '<!DOCTYPE chunkList SYSTEM "ccl.dtd">\n' \ + '<chunkList>\n' footer = '</chunkList>\n' @@ -764,6 +793,65 @@ def merge_ccls(output_path, l_results, _log): f.write(footer) +def run_sent(sentences, task): + """Runs subtask on sentences. + + :param sentences: list of sentences + :type sentences: list + :param task: task name + :type task: str + """ + results = {} + for i in range(ceil(len(sentences) / SENT_LIMIT)): + sent_local = sentences[i * SENT_LIMIT:min((i + 1) * SENT_LIMIT, + len(sentences))] + l_subtask = Sentence(sent_local, task) + l_subtask.run(blocking=False) + for idx, link in enumerate(l_subtask.get_results()): + results[i * SENT_LIMIT + idx] = link + return results + + +def get_pos_sentences(document, marker_start, marker_stop, pos_list): + """Returns list of pos sentences. + + :param document: document + :type document: clarin.Document + :param marker_start: start marker + :type marker_start: str + :param marker_stop: stop marker + :type marker_stop: str + :param pos_list: list of pos + :type pos_list: list + """ + pos_to_mark = [] + tagset = document.tagset + for pos in pos_list: + if tagset == "nkjp": + pos_to_mark.extend(NKJP2PLWN[pos]) + elif tagset == "ud": + pos_to_mark.extend(UD2PLWN[pos]) + sentences = [] + choosen_tokens = [] + tokens = document.tokens() + for idx, token in enumerate(tokens): + if token.lexemes[0].pos.split(":")[0] in pos_to_mark: + start, stop = token.start, token.stop + choosen_tokens.append(document.text[start:stop]) + part_start = tokens[ + max(idx - LINKING_MARGIN_SIZE, 0) + ].start + part_stop = tokens[ + min(idx + LINKING_MARGIN_SIZE, len(tokens) - 1) + ].stop + sentences.append( + f"{document.text[part_start:start - 1]} {marker_start} " + f"{document.text[start:stop]} {marker_stop} " + f"{document.text[stop + 1:part_stop]}" + ) + return sentences, choosen_tokens + + def get_spans_sentences(document, marker_start, marker_stop, span_type): """Returns list of spans sentences. @@ -811,8 +899,7 @@ def add_linking( marker_start, marker_stop, linking_type, - knowledge_base_id, - _log: logging.Logger + knowledge_base_id ): """Adds linking to NER results. @@ -828,77 +915,82 @@ def add_linking( :type linking_type: str :param knowledge_base_id: knowledge base id :type knowledge_base_id: str - :param _log: logging.Logger object - :type _log: logging.Logger """ - res_documents = [] + records = [] for document in documents: - if linking_type not in document.get_span_types(): + if isinstance(linking_type, list) \ + and linking_type[0] in NKJP2PLWN.keys(): + sentences, ids = get_pos_sentences( + document, marker_start, marker_stop, linking_type + ) + elif linking_type not in document.get_span_types(): raise Exception(f"Linking {task} of {linking_type} " f"requires spans {linking_type}") - sentences, ids = get_spans_sentences( - document, marker_start, marker_stop, linking_type - ) + else: + sentences, ids = get_spans_sentences( + document, marker_start, marker_stop, linking_type + ) links = [] - for i in range(ceil(len(sentences) / SENT_LIMIT)): - sent_local = sentences[i * SENT_LIMIT:min((i + 1) * SENT_LIMIT, - len(sentences))] - l_subtask = Sentence(sent_local, task) - l_subtask.run(blocking=False) - - for idx, link in enumerate(l_subtask.get_results()): - links.append({ - OBJ_ID: ids[i * SENT_LIMIT + idx], - RESULTS: link - }) - document.set_records(clarin_json.Record(**{task: { + res = run_sent(sentences, task) + for idx, link in res.items(): + links.append({ + OBJ_ID: ids[idx], + RESULTS: link + }) + records.append({task: { KNOWLEDGE_BASE: knowledge_base_id, - LINKING_TYPE: f"spans/{linking_type}", - LINKS: links - }}), 'linking') - res_documents.append(document) - return res_documents + LINKING_TYPE: f"spans/{linking_type}" + if isinstance(linking_type, str) else "pos", + LINKS: links} + }) + return records + + +def add_multiple_links( + documents, + link_kwargs, +): + """Adds multiple linkings. + + :param documents: list of documents + :type documents: List[clarin_json.Document] + :param link_kwargs: list of linking kwargs + :type link_kwargs: list + """ + records = {} + res_docs = [] + documents = list(documents) # convert iterator to list ;) + for kwargs in link_kwargs: + records[kwargs['task']] = add_linking(documents, **kwargs) + for idx, document in enumerate(documents): + doc_records = {} + for task, task_records in records.items(): + doc_records = {**doc_records, **task_records[idx]} + document.set_records( + clarin_json.Record(**doc_records), 'linking') + res_docs.append(document) + return res_docs def run_linking( - task, - result_path, - output_path, - marker_start, - marker_stop, - linking_type, - knowledge_base_id, - _log: logging.Logger + result_path, + output_path, + link_kwargs, ): """Runs linking on NER results. - :param task: task name - :type task: str :param result_path: path to results :type result_path: str :param output_path: path to output file :type output_path: str - :param marker_start: start marker - :type marker_start: str - :param marker_stop: stop marker - :type marker_stop: str - :param linking_type: linking type - :type linking_type: str - :param knowledge_base_id: knowledge base id - :type knowledge_base_id: str - :param _log: logging.Logger object - :type _log: logging.Logger + :param link_kwargs: list of linking kwargs + :type link_kwargs: list """ clarin_json.process( result_path, - add_linking, + add_multiple_links, output_path, ensure_ascii=False, - task=task, - marker_start=marker_start, - marker_stop=marker_stop, - linking_type=linking_type, - knowledge_base_id=knowledge_base_id, - _log=_log + link_kwargs=link_kwargs ) os.remove(result_path)