Skip to content
Snippets Groups Projects
Commit 3063fa44 authored by Paweł Walkowiak's avatar Paweł Walkowiak
Browse files

Merge branch 'winer_support' into 'master'

Winer support

See merge request !4
parents 7c9be063 2c550650
No related branches found
No related tags found
1 merge request!4Winer support
Pipeline #8191 passed
...@@ -7,5 +7,9 @@ ...@@ -7,5 +7,9 @@
"post_any2txt": {"task": ["any2txt", {"postagger": {"lang": "en", "output": "json"}}], "input": "post_spacy_input", "expected": "post_spacy_expected.json"}, "post_any2txt": {"task": ["any2txt", {"postagger": {"lang": "en", "output": "json"}}], "input": "post_spacy_input", "expected": "post_spacy_expected.json"},
"postagger_lone_lemmas": {"task": [{"postagger": {"lang": "pl", "output": "lemmas"}}], "input": "post_postagger_input", "expected": "post_postagger_input_lemmas"} "postagger_lone_lemmas": {"task": [{"postagger": {"lang": "pl", "output": "lemmas"}}], "input": "post_postagger_input", "expected": "post_postagger_input_lemmas"},
"ner_for_pl": {"task": [{"postagger": {"lang": "pl", "output": "json", "method": "ner"}}], "input": "post_postagger_input", "expected": "ner_for_pl.json"},
"ner_for_en": {"task": [{"postagger": {"lang": "en", "output": "json", "method": "ner"}}], "input": "post_spacy_input", "expected": "ner_for_en.json"}
} }
...@@ -9,3 +9,14 @@ taggers: ...@@ -9,3 +9,14 @@ taggers:
lpmn: [{"spacy":{"lang":"en"}}] lpmn: [{"spacy":{"lang":"en"}}]
output: json output: json
tagset: ud tagset: ud
ners:
pl:
default:
lpmn: ['morphodita', {'posconverter': {'input_format': 'ccl', 'output_format': 'json'}}, 'winer']
output: json
tagset: nkjp
en:
default:
lpmn: [{"spacy":{"lang":"en", 'method': 'ner'}}]
output: json
tagset: ud
...@@ -15,6 +15,14 @@ import src.utils ...@@ -15,6 +15,14 @@ import src.utils
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
SubTask.turn_on() SubTask.turn_on()
DEFAULT_TYPE = "default"
OUTPUT = "output"
JSON = "json"
TAGSET = "tagset"
TAGGER = "tagger"
NER = "ner"
LPMN = "lpmn"
class TaggerWorker(nlp_ws.NLPWorker): class TaggerWorker(nlp_ws.NLPWorker):
"""Class implementing TaggerWorker worker.""" """Class implementing TaggerWorker worker."""
...@@ -23,15 +31,18 @@ class TaggerWorker(nlp_ws.NLPWorker): ...@@ -23,15 +31,18 @@ class TaggerWorker(nlp_ws.NLPWorker):
def static_init(cls, config): def static_init(cls, config):
"""Initialize process.""" """Initialize process."""
cls._taggers = {} cls._taggers = {}
cls._ners = {}
yaml_path = config.get('tool').get('config', 'pos_tagger.yaml') yaml_path = config.get('tool').get('config', 'pos_tagger.yaml')
yaml_config = parse_config(yaml_path) yaml_config = parse_config(yaml_path)
cls._taggers = yaml_config["taggers"] cls._taggers = yaml_config["taggers"]
_log.error(f"Config from yaml: {cls._taggers}") cls._ners = yaml_config["ners"]
_log.info(f"Config taggers from yaml: {cls._taggers}")
_log.info(f"Config ners from yaml: {cls._ners}")
cls._chunking_limit = config.get('tool').get('chunking_limit', 50000) cls._chunking_limit = config.get('tool').get('chunking_limit', 50000)
if not isinstance(cls._chunking_limit, int): if not isinstance(cls._chunking_limit, int):
cls._chunking_limit = int(cls._chunking_limit) cls._chunking_limit = int(cls._chunking_limit)
_log.error(f"Chunk size: {cls._chunking_limit}") _log.info(f"Chunk size: {cls._chunking_limit}")
def get_converter_directive(self, input_format, input_tagset, output_format, def get_converter_directive(self, input_format, input_tagset, output_format,
output_tagset, json_text): output_tagset, json_text):
...@@ -63,37 +74,47 @@ class TaggerWorker(nlp_ws.NLPWorker): ...@@ -63,37 +74,47 @@ class TaggerWorker(nlp_ws.NLPWorker):
output: format of results (default = 'json', values: json, ccl, lemmas) output: format of results (default = 'json', values: json, ccl, lemmas)
json_text: bool if json output should contain original json_text: bool if json output should contain original
text (default = True) text (default = True)
method: method of processing (default = 'tagger', values: tagger, ner)
:type task_options: dict :type task_options: dict
:param output_path: Path to directory where the :param output_path: Path to directory where the
worker will store result file. worker will store result file.
:type output_path: str :type output_path: str
""" """
lang = "pl" lang = task_options.get("lang", "pl")
if "lang" in task_options:
lang = task_options["lang"] method = task_options.get("method", "tagger")
if lang not in self._taggers:
if method not in [TAGGER, NER]:
raise Exception(f"Unsupported method: {method}")
if method == TAGGER and lang not in self._taggers:
raise Exception(f"Unsupported language: {lang}, " raise Exception(f"Unsupported language: {lang}, "
f"supported {list(self._taggers.keys())}") f"supported {list(self._taggers.keys())}")
output = "json"
if "output" in task_options:
output = task_options["output"]
tagset = "identical" if method == NER and lang not in self._ners:
if "tagset" in task_options: raise Exception(f"Unsupported language: {lang}, "
tagset = task_options["tagset"] f"supported {list(self._ners.keys())}")
output = task_options.get("output", "json")
json_text = True tagset = task_options.get("tagset", "identical")
if "json_text" in task_options:
json_text = task_options["json_text"]
tagger_type = "default" json_text = task_options.get("json_text", True)
tagger_opt = self._taggers[lang][tagger_type]
tagger_opt = self._taggers[lang][DEFAULT_TYPE]
ner_opt = self._ners[lang][DEFAULT_TYPE]
convert_lpmn = self.get_converter_directive( convert_lpmn = self.get_converter_directive(
tagger_opt["output"], tagger_opt["tagset"], output, tagset, tagger_opt[OUTPUT], tagger_opt[TAGSET], output, tagset,
json_text) if method == TAGGER else self.get_converter_directive(
ner_opt[OUTPUT], ner_opt[TAGSET], output, tagset,
json_text) json_text)
json_lpmn = tagger_opt["lpmn"].copy() json_lpmn = (
if convert_lpmn is not None: tagger_opt[LPMN].copy()
if method == TAGGER
else ner_opt[LPMN].copy()
)
if convert_lpmn is not None and method == TAGGER:
json_lpmn.append(convert_lpmn) json_lpmn.append(convert_lpmn)
_dir_style = False _dir_style = False
...@@ -101,7 +122,7 @@ class TaggerWorker(nlp_ws.NLPWorker): ...@@ -101,7 +122,7 @@ class TaggerWorker(nlp_ws.NLPWorker):
_dir_style = True _dir_style = True
json_lpmn = [json_lpmn] json_lpmn = [json_lpmn]
_log.debug(f"Running LPMN: {json_lpmn}") _log.debug(f"Running LPMN: {json_lpmn}")
if output == "json" and not _dir_style: if output == JSON and not _dir_style:
# split file into chunks # split file into chunks
chunk_size = int(self._chunking_limit * 0.5) chunk_size = int(self._chunking_limit * 0.5)
destination_path = os.path.join( destination_path = os.path.join(
......
{"filename": "6801426b-6ece-403b-868e-574ae96ce660", "tagset": "ud", "tokens": [{"index": 1, "position": [0, 4], "orth": "When", "lexemes": [{"lemma": "when", "mstag": "SCONJ", "disamb": true}]}, {"index": 2, "position": [5, 14], "orth": "Sebastian", "lexemes": [{"lemma": "Sebastian", "mstag": "PROPN", "disamb": true}]}, {"index": 3, "position": [15, 20], "orth": "Thrun", "lexemes": [{"lemma": "Thrun", "mstag": "PROPN", "disamb": true}]}, {"index": 4, "position": [21, 28], "orth": "started", "lexemes": [{"lemma": "start", "mstag": "VERB", "disamb": true}]}, {"index": 5, "position": [29, 36], "orth": "working", "lexemes": [{"lemma": "work", "mstag": "VERB", "disamb": true}]}, {"index": 6, "position": [37, 39], "orth": "on", "lexemes": [{"lemma": "on", "mstag": "ADP", "disamb": true}]}, {"index": 7, "position": [40, 44], "orth": "self", "lexemes": [{"lemma": "self", "mstag": "NOUN", "disamb": true}]}, {"index": 8, "position": [45, 45], "orth": "-", "lexemes": [{"lemma": "-", "mstag": "PUNCT", "disamb": true}]}, {"index": 9, "position": [46, 52], "orth": "driving", "lexemes": [{"lemma": "drive", "mstag": "VERB", "disamb": true}]}, {"index": 10, "position": [53, 57], "orth": "cars", "lexemes": [{"lemma": "car", "mstag": "NOUN", "disamb": true}]}, {"index": 11, "position": [58, 60], "orth": "at", "lexemes": [{"lemma": "at", "mstag": "ADP", "disamb": true}]}, {"index": 12, "position": [61, 67], "orth": "Google", "lexemes": [{"lemma": "Google", "mstag": "PROPN", "disamb": true}]}, {"index": 13, "position": [68, 70], "orth": "in", "lexemes": [{"lemma": "in", "mstag": "ADP", "disamb": true}]}, {"index": 14, "position": [71, 75], "orth": "2007", "lexemes": [{"lemma": "2007", "mstag": "NUM", "disamb": true}]}, {"index": 15, "position": [76, 76], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 16, "position": [77, 80], "orth": "few", "lexemes": [{"lemma": "few", "mstag": "ADJ", "disamb": true}]}, {"index": 17, "position": [81, 87], "orth": "people", "lexemes": [{"lemma": "people", "mstag": "NOUN", "disamb": true}]}, {"index": 18, "position": [88, 95], "orth": "outside", "lexemes": [{"lemma": "outside", "mstag": "ADV", "disamb": true}]}, {"index": 19, "position": [96, 98], "orth": "of", "lexemes": [{"lemma": "of", "mstag": "ADP", "disamb": true}]}, {"index": 20, "position": [99, 102], "orth": "the", "lexemes": [{"lemma": "the", "mstag": "DET", "disamb": true}]}, {"index": 21, "position": [103, 110], "orth": "company", "lexemes": [{"lemma": "company", "mstag": "NOUN", "disamb": true}]}, {"index": 22, "position": [111, 115], "orth": "took", "lexemes": [{"lemma": "take", "mstag": "VERB", "disamb": true}]}, {"index": 23, "position": [116, 119], "orth": "him", "lexemes": [{"lemma": "he", "mstag": "PRON", "disamb": true}]}, {"index": 24, "position": [120, 129], "orth": "seriously", "lexemes": [{"lemma": "seriously", "mstag": "ADV", "disamb": true}]}, {"index": 25, "position": [130, 130], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 26, "position": [131, 132], "orth": "“", "lexemes": [{"lemma": "\"", "mstag": "PUNCT", "disamb": true}]}, {"index": 27, "position": [133, 133], "orth": "I", "lexemes": [{"lemma": "I", "mstag": "PRON", "disamb": true}]}, {"index": 28, "position": [134, 137], "orth": "can", "lexemes": [{"lemma": "can", "mstag": "AUX", "disamb": true}]}, {"index": 29, "position": [138, 142], "orth": "tell", "lexemes": [{"lemma": "tell", "mstag": "VERB", "disamb": true}]}, {"index": 30, "position": [143, 146], "orth": "you", "lexemes": [{"lemma": "you", "mstag": "PRON", "disamb": true}]}, {"index": 31, "position": [147, 151], "orth": "very", "lexemes": [{"lemma": "very", "mstag": "ADV", "disamb": true}]}, {"index": 32, "position": [152, 158], "orth": "senior", "lexemes": [{"lemma": "senior", "mstag": "ADJ", "disamb": true}]}, {"index": 33, "position": [159, 163], "orth": "CEOs", "lexemes": [{"lemma": "ceo", "mstag": "NOUN", "disamb": true}]}, {"index": 34, "position": [164, 166], "orth": "of", "lexemes": [{"lemma": "of", "mstag": "ADP", "disamb": true}]}, {"index": 35, "position": [167, 172], "orth": "major", "lexemes": [{"lemma": "major", "mstag": "ADJ", "disamb": true}]}, {"index": 36, "position": [173, 181], "orth": "American", "lexemes": [{"lemma": "american", "mstag": "ADJ", "disamb": true}]}, {"index": 37, "position": [182, 185], "orth": "car", "lexemes": [{"lemma": "car", "mstag": "NOUN", "disamb": true}]}, {"index": 38, "position": [186, 195], "orth": "companies", "lexemes": [{"lemma": "company", "mstag": "NOUN", "disamb": true}]}, {"index": 39, "position": [196, 201], "orth": "would", "lexemes": [{"lemma": "would", "mstag": "AUX", "disamb": true}]}, {"index": 40, "position": [202, 207], "orth": "shake", "lexemes": [{"lemma": "shake", "mstag": "VERB", "disamb": true}]}, {"index": 41, "position": [208, 210], "orth": "my", "lexemes": [{"lemma": "my", "mstag": "PRON", "disamb": true}]}, {"index": 42, "position": [211, 215], "orth": "hand", "lexemes": [{"lemma": "hand", "mstag": "NOUN", "disamb": true}]}, {"index": 43, "position": [216, 219], "orth": "and", "lexemes": [{"lemma": "and", "mstag": "CCONJ", "disamb": true}]}, {"index": 44, "position": [220, 224], "orth": "turn", "lexemes": [{"lemma": "turn", "mstag": "VERB", "disamb": true}]}, {"index": 45, "position": [225, 229], "orth": "away", "lexemes": [{"lemma": "away", "mstag": "ADV", "disamb": true}]}, {"index": 46, "position": [230, 237], "orth": "because", "lexemes": [{"lemma": "because", "mstag": "SCONJ", "disamb": true}]}, {"index": 47, "position": [238, 239], "orth": "I", "lexemes": [{"lemma": "I", "mstag": "PRON", "disamb": true}]}, {"index": 48, "position": [240, 243], "orth": "was", "lexemes": [{"lemma": "be", "mstag": "AUX", "disamb": true}]}, {"index": 49, "position": [244, 246], "orth": "n’t", "lexemes": [{"lemma": "not", "mstag": "PART", "disamb": true}]}, {"index": 50, "position": [247, 252], "orth": "worth", "lexemes": [{"lemma": "worth", "mstag": "ADJ", "disamb": true}]}, {"index": 51, "position": [253, 260], "orth": "talking", "lexemes": [{"lemma": "talk", "mstag": "VERB", "disamb": true}]}, {"index": 52, "position": [261, 263], "orth": "to", "lexemes": [{"lemma": "to", "mstag": "ADP", "disamb": true}]}, {"index": 53, "position": [264, 264], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 54, "position": [265, 265], "orth": "”", "lexemes": [{"lemma": "\"", "mstag": "PUNCT", "disamb": true}]}, {"index": 55, "position": [266, 270], "orth": "said", "lexemes": [{"lemma": "say", "mstag": "VERB", "disamb": true}]}, {"index": 56, "position": [271, 276], "orth": "Thrun", "lexemes": [{"lemma": "Thrun", "mstag": "PROPN", "disamb": true}]}, {"index": 57, "position": [277, 277], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 58, "position": [278, 280], "orth": "in", "lexemes": [{"lemma": "in", "mstag": "ADP", "disamb": true}]}, {"index": 59, "position": [281, 283], "orth": "an", "lexemes": [{"lemma": "an", "mstag": "DET", "disamb": true}]}, {"index": 60, "position": [284, 293], "orth": "interview", "lexemes": [{"lemma": "interview", "mstag": "NOUN", "disamb": true}]}, {"index": 61, "position": [294, 298], "orth": "with", "lexemes": [{"lemma": "with", "mstag": "ADP", "disamb": true}]}, {"index": 62, "position": [299, 305], "orth": "Recode", "lexemes": [{"lemma": "Recode", "mstag": "PROPN", "disamb": true}]}, {"index": 63, "position": [306, 313], "orth": "earlier", "lexemes": [{"lemma": "early", "mstag": "ADV", "disamb": true}]}, {"index": 64, "position": [314, 318], "orth": "this", "lexemes": [{"lemma": "this", "mstag": "DET", "disamb": true}]}, {"index": 65, "position": [319, 323], "orth": "week", "lexemes": [{"lemma": "week", "mstag": "NOUN", "disamb": true}]}, {"index": 66, "position": [324, 324], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 67, "position": [325, 325], "orth": "\n", "lexemes": [{"lemma": "\n", "mstag": "SPACE", "disamb": true}]}], "text": "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously. “I can tell you very senior CEOs of major American car companies would shake my hand and turn away because I wasn’t worth talking to,” said Thrun, in an interview with Recode earlier this week.\n", "entities": [{"text": "Sebastian Thrun", "type": "PERSON", "tokens": [1, 3], "position": [5, 20]}, {"text": "Google", "type": "ORG", "tokens": [11, 12], "position": [61, 67]}, {"text": "2007", "type": "DATE", "tokens": [13, 14], "position": [71, 75]}, {"text": "American", "type": "NORP", "tokens": [35, 36], "position": [173, 181]}, {"text": "Thrun", "type": "PERSON", "tokens": [55, 56], "position": [271, 276]}, {"text": "Recode", "type": "ORG", "tokens": [61, 62], "position": [299, 305]}, {"text": "earlier this week", "type": "DATE", "tokens": [62, 65], "position": [306, 323]}]}
This diff is collapsed.
...@@ -9,3 +9,14 @@ taggers: ...@@ -9,3 +9,14 @@ taggers:
lpmn: [{"spacy":{"lang":"en"}}] lpmn: [{"spacy":{"lang":"en"}}]
output: json output: json
tagset: ud tagset: ud
ners:
pl:
default:
lpmn: ['morphodita', {'posconverter': {'input_format': 'ccl', 'output_format': 'json'}}, 'winer']
output: json
tagset: nkjp
en:
default:
lpmn: [{"spacy":{"lang":"en", 'method': 'ner'}}]
output: json
tagset: ud
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment