diff --git a/lpmn_queries.json b/lpmn_queries.json index 0a8f6a1ce6415435cecb27812bee7586edc3baff..67b852dd65863f8857667dd2dca7bb16ec824696 100644 --- a/lpmn_queries.json +++ b/lpmn_queries.json @@ -7,5 +7,9 @@ "post_any2txt": {"task": ["any2txt", {"postagger": {"lang": "en", "output": "json"}}], "input": "post_spacy_input", "expected": "post_spacy_expected.json"}, - "postagger_lone_lemmas": {"task": [{"postagger": {"lang": "pl", "output": "lemmas"}}], "input": "post_postagger_input", "expected": "post_postagger_input_lemmas"} + "postagger_lone_lemmas": {"task": [{"postagger": {"lang": "pl", "output": "lemmas"}}], "input": "post_postagger_input", "expected": "post_postagger_input_lemmas"}, + + "ner_for_pl": {"task": [{"postagger": {"lang": "pl", "output": "json", "method": "ner"}}], "input": "post_postagger_input", "expected": "ner_for_pl.json"}, + + "ner_for_en": {"task": [{"postagger": {"lang": "en", "output": "json", "method": "ner"}}], "input": "post_spacy_input", "expected": "ner_for_en.json"} } diff --git a/pos_tagger.yaml b/pos_tagger.yaml index 9b8e4431d08327a73cbc9f857dd5e36caccb82ee..441831026407b1d274335f3db0d455fcfa966286 100644 --- a/pos_tagger.yaml +++ b/pos_tagger.yaml @@ -9,3 +9,14 @@ taggers: lpmn: [{"spacy":{"lang":"en"}}] output: json tagset: ud +ners: + pl: + default: + lpmn: ['morphodita', {'posconverter': {'input_format': 'ccl', 'output_format': 'json'}}, 'winer'] + output: json + tagset: nkjp + en: + default: + lpmn: [{"spacy":{"lang":"en", 'method': 'ner'}}] + output: json + tagset: ud diff --git a/src/tagger.py b/src/tagger.py index cff461484835afcba9520a2c2a83ce0603395048..04b2d7c5907732f05a29f2d55cd5a6955eab3074 100644 --- a/src/tagger.py +++ b/src/tagger.py @@ -15,6 +15,14 @@ import src.utils _log = logging.getLogger(__name__) SubTask.turn_on() +DEFAULT_TYPE = "default" +OUTPUT = "output" +JSON = "json" +TAGSET = "tagset" +TAGGER = "tagger" +NER = "ner" +LPMN = "lpmn" + class TaggerWorker(nlp_ws.NLPWorker): """Class implementing TaggerWorker worker.""" @@ -23,15 +31,18 @@ class TaggerWorker(nlp_ws.NLPWorker): def static_init(cls, config): """Initialize process.""" cls._taggers = {} + cls._ners = {} yaml_path = config.get('tool').get('config', 'pos_tagger.yaml') yaml_config = parse_config(yaml_path) cls._taggers = yaml_config["taggers"] - _log.error(f"Config from yaml: {cls._taggers}") + cls._ners = yaml_config["ners"] + _log.info(f"Config taggers from yaml: {cls._taggers}") + _log.info(f"Config ners from yaml: {cls._ners}") cls._chunking_limit = config.get('tool').get('chunking_limit', 50000) if not isinstance(cls._chunking_limit, int): cls._chunking_limit = int(cls._chunking_limit) - _log.error(f"Chunk size: {cls._chunking_limit}") + _log.info(f"Chunk size: {cls._chunking_limit}") def get_converter_directive(self, input_format, input_tagset, output_format, output_tagset, json_text): @@ -63,37 +74,47 @@ class TaggerWorker(nlp_ws.NLPWorker): output: format of results (default = 'json', values: json, ccl, lemmas) json_text: bool if json output should contain original text (default = True) + method: method of processing (default = 'tagger', values: tagger, ner) :type task_options: dict :param output_path: Path to directory where the worker will store result file. :type output_path: str """ - lang = "pl" - if "lang" in task_options: - lang = task_options["lang"] - if lang not in self._taggers: + lang = task_options.get("lang", "pl") + + method = task_options.get("method", "tagger") + + if method not in [TAGGER, NER]: + raise Exception(f"Unsupported method: {method}") + + if method == TAGGER and lang not in self._taggers: raise Exception(f"Unsupported language: {lang}, " f"supported {list(self._taggers.keys())}") - output = "json" - if "output" in task_options: - output = task_options["output"] - tagset = "identical" - if "tagset" in task_options: - tagset = task_options["tagset"] + if method == NER and lang not in self._ners: + raise Exception(f"Unsupported language: {lang}, " + f"supported {list(self._ners.keys())}") + + output = task_options.get("output", "json") + + tagset = task_options.get("tagset", "identical") - json_text = True - if "json_text" in task_options: - json_text = task_options["json_text"] + json_text = task_options.get("json_text", True) - tagger_type = "default" - tagger_opt = self._taggers[lang][tagger_type] + tagger_opt = self._taggers[lang][DEFAULT_TYPE] + ner_opt = self._ners[lang][DEFAULT_TYPE] convert_lpmn = self.get_converter_directive( - tagger_opt["output"], tagger_opt["tagset"], output, tagset, + tagger_opt[OUTPUT], tagger_opt[TAGSET], output, tagset, + json_text) if method == TAGGER else self.get_converter_directive( + ner_opt[OUTPUT], ner_opt[TAGSET], output, tagset, json_text) - json_lpmn = tagger_opt["lpmn"].copy() - if convert_lpmn is not None: + json_lpmn = ( + tagger_opt[LPMN].copy() + if method == TAGGER + else ner_opt[LPMN].copy() + ) + if convert_lpmn is not None and method == TAGGER: json_lpmn.append(convert_lpmn) _dir_style = False @@ -101,7 +122,7 @@ class TaggerWorker(nlp_ws.NLPWorker): _dir_style = True json_lpmn = [json_lpmn] _log.debug(f"Running LPMN: {json_lpmn}") - if output == "json" and not _dir_style: + if output == JSON and not _dir_style: # split file into chunks chunk_size = int(self._chunking_limit * 0.5) destination_path = os.path.join( diff --git a/tests/testdata/expected/ner_for_en.json b/tests/testdata/expected/ner_for_en.json new file mode 100644 index 0000000000000000000000000000000000000000..96cf588b3cec5443d3524dcc0789ac8296ed1ae7 --- /dev/null +++ b/tests/testdata/expected/ner_for_en.json @@ -0,0 +1 @@ +{"filename": "6801426b-6ece-403b-868e-574ae96ce660", "tagset": "ud", "tokens": [{"index": 1, "position": [0, 4], "orth": "When", "lexemes": [{"lemma": "when", "mstag": "SCONJ", "disamb": true}]}, {"index": 2, "position": [5, 14], "orth": "Sebastian", "lexemes": [{"lemma": "Sebastian", "mstag": "PROPN", "disamb": true}]}, {"index": 3, "position": [15, 20], "orth": "Thrun", "lexemes": [{"lemma": "Thrun", "mstag": "PROPN", "disamb": true}]}, {"index": 4, "position": [21, 28], "orth": "started", "lexemes": [{"lemma": "start", "mstag": "VERB", "disamb": true}]}, {"index": 5, "position": [29, 36], "orth": "working", "lexemes": [{"lemma": "work", "mstag": "VERB", "disamb": true}]}, {"index": 6, "position": [37, 39], "orth": "on", "lexemes": [{"lemma": "on", "mstag": "ADP", "disamb": true}]}, {"index": 7, "position": [40, 44], "orth": "self", "lexemes": [{"lemma": "self", "mstag": "NOUN", "disamb": true}]}, {"index": 8, "position": [45, 45], "orth": "-", "lexemes": [{"lemma": "-", "mstag": "PUNCT", "disamb": true}]}, {"index": 9, "position": [46, 52], "orth": "driving", "lexemes": [{"lemma": "drive", "mstag": "VERB", "disamb": true}]}, {"index": 10, "position": [53, 57], "orth": "cars", "lexemes": [{"lemma": "car", "mstag": "NOUN", "disamb": true}]}, {"index": 11, "position": [58, 60], "orth": "at", "lexemes": [{"lemma": "at", "mstag": "ADP", "disamb": true}]}, {"index": 12, "position": [61, 67], "orth": "Google", "lexemes": [{"lemma": "Google", "mstag": "PROPN", "disamb": true}]}, {"index": 13, "position": [68, 70], "orth": "in", "lexemes": [{"lemma": "in", "mstag": "ADP", "disamb": true}]}, {"index": 14, "position": [71, 75], "orth": "2007", "lexemes": [{"lemma": "2007", "mstag": "NUM", "disamb": true}]}, {"index": 15, "position": [76, 76], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 16, "position": [77, 80], "orth": "few", "lexemes": [{"lemma": "few", "mstag": "ADJ", "disamb": true}]}, {"index": 17, "position": [81, 87], "orth": "people", "lexemes": [{"lemma": "people", "mstag": "NOUN", "disamb": true}]}, {"index": 18, "position": [88, 95], "orth": "outside", "lexemes": [{"lemma": "outside", "mstag": "ADV", "disamb": true}]}, {"index": 19, "position": [96, 98], "orth": "of", "lexemes": [{"lemma": "of", "mstag": "ADP", "disamb": true}]}, {"index": 20, "position": [99, 102], "orth": "the", "lexemes": [{"lemma": "the", "mstag": "DET", "disamb": true}]}, {"index": 21, "position": [103, 110], "orth": "company", "lexemes": [{"lemma": "company", "mstag": "NOUN", "disamb": true}]}, {"index": 22, "position": [111, 115], "orth": "took", "lexemes": [{"lemma": "take", "mstag": "VERB", "disamb": true}]}, {"index": 23, "position": [116, 119], "orth": "him", "lexemes": [{"lemma": "he", "mstag": "PRON", "disamb": true}]}, {"index": 24, "position": [120, 129], "orth": "seriously", "lexemes": [{"lemma": "seriously", "mstag": "ADV", "disamb": true}]}, {"index": 25, "position": [130, 130], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 26, "position": [131, 132], "orth": "“", "lexemes": [{"lemma": "\"", "mstag": "PUNCT", "disamb": true}]}, {"index": 27, "position": [133, 133], "orth": "I", "lexemes": [{"lemma": "I", "mstag": "PRON", "disamb": true}]}, {"index": 28, "position": [134, 137], "orth": "can", "lexemes": [{"lemma": "can", "mstag": "AUX", "disamb": true}]}, {"index": 29, "position": [138, 142], "orth": "tell", "lexemes": [{"lemma": "tell", "mstag": "VERB", "disamb": true}]}, {"index": 30, "position": [143, 146], "orth": "you", "lexemes": [{"lemma": "you", "mstag": "PRON", "disamb": true}]}, {"index": 31, "position": [147, 151], "orth": "very", "lexemes": [{"lemma": "very", "mstag": "ADV", "disamb": true}]}, {"index": 32, "position": [152, 158], "orth": "senior", "lexemes": [{"lemma": "senior", "mstag": "ADJ", "disamb": true}]}, {"index": 33, "position": [159, 163], "orth": "CEOs", "lexemes": [{"lemma": "ceo", "mstag": "NOUN", "disamb": true}]}, {"index": 34, "position": [164, 166], "orth": "of", "lexemes": [{"lemma": "of", "mstag": "ADP", "disamb": true}]}, {"index": 35, "position": [167, 172], "orth": "major", "lexemes": [{"lemma": "major", "mstag": "ADJ", "disamb": true}]}, {"index": 36, "position": [173, 181], "orth": "American", "lexemes": [{"lemma": "american", "mstag": "ADJ", "disamb": true}]}, {"index": 37, "position": [182, 185], "orth": "car", "lexemes": [{"lemma": "car", "mstag": "NOUN", "disamb": true}]}, {"index": 38, "position": [186, 195], "orth": "companies", "lexemes": [{"lemma": "company", "mstag": "NOUN", "disamb": true}]}, {"index": 39, "position": [196, 201], "orth": "would", "lexemes": [{"lemma": "would", "mstag": "AUX", "disamb": true}]}, {"index": 40, "position": [202, 207], "orth": "shake", "lexemes": [{"lemma": "shake", "mstag": "VERB", "disamb": true}]}, {"index": 41, "position": [208, 210], "orth": "my", "lexemes": [{"lemma": "my", "mstag": "PRON", "disamb": true}]}, {"index": 42, "position": [211, 215], "orth": "hand", "lexemes": [{"lemma": "hand", "mstag": "NOUN", "disamb": true}]}, {"index": 43, "position": [216, 219], "orth": "and", "lexemes": [{"lemma": "and", "mstag": "CCONJ", "disamb": true}]}, {"index": 44, "position": [220, 224], "orth": "turn", "lexemes": [{"lemma": "turn", "mstag": "VERB", "disamb": true}]}, {"index": 45, "position": [225, 229], "orth": "away", "lexemes": [{"lemma": "away", "mstag": "ADV", "disamb": true}]}, {"index": 46, "position": [230, 237], "orth": "because", "lexemes": [{"lemma": "because", "mstag": "SCONJ", "disamb": true}]}, {"index": 47, "position": [238, 239], "orth": "I", "lexemes": [{"lemma": "I", "mstag": "PRON", "disamb": true}]}, {"index": 48, "position": [240, 243], "orth": "was", "lexemes": [{"lemma": "be", "mstag": "AUX", "disamb": true}]}, {"index": 49, "position": [244, 246], "orth": "n’t", "lexemes": [{"lemma": "not", "mstag": "PART", "disamb": true}]}, {"index": 50, "position": [247, 252], "orth": "worth", "lexemes": [{"lemma": "worth", "mstag": "ADJ", "disamb": true}]}, {"index": 51, "position": [253, 260], "orth": "talking", "lexemes": [{"lemma": "talk", "mstag": "VERB", "disamb": true}]}, {"index": 52, "position": [261, 263], "orth": "to", "lexemes": [{"lemma": "to", "mstag": "ADP", "disamb": true}]}, {"index": 53, "position": [264, 264], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 54, "position": [265, 265], "orth": "”", "lexemes": [{"lemma": "\"", "mstag": "PUNCT", "disamb": true}]}, {"index": 55, "position": [266, 270], "orth": "said", "lexemes": [{"lemma": "say", "mstag": "VERB", "disamb": true}]}, {"index": 56, "position": [271, 276], "orth": "Thrun", "lexemes": [{"lemma": "Thrun", "mstag": "PROPN", "disamb": true}]}, {"index": 57, "position": [277, 277], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 58, "position": [278, 280], "orth": "in", "lexemes": [{"lemma": "in", "mstag": "ADP", "disamb": true}]}, {"index": 59, "position": [281, 283], "orth": "an", "lexemes": [{"lemma": "an", "mstag": "DET", "disamb": true}]}, {"index": 60, "position": [284, 293], "orth": "interview", "lexemes": [{"lemma": "interview", "mstag": "NOUN", "disamb": true}]}, {"index": 61, "position": [294, 298], "orth": "with", "lexemes": [{"lemma": "with", "mstag": "ADP", "disamb": true}]}, {"index": 62, "position": [299, 305], "orth": "Recode", "lexemes": [{"lemma": "Recode", "mstag": "PROPN", "disamb": true}]}, {"index": 63, "position": [306, 313], "orth": "earlier", "lexemes": [{"lemma": "early", "mstag": "ADV", "disamb": true}]}, {"index": 64, "position": [314, 318], "orth": "this", "lexemes": [{"lemma": "this", "mstag": "DET", "disamb": true}]}, {"index": 65, "position": [319, 323], "orth": "week", "lexemes": [{"lemma": "week", "mstag": "NOUN", "disamb": true}]}, {"index": 66, "position": [324, 324], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 67, "position": [325, 325], "orth": "\n", "lexemes": [{"lemma": "\n", "mstag": "SPACE", "disamb": true}]}], "text": "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously. “I can tell you very senior CEOs of major American car companies would shake my hand and turn away because I wasn’t worth talking to,” said Thrun, in an interview with Recode earlier this week.\n", "entities": [{"text": "Sebastian Thrun", "type": "PERSON", "tokens": [1, 3], "position": [5, 20]}, {"text": "Google", "type": "ORG", "tokens": [11, 12], "position": [61, 67]}, {"text": "2007", "type": "DATE", "tokens": [13, 14], "position": [71, 75]}, {"text": "American", "type": "NORP", "tokens": [35, 36], "position": [173, 181]}, {"text": "Thrun", "type": "PERSON", "tokens": [55, 56], "position": [271, 276]}, {"text": "Recode", "type": "ORG", "tokens": [61, 62], "position": [299, 305]}, {"text": "earlier this week", "type": "DATE", "tokens": [62, 65], "position": [306, 323]}]} diff --git a/tests/testdata/expected/ner_for_pl.json b/tests/testdata/expected/ner_for_pl.json new file mode 100644 index 0000000000000000000000000000000000000000..2c61d37d463ba03d78dce39eb5ee52b9d17ff97d --- /dev/null +++ b/tests/testdata/expected/ner_for_pl.json @@ -0,0 +1 @@ +{"filename": "5b4e69c7-c7b3-460d-bee3-36fef23b13ed", "text": "Woda jest jedną z najpospolitszych substancji we Wszechświecie. Cząsteczka wody jest trzecią najbardziej rozpowszechnioną molekułą w ośrodku międzygwiazdowym, po cząsteczkowym wodorze i tlenku węgla. Jest również szeroko rozpowszechniona w Układzie Słonecznym: stanowi istotny element budowy Ceres i księżyców lodowych krążących wokół planet-olbrzymów, jako domieszka występuje w ich atmosferach, a przypuszcza się, że duże jej ilości znajdują się we wnętrzach tych planet. Jako lód występuje także na części planetoid, a zapewne również na obiektach transneptunowych. Woda jest bardzo rozpowszechniona także na powierzchni Ziemi. Występuje głównie w oceanach, które pokrywają 70,8% powierzchni globu, ale także w rzekach, jeziorach i w postaci stałej w lodowcach. Część wody znajduje się w atmosferze (chmury, para wodna). Niektóre związki chemiczne zawierają cząsteczki wody w swojej budowie (hydraty – określa się ją wówczas mianem wody krystalizacyjnej). Zawartość wody włączonej w strukturę minerałów w płaszczu Ziemi może przekraczać łączną zawartość wody w oceanach i innych zbiornikach powierzchniowych nawet dziesięciokrotnie. Woda występująca w przyrodzie jest roztworem soli i gazów. Najwięcej soli mineralnych zawiera woda morska i wody mineralne; najmniej woda z opadów atmosferycznych. Wodę o małej zawartości składników mineralnych nazywamy wodą miękką, natomiast zawierającą znaczne ilości soli wapnia i magnezu – wodą twardą. Oprócz tego wody naturalne zawierają rozpuszczone substancje pochodzenia organicznego, np. mocznik, kwasy humusowe itp.", "tokens": [{"index": 1, "position": [0, 4], "orth": "Woda", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:nom:f", "disamb": true}]}, {"index": 2, "position": [5, 9], "orth": "jest", "lexemes": [{"lemma": "być", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 3, "position": [10, 15], "orth": "jedną", "lexemes": [{"lemma": "jeden", "mstag": "adj:sg:inst:f:pos", "disamb": true}]}, {"index": 4, "position": [16, 17], "orth": "z", "lexemes": [{"lemma": "z", "mstag": "prep:gen:nwok", "disamb": true}]}, {"index": 5, "position": [18, 34], "orth": "najpospolitszych", "lexemes": [{"lemma": "pospolity", "mstag": "adj:pl:gen:f:sup", "disamb": true}]}, {"index": 6, "position": [35, 45], "orth": "substancji", "lexemes": [{"lemma": "substancja", "mstag": "subst:pl:gen:f", "disamb": true}]}, {"index": 7, "position": [46, 48], "orth": "we", "lexemes": [{"lemma": "w", "mstag": "prep:loc:wok", "disamb": true}]}, {"index": 8, "position": [49, 62], "orth": "Wszechświecie", "lexemes": [{"lemma": "wszechświat", "mstag": "subst:sg:loc:m3", "disamb": true}]}, {"index": 9, "position": [62, 63], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 10, "position": [64, 74], "orth": "Cząsteczka", "lexemes": [{"lemma": "cząsteczka", "mstag": "subst:sg:nom:f", "disamb": true}]}, {"index": 11, "position": [75, 79], "orth": "wody", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 12, "position": [80, 84], "orth": "jest", "lexemes": [{"lemma": "być", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 13, "position": [85, 92], "orth": "trzecią", "lexemes": [{"lemma": "trzeci", "mstag": "adj:sg:inst:f:pos", "disamb": true}]}, {"index": 14, "position": [93, 104], "orth": "najbardziej", "lexemes": [{"lemma": "bardzo", "mstag": "adv:sup", "disamb": true}]}, {"index": 15, "position": [105, 121], "orth": "rozpowszechnioną", "lexemes": [{"lemma": "rozpowszechniony", "mstag": "adj:sg:inst:f:pos", "disamb": true}]}, {"index": 16, "position": [122, 130], "orth": "molekułą", "lexemes": [{"lemma": "molekuła", "mstag": "subst:sg:inst:f", "disamb": true}]}, {"index": 17, "position": [131, 132], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 18, "position": [133, 140], "orth": "ośrodku", "lexemes": [{"lemma": "ośrodek", "mstag": "subst:sg:loc:m3", "disamb": true}]}, {"index": 19, "position": [141, 157], "orth": "międzygwiazdowym", "lexemes": [{"lemma": "międzygwiazdowy", "mstag": "adj:sg:loc:m3:pos", "disamb": true}]}, {"index": 20, "position": [157, 158], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 21, "position": [159, 161], "orth": "po", "lexemes": [{"lemma": "po", "mstag": "prep:loc", "disamb": true}]}, {"index": 22, "position": [162, 175], "orth": "cząsteczkowym", "lexemes": [{"lemma": "cząsteczkowy", "mstag": "adj:sg:loc:m3:pos", "disamb": true}]}, {"index": 23, "position": [176, 183], "orth": "wodorze", "lexemes": [{"lemma": "wodór", "mstag": "subst:sg:loc:m3", "disamb": true}]}, {"index": 24, "position": [184, 185], "orth": "i", "lexemes": [{"lemma": "i", "mstag": "conj", "disamb": true}]}, {"index": 25, "position": [186, 192], "orth": "tlenku", "lexemes": [{"lemma": "tlenek", "mstag": "subst:sg:loc:m3", "disamb": true}]}, {"index": 26, "position": [193, 198], "orth": "węgla", "lexemes": [{"lemma": "węgiel", "mstag": "subst:sg:gen:m3", "disamb": true}]}, {"index": 27, "position": [198, 199], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 28, "position": [200, 204], "orth": "Jest", "lexemes": [{"lemma": "być", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 29, "position": [205, 212], "orth": "również", "lexemes": [{"lemma": "również", "mstag": "qub", "disamb": true}]}, {"index": 30, "position": [213, 220], "orth": "szeroko", "lexemes": [{"lemma": "szeroko", "mstag": "adv:pos", "disamb": true}]}, {"index": 31, "position": [221, 237], "orth": "rozpowszechniona", "lexemes": [{"lemma": "rozpowszechniony", "mstag": "adj:sg:nom:f:pos", "disamb": true}]}, {"index": 32, "position": [238, 239], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 33, "position": [240, 248], "orth": "Układzie", "lexemes": [{"lemma": "Układ", "mstag": "subst:sg:loc:m3", "disamb": true}]}, {"index": 34, "position": [249, 259], "orth": "Słonecznym", "lexemes": [{"lemma": "Słoneczny", "mstag": "adj:sg:loc:m3:pos", "disamb": true}]}, {"index": 35, "position": [259, 260], "orth": ":", "lexemes": [{"lemma": ":", "mstag": "interp", "disamb": true}]}, {"index": 36, "position": [261, 268], "orth": "stanowi", "lexemes": [{"lemma": "stanowić", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 37, "position": [269, 276], "orth": "istotny", "lexemes": [{"lemma": "istotny", "mstag": "adj:sg:acc:m3:pos", "disamb": true}]}, {"index": 38, "position": [277, 284], "orth": "element", "lexemes": [{"lemma": "element", "mstag": "subst:sg:acc:m3", "disamb": true}]}, {"index": 39, "position": [285, 291], "orth": "budowy", "lexemes": [{"lemma": "budowa", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 40, "position": [292, 297], "orth": "Ceres", "lexemes": [{"lemma": "ceres", "mstag": "subst:sg:acc:m3", "disamb": true}]}, {"index": 41, "position": [298, 299], "orth": "i", "lexemes": [{"lemma": "i", "mstag": "conj", "disamb": true}]}, {"index": 42, "position": [300, 309], "orth": "księżyców", "lexemes": [{"lemma": "księżyc", "mstag": "subst:pl:gen:m3", "disamb": true}]}, {"index": 43, "position": [310, 318], "orth": "lodowych", "lexemes": [{"lemma": "lodowy", "mstag": "adj:pl:gen:m3:pos", "disamb": true}]}, {"index": 44, "position": [319, 328], "orth": "krążących", "lexemes": [{"lemma": "krążyć", "mstag": "pact:pl:gen:f:imperf:aff", "disamb": true}]}, {"index": 45, "position": [329, 334], "orth": "wokół", "lexemes": [{"lemma": "wokół", "mstag": "prep:gen", "disamb": true}]}, {"index": 46, "position": [335, 341], "orth": "planet", "lexemes": [{"lemma": "planeta", "mstag": "subst:pl:gen:f", "disamb": true}]}, {"index": 47, "position": [341, 342], "orth": "-", "lexemes": [{"lemma": "-", "mstag": "interp", "disamb": true}]}, {"index": 48, "position": [342, 351], "orth": "olbrzymów", "lexemes": [{"lemma": "olbrzym", "mstag": "subst:pl:gen:m1", "disamb": true}]}, {"index": 49, "position": [351, 352], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 50, "position": [353, 357], "orth": "jako", "lexemes": [{"lemma": "jako", "mstag": "adv", "disamb": true}]}, {"index": 51, "position": [358, 367], "orth": "domieszka", "lexemes": [{"lemma": "domieszka", "mstag": "subst:sg:nom:f", "disamb": true}]}, {"index": 52, "position": [368, 377], "orth": "występuje", "lexemes": [{"lemma": "występować", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 53, "position": [378, 379], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 54, "position": [380, 383], "orth": "ich", "lexemes": [{"lemma": "on", "mstag": "ppron3:pl:gen:m1:ter:akc:npraep", "disamb": true}]}, {"index": 55, "position": [384, 395], "orth": "atmosferach", "lexemes": [{"lemma": "atmosfera", "mstag": "subst:pl:loc:f", "disamb": true}]}, {"index": 56, "position": [395, 396], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 57, "position": [397, 398], "orth": "a", "lexemes": [{"lemma": "a", "mstag": "conj", "disamb": true}]}, {"index": 58, "position": [399, 410], "orth": "przypuszcza", "lexemes": [{"lemma": "przypuszczać", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 59, "position": [411, 414], "orth": "się", "lexemes": [{"lemma": "się", "mstag": "qub", "disamb": true}]}, {"index": 60, "position": [414, 415], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 61, "position": [416, 418], "orth": "że", "lexemes": [{"lemma": "że", "mstag": "comp", "disamb": true}]}, {"index": 62, "position": [419, 423], "orth": "duże", "lexemes": [{"lemma": "duży", "mstag": "adj:pl:nom:f:pos", "disamb": true}]}, {"index": 63, "position": [424, 427], "orth": "jej", "lexemes": [{"lemma": "on", "mstag": "ppron3:sg:gen:f:ter:akc:npraep", "disamb": true}]}, {"index": 64, "position": [428, 434], "orth": "ilości", "lexemes": [{"lemma": "ilość", "mstag": "subst:pl:nom:f", "disamb": true}]}, {"index": 65, "position": [435, 443], "orth": "znajdują", "lexemes": [{"lemma": "znajdować", "mstag": "fin:pl:ter:imperf", "disamb": true}]}, {"index": 66, "position": [444, 447], "orth": "się", "lexemes": [{"lemma": "się", "mstag": "qub", "disamb": true}]}, {"index": 67, "position": [448, 450], "orth": "we", "lexemes": [{"lemma": "w", "mstag": "prep:loc:wok", "disamb": true}]}, {"index": 68, "position": [451, 460], "orth": "wnętrzach", "lexemes": [{"lemma": "wnętrze", "mstag": "subst:pl:loc:n", "disamb": true}]}, {"index": 69, "position": [461, 465], "orth": "tych", "lexemes": [{"lemma": "ten", "mstag": "adj:pl:gen:f:pos", "disamb": true}]}, {"index": 70, "position": [466, 472], "orth": "planet", "lexemes": [{"lemma": "planeta", "mstag": "subst:pl:gen:f", "disamb": true}]}, {"index": 71, "position": [472, 473], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 72, "position": [474, 478], "orth": "Jako", "lexemes": [{"lemma": "jako", "mstag": "conj", "disamb": true}]}, {"index": 73, "position": [479, 482], "orth": "lód", "lexemes": [{"lemma": "lód", "mstag": "subst:sg:nom:m3", "disamb": true}]}, {"index": 74, "position": [483, 492], "orth": "występuje", "lexemes": [{"lemma": "występować", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 75, "position": [493, 498], "orth": "także", "lexemes": [{"lemma": "także", "mstag": "qub", "disamb": true}]}, {"index": 76, "position": [499, 501], "orth": "na", "lexemes": [{"lemma": "na", "mstag": "prep:loc", "disamb": true}]}, {"index": 77, "position": [502, 508], "orth": "części", "lexemes": [{"lemma": "część", "mstag": "subst:sg:loc:f", "disamb": true}]}, {"index": 78, "position": [509, 518], "orth": "planetoid", "lexemes": [{"lemma": "planetoida", "mstag": "subst:pl:gen:f", "disamb": true}]}, {"index": 79, "position": [518, 519], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 80, "position": [520, 521], "orth": "a", "lexemes": [{"lemma": "a", "mstag": "conj", "disamb": true}]}, {"index": 81, "position": [522, 529], "orth": "zapewne", "lexemes": [{"lemma": "zapewne", "mstag": "qub", "disamb": true}]}, {"index": 82, "position": [530, 537], "orth": "również", "lexemes": [{"lemma": "również", "mstag": "qub", "disamb": true}]}, {"index": 83, "position": [538, 540], "orth": "na", "lexemes": [{"lemma": "na", "mstag": "prep:loc", "disamb": true}]}, {"index": 84, "position": [541, 550], "orth": "obiektach", "lexemes": [{"lemma": "obiekt", "mstag": "subst:pl:loc:m3", "disamb": true}]}, {"index": 85, "position": [551, 567], "orth": "transneptunowych", "lexemes": [{"lemma": "transneptunowych", "mstag": "ign", "disamb": true}]}, {"index": 86, "position": [567, 568], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 87, "position": [569, 573], "orth": "Woda", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:nom:f", "disamb": true}]}, {"index": 88, "position": [574, 578], "orth": "jest", "lexemes": [{"lemma": "być", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 89, "position": [579, 585], "orth": "bardzo", "lexemes": [{"lemma": "bardzo", "mstag": "adv:pos", "disamb": true}]}, {"index": 90, "position": [586, 602], "orth": "rozpowszechniona", "lexemes": [{"lemma": "rozpowszechniony", "mstag": "adj:sg:nom:f:pos", "disamb": true}]}, {"index": 91, "position": [603, 608], "orth": "także", "lexemes": [{"lemma": "także", "mstag": "conj", "disamb": true}]}, {"index": 92, "position": [609, 611], "orth": "na", "lexemes": [{"lemma": "na", "mstag": "prep:loc", "disamb": true}]}, {"index": 93, "position": [612, 623], "orth": "powierzchni", "lexemes": [{"lemma": "powierzchnia", "mstag": "subst:sg:loc:f", "disamb": true}]}, {"index": 94, "position": [624, 629], "orth": "Ziemi", "lexemes": [{"lemma": "Ziemia", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 95, "position": [629, 630], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 96, "position": [631, 640], "orth": "Występuje", "lexemes": [{"lemma": "występować", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 97, "position": [641, 648], "orth": "głównie", "lexemes": [{"lemma": "głównie", "mstag": "qub", "disamb": true}]}, {"index": 98, "position": [649, 650], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 99, "position": [651, 659], "orth": "oceanach", "lexemes": [{"lemma": "ocean", "mstag": "subst:pl:loc:m3", "disamb": true}]}, {"index": 100, "position": [659, 660], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 101, "position": [661, 666], "orth": "które", "lexemes": [{"lemma": "który", "mstag": "adj:pl:nom:m3:pos", "disamb": true}]}, {"index": 102, "position": [667, 676], "orth": "pokrywają", "lexemes": [{"lemma": "pokrywać", "mstag": "fin:pl:ter:imperf", "disamb": true}]}, {"index": 103, "position": [677, 679], "orth": "70", "lexemes": [{"lemma": "70", "mstag": "num:pl:acc:m3:rec", "disamb": true}]}, {"index": 104, "position": [679, 680], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 105, "position": [680, 681], "orth": "8", "lexemes": [{"lemma": "8", "mstag": "num:pl:acc:m3:rec", "disamb": true}]}, {"index": 106, "position": [681, 682], "orth": "%", "lexemes": [{"lemma": "%", "mstag": "subst:sg:nom:m3", "disamb": true}]}, {"index": 107, "position": [683, 694], "orth": "powierzchni", "lexemes": [{"lemma": "powierzchnia", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 108, "position": [695, 700], "orth": "globu", "lexemes": [{"lemma": "glob", "mstag": "subst:sg:gen:m3", "disamb": true}]}, {"index": 109, "position": [700, 701], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 110, "position": [702, 705], "orth": "ale", "lexemes": [{"lemma": "ale", "mstag": "conj", "disamb": true}]}, {"index": 111, "position": [706, 711], "orth": "także", "lexemes": [{"lemma": "także", "mstag": "conj", "disamb": true}]}, {"index": 112, "position": [712, 713], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 113, "position": [714, 721], "orth": "rzekach", "lexemes": [{"lemma": "rzeka", "mstag": "subst:pl:loc:f", "disamb": true}]}, {"index": 114, "position": [721, 722], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 115, "position": [723, 732], "orth": "jeziorach", "lexemes": [{"lemma": "jezioro", "mstag": "subst:pl:loc:n", "disamb": true}]}, {"index": 116, "position": [733, 734], "orth": "i", "lexemes": [{"lemma": "i", "mstag": "conj", "disamb": true}]}, {"index": 117, "position": [735, 736], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 118, "position": [737, 744], "orth": "postaci", "lexemes": [{"lemma": "postać", "mstag": "subst:sg:loc:f", "disamb": true}]}, {"index": 119, "position": [745, 751], "orth": "stałej", "lexemes": [{"lemma": "stały", "mstag": "adj:sg:loc:f:pos", "disamb": true}]}, {"index": 120, "position": [752, 753], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 121, "position": [754, 763], "orth": "lodowcach", "lexemes": [{"lemma": "lodowiec", "mstag": "subst:pl:loc:m3", "disamb": true}]}, {"index": 122, "position": [763, 764], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 123, "position": [765, 770], "orth": "Część", "lexemes": [{"lemma": "część", "mstag": "subst:sg:nom:f", "disamb": true}]}, {"index": 124, "position": [771, 775], "orth": "wody", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 125, "position": [776, 784], "orth": "znajduje", "lexemes": [{"lemma": "znajdować", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 126, "position": [785, 788], "orth": "się", "lexemes": [{"lemma": "się", "mstag": "qub", "disamb": true}]}, {"index": 127, "position": [789, 790], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 128, "position": [791, 801], "orth": "atmosferze", "lexemes": [{"lemma": "atmosfera", "mstag": "subst:sg:loc:f", "disamb": true}]}, {"index": 129, "position": [802, 803], "orth": "(", "lexemes": [{"lemma": "(", "mstag": "interp", "disamb": true}]}, {"index": 130, "position": [803, 809], "orth": "chmury", "lexemes": [{"lemma": "chmura", "mstag": "subst:pl:nom:f", "disamb": true}]}, {"index": 131, "position": [809, 810], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 132, "position": [811, 815], "orth": "para", "lexemes": [{"lemma": "para", "mstag": "subst:sg:nom:f", "disamb": true}]}, {"index": 133, "position": [816, 821], "orth": "wodna", "lexemes": [{"lemma": "wodny", "mstag": "adj:sg:nom:f:pos", "disamb": true}]}, {"index": 134, "position": [821, 822], "orth": ")", "lexemes": [{"lemma": ")", "mstag": "interp", "disamb": true}]}, {"index": 135, "position": [822, 823], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 136, "position": [824, 832], "orth": "Niektóre", "lexemes": [{"lemma": "niektóry", "mstag": "adj:pl:nom:m3:pos", "disamb": true}]}, {"index": 137, "position": [833, 840], "orth": "związki", "lexemes": [{"lemma": "związek", "mstag": "subst:pl:nom:m3", "disamb": true}]}, {"index": 138, "position": [841, 850], "orth": "chemiczne", "lexemes": [{"lemma": "chemiczny", "mstag": "adj:pl:nom:m3:pos", "disamb": true}]}, {"index": 139, "position": [851, 860], "orth": "zawierają", "lexemes": [{"lemma": "zawierać", "mstag": "fin:pl:ter:imperf", "disamb": true}]}, {"index": 140, "position": [861, 871], "orth": "cząsteczki", "lexemes": [{"lemma": "cząsteczka", "mstag": "subst:pl:acc:f", "disamb": true}]}, {"index": 141, "position": [872, 876], "orth": "wody", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 142, "position": [877, 878], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 143, "position": [879, 885], "orth": "swojej", "lexemes": [{"lemma": "swój", "mstag": "adj:sg:loc:f:pos", "disamb": true}]}, {"index": 144, "position": [886, 893], "orth": "budowie", "lexemes": [{"lemma": "budowa", "mstag": "subst:sg:loc:f", "disamb": true}]}, {"index": 145, "position": [894, 895], "orth": "(", "lexemes": [{"lemma": "(", "mstag": "interp", "disamb": true}]}, {"index": 146, "position": [895, 902], "orth": "hydraty", "lexemes": [{"lemma": "hydrat", "mstag": "subst:pl:nom:m3", "disamb": true}]}, {"index": 147, "position": [903, 904], "orth": "–", "lexemes": [{"lemma": "–", "mstag": "interp", "disamb": true}]}, {"index": 148, "position": [905, 912], "orth": "określa", "lexemes": [{"lemma": "określać", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 149, "position": [913, 916], "orth": "się", "lexemes": [{"lemma": "się", "mstag": "qub", "disamb": true}]}, {"index": 150, "position": [917, 919], "orth": "ją", "lexemes": [{"lemma": "on", "mstag": "ppron3:sg:acc:f:ter:akc:npraep", "disamb": true}]}, {"index": 151, "position": [920, 927], "orth": "wówczas", "lexemes": [{"lemma": "wówczas", "mstag": "adv", "disamb": true}]}, {"index": 152, "position": [928, 934], "orth": "mianem", "lexemes": [{"lemma": "miano", "mstag": "subst:sg:inst:n", "disamb": true}]}, {"index": 153, "position": [935, 939], "orth": "wody", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 154, "position": [940, 956], "orth": "krystalizacyjnej", "lexemes": [{"lemma": "krystalizacyjny", "mstag": "adj:sg:gen:f:pos", "disamb": true}]}, {"index": 155, "position": [956, 957], "orth": ")", "lexemes": [{"lemma": ")", "mstag": "interp", "disamb": true}]}, {"index": 156, "position": [957, 958], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 157, "position": [959, 968], "orth": "Zawartość", "lexemes": [{"lemma": "zawartość", "mstag": "subst:sg:nom:f", "disamb": true}]}, {"index": 158, "position": [969, 973], "orth": "wody", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 159, "position": [974, 983], "orth": "włączonej", "lexemes": [{"lemma": "włączyć", "mstag": "ppas:sg:gen:f:perf:aff", "disamb": true}]}, {"index": 160, "position": [984, 985], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:acc:nwok", "disamb": true}]}, {"index": 161, "position": [986, 995], "orth": "strukturę", "lexemes": [{"lemma": "struktura", "mstag": "subst:sg:acc:f", "disamb": true}]}, {"index": 162, "position": [996, 1005], "orth": "minerałów", "lexemes": [{"lemma": "minerał", "mstag": "subst:pl:gen:m3", "disamb": true}]}, {"index": 163, "position": [1006, 1007], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 164, "position": [1008, 1016], "orth": "płaszczu", "lexemes": [{"lemma": "płaszcz", "mstag": "subst:sg:loc:m3", "disamb": true}]}, {"index": 165, "position": [1017, 1022], "orth": "Ziemi", "lexemes": [{"lemma": "Ziemia", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 166, "position": [1023, 1027], "orth": "może", "lexemes": [{"lemma": "móc", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 167, "position": [1028, 1039], "orth": "przekraczać", "lexemes": [{"lemma": "przekraczać", "mstag": "inf:imperf", "disamb": true}]}, {"index": 168, "position": [1040, 1046], "orth": "łączną", "lexemes": [{"lemma": "łączny", "mstag": "adj:sg:acc:f:pos", "disamb": true}]}, {"index": 169, "position": [1047, 1056], "orth": "zawartość", "lexemes": [{"lemma": "zawartość", "mstag": "subst:sg:acc:f", "disamb": true}]}, {"index": 170, "position": [1057, 1061], "orth": "wody", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 171, "position": [1062, 1063], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 172, "position": [1064, 1072], "orth": "oceanach", "lexemes": [{"lemma": "ocean", "mstag": "subst:pl:loc:m3", "disamb": true}]}, {"index": 173, "position": [1073, 1074], "orth": "i", "lexemes": [{"lemma": "i", "mstag": "conj", "disamb": true}]}, {"index": 174, "position": [1075, 1081], "orth": "innych", "lexemes": [{"lemma": "inny", "mstag": "adj:pl:loc:m3:pos", "disamb": true}]}, {"index": 175, "position": [1082, 1093], "orth": "zbiornikach", "lexemes": [{"lemma": "zbiornik", "mstag": "subst:pl:loc:m3", "disamb": true}]}, {"index": 176, "position": [1094, 1110], "orth": "powierzchniowych", "lexemes": [{"lemma": "powierzchniowy", "mstag": "adj:pl:loc:m3:pos", "disamb": true}]}, {"index": 177, "position": [1111, 1116], "orth": "nawet", "lexemes": [{"lemma": "nawet", "mstag": "qub", "disamb": true}]}, {"index": 178, "position": [1117, 1134], "orth": "dziesięciokrotnie", "lexemes": [{"lemma": "dziesięciokrotnie", "mstag": "adv:pos", "disamb": true}]}, {"index": 179, "position": [1134, 1135], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 180, "position": [1136, 1140], "orth": "Woda", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:nom:f", "disamb": true}]}, {"index": 181, "position": [1141, 1152], "orth": "występująca", "lexemes": [{"lemma": "występować", "mstag": "pact:sg:nom:f:imperf:aff", "disamb": true}]}, {"index": 182, "position": [1153, 1154], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 183, "position": [1155, 1165], "orth": "przyrodzie", "lexemes": [{"lemma": "przyroda", "mstag": "subst:sg:loc:f", "disamb": true}]}, {"index": 184, "position": [1166, 1170], "orth": "jest", "lexemes": [{"lemma": "być", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 185, "position": [1171, 1180], "orth": "roztworem", "lexemes": [{"lemma": "roztwór", "mstag": "subst:sg:inst:m3", "disamb": true}]}, {"index": 186, "position": [1181, 1185], "orth": "soli", "lexemes": [{"lemma": "sól", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 187, "position": [1186, 1187], "orth": "i", "lexemes": [{"lemma": "i", "mstag": "conj", "disamb": true}]}, {"index": 188, "position": [1188, 1193], "orth": "gazów", "lexemes": [{"lemma": "gaz", "mstag": "subst:pl:gen:m3", "disamb": true}]}, {"index": 189, "position": [1193, 1194], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 190, "position": [1195, 1204], "orth": "Najwięcej", "lexemes": [{"lemma": "najwięcej", "mstag": "num:pl:acc:f:rec", "disamb": true}]}, {"index": 191, "position": [1205, 1209], "orth": "soli", "lexemes": [{"lemma": "sól", "mstag": "subst:pl:gen:f", "disamb": true}]}, {"index": 192, "position": [1210, 1221], "orth": "mineralnych", "lexemes": [{"lemma": "mineralny", "mstag": "adj:pl:gen:f:pos", "disamb": true}]}, {"index": 193, "position": [1222, 1229], "orth": "zawiera", "lexemes": [{"lemma": "zawierać", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 194, "position": [1230, 1234], "orth": "woda", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:nom:f", "disamb": true}]}, {"index": 195, "position": [1235, 1241], "orth": "morska", "lexemes": [{"lemma": "morski", "mstag": "adj:sg:nom:f:pos", "disamb": true}]}, {"index": 196, "position": [1242, 1243], "orth": "i", "lexemes": [{"lemma": "i", "mstag": "conj", "disamb": true}]}, {"index": 197, "position": [1244, 1248], "orth": "wody", "lexemes": [{"lemma": "woda", "mstag": "subst:pl:acc:f", "disamb": true}]}, {"index": 198, "position": [1249, 1258], "orth": "mineralne", "lexemes": [{"lemma": "mineralny", "mstag": "adj:pl:acc:f:pos", "disamb": true}]}, {"index": 199, "position": [1258, 1259], "orth": ";", "lexemes": [{"lemma": ";", "mstag": "interp", "disamb": true}]}, {"index": 200, "position": [1260, 1268], "orth": "najmniej", "lexemes": [{"lemma": "najmniej", "mstag": "num:pl:nom:f:rec", "disamb": true}]}, {"index": 201, "position": [1269, 1273], "orth": "woda", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:nom:f", "disamb": true}]}, {"index": 202, "position": [1274, 1275], "orth": "z", "lexemes": [{"lemma": "z", "mstag": "prep:gen:nwok", "disamb": true}]}, {"index": 203, "position": [1276, 1282], "orth": "opadów", "lexemes": [{"lemma": "opad", "mstag": "subst:pl:gen:m3", "disamb": true}]}, {"index": 204, "position": [1283, 1298], "orth": "atmosferycznych", "lexemes": [{"lemma": "atmosferyczny", "mstag": "adj:pl:gen:m3:pos", "disamb": true}]}, {"index": 205, "position": [1298, 1299], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 206, "position": [1300, 1304], "orth": "Wodę", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:acc:f", "disamb": true}]}, {"index": 207, "position": [1305, 1306], "orth": "o", "lexemes": [{"lemma": "o", "mstag": "prep:loc", "disamb": true}]}, {"index": 208, "position": [1307, 1312], "orth": "małej", "lexemes": [{"lemma": "mały", "mstag": "adj:sg:loc:f:pos", "disamb": true}]}, {"index": 209, "position": [1313, 1323], "orth": "zawartości", "lexemes": [{"lemma": "zawartość", "mstag": "subst:sg:loc:f", "disamb": true}]}, {"index": 210, "position": [1324, 1334], "orth": "składników", "lexemes": [{"lemma": "składnik", "mstag": "subst:pl:gen:m3", "disamb": true}]}, {"index": 211, "position": [1335, 1346], "orth": "mineralnych", "lexemes": [{"lemma": "mineralny", "mstag": "adj:pl:gen:m3:pos", "disamb": true}]}, {"index": 212, "position": [1347, 1355], "orth": "nazywamy", "lexemes": [{"lemma": "nazywać", "mstag": "fin:pl:pri:imperf", "disamb": true}]}, {"index": 213, "position": [1356, 1360], "orth": "wodą", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:inst:f", "disamb": true}]}, {"index": 214, "position": [1361, 1367], "orth": "miękką", "lexemes": [{"lemma": "miękki", "mstag": "adj:sg:inst:f:pos", "disamb": true}]}, {"index": 215, "position": [1367, 1368], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 216, "position": [1369, 1378], "orth": "natomiast", "lexemes": [{"lemma": "natomiast", "mstag": "conj", "disamb": true}]}, {"index": 217, "position": [1379, 1390], "orth": "zawierającą", "lexemes": [{"lemma": "zawierać", "mstag": "pact:sg:acc:f:imperf:aff", "disamb": true}]}, {"index": 218, "position": [1391, 1398], "orth": "znaczne", "lexemes": [{"lemma": "znaczny", "mstag": "adj:pl:acc:f:pos", "disamb": true}]}, {"index": 219, "position": [1399, 1405], "orth": "ilości", "lexemes": [{"lemma": "ilość", "mstag": "subst:pl:acc:f", "disamb": true}]}, {"index": 220, "position": [1406, 1410], "orth": "soli", "lexemes": [{"lemma": "sól", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 221, "position": [1411, 1417], "orth": "wapnia", "lexemes": [{"lemma": "wapń", "mstag": "subst:sg:gen:m3", "disamb": true}]}, {"index": 222, "position": [1418, 1419], "orth": "i", "lexemes": [{"lemma": "i", "mstag": "conj", "disamb": true}]}, {"index": 223, "position": [1420, 1427], "orth": "magnezu", "lexemes": [{"lemma": "magnez", "mstag": "subst:sg:gen:m3", "disamb": true}]}, {"index": 224, "position": [1428, 1429], "orth": "–", "lexemes": [{"lemma": "–", "mstag": "interp", "disamb": true}]}, {"index": 225, "position": [1430, 1434], "orth": "wodą", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:inst:f", "disamb": true}]}, {"index": 226, "position": [1435, 1441], "orth": "twardą", "lexemes": [{"lemma": "twardy", "mstag": "adj:sg:inst:f:pos", "disamb": true}]}, {"index": 227, "position": [1441, 1442], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 228, "position": [1443, 1449], "orth": "Oprócz", "lexemes": [{"lemma": "oprócz", "mstag": "prep:gen", "disamb": true}]}, {"index": 229, "position": [1450, 1454], "orth": "tego", "lexemes": [{"lemma": "to", "mstag": "subst:sg:gen:n", "disamb": true}]}, {"index": 230, "position": [1455, 1459], "orth": "wody", "lexemes": [{"lemma": "woda", "mstag": "subst:pl:nom:f", "disamb": true}]}, {"index": 231, "position": [1460, 1469], "orth": "naturalne", "lexemes": [{"lemma": "naturalny", "mstag": "adj:pl:nom:f:pos", "disamb": true}]}, {"index": 232, "position": [1470, 1479], "orth": "zawierają", "lexemes": [{"lemma": "zawierać", "mstag": "fin:pl:ter:imperf", "disamb": true}]}, {"index": 233, "position": [1480, 1492], "orth": "rozpuszczone", "lexemes": [{"lemma": "rozpuścić", "mstag": "ppas:pl:nom:f:perf:aff", "disamb": true}]}, {"index": 234, "position": [1493, 1503], "orth": "substancje", "lexemes": [{"lemma": "substancja", "mstag": "subst:pl:nom:f", "disamb": true}]}, {"index": 235, "position": [1504, 1515], "orth": "pochodzenia", "lexemes": [{"lemma": "pochodzenie", "mstag": "subst:sg:gen:n", "disamb": true}]}, {"index": 236, "position": [1516, 1528], "orth": "organicznego", "lexemes": [{"lemma": "organiczny", "mstag": "adj:sg:gen:n:pos", "disamb": true}]}, {"index": 237, "position": [1528, 1529], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 238, "position": [1530, 1532], "orth": "np", "lexemes": [{"lemma": "na przykład", "mstag": "brev:pun", "disamb": true}]}, {"index": 239, "position": [1532, 1533], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 240, "position": [1534, 1541], "orth": "mocznik", "lexemes": [{"lemma": "mocznik", "mstag": "subst:sg:nom:m3", "disamb": true}]}, {"index": 241, "position": [1541, 1542], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 242, "position": [1543, 1548], "orth": "kwasy", "lexemes": [{"lemma": "kwas", "mstag": "subst:pl:nom:m3", "disamb": true}]}, {"index": 243, "position": [1549, 1557], "orth": "humusowe", "lexemes": [{"lemma": "humusowy", "mstag": "adj:pl:nom:m3:pos", "disamb": true}]}, {"index": 244, "position": [1558, 1561], "orth": "itp", "lexemes": [{"lemma": "i tym podobne", "mstag": "brev:pun", "disamb": true}]}, {"index": 245, "position": [1561, 1562], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}], "entities": [{"text": "Wszechświecie", "type": "nam_loc_astronomical", "tokens": [7, 8], "positions": [49, 62]}, {"text": "Układzie", "type": "nam_loc_astronomical", "tokens": [32, 33], "positions": [240, 248]}, {"text": "Słonecznym", "type": "nam_loc_country_region", "tokens": [33, 34], "positions": [249, 259]}, {"text": "Ceres", "type": "nam_loc_astronomical", "tokens": [39, 40], "positions": [292, 297]}, {"text": "Ziemi", "type": "nam_loc_astronomical", "tokens": [93, 94], "positions": [624, 629]}, {"text": "Ziemi", "type": "nam_loc_astronomical", "tokens": [164, 165], "positions": [1017, 1022]}]} diff --git a/tests/testdata/input/pos_tagger.yaml b/tests/testdata/input/pos_tagger.yaml index 9b8e4431d08327a73cbc9f857dd5e36caccb82ee..441831026407b1d274335f3db0d455fcfa966286 100644 --- a/tests/testdata/input/pos_tagger.yaml +++ b/tests/testdata/input/pos_tagger.yaml @@ -9,3 +9,14 @@ taggers: lpmn: [{"spacy":{"lang":"en"}}] output: json tagset: ud +ners: + pl: + default: + lpmn: ['morphodita', {'posconverter': {'input_format': 'ccl', 'output_format': 'json'}}, 'winer'] + output: json + tagset: nkjp + en: + default: + lpmn: [{"spacy":{"lang":"en", 'method': 'ner'}}] + output: json + tagset: ud