diff --git a/lpmn_queries.json b/lpmn_queries.json new file mode 100644 index 0000000000000000000000000000000000000000..0a8f6a1ce6415435cecb27812bee7586edc3baff --- /dev/null +++ b/lpmn_queries.json @@ -0,0 +1,11 @@ +{ + "postagger_lone_json": {"task": [{"postagger": {"lang": "pl", "output": "json", "json_text": false}}], "input": "post_postagger_input", "expected": "post_postagger_input.json"}, + + "pre_winer": {"task": [{"postagger": {"lang": "pl", "output": "json"}}, "winer"], "input": "post_postagger_input", "expected": "pre_winer.json"}, + + "pre_fextor3": {"task": ["any2txt", {"postagger": {"lang": "pl", "output": "json"}}, "fextor3"], "input": "pre_fextor3_input", "expected": "pre_fextor3_expected.json"}, + + "post_any2txt": {"task": ["any2txt", {"postagger": {"lang": "en", "output": "json"}}], "input": "post_spacy_input", "expected": "post_spacy_expected.json"}, + + "postagger_lone_lemmas": {"task": [{"postagger": {"lang": "pl", "output": "lemmas"}}], "input": "post_postagger_input", "expected": "post_postagger_input_lemmas"} +} diff --git a/src/tagger.py b/src/tagger.py index a5843fcd770b1da795f7123494f331f351d9003c..cff461484835afcba9520a2c2a83ce0603395048 100644 --- a/src/tagger.py +++ b/src/tagger.py @@ -131,7 +131,8 @@ class TaggerWorker(nlp_ws.NLPWorker): _log ) # remove tmp directory - shutil.rmtree(destination_path) + if os.path.exists(destination_path): + shutil.rmtree(destination_path) except Exception as e: if os.path.exists(destination_path): shutil.rmtree(destination_path) diff --git a/src/utils.py b/src/utils.py index 4a1f4f10281396a078df0a5af4bd9e704652df5d..05227d747d415d1096c2f6095d93a2e792fd77d9 100644 --- a/src/utils.py +++ b/src/utils.py @@ -105,8 +105,9 @@ def split_corpus(source_path: str, destination_path: str, file_name: str, return None if chunk_size >= file_size: - logging.error('Chunk size is greater than/equal to file size!') - return None + logging.info('Chunk size is greater than/equal to ' + 'file size, no splitting') + return [source_path] logging.debug(f'Creating the tree... ({destination_path})') Path(destination_path).mkdir(parents=True, exist_ok=True) diff --git a/tests/conftest.py b/tests/conftest.py index 77e0a40ba09f447d58ec09b78c125e8a9b86ac1d..e19c9463700e4ff398f02b26f775618c49d7214b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -29,6 +29,11 @@ def input_file1(): return 'post_spacy_input' +@pytest.fixture +def input_file_small(): + return 'post_spacy_small_limit_input' + + @pytest.fixture def input_dir2(): return 'input_dir2' @@ -39,6 +44,11 @@ def config(input_dir): return {'tool': {'config': join(input_dir, 'pos_tagger.yaml')}} +@pytest.fixture +def config_small(input_dir): + return {'tool': {'config': join(input_dir, 'pos_tagger.yaml'), 'chunking_limit': 50}} + + @pytest.fixture def worker(config): worker = TaggerWorker() @@ -46,3 +56,8 @@ def worker(config): return worker +@pytest.fixture +def worker_small(config_small): + worker = TaggerWorker() + worker.static_init(config_small) + return worker diff --git a/tests/test.py b/tests/test.py index 99661a4d937b31cb680b2b8101498ac442b9889c..d79f410883d63aaf7830a5c17c4c2432fd567124 100644 --- a/tests/test.py +++ b/tests/test.py @@ -22,20 +22,39 @@ def test_init(): assert type(worker).__name__ == 'TaggerWorker' -# def test_base_process_file(mocker, worker, input_dir, input_file1, -# output_dir, expected_dir): -# mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None) -# mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path) -# mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask) -# mocker.patch('nlp_ws._worker.NLPWorker.update_progress') -# SubTask.prepare_subtask( -# {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()}, -# os.getpid() -# ) -# worker.process( -# os.path.join(input_dir, input_file1), -# {}, os.path.join(output_dir, input_file1) -# ) -# assert cmp(os.path.join(output_dir, input_file1), -# os.path.join(expected_dir, input_file1)) -# os.remove(os.path.join(output_dir, input_file1)) +def test_base_process_file(mocker, worker, input_dir, input_file1, + output_dir, expected_dir): + mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None) + mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path) + mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask) + mocker.patch('nlp_ws._worker.NLPWorker.update_progress') + SubTask.prepare_subtask( + {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()}, + os.getpid() + ) + worker.process( + os.path.join(input_dir, input_file1), + {}, os.path.join(output_dir, input_file1) + ) + assert cmp(os.path.join(output_dir, input_file1), + os.path.join(expected_dir, input_file1)) + os.remove(os.path.join(output_dir, input_file1)) + + +def test_base_process_file_small_limit(mocker, worker_small, input_dir, input_file_small, + output_dir, expected_dir): + mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None) + mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path) + mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask) + mocker.patch('nlp_ws._worker.NLPWorker.update_progress') + SubTask.prepare_subtask( + {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()}, + os.getpid() + ) + worker_small.process( + os.path.join(input_dir, input_file_small), + {}, os.path.join(output_dir, input_file_small) + ) + assert cmp(os.path.join(output_dir, input_file_small), + os.path.join(expected_dir, input_file_small)) + os.remove(os.path.join(output_dir, input_file_small)) diff --git a/tests/testdata/expected/post_postagger_input.json b/tests/testdata/expected/post_postagger_input.json new file mode 100644 index 0000000000000000000000000000000000000000..2ff888963654c042d5f489e5ef228bc57aba11b7 --- /dev/null +++ b/tests/testdata/expected/post_postagger_input.json @@ -0,0 +1 @@ +{"filename": "a4281403-fffb-4382-b2a1-2ad171c4d004", "tagset": "nkjp", "tokens": [{"index": 1, "position": [0, 4], "orth": "Woda", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:nom:f", "disamb": true}]}, {"index": 2, "position": [5, 9], "orth": "jest", "lexemes": [{"lemma": "być", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 3, "position": [10, 15], "orth": "jedną", "lexemes": [{"lemma": "jeden", "mstag": "adj:sg:inst:f:pos", "disamb": true}]}, {"index": 4, "position": [16, 17], "orth": "z", "lexemes": [{"lemma": "z", "mstag": "prep:gen:nwok", "disamb": true}]}, {"index": 5, "position": [18, 34], "orth": "najpospolitszych", "lexemes": [{"lemma": "pospolity", "mstag": "adj:pl:gen:f:sup", "disamb": true}]}, {"index": 6, "position": [35, 45], "orth": "substancji", "lexemes": [{"lemma": "substancja", "mstag": "subst:pl:gen:f", "disamb": true}]}, {"index": 7, "position": [46, 48], "orth": "we", "lexemes": [{"lemma": "w", "mstag": "prep:loc:wok", "disamb": true}]}, {"index": 8, "position": [49, 62], "orth": "Wszechświecie", "lexemes": [{"lemma": "wszechświat", "mstag": "subst:sg:loc:m3", "disamb": true}]}, {"index": 9, "position": [62, 63], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 10, "position": [64, 74], "orth": "Cząsteczka", "lexemes": [{"lemma": "cząsteczka", "mstag": "subst:sg:nom:f", "disamb": true}]}, {"index": 11, "position": [75, 79], "orth": "wody", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 12, "position": [80, 84], "orth": "jest", "lexemes": [{"lemma": "być", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 13, "position": [85, 92], "orth": "trzecią", "lexemes": [{"lemma": "trzeci", "mstag": "adj:sg:inst:f:pos", "disamb": true}]}, {"index": 14, "position": [93, 104], "orth": "najbardziej", "lexemes": [{"lemma": "bardzo", "mstag": "adv:sup", "disamb": true}]}, {"index": 15, "position": [105, 121], "orth": "rozpowszechnioną", "lexemes": [{"lemma": "rozpowszechniony", "mstag": "adj:sg:inst:f:pos", "disamb": true}]}, {"index": 16, "position": [122, 130], "orth": "molekułą", "lexemes": [{"lemma": "molekuła", "mstag": "subst:sg:inst:f", "disamb": true}]}, {"index": 17, "position": [131, 132], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 18, "position": [133, 140], "orth": "ośrodku", "lexemes": [{"lemma": "ośrodek", "mstag": "subst:sg:loc:m3", "disamb": true}]}, {"index": 19, "position": [141, 157], "orth": "międzygwiazdowym", "lexemes": [{"lemma": "międzygwiazdowy", "mstag": "adj:sg:loc:m3:pos", "disamb": true}]}, {"index": 20, "position": [157, 158], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 21, "position": [159, 161], "orth": "po", "lexemes": [{"lemma": "po", "mstag": "prep:loc", "disamb": true}]}, {"index": 22, "position": [162, 175], "orth": "cząsteczkowym", "lexemes": [{"lemma": "cząsteczkowy", "mstag": "adj:sg:loc:m3:pos", "disamb": true}]}, {"index": 23, "position": [176, 183], "orth": "wodorze", "lexemes": [{"lemma": "wodór", "mstag": "subst:sg:loc:m3", "disamb": true}]}, {"index": 24, "position": [184, 185], "orth": "i", "lexemes": [{"lemma": "i", "mstag": "conj", "disamb": true}]}, {"index": 25, "position": [186, 192], "orth": "tlenku", "lexemes": [{"lemma": "tlenek", "mstag": "subst:sg:loc:m3", "disamb": true}]}, {"index": 26, "position": [193, 198], "orth": "węgla", "lexemes": [{"lemma": "węgiel", "mstag": "subst:sg:gen:m3", "disamb": true}]}, {"index": 27, "position": [198, 199], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 28, "position": [200, 204], "orth": "Jest", "lexemes": [{"lemma": "być", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 29, "position": [205, 212], "orth": "również", "lexemes": [{"lemma": "również", "mstag": "qub", "disamb": true}]}, {"index": 30, "position": [213, 220], "orth": "szeroko", "lexemes": [{"lemma": "szeroko", "mstag": "adv:pos", "disamb": true}]}, {"index": 31, "position": [221, 237], "orth": "rozpowszechniona", "lexemes": [{"lemma": "rozpowszechniony", "mstag": "adj:sg:nom:f:pos", "disamb": true}]}, {"index": 32, "position": [238, 239], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 33, "position": [240, 248], "orth": "Układzie", "lexemes": [{"lemma": "Układ", "mstag": "subst:sg:loc:m3", "disamb": true}]}, {"index": 34, "position": [249, 259], "orth": "Słonecznym", "lexemes": [{"lemma": "Słoneczny", "mstag": "adj:sg:loc:m3:pos", "disamb": true}]}, {"index": 35, "position": [259, 260], "orth": ":", "lexemes": [{"lemma": ":", "mstag": "interp", "disamb": true}]}, {"index": 36, "position": [261, 268], "orth": "stanowi", "lexemes": [{"lemma": "stanowić", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 37, "position": [269, 276], "orth": "istotny", "lexemes": [{"lemma": "istotny", "mstag": "adj:sg:acc:m3:pos", "disamb": true}]}, {"index": 38, "position": [277, 284], "orth": "element", "lexemes": [{"lemma": "element", "mstag": "subst:sg:acc:m3", "disamb": true}]}, {"index": 39, "position": [285, 291], "orth": "budowy", "lexemes": [{"lemma": "budowa", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 40, "position": [292, 297], "orth": "Ceres", "lexemes": [{"lemma": "ceres", "mstag": "subst:sg:acc:m3", "disamb": true}]}, {"index": 41, "position": [298, 299], "orth": "i", "lexemes": [{"lemma": "i", "mstag": "conj", "disamb": true}]}, {"index": 42, "position": [300, 309], "orth": "księżyców", "lexemes": [{"lemma": "księżyc", "mstag": "subst:pl:gen:m3", "disamb": true}]}, {"index": 43, "position": [310, 318], "orth": "lodowych", "lexemes": [{"lemma": "lodowy", "mstag": "adj:pl:gen:m3:pos", "disamb": true}]}, {"index": 44, "position": [319, 328], "orth": "krążących", "lexemes": [{"lemma": "krążyć", "mstag": "pact:pl:gen:f:imperf:aff", "disamb": true}]}, {"index": 45, "position": [329, 334], "orth": "wokół", "lexemes": [{"lemma": "wokół", "mstag": "prep:gen", "disamb": true}]}, {"index": 46, "position": [335, 341], "orth": "planet", "lexemes": [{"lemma": "planeta", "mstag": "subst:pl:gen:f", "disamb": true}]}, {"index": 47, "position": [341, 342], "orth": "-", "lexemes": [{"lemma": "-", "mstag": "interp", "disamb": true}]}, {"index": 48, "position": [342, 351], "orth": "olbrzymów", "lexemes": [{"lemma": "olbrzym", "mstag": "subst:pl:gen:m1", "disamb": true}]}, {"index": 49, "position": [351, 352], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 50, "position": [353, 357], "orth": "jako", "lexemes": [{"lemma": "jako", "mstag": "adv", "disamb": true}]}, {"index": 51, "position": [358, 367], "orth": "domieszka", "lexemes": [{"lemma": "domieszka", "mstag": "subst:sg:nom:f", "disamb": true}]}, {"index": 52, "position": [368, 377], "orth": "występuje", "lexemes": [{"lemma": "występować", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 53, "position": [378, 379], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 54, "position": [380, 383], "orth": "ich", "lexemes": [{"lemma": "on", "mstag": "ppron3:pl:gen:m1:ter:akc:npraep", "disamb": true}]}, {"index": 55, "position": [384, 395], "orth": "atmosferach", "lexemes": [{"lemma": "atmosfera", "mstag": "subst:pl:loc:f", "disamb": true}]}, {"index": 56, "position": [395, 396], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 57, "position": [397, 398], "orth": "a", "lexemes": [{"lemma": "a", "mstag": "conj", "disamb": true}]}, {"index": 58, "position": [399, 410], "orth": "przypuszcza", "lexemes": [{"lemma": "przypuszczać", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 59, "position": [411, 414], "orth": "się", "lexemes": [{"lemma": "się", "mstag": "qub", "disamb": true}]}, {"index": 60, "position": [414, 415], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 61, "position": [416, 418], "orth": "że", "lexemes": [{"lemma": "że", "mstag": "comp", "disamb": true}]}, {"index": 62, "position": [419, 423], "orth": "duże", "lexemes": [{"lemma": "duży", "mstag": "adj:pl:nom:f:pos", "disamb": true}]}, {"index": 63, "position": [424, 427], "orth": "jej", "lexemes": [{"lemma": "on", "mstag": "ppron3:sg:gen:f:ter:akc:npraep", "disamb": true}]}, {"index": 64, "position": [428, 434], "orth": "ilości", "lexemes": [{"lemma": "ilość", "mstag": "subst:pl:nom:f", "disamb": true}]}, {"index": 65, "position": [435, 443], "orth": "znajdują", "lexemes": [{"lemma": "znajdować", "mstag": "fin:pl:ter:imperf", "disamb": true}]}, {"index": 66, "position": [444, 447], "orth": "się", "lexemes": [{"lemma": "się", "mstag": "qub", "disamb": true}]}, {"index": 67, "position": [448, 450], "orth": "we", "lexemes": [{"lemma": "w", "mstag": "prep:loc:wok", "disamb": true}]}, {"index": 68, "position": [451, 460], "orth": "wnętrzach", "lexemes": [{"lemma": "wnętrze", "mstag": "subst:pl:loc:n", "disamb": true}]}, {"index": 69, "position": [461, 465], "orth": "tych", "lexemes": [{"lemma": "ten", "mstag": "adj:pl:gen:f:pos", "disamb": true}]}, {"index": 70, "position": [466, 472], "orth": "planet", "lexemes": [{"lemma": "planeta", "mstag": "subst:pl:gen:f", "disamb": true}]}, {"index": 71, "position": [472, 473], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 72, "position": [474, 478], "orth": "Jako", "lexemes": [{"lemma": "jako", "mstag": "conj", "disamb": true}]}, {"index": 73, "position": [479, 482], "orth": "lód", "lexemes": [{"lemma": "lód", "mstag": "subst:sg:nom:m3", "disamb": true}]}, {"index": 74, "position": [483, 492], "orth": "występuje", "lexemes": [{"lemma": "występować", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 75, "position": [493, 498], "orth": "także", "lexemes": [{"lemma": "także", "mstag": "qub", "disamb": true}]}, {"index": 76, "position": [499, 501], "orth": "na", "lexemes": [{"lemma": "na", "mstag": "prep:loc", "disamb": true}]}, {"index": 77, "position": [502, 508], "orth": "części", "lexemes": [{"lemma": "część", "mstag": "subst:sg:loc:f", "disamb": true}]}, {"index": 78, "position": [509, 518], "orth": "planetoid", "lexemes": [{"lemma": "planetoida", "mstag": "subst:pl:gen:f", "disamb": true}]}, {"index": 79, "position": [518, 519], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 80, "position": [520, 521], "orth": "a", "lexemes": [{"lemma": "a", "mstag": "conj", "disamb": true}]}, {"index": 81, "position": [522, 529], "orth": "zapewne", "lexemes": [{"lemma": "zapewne", "mstag": "qub", "disamb": true}]}, {"index": 82, "position": [530, 537], "orth": "również", "lexemes": [{"lemma": "również", "mstag": "qub", "disamb": true}]}, {"index": 83, "position": [538, 540], "orth": "na", "lexemes": [{"lemma": "na", "mstag": "prep:loc", "disamb": true}]}, {"index": 84, "position": [541, 550], "orth": "obiektach", "lexemes": [{"lemma": "obiekt", "mstag": "subst:pl:loc:m3", "disamb": true}]}, {"index": 85, "position": [551, 567], "orth": "transneptunowych", "lexemes": [{"lemma": "transneptunowych", "mstag": "ign", "disamb": true}]}, {"index": 86, "position": [567, 568], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 87, "position": [569, 573], "orth": "Woda", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:nom:f", "disamb": true}]}, {"index": 88, "position": [574, 578], "orth": "jest", "lexemes": [{"lemma": "być", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 89, "position": [579, 585], "orth": "bardzo", "lexemes": [{"lemma": "bardzo", "mstag": "adv:pos", "disamb": true}]}, {"index": 90, "position": [586, 602], "orth": "rozpowszechniona", "lexemes": [{"lemma": "rozpowszechniony", "mstag": "adj:sg:nom:f:pos", "disamb": true}]}, {"index": 91, "position": [603, 608], "orth": "także", "lexemes": [{"lemma": "także", "mstag": "conj", "disamb": true}]}, {"index": 92, "position": [609, 611], "orth": "na", "lexemes": [{"lemma": "na", "mstag": "prep:loc", "disamb": true}]}, {"index": 93, "position": [612, 623], "orth": "powierzchni", "lexemes": [{"lemma": "powierzchnia", "mstag": "subst:sg:loc:f", "disamb": true}]}, {"index": 94, "position": [624, 629], "orth": "Ziemi", "lexemes": [{"lemma": "Ziemia", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 95, "position": [629, 630], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 96, "position": [631, 640], "orth": "Występuje", "lexemes": [{"lemma": "występować", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 97, "position": [641, 648], "orth": "głównie", "lexemes": [{"lemma": "głównie", "mstag": "qub", "disamb": true}]}, {"index": 98, "position": [649, 650], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 99, "position": [651, 659], "orth": "oceanach", "lexemes": [{"lemma": "ocean", "mstag": "subst:pl:loc:m3", "disamb": true}]}, {"index": 100, "position": [659, 660], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 101, "position": [661, 666], "orth": "które", "lexemes": [{"lemma": "który", "mstag": "adj:pl:nom:m3:pos", "disamb": true}]}, {"index": 102, "position": [667, 676], "orth": "pokrywają", "lexemes": [{"lemma": "pokrywać", "mstag": "fin:pl:ter:imperf", "disamb": true}]}, {"index": 103, "position": [677, 679], "orth": "70", "lexemes": [{"lemma": "70", "mstag": "num:pl:acc:m3:rec", "disamb": true}]}, {"index": 104, "position": [679, 680], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 105, "position": [680, 681], "orth": "8", "lexemes": [{"lemma": "8", "mstag": "num:pl:acc:m3:rec", "disamb": true}]}, {"index": 106, "position": [681, 682], "orth": "%", "lexemes": [{"lemma": "%", "mstag": "subst:sg:nom:m3", "disamb": true}]}, {"index": 107, "position": [683, 694], "orth": "powierzchni", "lexemes": [{"lemma": "powierzchnia", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 108, "position": [695, 700], "orth": "globu", "lexemes": [{"lemma": "glob", "mstag": "subst:sg:gen:m3", "disamb": true}]}, {"index": 109, "position": [700, 701], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 110, "position": [702, 705], "orth": "ale", "lexemes": [{"lemma": "ale", "mstag": "conj", "disamb": true}]}, {"index": 111, "position": [706, 711], "orth": "także", "lexemes": [{"lemma": "także", "mstag": "conj", "disamb": true}]}, {"index": 112, "position": [712, 713], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 113, "position": [714, 721], "orth": "rzekach", "lexemes": [{"lemma": "rzeka", "mstag": "subst:pl:loc:f", "disamb": true}]}, {"index": 114, "position": [721, 722], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 115, "position": [723, 732], "orth": "jeziorach", "lexemes": [{"lemma": "jezioro", "mstag": "subst:pl:loc:n", "disamb": true}]}, {"index": 116, "position": [733, 734], "orth": "i", "lexemes": [{"lemma": "i", "mstag": "conj", "disamb": true}]}, {"index": 117, "position": [735, 736], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 118, "position": [737, 744], "orth": "postaci", "lexemes": [{"lemma": "postać", "mstag": "subst:sg:loc:f", "disamb": true}]}, {"index": 119, "position": [745, 751], "orth": "stałej", "lexemes": [{"lemma": "stały", "mstag": "adj:sg:loc:f:pos", "disamb": true}]}, {"index": 120, "position": [752, 753], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 121, "position": [754, 763], "orth": "lodowcach", "lexemes": [{"lemma": "lodowiec", "mstag": "subst:pl:loc:m3", "disamb": true}]}, {"index": 122, "position": [763, 764], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 123, "position": [765, 770], "orth": "Część", "lexemes": [{"lemma": "część", "mstag": "subst:sg:nom:f", "disamb": true}]}, {"index": 124, "position": [771, 775], "orth": "wody", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 125, "position": [776, 784], "orth": "znajduje", "lexemes": [{"lemma": "znajdować", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 126, "position": [785, 788], "orth": "się", "lexemes": [{"lemma": "się", "mstag": "qub", "disamb": true}]}, {"index": 127, "position": [789, 790], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 128, "position": [791, 801], "orth": "atmosferze", "lexemes": [{"lemma": "atmosfera", "mstag": "subst:sg:loc:f", "disamb": true}]}, {"index": 129, "position": [802, 803], "orth": "(", "lexemes": [{"lemma": "(", "mstag": "interp", "disamb": true}]}, {"index": 130, "position": [803, 809], "orth": "chmury", "lexemes": [{"lemma": "chmura", "mstag": "subst:pl:nom:f", "disamb": true}]}, {"index": 131, "position": [809, 810], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 132, "position": [811, 815], "orth": "para", "lexemes": [{"lemma": "para", "mstag": "subst:sg:nom:f", "disamb": true}]}, {"index": 133, "position": [816, 821], "orth": "wodna", "lexemes": [{"lemma": "wodny", "mstag": "adj:sg:nom:f:pos", "disamb": true}]}, {"index": 134, "position": [821, 822], "orth": ")", "lexemes": [{"lemma": ")", "mstag": "interp", "disamb": true}]}, {"index": 135, "position": [822, 823], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 136, "position": [824, 832], "orth": "Niektóre", "lexemes": [{"lemma": "niektóry", "mstag": "adj:pl:nom:m3:pos", "disamb": true}]}, {"index": 137, "position": [833, 840], "orth": "związki", "lexemes": [{"lemma": "związek", "mstag": "subst:pl:nom:m3", "disamb": true}]}, {"index": 138, "position": [841, 850], "orth": "chemiczne", "lexemes": [{"lemma": "chemiczny", "mstag": "adj:pl:nom:m3:pos", "disamb": true}]}, {"index": 139, "position": [851, 860], "orth": "zawierają", "lexemes": [{"lemma": "zawierać", "mstag": "fin:pl:ter:imperf", "disamb": true}]}, {"index": 140, "position": [861, 871], "orth": "cząsteczki", "lexemes": [{"lemma": "cząsteczka", "mstag": "subst:pl:acc:f", "disamb": true}]}, {"index": 141, "position": [872, 876], "orth": "wody", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 142, "position": [877, 878], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 143, "position": [879, 885], "orth": "swojej", "lexemes": [{"lemma": "swój", "mstag": "adj:sg:loc:f:pos", "disamb": true}]}, {"index": 144, "position": [886, 893], "orth": "budowie", "lexemes": [{"lemma": "budowa", "mstag": "subst:sg:loc:f", "disamb": true}]}, {"index": 145, "position": [894, 895], "orth": "(", "lexemes": [{"lemma": "(", "mstag": "interp", "disamb": true}]}, {"index": 146, "position": [895, 902], "orth": "hydraty", "lexemes": [{"lemma": "hydrat", "mstag": "subst:pl:nom:m3", "disamb": true}]}, {"index": 147, "position": [903, 904], "orth": "–", "lexemes": [{"lemma": "–", "mstag": "interp", "disamb": true}]}, {"index": 148, "position": [905, 912], "orth": "określa", "lexemes": [{"lemma": "określać", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 149, "position": [913, 916], "orth": "się", "lexemes": [{"lemma": "się", "mstag": "qub", "disamb": true}]}, {"index": 150, "position": [917, 919], "orth": "ją", "lexemes": [{"lemma": "on", "mstag": "ppron3:sg:acc:f:ter:akc:npraep", "disamb": true}]}, {"index": 151, "position": [920, 927], "orth": "wówczas", "lexemes": [{"lemma": "wówczas", "mstag": "adv", "disamb": true}]}, {"index": 152, "position": [928, 934], "orth": "mianem", "lexemes": [{"lemma": "miano", "mstag": "subst:sg:inst:n", "disamb": true}]}, {"index": 153, "position": [935, 939], "orth": "wody", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 154, "position": [940, 956], "orth": "krystalizacyjnej", "lexemes": [{"lemma": "krystalizacyjny", "mstag": "adj:sg:gen:f:pos", "disamb": true}]}, {"index": 155, "position": [956, 957], "orth": ")", "lexemes": [{"lemma": ")", "mstag": "interp", "disamb": true}]}, {"index": 156, "position": [957, 958], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 157, "position": [959, 968], "orth": "Zawartość", "lexemes": [{"lemma": "zawartość", "mstag": "subst:sg:nom:f", "disamb": true}]}, {"index": 158, "position": [969, 973], "orth": "wody", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 159, "position": [974, 983], "orth": "włączonej", "lexemes": [{"lemma": "włączyć", "mstag": "ppas:sg:gen:f:perf:aff", "disamb": true}]}, {"index": 160, "position": [984, 985], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:acc:nwok", "disamb": true}]}, {"index": 161, "position": [986, 995], "orth": "strukturę", "lexemes": [{"lemma": "struktura", "mstag": "subst:sg:acc:f", "disamb": true}]}, {"index": 162, "position": [996, 1005], "orth": "minerałów", "lexemes": [{"lemma": "minerał", "mstag": "subst:pl:gen:m3", "disamb": true}]}, {"index": 163, "position": [1006, 1007], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 164, "position": [1008, 1016], "orth": "płaszczu", "lexemes": [{"lemma": "płaszcz", "mstag": "subst:sg:loc:m3", "disamb": true}]}, {"index": 165, "position": [1017, 1022], "orth": "Ziemi", "lexemes": [{"lemma": "Ziemia", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 166, "position": [1023, 1027], "orth": "może", "lexemes": [{"lemma": "móc", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 167, "position": [1028, 1039], "orth": "przekraczać", "lexemes": [{"lemma": "przekraczać", "mstag": "inf:imperf", "disamb": true}]}, {"index": 168, "position": [1040, 1046], "orth": "łączną", "lexemes": [{"lemma": "łączny", "mstag": "adj:sg:acc:f:pos", "disamb": true}]}, {"index": 169, "position": [1047, 1056], "orth": "zawartość", "lexemes": [{"lemma": "zawartość", "mstag": "subst:sg:acc:f", "disamb": true}]}, {"index": 170, "position": [1057, 1061], "orth": "wody", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 171, "position": [1062, 1063], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 172, "position": [1064, 1072], "orth": "oceanach", "lexemes": [{"lemma": "ocean", "mstag": "subst:pl:loc:m3", "disamb": true}]}, {"index": 173, "position": [1073, 1074], "orth": "i", "lexemes": [{"lemma": "i", "mstag": "conj", "disamb": true}]}, {"index": 174, "position": [1075, 1081], "orth": "innych", "lexemes": [{"lemma": "inny", "mstag": "adj:pl:loc:m3:pos", "disamb": true}]}, {"index": 175, "position": [1082, 1093], "orth": "zbiornikach", "lexemes": [{"lemma": "zbiornik", "mstag": "subst:pl:loc:m3", "disamb": true}]}, {"index": 176, "position": [1094, 1110], "orth": "powierzchniowych", "lexemes": [{"lemma": "powierzchniowy", "mstag": "adj:pl:loc:m3:pos", "disamb": true}]}, {"index": 177, "position": [1111, 1116], "orth": "nawet", "lexemes": [{"lemma": "nawet", "mstag": "qub", "disamb": true}]}, {"index": 178, "position": [1117, 1134], "orth": "dziesięciokrotnie", "lexemes": [{"lemma": "dziesięciokrotnie", "mstag": "adv:pos", "disamb": true}]}, {"index": 179, "position": [1134, 1135], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 180, "position": [1136, 1140], "orth": "Woda", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:nom:f", "disamb": true}]}, {"index": 181, "position": [1141, 1152], "orth": "występująca", "lexemes": [{"lemma": "występować", "mstag": "pact:sg:nom:f:imperf:aff", "disamb": true}]}, {"index": 182, "position": [1153, 1154], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 183, "position": [1155, 1165], "orth": "przyrodzie", "lexemes": [{"lemma": "przyroda", "mstag": "subst:sg:loc:f", "disamb": true}]}, {"index": 184, "position": [1166, 1170], "orth": "jest", "lexemes": [{"lemma": "być", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 185, "position": [1171, 1180], "orth": "roztworem", "lexemes": [{"lemma": "roztwór", "mstag": "subst:sg:inst:m3", "disamb": true}]}, {"index": 186, "position": [1181, 1185], "orth": "soli", "lexemes": [{"lemma": "sól", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 187, "position": [1186, 1187], "orth": "i", "lexemes": [{"lemma": "i", "mstag": "conj", "disamb": true}]}, {"index": 188, "position": [1188, 1193], "orth": "gazów", "lexemes": [{"lemma": "gaz", "mstag": "subst:pl:gen:m3", "disamb": true}]}, {"index": 189, "position": [1193, 1194], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 190, "position": [1195, 1204], "orth": "Najwięcej", "lexemes": [{"lemma": "najwięcej", "mstag": "num:pl:acc:f:rec", "disamb": true}]}, {"index": 191, "position": [1205, 1209], "orth": "soli", "lexemes": [{"lemma": "sól", "mstag": "subst:pl:gen:f", "disamb": true}]}, {"index": 192, "position": [1210, 1221], "orth": "mineralnych", "lexemes": [{"lemma": "mineralny", "mstag": "adj:pl:gen:f:pos", "disamb": true}]}, {"index": 193, "position": [1222, 1229], "orth": "zawiera", "lexemes": [{"lemma": "zawierać", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 194, "position": [1230, 1234], "orth": "woda", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:nom:f", "disamb": true}]}, {"index": 195, "position": [1235, 1241], "orth": "morska", "lexemes": [{"lemma": "morski", "mstag": "adj:sg:nom:f:pos", "disamb": true}]}, {"index": 196, "position": [1242, 1243], "orth": "i", "lexemes": [{"lemma": "i", "mstag": "conj", "disamb": true}]}, {"index": 197, "position": [1244, 1248], "orth": "wody", "lexemes": [{"lemma": "woda", "mstag": "subst:pl:acc:f", "disamb": true}]}, {"index": 198, "position": [1249, 1258], "orth": "mineralne", "lexemes": [{"lemma": "mineralny", "mstag": "adj:pl:acc:f:pos", "disamb": true}]}, {"index": 199, "position": [1258, 1259], "orth": ";", "lexemes": [{"lemma": ";", "mstag": "interp", "disamb": true}]}, {"index": 200, "position": [1260, 1268], "orth": "najmniej", "lexemes": [{"lemma": "najmniej", "mstag": "num:pl:nom:f:rec", "disamb": true}]}, {"index": 201, "position": [1269, 1273], "orth": "woda", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:nom:f", "disamb": true}]}, {"index": 202, "position": [1274, 1275], "orth": "z", "lexemes": [{"lemma": "z", "mstag": "prep:gen:nwok", "disamb": true}]}, {"index": 203, "position": [1276, 1282], "orth": "opadów", "lexemes": [{"lemma": "opad", "mstag": "subst:pl:gen:m3", "disamb": true}]}, {"index": 204, "position": [1283, 1298], "orth": "atmosferycznych", "lexemes": [{"lemma": "atmosferyczny", "mstag": "adj:pl:gen:m3:pos", "disamb": true}]}, {"index": 205, "position": [1298, 1299], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 206, "position": [1300, 1304], "orth": "Wodę", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:acc:f", "disamb": true}]}, {"index": 207, "position": [1305, 1306], "orth": "o", "lexemes": [{"lemma": "o", "mstag": "prep:loc", "disamb": true}]}, {"index": 208, "position": [1307, 1312], "orth": "małej", "lexemes": [{"lemma": "mały", "mstag": "adj:sg:loc:f:pos", "disamb": true}]}, {"index": 209, "position": [1313, 1323], "orth": "zawartości", "lexemes": [{"lemma": "zawartość", "mstag": "subst:sg:loc:f", "disamb": true}]}, {"index": 210, "position": [1324, 1334], "orth": "składników", "lexemes": [{"lemma": "składnik", "mstag": "subst:pl:gen:m3", "disamb": true}]}, {"index": 211, "position": [1335, 1346], "orth": "mineralnych", "lexemes": [{"lemma": "mineralny", "mstag": "adj:pl:gen:m3:pos", "disamb": true}]}, {"index": 212, "position": [1347, 1355], "orth": "nazywamy", "lexemes": [{"lemma": "nazywać", "mstag": "fin:pl:pri:imperf", "disamb": true}]}, {"index": 213, "position": [1356, 1360], "orth": "wodą", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:inst:f", "disamb": true}]}, {"index": 214, "position": [1361, 1367], "orth": "miękką", "lexemes": [{"lemma": "miękki", "mstag": "adj:sg:inst:f:pos", "disamb": true}]}, {"index": 215, "position": [1367, 1368], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 216, "position": [1369, 1378], "orth": "natomiast", "lexemes": [{"lemma": "natomiast", "mstag": "conj", "disamb": true}]}, {"index": 217, "position": [1379, 1390], "orth": "zawierającą", "lexemes": [{"lemma": "zawierać", "mstag": "pact:sg:acc:f:imperf:aff", "disamb": true}]}, {"index": 218, "position": [1391, 1398], "orth": "znaczne", "lexemes": [{"lemma": "znaczny", "mstag": "adj:pl:acc:f:pos", "disamb": true}]}, {"index": 219, "position": [1399, 1405], "orth": "ilości", "lexemes": [{"lemma": "ilość", "mstag": "subst:pl:acc:f", "disamb": true}]}, {"index": 220, "position": [1406, 1410], "orth": "soli", "lexemes": [{"lemma": "sól", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 221, "position": [1411, 1417], "orth": "wapnia", "lexemes": [{"lemma": "wapń", "mstag": "subst:sg:gen:m3", "disamb": true}]}, {"index": 222, "position": [1418, 1419], "orth": "i", "lexemes": [{"lemma": "i", "mstag": "conj", "disamb": true}]}, {"index": 223, "position": [1420, 1427], "orth": "magnezu", "lexemes": [{"lemma": "magnez", "mstag": "subst:sg:gen:m3", "disamb": true}]}, {"index": 224, "position": [1428, 1429], "orth": "–", "lexemes": [{"lemma": "–", "mstag": "interp", "disamb": true}]}, {"index": 225, "position": [1430, 1434], "orth": "wodą", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:inst:f", "disamb": true}]}, {"index": 226, "position": [1435, 1441], "orth": "twardą", "lexemes": [{"lemma": "twardy", "mstag": "adj:sg:inst:f:pos", "disamb": true}]}, {"index": 227, "position": [1441, 1442], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 228, "position": [1443, 1449], "orth": "Oprócz", "lexemes": [{"lemma": "oprócz", "mstag": "prep:gen", "disamb": true}]}, {"index": 229, "position": [1450, 1454], "orth": "tego", "lexemes": [{"lemma": "to", "mstag": "subst:sg:gen:n", "disamb": true}]}, {"index": 230, "position": [1455, 1459], "orth": "wody", "lexemes": [{"lemma": "woda", "mstag": "subst:pl:nom:f", "disamb": true}]}, {"index": 231, "position": [1460, 1469], "orth": "naturalne", "lexemes": [{"lemma": "naturalny", "mstag": "adj:pl:nom:f:pos", "disamb": true}]}, {"index": 232, "position": [1470, 1479], "orth": "zawierają", "lexemes": [{"lemma": "zawierać", "mstag": "fin:pl:ter:imperf", "disamb": true}]}, {"index": 233, "position": [1480, 1492], "orth": "rozpuszczone", "lexemes": [{"lemma": "rozpuścić", "mstag": "ppas:pl:nom:f:perf:aff", "disamb": true}]}, {"index": 234, "position": [1493, 1503], "orth": "substancje", "lexemes": [{"lemma": "substancja", "mstag": "subst:pl:nom:f", "disamb": true}]}, {"index": 235, "position": [1504, 1515], "orth": "pochodzenia", "lexemes": [{"lemma": "pochodzenie", "mstag": "subst:sg:gen:n", "disamb": true}]}, {"index": 236, "position": [1516, 1528], "orth": "organicznego", "lexemes": [{"lemma": "organiczny", "mstag": "adj:sg:gen:n:pos", "disamb": true}]}, {"index": 237, "position": [1528, 1529], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 238, "position": [1530, 1532], "orth": "np", "lexemes": [{"lemma": "na przykład", "mstag": "brev:pun", "disamb": true}]}, {"index": 239, "position": [1532, 1533], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 240, "position": [1534, 1541], "orth": "mocznik", "lexemes": [{"lemma": "mocznik", "mstag": "subst:sg:nom:m3", "disamb": true}]}, {"index": 241, "position": [1541, 1542], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 242, "position": [1543, 1548], "orth": "kwasy", "lexemes": [{"lemma": "kwas", "mstag": "subst:pl:nom:m3", "disamb": true}]}, {"index": 243, "position": [1549, 1557], "orth": "humusowe", "lexemes": [{"lemma": "humusowy", "mstag": "adj:pl:nom:m3:pos", "disamb": true}]}, {"index": 244, "position": [1558, 1561], "orth": "itp", "lexemes": [{"lemma": "i tym podobne", "mstag": "brev:pun", "disamb": true}]}, {"index": 245, "position": [1561, 1562], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}]} \ No newline at end of file diff --git a/tests/testdata/expected/post_postagger_input_lemmas b/tests/testdata/expected/post_postagger_input_lemmas new file mode 100644 index 0000000000000000000000000000000000000000..deac1e9775047d2c17d106dda1423711fad86ea6 --- /dev/null +++ b/tests/testdata/expected/post_postagger_input_lemmas @@ -0,0 +1,13 @@ +woda być jeden z pospolity substancja w wszechświat . +cząsteczka woda być trzeci bardzo rozpowszechniony molekuła w ośrodek międzygwiazdowy , po cząsteczkowy wodór i tlenek węgiel . +być również szeroko rozpowszechniony w Układ Słoneczny : stanowić istotny element budowa ceres i księżyc lodowy krążyć wokół planeta - olbrzym , jako domieszka występować w on atmosfera , a przypuszczać się , że duży on ilość znajdować się w wnętrze ten planeta . +jako lód występować także na część planetoida , a zapewne również na obiekt transneptunowych . +woda być bardzo rozpowszechniony także na powierzchnia Ziemia . +występować głównie w ocean , który pokrywać 70 , 8 % powierzchnia glob , ale także w rzeka , jezioro i w postać stały w lodowiec . +część woda znajdować się w atmosfera ( chmura , para wodny ) . +niektóry związek chemiczny zawierać cząsteczka woda w swój budowa ( hydrat – określać się on wówczas miano woda krystalizacyjny ) . +zawartość woda włączyć w struktura minerał w płaszcz Ziemia móc przekraczać łączny zawartość woda w ocean i inny zbiornik powierzchniowy nawet dziesięciokrotnie . +woda występować w przyroda być roztwór sól i gaz . +najwięcej sól mineralny zawierać woda morski i woda mineralny ; najmniej woda z opad atmosferyczny . +woda o mały zawartość składnik mineralny nazywać woda miękki , natomiast zawierać znaczny ilość sól wapń i magnez – woda twardy . +oprócz to woda naturalny zawierać rozpuścić substancja pochodzenie organiczny , na przykład . mocznik , kwas humusowy i tym podobne . diff --git a/tests/testdata/expected/post_spacy_expected.json b/tests/testdata/expected/post_spacy_expected.json new file mode 100644 index 0000000000000000000000000000000000000000..bb3c76a1de82141dbe7fb9f453c8c68e99aa1e23 --- /dev/null +++ b/tests/testdata/expected/post_spacy_expected.json @@ -0,0 +1 @@ +{"filename": "45b8d169-fc50-4a49-88ee-9327de089183", "tagset": "ud", "tokens": [{"index": 1, "position": [0, 4], "orth": "When", "lexemes": [{"lemma": "when", "mstag": "SCONJ", "disamb": true}]}, {"index": 2, "position": [5, 14], "orth": "Sebastian", "lexemes": [{"lemma": "Sebastian", "mstag": "PROPN", "disamb": true}]}, {"index": 3, "position": [15, 20], "orth": "Thrun", "lexemes": [{"lemma": "Thrun", "mstag": "PROPN", "disamb": true}]}, {"index": 4, "position": [21, 28], "orth": "started", "lexemes": [{"lemma": "start", "mstag": "VERB", "disamb": true}]}, {"index": 5, "position": [29, 36], "orth": "working", "lexemes": [{"lemma": "work", "mstag": "VERB", "disamb": true}]}, {"index": 6, "position": [37, 39], "orth": "on", "lexemes": [{"lemma": "on", "mstag": "ADP", "disamb": true}]}, {"index": 7, "position": [40, 44], "orth": "self", "lexemes": [{"lemma": "self", "mstag": "NOUN", "disamb": true}]}, {"index": 8, "position": [45, 45], "orth": "-", "lexemes": [{"lemma": "-", "mstag": "PUNCT", "disamb": true}]}, {"index": 9, "position": [46, 52], "orth": "driving", "lexemes": [{"lemma": "drive", "mstag": "VERB", "disamb": true}]}, {"index": 10, "position": [53, 57], "orth": "cars", "lexemes": [{"lemma": "car", "mstag": "NOUN", "disamb": true}]}, {"index": 11, "position": [58, 60], "orth": "at", "lexemes": [{"lemma": "at", "mstag": "ADP", "disamb": true}]}, {"index": 12, "position": [61, 67], "orth": "Google", "lexemes": [{"lemma": "Google", "mstag": "PROPN", "disamb": true}]}, {"index": 13, "position": [68, 70], "orth": "in", "lexemes": [{"lemma": "in", "mstag": "ADP", "disamb": true}]}, {"index": 14, "position": [71, 75], "orth": "2007", "lexemes": [{"lemma": "2007", "mstag": "NUM", "disamb": true}]}, {"index": 15, "position": [76, 76], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 16, "position": [77, 80], "orth": "few", "lexemes": [{"lemma": "few", "mstag": "ADJ", "disamb": true}]}, {"index": 17, "position": [81, 87], "orth": "people", "lexemes": [{"lemma": "people", "mstag": "NOUN", "disamb": true}]}, {"index": 18, "position": [88, 95], "orth": "outside", "lexemes": [{"lemma": "outside", "mstag": "ADV", "disamb": true}]}, {"index": 19, "position": [96, 98], "orth": "of", "lexemes": [{"lemma": "of", "mstag": "ADP", "disamb": true}]}, {"index": 20, "position": [99, 102], "orth": "the", "lexemes": [{"lemma": "the", "mstag": "DET", "disamb": true}]}, {"index": 21, "position": [103, 110], "orth": "company", "lexemes": [{"lemma": "company", "mstag": "NOUN", "disamb": true}]}, {"index": 22, "position": [111, 115], "orth": "took", "lexemes": [{"lemma": "take", "mstag": "VERB", "disamb": true}]}, {"index": 23, "position": [116, 119], "orth": "him", "lexemes": [{"lemma": "he", "mstag": "PRON", "disamb": true}]}, {"index": 24, "position": [120, 129], "orth": "seriously", "lexemes": [{"lemma": "seriously", "mstag": "ADV", "disamb": true}]}, {"index": 25, "position": [130, 130], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 26, "position": [131, 132], "orth": "“", "lexemes": [{"lemma": "\"", "mstag": "PUNCT", "disamb": true}]}, {"index": 27, "position": [133, 133], "orth": "I", "lexemes": [{"lemma": "I", "mstag": "PRON", "disamb": true}]}, {"index": 28, "position": [134, 137], "orth": "can", "lexemes": [{"lemma": "can", "mstag": "AUX", "disamb": true}]}, {"index": 29, "position": [138, 142], "orth": "tell", "lexemes": [{"lemma": "tell", "mstag": "VERB", "disamb": true}]}, {"index": 30, "position": [143, 146], "orth": "you", "lexemes": [{"lemma": "you", "mstag": "PRON", "disamb": true}]}, {"index": 31, "position": [147, 151], "orth": "very", "lexemes": [{"lemma": "very", "mstag": "ADV", "disamb": true}]}, {"index": 32, "position": [152, 158], "orth": "senior", "lexemes": [{"lemma": "senior", "mstag": "ADJ", "disamb": true}]}, {"index": 33, "position": [159, 163], "orth": "CEOs", "lexemes": [{"lemma": "ceo", "mstag": "NOUN", "disamb": true}]}, {"index": 34, "position": [164, 166], "orth": "of", "lexemes": [{"lemma": "of", "mstag": "ADP", "disamb": true}]}, {"index": 35, "position": [167, 172], "orth": "major", "lexemes": [{"lemma": "major", "mstag": "ADJ", "disamb": true}]}, {"index": 36, "position": [173, 181], "orth": "American", "lexemes": [{"lemma": "american", "mstag": "ADJ", "disamb": true}]}, {"index": 37, "position": [182, 185], "orth": "car", "lexemes": [{"lemma": "car", "mstag": "NOUN", "disamb": true}]}, {"index": 38, "position": [186, 195], "orth": "companies", "lexemes": [{"lemma": "company", "mstag": "NOUN", "disamb": true}]}, {"index": 39, "position": [196, 201], "orth": "would", "lexemes": [{"lemma": "would", "mstag": "AUX", "disamb": true}]}, {"index": 40, "position": [202, 207], "orth": "shake", "lexemes": [{"lemma": "shake", "mstag": "VERB", "disamb": true}]}, {"index": 41, "position": [208, 210], "orth": "my", "lexemes": [{"lemma": "my", "mstag": "PRON", "disamb": true}]}, {"index": 42, "position": [211, 215], "orth": "hand", "lexemes": [{"lemma": "hand", "mstag": "NOUN", "disamb": true}]}, {"index": 43, "position": [216, 219], "orth": "and", "lexemes": [{"lemma": "and", "mstag": "CCONJ", "disamb": true}]}, {"index": 44, "position": [220, 224], "orth": "turn", "lexemes": [{"lemma": "turn", "mstag": "VERB", "disamb": true}]}, {"index": 45, "position": [225, 229], "orth": "away", "lexemes": [{"lemma": "away", "mstag": "ADV", "disamb": true}]}, {"index": 46, "position": [230, 237], "orth": "because", "lexemes": [{"lemma": "because", "mstag": "SCONJ", "disamb": true}]}, {"index": 47, "position": [238, 239], "orth": "I", "lexemes": [{"lemma": "I", "mstag": "PRON", "disamb": true}]}, {"index": 48, "position": [240, 243], "orth": "was", "lexemes": [{"lemma": "be", "mstag": "AUX", "disamb": true}]}, {"index": 49, "position": [244, 246], "orth": "n’t", "lexemes": [{"lemma": "not", "mstag": "PART", "disamb": true}]}, {"index": 50, "position": [247, 252], "orth": "worth", "lexemes": [{"lemma": "worth", "mstag": "ADJ", "disamb": true}]}, {"index": 51, "position": [253, 260], "orth": "talking", "lexemes": [{"lemma": "talk", "mstag": "VERB", "disamb": true}]}, {"index": 52, "position": [261, 263], "orth": "to", "lexemes": [{"lemma": "to", "mstag": "ADP", "disamb": true}]}, {"index": 53, "position": [264, 264], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 54, "position": [265, 265], "orth": "”", "lexemes": [{"lemma": "\"", "mstag": "PUNCT", "disamb": true}]}, {"index": 55, "position": [266, 270], "orth": "said", "lexemes": [{"lemma": "say", "mstag": "VERB", "disamb": true}]}, {"index": 56, "position": [271, 276], "orth": "Thrun", "lexemes": [{"lemma": "Thrun", "mstag": "PROPN", "disamb": true}]}, {"index": 57, "position": [277, 277], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 58, "position": [278, 280], "orth": "in", "lexemes": [{"lemma": "in", "mstag": "ADP", "disamb": true}]}, {"index": 59, "position": [281, 283], "orth": "an", "lexemes": [{"lemma": "an", "mstag": "DET", "disamb": true}]}, {"index": 60, "position": [284, 293], "orth": "interview", "lexemes": [{"lemma": "interview", "mstag": "NOUN", "disamb": true}]}, {"index": 61, "position": [294, 298], "orth": "with", "lexemes": [{"lemma": "with", "mstag": "ADP", "disamb": true}]}, {"index": 62, "position": [299, 305], "orth": "Recode", "lexemes": [{"lemma": "Recode", "mstag": "PROPN", "disamb": true}]}, {"index": 63, "position": [306, 313], "orth": "earlier", "lexemes": [{"lemma": "early", "mstag": "ADV", "disamb": true}]}, {"index": 64, "position": [314, 318], "orth": "this", "lexemes": [{"lemma": "this", "mstag": "DET", "disamb": true}]}, {"index": 65, "position": [319, 323], "orth": "week", "lexemes": [{"lemma": "week", "mstag": "NOUN", "disamb": true}]}, {"index": 66, "position": [324, 324], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 67, "position": [325, 326], "orth": "\n\n", "lexemes": [{"lemma": "\n\n", "mstag": "SPACE", "disamb": true}]}], "text": "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously. “I can tell you very senior CEOs of major American car companies would shake my hand and turn away because I wasn’t worth talking to,” said Thrun, in an interview with Recode earlier this week.\n\n"} \ No newline at end of file diff --git a/tests/testdata/expected/post_spacy_input b/tests/testdata/expected/post_spacy_input index deafd543ca3f10c8f3f0fc9c70ec398b843f7c28..61660e8f4512d6866b59ef007fb3cb5dd7c7c536 100644 --- a/tests/testdata/expected/post_spacy_input +++ b/tests/testdata/expected/post_spacy_input @@ -1,7 +1 @@ foobar, baz -foobar, baz -foobar, baz -foobar, baz -foobar, baz -foobar, baz -foobar, baz diff --git a/tests/testdata/expected/post_spacy_small_limit_input b/tests/testdata/expected/post_spacy_small_limit_input new file mode 100644 index 0000000000000000000000000000000000000000..c60249bb429da018c151f3785059c7e96a0342a8 --- /dev/null +++ b/tests/testdata/expected/post_spacy_small_limit_input @@ -0,0 +1,15 @@ +foobar, baz +foobar, baz +foobar, baz +foobar, baz +foobar, baz +foobar, baz +foobar, baz +foobar, baz +foobar, baz +foobar, baz +foobar, baz +foobar, baz +foobar, baz +foobar, baz +foobar, baz diff --git a/tests/testdata/expected/pre_fextor3_expected.json b/tests/testdata/expected/pre_fextor3_expected.json new file mode 100644 index 0000000000000000000000000000000000000000..f926f5f802cc142011049da0e4f94a3a31aaaf5c --- /dev/null +++ b/tests/testdata/expected/pre_fextor3_expected.json @@ -0,0 +1 @@ +{"base": {"woda": 16, "być": 5, "jeden": 1, "z": 2, "pospolity": 1, "substancja": 2, "w": 15, "wszechświat": 1, ".": 14, "cząsteczka": 2, "trzeci": 1, "bardzo": 2, "rozpowszechniony": 3, "molekuła": 1, "ośrodek": 1, "międzygwiazdowy": 1, ",": 13, "po": 1, "cząsteczkowy": 1, "wodór": 1, "i": 7, "tlenek": 1, "węgiel": 1, "również": 2, "szeroko": 1, "Układ": 1, "Słoneczny": 1, ":": 1, "stanowić": 1, "istotny": 1, "element": 1, "budowa": 2, "ceres": 1, "księżyc": 1, "lodowy": 1, "krążyć": 1, "wokół": 1, "planeta": 2, "-": 1, "olbrzym": 1, "jako": 2, "domieszka": 1, "występować": 4, "on": 3, "atmosfera": 2, "a": 2, "przypuszczać": 1, "się": 4, "że": 1, "duży": 1, "ilość": 2, "znajdować": 2, "wnętrze": 1, "ten": 1, "lód": 1, "także": 3, "na": 3, "część": 2, "planetoida": 1, "zapewne": 1, "obiekt": 1, "transneptunowych": 1, "powierzchnia": 2, "Ziemia": 2, "głównie": 1, "ocean": 2, "który": 1, "pokrywać": 1, "70": 1, "8": 1, "%": 1, "glob": 1, "ale": 1, "rzeka": 1, "jezioro": 1, "postać": 1, "stały": 1, "lodowiec": 1, "(": 2, "chmura": 1, "para": 1, "wodny": 1, ")": 2, "niektóry": 1, "związek": 1, "chemiczny": 1, "zawierać": 4, "swój": 1, "hydrat": 1, "–": 2, "określać": 1, "wówczas": 1, "miano": 1, "krystalizacyjny": 1, "zawartość": 3, "włączyć": 1, "struktura": 1, "minerał": 1, "płaszcz": 1, "móc": 1, "przekraczać": 1, "łączny": 1, "inny": 1, "zbiornik": 1, "powierzchniowy": 1, "nawet": 1, "dziesięciokrotnie": 1, "przyroda": 1, "roztwór": 1, "sól": 3, "gaz": 1, "najwięcej": 1, "mineralny": 3, "morski": 1, ";": 1, "najmniej": 1, "opad": 1, "atmosferyczny": 1, "o": 1, "mały": 1, "składnik": 1, "nazywać": 1, "miękki": 1, "natomiast": 1, "znaczny": 1, "wapń": 1, "magnez": 1, "twardy": 1, "oprócz": 1, "to": 1, "naturalny": 1, "rozpuścić": 1, "pochodzenie": 1, "organiczny": 1, "na przykład": 1, "mocznik": 1, "kwas": 1, "humusowy": 1, "i tym podobne": 1}} \ No newline at end of file diff --git a/tests/testdata/expected/pre_winer.json b/tests/testdata/expected/pre_winer.json new file mode 100644 index 0000000000000000000000000000000000000000..b78743d4a75f1f133b718474c46395c1d1631d24 --- /dev/null +++ b/tests/testdata/expected/pre_winer.json @@ -0,0 +1 @@ +{"filename": "c63859cd-7534-49fa-b876-0ee066440f0d", "text": "Woda jest jedną z najpospolitszych substancji we Wszechświecie. Cząsteczka wody jest trzecią najbardziej rozpowszechnioną molekułą w ośrodku międzygwiazdowym, po cząsteczkowym wodorze i tlenku węgla. Jest również szeroko rozpowszechniona w Układzie Słonecznym: stanowi istotny element budowy Ceres i księżyców lodowych krążących wokół planet-olbrzymów, jako domieszka występuje w ich atmosferach, a przypuszcza się, że duże jej ilości znajdują się we wnętrzach tych planet. Jako lód występuje także na części planetoid, a zapewne również na obiektach transneptunowych. Woda jest bardzo rozpowszechniona także na powierzchni Ziemi. Występuje głównie w oceanach, które pokrywają 70,8% powierzchni globu, ale także w rzekach, jeziorach i w postaci stałej w lodowcach. Część wody znajduje się w atmosferze (chmury, para wodna). Niektóre związki chemiczne zawierają cząsteczki wody w swojej budowie (hydraty – określa się ją wówczas mianem wody krystalizacyjnej). Zawartość wody włączonej w strukturę minerałów w płaszczu Ziemi może przekraczać łączną zawartość wody w oceanach i innych zbiornikach powierzchniowych nawet dziesięciokrotnie. Woda występująca w przyrodzie jest roztworem soli i gazów. Najwięcej soli mineralnych zawiera woda morska i wody mineralne; najmniej woda z opadów atmosferycznych. Wodę o małej zawartości składników mineralnych nazywamy wodą miękką, natomiast zawierającą znaczne ilości soli wapnia i magnezu – wodą twardą. Oprócz tego wody naturalne zawierają rozpuszczone substancje pochodzenia organicznego, np. mocznik, kwasy humusowe itp.", "tokens": [{"index": 1, "position": [0, 4], "orth": "Woda", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:nom:f", "disamb": true}]}, {"index": 2, "position": [5, 9], "orth": "jest", "lexemes": [{"lemma": "być", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 3, "position": [10, 15], "orth": "jedną", "lexemes": [{"lemma": "jeden", "mstag": "adj:sg:inst:f:pos", "disamb": true}]}, {"index": 4, "position": [16, 17], "orth": "z", "lexemes": [{"lemma": "z", "mstag": "prep:gen:nwok", "disamb": true}]}, {"index": 5, "position": [18, 34], "orth": "najpospolitszych", "lexemes": [{"lemma": "pospolity", "mstag": "adj:pl:gen:f:sup", "disamb": true}]}, {"index": 6, "position": [35, 45], "orth": "substancji", "lexemes": [{"lemma": "substancja", "mstag": "subst:pl:gen:f", "disamb": true}]}, {"index": 7, "position": [46, 48], "orth": "we", "lexemes": [{"lemma": "w", "mstag": "prep:loc:wok", "disamb": true}]}, {"index": 8, "position": [49, 62], "orth": "Wszechświecie", "lexemes": [{"lemma": "wszechświat", "mstag": "subst:sg:loc:m3", "disamb": true}]}, {"index": 9, "position": [62, 63], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 10, "position": [64, 74], "orth": "Cząsteczka", "lexemes": [{"lemma": "cząsteczka", "mstag": "subst:sg:nom:f", "disamb": true}]}, {"index": 11, "position": [75, 79], "orth": "wody", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 12, "position": [80, 84], "orth": "jest", "lexemes": [{"lemma": "być", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 13, "position": [85, 92], "orth": "trzecią", "lexemes": [{"lemma": "trzeci", "mstag": "adj:sg:inst:f:pos", "disamb": true}]}, {"index": 14, "position": [93, 104], "orth": "najbardziej", "lexemes": [{"lemma": "bardzo", "mstag": "adv:sup", "disamb": true}]}, {"index": 15, "position": [105, 121], "orth": "rozpowszechnioną", "lexemes": [{"lemma": "rozpowszechniony", "mstag": "adj:sg:inst:f:pos", "disamb": true}]}, {"index": 16, "position": [122, 130], "orth": "molekułą", "lexemes": [{"lemma": "molekuła", "mstag": "subst:sg:inst:f", "disamb": true}]}, {"index": 17, "position": [131, 132], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 18, "position": [133, 140], "orth": "ośrodku", "lexemes": [{"lemma": "ośrodek", "mstag": "subst:sg:loc:m3", "disamb": true}]}, {"index": 19, "position": [141, 157], "orth": "międzygwiazdowym", "lexemes": [{"lemma": "międzygwiazdowy", "mstag": "adj:sg:loc:m3:pos", "disamb": true}]}, {"index": 20, "position": [157, 158], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 21, "position": [159, 161], "orth": "po", "lexemes": [{"lemma": "po", "mstag": "prep:loc", "disamb": true}]}, {"index": 22, "position": [162, 175], "orth": "cząsteczkowym", "lexemes": [{"lemma": "cząsteczkowy", "mstag": "adj:sg:loc:m3:pos", "disamb": true}]}, {"index": 23, "position": [176, 183], "orth": "wodorze", "lexemes": [{"lemma": "wodór", "mstag": "subst:sg:loc:m3", "disamb": true}]}, {"index": 24, "position": [184, 185], "orth": "i", "lexemes": [{"lemma": "i", "mstag": "conj", "disamb": true}]}, {"index": 25, "position": [186, 192], "orth": "tlenku", "lexemes": [{"lemma": "tlenek", "mstag": "subst:sg:loc:m3", "disamb": true}]}, {"index": 26, "position": [193, 198], "orth": "węgla", "lexemes": [{"lemma": "węgiel", "mstag": "subst:sg:gen:m3", "disamb": true}]}, {"index": 27, "position": [198, 199], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 28, "position": [200, 204], "orth": "Jest", "lexemes": [{"lemma": "być", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 29, "position": [205, 212], "orth": "również", "lexemes": [{"lemma": "również", "mstag": "qub", "disamb": true}]}, {"index": 30, "position": [213, 220], "orth": "szeroko", "lexemes": [{"lemma": "szeroko", "mstag": "adv:pos", "disamb": true}]}, {"index": 31, "position": [221, 237], "orth": "rozpowszechniona", "lexemes": [{"lemma": "rozpowszechniony", "mstag": "adj:sg:nom:f:pos", "disamb": true}]}, {"index": 32, "position": [238, 239], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 33, "position": [240, 248], "orth": "Układzie", "lexemes": [{"lemma": "Układ", "mstag": "subst:sg:loc:m3", "disamb": true}]}, {"index": 34, "position": [249, 259], "orth": "Słonecznym", "lexemes": [{"lemma": "Słoneczny", "mstag": "adj:sg:loc:m3:pos", "disamb": true}]}, {"index": 35, "position": [259, 260], "orth": ":", "lexemes": [{"lemma": ":", "mstag": "interp", "disamb": true}]}, {"index": 36, "position": [261, 268], "orth": "stanowi", "lexemes": [{"lemma": "stanowić", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 37, "position": [269, 276], "orth": "istotny", "lexemes": [{"lemma": "istotny", "mstag": "adj:sg:acc:m3:pos", "disamb": true}]}, {"index": 38, "position": [277, 284], "orth": "element", "lexemes": [{"lemma": "element", "mstag": "subst:sg:acc:m3", "disamb": true}]}, {"index": 39, "position": [285, 291], "orth": "budowy", "lexemes": [{"lemma": "budowa", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 40, "position": [292, 297], "orth": "Ceres", "lexemes": [{"lemma": "ceres", "mstag": "subst:sg:acc:m3", "disamb": true}]}, {"index": 41, "position": [298, 299], "orth": "i", "lexemes": [{"lemma": "i", "mstag": "conj", "disamb": true}]}, {"index": 42, "position": [300, 309], "orth": "księżyców", "lexemes": [{"lemma": "księżyc", "mstag": "subst:pl:gen:m3", "disamb": true}]}, {"index": 43, "position": [310, 318], "orth": "lodowych", "lexemes": [{"lemma": "lodowy", "mstag": "adj:pl:gen:m3:pos", "disamb": true}]}, {"index": 44, "position": [319, 328], "orth": "krążących", "lexemes": [{"lemma": "krążyć", "mstag": "pact:pl:gen:f:imperf:aff", "disamb": true}]}, {"index": 45, "position": [329, 334], "orth": "wokół", "lexemes": [{"lemma": "wokół", "mstag": "prep:gen", "disamb": true}]}, {"index": 46, "position": [335, 341], "orth": "planet", "lexemes": [{"lemma": "planeta", "mstag": "subst:pl:gen:f", "disamb": true}]}, {"index": 47, "position": [341, 342], "orth": "-", "lexemes": [{"lemma": "-", "mstag": "interp", "disamb": true}]}, {"index": 48, "position": [342, 351], "orth": "olbrzymów", "lexemes": [{"lemma": "olbrzym", "mstag": "subst:pl:gen:m1", "disamb": true}]}, {"index": 49, "position": [351, 352], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 50, "position": [353, 357], "orth": "jako", "lexemes": [{"lemma": "jako", "mstag": "adv", "disamb": true}]}, {"index": 51, "position": [358, 367], "orth": "domieszka", "lexemes": [{"lemma": "domieszka", "mstag": "subst:sg:nom:f", "disamb": true}]}, {"index": 52, "position": [368, 377], "orth": "występuje", "lexemes": [{"lemma": "występować", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 53, "position": [378, 379], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 54, "position": [380, 383], "orth": "ich", "lexemes": [{"lemma": "on", "mstag": "ppron3:pl:gen:m1:ter:akc:npraep", "disamb": true}]}, {"index": 55, "position": [384, 395], "orth": "atmosferach", "lexemes": [{"lemma": "atmosfera", "mstag": "subst:pl:loc:f", "disamb": true}]}, {"index": 56, "position": [395, 396], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 57, "position": [397, 398], "orth": "a", "lexemes": [{"lemma": "a", "mstag": "conj", "disamb": true}]}, {"index": 58, "position": [399, 410], "orth": "przypuszcza", "lexemes": [{"lemma": "przypuszczać", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 59, "position": [411, 414], "orth": "się", "lexemes": [{"lemma": "się", "mstag": "qub", "disamb": true}]}, {"index": 60, "position": [414, 415], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 61, "position": [416, 418], "orth": "że", "lexemes": [{"lemma": "że", "mstag": "comp", "disamb": true}]}, {"index": 62, "position": [419, 423], "orth": "duże", "lexemes": [{"lemma": "duży", "mstag": "adj:pl:nom:f:pos", "disamb": true}]}, {"index": 63, "position": [424, 427], "orth": "jej", "lexemes": [{"lemma": "on", "mstag": "ppron3:sg:gen:f:ter:akc:npraep", "disamb": true}]}, {"index": 64, "position": [428, 434], "orth": "ilości", "lexemes": [{"lemma": "ilość", "mstag": "subst:pl:nom:f", "disamb": true}]}, {"index": 65, "position": [435, 443], "orth": "znajdują", "lexemes": [{"lemma": "znajdować", "mstag": "fin:pl:ter:imperf", "disamb": true}]}, {"index": 66, "position": [444, 447], "orth": "się", "lexemes": [{"lemma": "się", "mstag": "qub", "disamb": true}]}, {"index": 67, "position": [448, 450], "orth": "we", "lexemes": [{"lemma": "w", "mstag": "prep:loc:wok", "disamb": true}]}, {"index": 68, "position": [451, 460], "orth": "wnętrzach", "lexemes": [{"lemma": "wnętrze", "mstag": "subst:pl:loc:n", "disamb": true}]}, {"index": 69, "position": [461, 465], "orth": "tych", "lexemes": [{"lemma": "ten", "mstag": "adj:pl:gen:f:pos", "disamb": true}]}, {"index": 70, "position": [466, 472], "orth": "planet", "lexemes": [{"lemma": "planeta", "mstag": "subst:pl:gen:f", "disamb": true}]}, {"index": 71, "position": [472, 473], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 72, "position": [474, 478], "orth": "Jako", "lexemes": [{"lemma": "jako", "mstag": "conj", "disamb": true}]}, {"index": 73, "position": [479, 482], "orth": "lód", "lexemes": [{"lemma": "lód", "mstag": "subst:sg:nom:m3", "disamb": true}]}, {"index": 74, "position": [483, 492], "orth": "występuje", "lexemes": [{"lemma": "występować", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 75, "position": [493, 498], "orth": "także", "lexemes": [{"lemma": "także", "mstag": "qub", "disamb": true}]}, {"index": 76, "position": [499, 501], "orth": "na", "lexemes": [{"lemma": "na", "mstag": "prep:loc", "disamb": true}]}, {"index": 77, "position": [502, 508], "orth": "części", "lexemes": [{"lemma": "część", "mstag": "subst:sg:loc:f", "disamb": true}]}, {"index": 78, "position": [509, 518], "orth": "planetoid", "lexemes": [{"lemma": "planetoida", "mstag": "subst:pl:gen:f", "disamb": true}]}, {"index": 79, "position": [518, 519], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 80, "position": [520, 521], "orth": "a", "lexemes": [{"lemma": "a", "mstag": "conj", "disamb": true}]}, {"index": 81, "position": [522, 529], "orth": "zapewne", "lexemes": [{"lemma": "zapewne", "mstag": "qub", "disamb": true}]}, {"index": 82, "position": [530, 537], "orth": "również", "lexemes": [{"lemma": "również", "mstag": "qub", "disamb": true}]}, {"index": 83, "position": [538, 540], "orth": "na", "lexemes": [{"lemma": "na", "mstag": "prep:loc", "disamb": true}]}, {"index": 84, "position": [541, 550], "orth": "obiektach", "lexemes": [{"lemma": "obiekt", "mstag": "subst:pl:loc:m3", "disamb": true}]}, {"index": 85, "position": [551, 567], "orth": "transneptunowych", "lexemes": [{"lemma": "transneptunowych", "mstag": "ign", "disamb": true}]}, {"index": 86, "position": [567, 568], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 87, "position": [569, 573], "orth": "Woda", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:nom:f", "disamb": true}]}, {"index": 88, "position": [574, 578], "orth": "jest", "lexemes": [{"lemma": "być", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 89, "position": [579, 585], "orth": "bardzo", "lexemes": [{"lemma": "bardzo", "mstag": "adv:pos", "disamb": true}]}, {"index": 90, "position": [586, 602], "orth": "rozpowszechniona", "lexemes": [{"lemma": "rozpowszechniony", "mstag": "adj:sg:nom:f:pos", "disamb": true}]}, {"index": 91, "position": [603, 608], "orth": "także", "lexemes": [{"lemma": "także", "mstag": "conj", "disamb": true}]}, {"index": 92, "position": [609, 611], "orth": "na", "lexemes": [{"lemma": "na", "mstag": "prep:loc", "disamb": true}]}, {"index": 93, "position": [612, 623], "orth": "powierzchni", "lexemes": [{"lemma": "powierzchnia", "mstag": "subst:sg:loc:f", "disamb": true}]}, {"index": 94, "position": [624, 629], "orth": "Ziemi", "lexemes": [{"lemma": "Ziemia", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 95, "position": [629, 630], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 96, "position": [631, 640], "orth": "Występuje", "lexemes": [{"lemma": "występować", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 97, "position": [641, 648], "orth": "głównie", "lexemes": [{"lemma": "głównie", "mstag": "qub", "disamb": true}]}, {"index": 98, "position": [649, 650], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 99, "position": [651, 659], "orth": "oceanach", "lexemes": [{"lemma": "ocean", "mstag": "subst:pl:loc:m3", "disamb": true}]}, {"index": 100, "position": [659, 660], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 101, "position": [661, 666], "orth": "które", "lexemes": [{"lemma": "który", "mstag": "adj:pl:nom:m3:pos", "disamb": true}]}, {"index": 102, "position": [667, 676], "orth": "pokrywają", "lexemes": [{"lemma": "pokrywać", "mstag": "fin:pl:ter:imperf", "disamb": true}]}, {"index": 103, "position": [677, 679], "orth": "70", "lexemes": [{"lemma": "70", "mstag": "num:pl:acc:m3:rec", "disamb": true}]}, {"index": 104, "position": [679, 680], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 105, "position": [680, 681], "orth": "8", "lexemes": [{"lemma": "8", "mstag": "num:pl:acc:m3:rec", "disamb": true}]}, {"index": 106, "position": [681, 682], "orth": "%", "lexemes": [{"lemma": "%", "mstag": "subst:sg:nom:m3", "disamb": true}]}, {"index": 107, "position": [683, 694], "orth": "powierzchni", "lexemes": [{"lemma": "powierzchnia", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 108, "position": [695, 700], "orth": "globu", "lexemes": [{"lemma": "glob", "mstag": "subst:sg:gen:m3", "disamb": true}]}, {"index": 109, "position": [700, 701], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 110, "position": [702, 705], "orth": "ale", "lexemes": [{"lemma": "ale", "mstag": "conj", "disamb": true}]}, {"index": 111, "position": [706, 711], "orth": "także", "lexemes": [{"lemma": "także", "mstag": "conj", "disamb": true}]}, {"index": 112, "position": [712, 713], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 113, "position": [714, 721], "orth": "rzekach", "lexemes": [{"lemma": "rzeka", "mstag": "subst:pl:loc:f", "disamb": true}]}, {"index": 114, "position": [721, 722], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 115, "position": [723, 732], "orth": "jeziorach", "lexemes": [{"lemma": "jezioro", "mstag": "subst:pl:loc:n", "disamb": true}]}, {"index": 116, "position": [733, 734], "orth": "i", "lexemes": [{"lemma": "i", "mstag": "conj", "disamb": true}]}, {"index": 117, "position": [735, 736], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 118, "position": [737, 744], "orth": "postaci", "lexemes": [{"lemma": "postać", "mstag": "subst:sg:loc:f", "disamb": true}]}, {"index": 119, "position": [745, 751], "orth": "stałej", "lexemes": [{"lemma": "stały", "mstag": "adj:sg:loc:f:pos", "disamb": true}]}, {"index": 120, "position": [752, 753], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 121, "position": [754, 763], "orth": "lodowcach", "lexemes": [{"lemma": "lodowiec", "mstag": "subst:pl:loc:m3", "disamb": true}]}, {"index": 122, "position": [763, 764], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 123, "position": [765, 770], "orth": "Część", "lexemes": [{"lemma": "część", "mstag": "subst:sg:nom:f", "disamb": true}]}, {"index": 124, "position": [771, 775], "orth": "wody", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 125, "position": [776, 784], "orth": "znajduje", "lexemes": [{"lemma": "znajdować", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 126, "position": [785, 788], "orth": "się", "lexemes": [{"lemma": "się", "mstag": "qub", "disamb": true}]}, {"index": 127, "position": [789, 790], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 128, "position": [791, 801], "orth": "atmosferze", "lexemes": [{"lemma": "atmosfera", "mstag": "subst:sg:loc:f", "disamb": true}]}, {"index": 129, "position": [802, 803], "orth": "(", "lexemes": [{"lemma": "(", "mstag": "interp", "disamb": true}]}, {"index": 130, "position": [803, 809], "orth": "chmury", "lexemes": [{"lemma": "chmura", "mstag": "subst:pl:nom:f", "disamb": true}]}, {"index": 131, "position": [809, 810], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 132, "position": [811, 815], "orth": "para", "lexemes": [{"lemma": "para", "mstag": "subst:sg:nom:f", "disamb": true}]}, {"index": 133, "position": [816, 821], "orth": "wodna", "lexemes": [{"lemma": "wodny", "mstag": "adj:sg:nom:f:pos", "disamb": true}]}, {"index": 134, "position": [821, 822], "orth": ")", "lexemes": [{"lemma": ")", "mstag": "interp", "disamb": true}]}, {"index": 135, "position": [822, 823], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 136, "position": [824, 832], "orth": "Niektóre", "lexemes": [{"lemma": "niektóry", "mstag": "adj:pl:nom:m3:pos", "disamb": true}]}, {"index": 137, "position": [833, 840], "orth": "związki", "lexemes": [{"lemma": "związek", "mstag": "subst:pl:nom:m3", "disamb": true}]}, {"index": 138, "position": [841, 850], "orth": "chemiczne", "lexemes": [{"lemma": "chemiczny", "mstag": "adj:pl:nom:m3:pos", "disamb": true}]}, {"index": 139, "position": [851, 860], "orth": "zawierają", "lexemes": [{"lemma": "zawierać", "mstag": "fin:pl:ter:imperf", "disamb": true}]}, {"index": 140, "position": [861, 871], "orth": "cząsteczki", "lexemes": [{"lemma": "cząsteczka", "mstag": "subst:pl:acc:f", "disamb": true}]}, {"index": 141, "position": [872, 876], "orth": "wody", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 142, "position": [877, 878], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 143, "position": [879, 885], "orth": "swojej", "lexemes": [{"lemma": "swój", "mstag": "adj:sg:loc:f:pos", "disamb": true}]}, {"index": 144, "position": [886, 893], "orth": "budowie", "lexemes": [{"lemma": "budowa", "mstag": "subst:sg:loc:f", "disamb": true}]}, {"index": 145, "position": [894, 895], "orth": "(", "lexemes": [{"lemma": "(", "mstag": "interp", "disamb": true}]}, {"index": 146, "position": [895, 902], "orth": "hydraty", "lexemes": [{"lemma": "hydrat", "mstag": "subst:pl:nom:m3", "disamb": true}]}, {"index": 147, "position": [903, 904], "orth": "–", "lexemes": [{"lemma": "–", "mstag": "interp", "disamb": true}]}, {"index": 148, "position": [905, 912], "orth": "określa", "lexemes": [{"lemma": "określać", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 149, "position": [913, 916], "orth": "się", "lexemes": [{"lemma": "się", "mstag": "qub", "disamb": true}]}, {"index": 150, "position": [917, 919], "orth": "ją", "lexemes": [{"lemma": "on", "mstag": "ppron3:sg:acc:f:ter:akc:npraep", "disamb": true}]}, {"index": 151, "position": [920, 927], "orth": "wówczas", "lexemes": [{"lemma": "wówczas", "mstag": "adv", "disamb": true}]}, {"index": 152, "position": [928, 934], "orth": "mianem", "lexemes": [{"lemma": "miano", "mstag": "subst:sg:inst:n", "disamb": true}]}, {"index": 153, "position": [935, 939], "orth": "wody", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 154, "position": [940, 956], "orth": "krystalizacyjnej", "lexemes": [{"lemma": "krystalizacyjny", "mstag": "adj:sg:gen:f:pos", "disamb": true}]}, {"index": 155, "position": [956, 957], "orth": ")", "lexemes": [{"lemma": ")", "mstag": "interp", "disamb": true}]}, {"index": 156, "position": [957, 958], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 157, "position": [959, 968], "orth": "Zawartość", "lexemes": [{"lemma": "zawartość", "mstag": "subst:sg:nom:f", "disamb": true}]}, {"index": 158, "position": [969, 973], "orth": "wody", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 159, "position": [974, 983], "orth": "włączonej", "lexemes": [{"lemma": "włączyć", "mstag": "ppas:sg:gen:f:perf:aff", "disamb": true}]}, {"index": 160, "position": [984, 985], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:acc:nwok", "disamb": true}]}, {"index": 161, "position": [986, 995], "orth": "strukturę", "lexemes": [{"lemma": "struktura", "mstag": "subst:sg:acc:f", "disamb": true}]}, {"index": 162, "position": [996, 1005], "orth": "minerałów", "lexemes": [{"lemma": "minerał", "mstag": "subst:pl:gen:m3", "disamb": true}]}, {"index": 163, "position": [1006, 1007], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 164, "position": [1008, 1016], "orth": "płaszczu", "lexemes": [{"lemma": "płaszcz", "mstag": "subst:sg:loc:m3", "disamb": true}]}, {"index": 165, "position": [1017, 1022], "orth": "Ziemi", "lexemes": [{"lemma": "Ziemia", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 166, "position": [1023, 1027], "orth": "może", "lexemes": [{"lemma": "móc", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 167, "position": [1028, 1039], "orth": "przekraczać", "lexemes": [{"lemma": "przekraczać", "mstag": "inf:imperf", "disamb": true}]}, {"index": 168, "position": [1040, 1046], "orth": "łączną", "lexemes": [{"lemma": "łączny", "mstag": "adj:sg:acc:f:pos", "disamb": true}]}, {"index": 169, "position": [1047, 1056], "orth": "zawartość", "lexemes": [{"lemma": "zawartość", "mstag": "subst:sg:acc:f", "disamb": true}]}, {"index": 170, "position": [1057, 1061], "orth": "wody", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 171, "position": [1062, 1063], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 172, "position": [1064, 1072], "orth": "oceanach", "lexemes": [{"lemma": "ocean", "mstag": "subst:pl:loc:m3", "disamb": true}]}, {"index": 173, "position": [1073, 1074], "orth": "i", "lexemes": [{"lemma": "i", "mstag": "conj", "disamb": true}]}, {"index": 174, "position": [1075, 1081], "orth": "innych", "lexemes": [{"lemma": "inny", "mstag": "adj:pl:loc:m3:pos", "disamb": true}]}, {"index": 175, "position": [1082, 1093], "orth": "zbiornikach", "lexemes": [{"lemma": "zbiornik", "mstag": "subst:pl:loc:m3", "disamb": true}]}, {"index": 176, "position": [1094, 1110], "orth": "powierzchniowych", "lexemes": [{"lemma": "powierzchniowy", "mstag": "adj:pl:loc:m3:pos", "disamb": true}]}, {"index": 177, "position": [1111, 1116], "orth": "nawet", "lexemes": [{"lemma": "nawet", "mstag": "qub", "disamb": true}]}, {"index": 178, "position": [1117, 1134], "orth": "dziesięciokrotnie", "lexemes": [{"lemma": "dziesięciokrotnie", "mstag": "adv:pos", "disamb": true}]}, {"index": 179, "position": [1134, 1135], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 180, "position": [1136, 1140], "orth": "Woda", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:nom:f", "disamb": true}]}, {"index": 181, "position": [1141, 1152], "orth": "występująca", "lexemes": [{"lemma": "występować", "mstag": "pact:sg:nom:f:imperf:aff", "disamb": true}]}, {"index": 182, "position": [1153, 1154], "orth": "w", "lexemes": [{"lemma": "w", "mstag": "prep:loc:nwok", "disamb": true}]}, {"index": 183, "position": [1155, 1165], "orth": "przyrodzie", "lexemes": [{"lemma": "przyroda", "mstag": "subst:sg:loc:f", "disamb": true}]}, {"index": 184, "position": [1166, 1170], "orth": "jest", "lexemes": [{"lemma": "być", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 185, "position": [1171, 1180], "orth": "roztworem", "lexemes": [{"lemma": "roztwór", "mstag": "subst:sg:inst:m3", "disamb": true}]}, {"index": 186, "position": [1181, 1185], "orth": "soli", "lexemes": [{"lemma": "sól", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 187, "position": [1186, 1187], "orth": "i", "lexemes": [{"lemma": "i", "mstag": "conj", "disamb": true}]}, {"index": 188, "position": [1188, 1193], "orth": "gazów", "lexemes": [{"lemma": "gaz", "mstag": "subst:pl:gen:m3", "disamb": true}]}, {"index": 189, "position": [1193, 1194], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 190, "position": [1195, 1204], "orth": "Najwięcej", "lexemes": [{"lemma": "najwięcej", "mstag": "num:pl:acc:f:rec", "disamb": true}]}, {"index": 191, "position": [1205, 1209], "orth": "soli", "lexemes": [{"lemma": "sól", "mstag": "subst:pl:gen:f", "disamb": true}]}, {"index": 192, "position": [1210, 1221], "orth": "mineralnych", "lexemes": [{"lemma": "mineralny", "mstag": "adj:pl:gen:f:pos", "disamb": true}]}, {"index": 193, "position": [1222, 1229], "orth": "zawiera", "lexemes": [{"lemma": "zawierać", "mstag": "fin:sg:ter:imperf", "disamb": true}]}, {"index": 194, "position": [1230, 1234], "orth": "woda", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:nom:f", "disamb": true}]}, {"index": 195, "position": [1235, 1241], "orth": "morska", "lexemes": [{"lemma": "morski", "mstag": "adj:sg:nom:f:pos", "disamb": true}]}, {"index": 196, "position": [1242, 1243], "orth": "i", "lexemes": [{"lemma": "i", "mstag": "conj", "disamb": true}]}, {"index": 197, "position": [1244, 1248], "orth": "wody", "lexemes": [{"lemma": "woda", "mstag": "subst:pl:acc:f", "disamb": true}]}, {"index": 198, "position": [1249, 1258], "orth": "mineralne", "lexemes": [{"lemma": "mineralny", "mstag": "adj:pl:acc:f:pos", "disamb": true}]}, {"index": 199, "position": [1258, 1259], "orth": ";", "lexemes": [{"lemma": ";", "mstag": "interp", "disamb": true}]}, {"index": 200, "position": [1260, 1268], "orth": "najmniej", "lexemes": [{"lemma": "najmniej", "mstag": "num:pl:nom:f:rec", "disamb": true}]}, {"index": 201, "position": [1269, 1273], "orth": "woda", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:nom:f", "disamb": true}]}, {"index": 202, "position": [1274, 1275], "orth": "z", "lexemes": [{"lemma": "z", "mstag": "prep:gen:nwok", "disamb": true}]}, {"index": 203, "position": [1276, 1282], "orth": "opadów", "lexemes": [{"lemma": "opad", "mstag": "subst:pl:gen:m3", "disamb": true}]}, {"index": 204, "position": [1283, 1298], "orth": "atmosferycznych", "lexemes": [{"lemma": "atmosferyczny", "mstag": "adj:pl:gen:m3:pos", "disamb": true}]}, {"index": 205, "position": [1298, 1299], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 206, "position": [1300, 1304], "orth": "Wodę", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:acc:f", "disamb": true}]}, {"index": 207, "position": [1305, 1306], "orth": "o", "lexemes": [{"lemma": "o", "mstag": "prep:loc", "disamb": true}]}, {"index": 208, "position": [1307, 1312], "orth": "małej", "lexemes": [{"lemma": "mały", "mstag": "adj:sg:loc:f:pos", "disamb": true}]}, {"index": 209, "position": [1313, 1323], "orth": "zawartości", "lexemes": [{"lemma": "zawartość", "mstag": "subst:sg:loc:f", "disamb": true}]}, {"index": 210, "position": [1324, 1334], "orth": "składników", "lexemes": [{"lemma": "składnik", "mstag": "subst:pl:gen:m3", "disamb": true}]}, {"index": 211, "position": [1335, 1346], "orth": "mineralnych", "lexemes": [{"lemma": "mineralny", "mstag": "adj:pl:gen:m3:pos", "disamb": true}]}, {"index": 212, "position": [1347, 1355], "orth": "nazywamy", "lexemes": [{"lemma": "nazywać", "mstag": "fin:pl:pri:imperf", "disamb": true}]}, {"index": 213, "position": [1356, 1360], "orth": "wodą", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:inst:f", "disamb": true}]}, {"index": 214, "position": [1361, 1367], "orth": "miękką", "lexemes": [{"lemma": "miękki", "mstag": "adj:sg:inst:f:pos", "disamb": true}]}, {"index": 215, "position": [1367, 1368], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 216, "position": [1369, 1378], "orth": "natomiast", "lexemes": [{"lemma": "natomiast", "mstag": "conj", "disamb": true}]}, {"index": 217, "position": [1379, 1390], "orth": "zawierającą", "lexemes": [{"lemma": "zawierać", "mstag": "pact:sg:acc:f:imperf:aff", "disamb": true}]}, {"index": 218, "position": [1391, 1398], "orth": "znaczne", "lexemes": [{"lemma": "znaczny", "mstag": "adj:pl:acc:f:pos", "disamb": true}]}, {"index": 219, "position": [1399, 1405], "orth": "ilości", "lexemes": [{"lemma": "ilość", "mstag": "subst:pl:acc:f", "disamb": true}]}, {"index": 220, "position": [1406, 1410], "orth": "soli", "lexemes": [{"lemma": "sól", "mstag": "subst:sg:gen:f", "disamb": true}]}, {"index": 221, "position": [1411, 1417], "orth": "wapnia", "lexemes": [{"lemma": "wapń", "mstag": "subst:sg:gen:m3", "disamb": true}]}, {"index": 222, "position": [1418, 1419], "orth": "i", "lexemes": [{"lemma": "i", "mstag": "conj", "disamb": true}]}, {"index": 223, "position": [1420, 1427], "orth": "magnezu", "lexemes": [{"lemma": "magnez", "mstag": "subst:sg:gen:m3", "disamb": true}]}, {"index": 224, "position": [1428, 1429], "orth": "–", "lexemes": [{"lemma": "–", "mstag": "interp", "disamb": true}]}, {"index": 225, "position": [1430, 1434], "orth": "wodą", "lexemes": [{"lemma": "woda", "mstag": "subst:sg:inst:f", "disamb": true}]}, {"index": 226, "position": [1435, 1441], "orth": "twardą", "lexemes": [{"lemma": "twardy", "mstag": "adj:sg:inst:f:pos", "disamb": true}]}, {"index": 227, "position": [1441, 1442], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 228, "position": [1443, 1449], "orth": "Oprócz", "lexemes": [{"lemma": "oprócz", "mstag": "prep:gen", "disamb": true}]}, {"index": 229, "position": [1450, 1454], "orth": "tego", "lexemes": [{"lemma": "to", "mstag": "subst:sg:gen:n", "disamb": true}]}, {"index": 230, "position": [1455, 1459], "orth": "wody", "lexemes": [{"lemma": "woda", "mstag": "subst:pl:nom:f", "disamb": true}]}, {"index": 231, "position": [1460, 1469], "orth": "naturalne", "lexemes": [{"lemma": "naturalny", "mstag": "adj:pl:nom:f:pos", "disamb": true}]}, {"index": 232, "position": [1470, 1479], "orth": "zawierają", "lexemes": [{"lemma": "zawierać", "mstag": "fin:pl:ter:imperf", "disamb": true}]}, {"index": 233, "position": [1480, 1492], "orth": "rozpuszczone", "lexemes": [{"lemma": "rozpuścić", "mstag": "ppas:pl:nom:f:perf:aff", "disamb": true}]}, {"index": 234, "position": [1493, 1503], "orth": "substancje", "lexemes": [{"lemma": "substancja", "mstag": "subst:pl:nom:f", "disamb": true}]}, {"index": 235, "position": [1504, 1515], "orth": "pochodzenia", "lexemes": [{"lemma": "pochodzenie", "mstag": "subst:sg:gen:n", "disamb": true}]}, {"index": 236, "position": [1516, 1528], "orth": "organicznego", "lexemes": [{"lemma": "organiczny", "mstag": "adj:sg:gen:n:pos", "disamb": true}]}, {"index": 237, "position": [1528, 1529], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 238, "position": [1530, 1532], "orth": "np", "lexemes": [{"lemma": "na przykład", "mstag": "brev:pun", "disamb": true}]}, {"index": 239, "position": [1532, 1533], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}, {"index": 240, "position": [1534, 1541], "orth": "mocznik", "lexemes": [{"lemma": "mocznik", "mstag": "subst:sg:nom:m3", "disamb": true}]}, {"index": 241, "position": [1541, 1542], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "interp", "disamb": true}]}, {"index": 242, "position": [1543, 1548], "orth": "kwasy", "lexemes": [{"lemma": "kwas", "mstag": "subst:pl:nom:m3", "disamb": true}]}, {"index": 243, "position": [1549, 1557], "orth": "humusowe", "lexemes": [{"lemma": "humusowy", "mstag": "adj:pl:nom:m3:pos", "disamb": true}]}, {"index": 244, "position": [1558, 1561], "orth": "itp", "lexemes": [{"lemma": "i tym podobne", "mstag": "brev:pun", "disamb": true}]}, {"index": 245, "position": [1561, 1562], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "interp", "disamb": true}]}], "entities": [{"text": "Wszechświecie", "type": "nam_loc_astronomical", "tokens": [7, 8], "positions": [49, 62]}, {"text": "Układzie", "type": "nam_loc_astronomical", "tokens": [32, 33], "positions": [240, 248]}, {"text": "Słonecznym", "type": "nam_loc_country_region", "tokens": [33, 34], "positions": [249, 259]}, {"text": "Ceres", "type": "nam_loc_astronomical", "tokens": [39, 40], "positions": [292, 297]}, {"text": "Ziemi", "type": "nam_loc_astronomical", "tokens": [93, 94], "positions": [624, 629]}, {"text": "Ziemi", "type": "nam_loc_astronomical", "tokens": [164, 165], "positions": [1017, 1022]}]} \ No newline at end of file diff --git a/tests/testdata/input/post_spacy_small_limit_input b/tests/testdata/input/post_spacy_small_limit_input new file mode 100644 index 0000000000000000000000000000000000000000..46d611b84cf7da72ec3d897a53a15444bee27657 --- /dev/null +++ b/tests/testdata/input/post_spacy_small_limit_input @@ -0,0 +1 @@ +When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously. “I can tell you very senior CEOs of major American car companies would shake my hand and turn away because I wasn’t worth talking to,” said Thrun, in an interview with Recode earlier this week. diff --git a/tests/testdata/input/pre_fextor3_input b/tests/testdata/input/pre_fextor3_input new file mode 100644 index 0000000000000000000000000000000000000000..c1ac9f7e1a6d505b4fe7f401fcbd337dd82276d2 --- /dev/null +++ b/tests/testdata/input/pre_fextor3_input @@ -0,0 +1,3 @@ +Woda jest jedną z najpospolitszych substancji we Wszechświecie. +Cząsteczka wody jest trzecią najbardziej rozpowszechnioną molekułą w ośrodku międzygwiazdowym, po cząsteczkowym wodorze i tlenku węgla. Jest również szeroko rozpowszechniona w Układzie Słonecznym: stanowi istotny element budowy Ceres i księżyców lodowych krążących wokół planet-olbrzymów, jako domieszka występuje w ich atmosferach, a przypuszcza się, że duże jej ilości znajdują się we wnętrzach tych planet. Jako lód występuje także na części planetoid, a zapewne również na obiektach transneptunowych. Woda jest bardzo rozpowszechniona także na powierzchni Ziemi. Występuje głównie w oceanach, które pokrywają 70,8% powierzchni globu, ale także w rzekach, jeziorach i w postaci stałej w lodowcach. Część wody znajduje się w atmosferze (chmury, para wodna). Niektóre związki chemiczne zawierają cząsteczki wody w swojej budowie (hydraty – określa się ją wówczas mianem wody krystalizacyjnej). Zawartość wody włączonej w strukturę minerałów w płaszczu Ziemi może przekraczać łączną zawartość wody w oceanach i innych zbiornikach powierzchniowych nawet dziesięciokrotnie. +Woda występująca w przyrodzie jest roztworem soli i gazów. Najwięcej soli mineralnych zawiera woda morska i wody mineralne; najmniej woda z opadów atmosferycznych. Wodę o małej zawartości składników mineralnych nazywamy wodą miękką, natomiast zawierającą znaczne ilości soli wapnia i magnezu – wodą twardą. Oprócz tego wody naturalne zawierają rozpuszczone substancje pochodzenia organicznego, np. mocznik, kwasy humusowe itp.