Merge branch 'tests' into 'master'

Tests See merge request !1

Merge branch 'tests' into 'master'
Tests See merge request !1
fdebbc6f · Paweł Walkowiak · f8cdf0a1 · da034cf0 · fdebbc6f · fdebbc6f
Commit fdebbc6f authored 2 years ago by Paweł Walkowiak
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -6,6 +6,8 @@ cache:

 stages:
  - check_style
+  - tests
+  - pages
  - build_master
  - build_develop

@@ -23,6 +25,30 @@ docstyle:
  script:
    - tox -v -e docstyle

+tests:
+  stage: tests
+  before_script:
+    - pip install tox==2.9.1
+  script:
+    - tox -v -e pytest
+  artifacts:
+    paths:
+      - htmlcov
+    expire_in: 1 week
+    reports:
+      junit:
+        - report.xml
+
+pages:
+  stage: pages
+  script:
+    - mkdir -p public/coverage
+    - cp -r htmlcov/* public/coverage/
+  artifacts:
+    name: coverage
+    paths:
+      - public
+
 build_develop:
  except:
    - master

--- a/lpmn_queries.json
+++ b/lpmn_queries.json
+{
+    "posconverter_lone": {"task": [{"posconverter": {"input_format": "json", "output_format": "json", "input_tagset": "ud", "output_tagset": "identical", "json_text": false}}], "input": "remove_json_text.json", "expected": "remove_json_text.json"},
+
+
+    "post_spacy_tagger_json_text": {"task": [{"spacy": {"lang": "en", "method": "tagger"}}, {"posconverter": {"input_format": "json", "output_format": "json", "input_tagset": "ud", "output_tagset": "identical", "json_text": true}}], "input": "post_spacy_input", "expected": "post_spacy_tagger_json_text_expected.json"},
+
+    "post_spacy_tagger_no_json_text": {"task": [{"spacy": {"lang": "en", "method": "tagger"}}, {"posconverter": {"input_format": "json", "output_format": "json", "input_tagset": "ud", "output_tagset": "identical", "json_text": false}}], "input": "post_spacy_input", "expected": "post_spacy_tagger_no_json_text_expected.json"},
+
+    "post_spacy_ner_no_json_text": {"task": [{"spacy": {"lang": "en", "method": "ner"}}, {"posconverter": {"input_format": "json", "output_format": "json", "input_tagset": "ud", "output_tagset": "identical", "json_text": false}}], "input": "post_spacy_input", "expected": "post_spacy_ner_no_json_text_expected.json"},
+
+
+    "post_postagger_json_json_text": {"task": [{"postagger": {"lang": "pl", "output": "json"}}, {"posconverter": {"input_format": "json", "output_format": "json", "input_tagset": "ud", "output_tagset": "identical", "json_text": true}}], "input": "post_postagger_input", "expected": "post_postagger_json_json_text_expected.json"},
+
+    "post_postagger_ccl_json_text": {"task": [{"postagger": {"lang": "pl", "output": "ccl"}}, {"posconverter": {"input_format": "ccl", "output_format": "json", "input_tagset": "ud", "output_tagset": "identical", "json_text": true}}], "input": "post_postagger_input", "expected": "post_postagger_ccl_json_text_expected.json"},
+
+    "post_postagger_ccl_no_json_text": {"task": [{"postagger": {"lang": "pl", "output": "ccl"}}, {"posconverter": {"input_format": "ccl", "output_format": "json", "input_tagset": "ud", "output_tagset": "identical", "json_text": false}}], "input": "post_postagger_input", "expected": "post_postagger_ccl_no_json_text_expected.json"},
+
+
+    "pre_fextor3": {"task": [{"posconverter": {"input_format": "json", "output_format": "json", "input_tagset": "nkjp", "output_tagset": "identical", "json_text": false}}, "fextor3"], "input": "pre_fextor3_input.json", "expected": "pre_fextor3_expected.json"}
+}
--- a/src/converter.py
+++ b/src/converter.py
@@ -5,7 +5,7 @@ import os
 import nlp_ws
 import json
 import pathlib
-from xml.sax import handler, make_parser
+from xml.sax import handler, make_parser, SAXParseException
 from ccl2json.parse import CCLhandler
 from xml.dom import pulldom
 import configparser
@@ -76,18 +76,43 @@ def ccl2json(path_in, path_out, tagset='nkjp', json_text=True):
    parser = make_parser()
    parser.setFeature(handler.feature_external_ges, False)
    parser.setContentHandler(CCLhandler())
-    parser.parse(path_in)
-
-    with open(path_out, 'w', encoding='utf-8') as file_out:
-        data_out = {
-            "filename": pathlib.Path(path_in).stem,
-            'tagset': tagset,
-            'tokens': [_convert_tagset_in_token(token.as_dict(), tagset)
-                       for token in parser.getContentHandler().get_tokens()]
-        }
-        if json_text:
-            data_out['text'] = parser.getContentHandler().get_text()
-        json.dump(data_out, file_out, ensure_ascii=False)
+    try:
+        parser.parse(path_in)
+    except SAXParseException:
+        raise Exception("Error parsing file, check file syntax") from None
+    else:
+        _log.log(logging.INFO, "Parser initialised and input data read ")
+
+        with open(path_out, 'w', encoding='utf-8') as file_out:
+            data_out = {
+                "filename": pathlib.Path(path_in).stem,
+                'tagset': tagset,
+                'tokens': [_convert_tagset_in_token(token.as_dict(), tagset)
+                           for token in parser.getContentHandler().get_tokens()]
+            }
+            _log.log(logging.INFO, "Data to write prepared ")
+            if json_text:
+                data_out['text'] = parser.getContentHandler().get_text()
+                _log.log(logging.INFO, "Original text attached to output data ")
+            json.dump(data_out, file_out, ensure_ascii=False)
+            _log.log(logging.INFO, "Data dumped to output file ")
+
+
+def remove_attribute(input_path, output_path, attribute):
+    """Remove a single key from json file and write it to another one.
+
+    :param input_path: path to input json file
+    :type input_path: str
+    :param output_path: path to output json file
+    :type output_path: str
+    :param attribute: key being removed
+    :type param: str
+    """
+    with open(input_path) as input_file:
+        json_obj = json.load(input_file)
+        _ = json_obj.pop(attribute, None)
+        with open(output_path, 'wt') as output_file:
+            json.dump(json_obj, output_file, ensure_ascii=False)


 class ConverterWorker(nlp_ws.NLPWorker):
@@ -128,9 +153,12 @@ class ConverterWorker(nlp_ws.NLPWorker):
            (default = True)
        :type json_text: bool
        """
-        if input_format == output_format and \
-                (output_format == "identical" or
-                 input_format == output_format):
+        if input_format == "json" and output_format == "json" and \
+                not json_text:
+            remove_attribute(input_path, output_path, "text")
+        elif input_format == output_format and \
+                (output_tagset == "identical" or
+                 input_tagset == output_tagset):
            shutil.copyfile(input_path, output_path)
        elif input_format == "ccl" and output_format == "lemmas":
            ccl_2_lemmas(input_path, output_path)

--- a/tests/conftest.py
+++ b/tests/conftest.py
+from os.path import join
+
+import pytest
+
+from src.converter import ConverterWorker
+
+@pytest.fixture
+def dir_testdata():
+    return join("tests", "testdata")
+
+
+@pytest.fixture
+def input_dir(dir_testdata):
+    return join(dir_testdata, 'input')
+
+
+@pytest.fixture
+def output_dir(dir_testdata):
+    return join(dir_testdata, 'output')
+
+
+@pytest.fixture
+def expected_dir(dir_testdata):
+    return join(dir_testdata, 'expected')
+
+
+@pytest.fixture
+def identical_input_output_formats_input():
+    return 'identical_input_output_formats.json'
+
+@pytest.fixture
+def identical_input_output_formats_output():
+    return 'identical_input_output_formats.json'
+
+@pytest.fixture
+def identical_input_output_formats_expected():
+    return 'identical_input_output_formats.json'
+
+
+@pytest.fixture
+def identical_input_output_formats_task_options():
+    return {
+            'input_format': 'json',
+            'output_format': 'json',
+            'input_tagset': 'ud',
+            'output_tagset': 'identical',
+            'json_text': True,
+           }
+
+
+@pytest.fixture
+def remove_json_text_input():
+    return 'remove_json_text.json'
+
+@pytest.fixture
+def remove_json_text_output():
+    return 'remove_json_text.json'
+
+@pytest.fixture
+def remove_json_text_expected():
+    return 'remove_json_text.json'
+
+@pytest.fixture
+def remove_json_text_task_options():
+    return {
+            'input_format': 'json',
+            'output_format': 'json',
+            'input_tagset': 'ud',
+            'output_tagset': 'identical',
+            'json_text': False,
+           }
+
+
+@pytest.fixture
+def simple_ccl2json_conversion_input():
+    return 'simple_ccl2json_conversion.xml'
+
+@pytest.fixture
+def simple_ccl2json_conversion_output():
+    return 'simple_ccl2json_conversion.json'
+
+@pytest.fixture
+def simple_ccl2json_conversion_expected():
+    return 'simple_ccl2json_conversion.json'
+
+
+@pytest.fixture
+def simple_ccl2json_conversion_task_options():
+    return {
+            'input_format': 'ccl',
+            'output_format': 'json',
+            'input_tagset': 'nkjp',
+            'output_tagset': 'identical',
+            'json_text': False,
+           }
+
+
+@pytest.fixture
+def simple_nkjp2ud_conversion_input():
+    return 'simple_nkjp2ud_conversion.xml'
+
+@pytest.fixture
+def simple_nkjp2ud_conversion_output():
+    return 'simple_nkjp2ud_conversion.json'
+
+@pytest.fixture
+def simple_nkjp2ud_conversion_expected():
+    return 'simple_nkjp2ud_conversion.json'
+
+
+@pytest.fixture
+def simple_nkjp2ud_conversion_task_options():
+    return {
+            'input_format': 'ccl',
+            'output_format': 'json',
+            'input_tagset': 'nkjp',
+            'output_tagset': 'ud',
+            'json_text': False,
+           }
+
+
+@pytest.fixture
+def simple_ccl2lemmas_conversion_input():
+    return 'simple_ccl2lemmas_conversion.xml'
+
+@pytest.fixture
+def simple_ccl2lemmas_conversion_output():
+    return 'simple_ccl2lemmas_conversion'
+
+@pytest.fixture
+def simple_ccl2lemmas_conversion_expected():
+    return 'simple_ccl2lemmas_conversion'
+
+
+@pytest.fixture
+def simple_ccl2lemmas_conversion_task_options():
+    return {
+            'input_format': 'ccl',
+            'output_format': 'lemmas',
+            'input_tagset': 'nkjp',
+            'output_tagset': 'identical',
+            'json_text': False,
+           }
+
+
+@pytest.fixture
+def undefined_input_format_input():
+    return identical_input_output_formats_input
+
+
+@pytest.fixture
+def undefined_input_format_task_options():
+    return {
+            'input_format': 'foo',
+            'output_format': 'json',
+            'input_tagset': 'ud',
+            'output_tagset': 'identical',
+            'json_text': False,
+           }
+
+
+@pytest.fixture
+def undefined_output_format_input():
+    return identical_input_output_formats_input
+
+
+@pytest.fixture
+def undefined_output_format_task_options():
+    return {
+            'input_format': 'json',
+            'output_format': 'foobar',
+            'input_tagset': 'ud',
+            'output_tagset': 'identical',
+            'json_text': False,
+           }
+
+
+@pytest.fixture
+def config():
+    return {}
+
+
+@pytest.fixture
+def worker(config):
+    worker = ConverterWorker()
+    worker.static_init(config)
+    return worker
--- a/tests/test.py
+++ b/tests/test.py
+import os
+from filecmp import cmp
+from os.path import join
+
+import pytest
+
+from src.converter import ConverterWorker
+
+
+def test_init():
+    worker = ConverterWorker()
+    assert type(worker).__name__ == 'ConverterWorker'
+
+def test_identical_input_output_formats(input_dir, output_dir, expected_dir,
+                                        identical_input_output_formats_input,
+                                        identical_input_output_formats_task_options,
+                                        identical_input_output_formats_output,
+                                        identical_input_output_formats_expected,
+                                        worker):
+    worker.process(
+        join(input_dir, identical_input_output_formats_input),
+        identical_input_output_formats_task_options,
+        join(output_dir, identical_input_output_formats_output)
+    )
+    assert(cmp(join(output_dir, identical_input_output_formats_output), join(expected_dir, identical_input_output_formats_expected)))
+    os.remove(join(output_dir, identical_input_output_formats_output))
+
+
+def test_remove_json_text(input_dir, output_dir, expected_dir,
+                                        remove_json_text_input,
+                                        remove_json_text_task_options,
+                                        remove_json_text_output,
+                                        remove_json_text_expected,
+                                        worker):
+    worker.process(
+        join(input_dir, remove_json_text_input),
+        remove_json_text_task_options,
+        join(output_dir, remove_json_text_output)
+    )
+    assert(cmp(join(output_dir, remove_json_text_output), join(expected_dir, remove_json_text_expected)))
+    os.remove(join(output_dir, remove_json_text_output))
+
+
+def test_simple_ccl2json_conversion(input_dir, output_dir, expected_dir,
+                                        simple_ccl2json_conversion_input,
+                                        simple_ccl2json_conversion_task_options,
+                                        simple_ccl2json_conversion_output,
+                                        simple_ccl2json_conversion_expected,
+                                        worker):
+    worker.process(
+        join(input_dir, simple_ccl2json_conversion_input),
+        simple_ccl2json_conversion_task_options,
+        join(output_dir, simple_ccl2json_conversion_output)
+    )
+    assert(cmp(join(output_dir, simple_ccl2json_conversion_output), join(expected_dir, simple_ccl2json_conversion_expected)))
+    os.remove(join(output_dir, simple_ccl2json_conversion_output))
+
+
+def test_simple_nkjp2ud_conversion(input_dir, output_dir, expected_dir,
+                                        simple_nkjp2ud_conversion_input,
+                                        simple_nkjp2ud_conversion_task_options,
+                                        simple_nkjp2ud_conversion_output,
+                                        simple_nkjp2ud_conversion_expected,
+                                        worker):
+    worker.process(
+        join(input_dir, simple_nkjp2ud_conversion_input),
+        simple_nkjp2ud_conversion_task_options,
+        join(output_dir, simple_nkjp2ud_conversion_output)
+    )
+    assert(cmp(join(output_dir, simple_nkjp2ud_conversion_output), join(expected_dir, simple_nkjp2ud_conversion_expected)))
+    os.remove(join(output_dir, simple_nkjp2ud_conversion_output))
+
+
+def test_simple_ccl2lemmas_conversion(input_dir, output_dir, expected_dir,
+                                        simple_ccl2lemmas_conversion_input,
+                                        simple_ccl2lemmas_conversion_task_options,
+                                        simple_ccl2lemmas_conversion_output,
+                                        simple_ccl2lemmas_conversion_expected,
+                                        worker):
+    worker.process(
+        join(input_dir, simple_ccl2lemmas_conversion_input),
+        simple_ccl2lemmas_conversion_task_options,
+        join(output_dir, simple_ccl2lemmas_conversion_output)
+    )
+    assert(cmp(join(output_dir, simple_ccl2lemmas_conversion_output), join(expected_dir, simple_ccl2lemmas_conversion_expected)))
+    os.remove(join(output_dir, simple_ccl2lemmas_conversion_output))
+
+
+#raises exception
+def test_undefined_input_format(input_dir, output_dir,
+                                        undefined_input_format_input,
+                                        undefined_input_format_task_options,
+                                        worker):
+    with pytest.raises(Exception):
+        worker.process(
+            join(input_dir, undefined_input_format_input),
+            undefined_input_format_task_options,
+            join(output_dir, undefined_input_format_input)
+        )
+
+
+# raises exception
+def test_undefined_output_format(input_dir, output_dir,
+                                        undefined_output_format_input,
+                                        undefined_output_format_task_options,
+                                        worker):
+    with pytest.raises(Exception):
+        worker.process(
+            join(input_dir, undefined_output_format_input),
+            undefined_output_format_task_options,
+            join(output_dir, undefined_output_format_input)
+        )
--- a/tests/testdata/expected/identical_input_output_formats.json
+++ b/tests/testdata/expected/identical_input_output_formats.json
+{"filename": "simple_text_pl", "tagset": "ud", "tokens": [{"index": 1, "position": [0, 7], "orth": "Nazywam", "lexemes": [{"lemma": "Nazywam", "mstag": "VERB", "disamb": true}]}, {"index": 2, "position": [8, 11], "orth": "się", "lexemes": [{"lemma": "się", "mstag": "PRON", "disamb": true}]}, {"index": 3, "position": [12, 15], "orth": "Jan", "lexemes": [{"lemma": "Jan", "mstag": "PROPN", "disamb": true}]}, {"index": 4, "position": [16, 24], "orth": "Kowalski", "lexemes": [{"lemma": "Kowalski", "mstag": "PROPN", "disamb": true}]}, {"index": 5, "position": [25, 26], "orth": "i", "lexemes": [{"lemma": "i", "mstag": "CCONJ", "disamb": true}]}, {"index": 6, "position": [27, 35], "orth": "mieszkam", "lexemes": [{"lemma": "mieszkać", "mstag": "VERB", "disamb": true}]}, {"index": 7, "position": [36, 38], "orth": "we", "lexemes": [{"lemma": "w", "mstag": "ADP", "disamb": true}]}, {"index": 8, "position": [39, 48], "orth": "Wrocławiu", "lexemes": [{"lemma": "Wrocław", "mstag": "PROPN", "disamb": true}]}, {"index": 9, "position": [49, 49], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 10, "position": [50, 53], "orth": "Mój", "lexemes": [{"lemma": "mój", "mstag": "DET", "disamb": true}]}, {"index": 11, "position": [54, 62], "orth": "rodzinny", "lexemes": [{"lemma": "rodzinny", "mstag": "ADJ", "disamb": true}]}, {"index": 12, "position": [63, 66], "orth": "dom", "lexemes": [{"lemma": "dom", "mstag": "NOUN", "disamb": true}]}, {"index": 13, "position": [67, 73], "orth": "mieści", "lexemes": [{"lemma": "mieścić", "mstag": "VERB", "disamb": true}]}, {"index": 14, "position": [74, 77], "orth": "się", "lexemes": [{"lemma": "się", "mstag": "PRON", "disamb": true}]}, {"index": 15, "position": [78, 82], "orth": "przy", "lexemes": [{"lemma": "przy", "mstag": "ADP", "disamb": true}]}, {"index": 16, "position": [83, 88], "orth": "aleji", "lexemes": [{"lemma": "aleji", "mstag": "NOUN", "disamb": true}]}, {"index": 17, "position": [89, 93], "orth": "Jana", "lexemes": [{"lemma": "Jan", "mstag": "PROPN", "disamb": true}]}, {"index": 18, "position": [94, 99], "orth": "Pawła", "lexemes": [{"lemma": "Paweł", "mstag": "PROPN", "disamb": true}]}, {"index": 19, "position": [100, 102], "orth": "II", "lexemes": [{"lemma": "II", "mstag": "ADJ", "disamb": true}]}, {"index": 20, "position": [103, 103], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}], "text": "Nazywam się Jan Kowalski i mieszkam we Wrocławiu. Mój rodzinny dom mieści się przy aleji Jana Pawła II."}
\ No newline at end of file
--- a/tests/testdata/expected/post_postagger_ccl_json_text_expected.json
+++ b/tests/testdata/expected/post_postagger_ccl_json_text_expected.json
--- a/tests/testdata/expected/post_postagger_ccl_no_json_text_expected.json
+++ b/tests/testdata/expected/post_postagger_ccl_no_json_text_expected.json
--- a/tests/testdata/expected/post_postagger_json_json_text_expected.json
+++ b/tests/testdata/expected/post_postagger_json_json_text_expected.json
--- a/tests/testdata/expected/post_spacy_ner_no_json_text_expected.json
+++ b/tests/testdata/expected/post_spacy_ner_no_json_text_expected.json
+{"filename": "post_spacy_input", "tagset": "ud", "tokens": [{"index": 1, "position": [0, 4], "orth": "When", "lexemes": [{"lemma": "when", "mstag": "SCONJ", "disamb": true}]}, {"index": 2, "position": [5, 14], "orth": "Sebastian", "lexemes": [{"lemma": "Sebastian", "mstag": "PROPN", "disamb": true}]}, {"index": 3, "position": [15, 20], "orth": "Thrun", "lexemes": [{"lemma": "Thrun", "mstag": "PROPN", "disamb": true}]}, {"index": 4, "position": [21, 28], "orth": "started", "lexemes": [{"lemma": "start", "mstag": "VERB", "disamb": true}]}, {"index": 5, "position": [29, 36], "orth": "working", "lexemes": [{"lemma": "work", "mstag": "VERB", "disamb": true}]}, {"index": 6, "position": [37, 39], "orth": "on", "lexemes": [{"lemma": "on", "mstag": "ADP", "disamb": true}]}, {"index": 7, "position": [40, 44], "orth": "self", "lexemes": [{"lemma": "self", "mstag": "NOUN", "disamb": true}]}, {"index": 8, "position": [45, 45], "orth": "-", "lexemes": [{"lemma": "-", "mstag": "PUNCT", "disamb": true}]}, {"index": 9, "position": [46, 52], "orth": "driving", "lexemes": [{"lemma": "drive", "mstag": "VERB", "disamb": true}]}, {"index": 10, "position": [53, 57], "orth": "cars", "lexemes": [{"lemma": "car", "mstag": "NOUN", "disamb": true}]}, {"index": 11, "position": [58, 60], "orth": "at", "lexemes": [{"lemma": "at", "mstag": "ADP", "disamb": true}]}, {"index": 12, "position": [61, 67], "orth": "Google", "lexemes": [{"lemma": "Google", "mstag": "PROPN", "disamb": true}]}, {"index": 13, "position": [68, 70], "orth": "in", "lexemes": [{"lemma": "in", "mstag": "ADP", "disamb": true}]}, {"index": 14, "position": [71, 75], "orth": "2007", "lexemes": [{"lemma": "2007", "mstag": "NUM", "disamb": true}]}, {"index": 15, "position": [76, 76], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 16, "position": [77, 80], "orth": "few", "lexemes": [{"lemma": "few", "mstag": "ADJ", "disamb": true}]}, {"index": 17, "position": [81, 87], "orth": "people", "lexemes": [{"lemma": "people", "mstag": "NOUN", "disamb": true}]}, {"index": 18, "position": [88, 95], "orth": "outside", "lexemes": [{"lemma": "outside", "mstag": "ADV", "disamb": true}]}, {"index": 19, "position": [96, 98], "orth": "of", "lexemes": [{"lemma": "of", "mstag": "ADP", "disamb": true}]}, {"index": 20, "position": [99, 102], "orth": "the", "lexemes": [{"lemma": "the", "mstag": "DET", "disamb": true}]}, {"index": 21, "position": [103, 110], "orth": "company", "lexemes": [{"lemma": "company", "mstag": "NOUN", "disamb": true}]}, {"index": 22, "position": [111, 115], "orth": "took", "lexemes": [{"lemma": "take", "mstag": "VERB", "disamb": true}]}, {"index": 23, "position": [116, 119], "orth": "him", "lexemes": [{"lemma": "he", "mstag": "PRON", "disamb": true}]}, {"index": 24, "position": [120, 129], "orth": "seriously", "lexemes": [{"lemma": "seriously", "mstag": "ADV", "disamb": true}]}, {"index": 25, "position": [130, 130], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 26, "position": [131, 132], "orth": "“", "lexemes": [{"lemma": "\"", "mstag": "PUNCT", "disamb": true}]}, {"index": 27, "position": [133, 133], "orth": "I", "lexemes": [{"lemma": "I", "mstag": "PRON", "disamb": true}]}, {"index": 28, "position": [134, 137], "orth": "can", "lexemes": [{"lemma": "can", "mstag": "AUX", "disamb": true}]}, {"index": 29, "position": [138, 142], "orth": "tell", "lexemes": [{"lemma": "tell", "mstag": "VERB", "disamb": true}]}, {"index": 30, "position": [143, 146], "orth": "you", "lexemes": [{"lemma": "you", "mstag": "PRON", "disamb": true}]}, {"index": 31, "position": [147, 151], "orth": "very", "lexemes": [{"lemma": "very", "mstag": "ADV", "disamb": true}]}, {"index": 32, "position": [152, 158], "orth": "senior", "lexemes": [{"lemma": "senior", "mstag": "ADJ", "disamb": true}]}, {"index": 33, "position": [159, 163], "orth": "CEOs", "lexemes": [{"lemma": "ceo", "mstag": "NOUN", "disamb": true}]}, {"index": 34, "position": [164, 166], "orth": "of", "lexemes": [{"lemma": "of", "mstag": "ADP", "disamb": true}]}, {"index": 35, "position": [167, 172], "orth": "major", "lexemes": [{"lemma": "major", "mstag": "ADJ", "disamb": true}]}, {"index": 36, "position": [173, 181], "orth": "American", "lexemes": [{"lemma": "american", "mstag": "ADJ", "disamb": true}]}, {"index": 37, "position": [182, 185], "orth": "car", "lexemes": [{"lemma": "car", "mstag": "NOUN", "disamb": true}]}, {"index": 38, "position": [186, 195], "orth": "companies", "lexemes": [{"lemma": "company", "mstag": "NOUN", "disamb": true}]}, {"index": 39, "position": [196, 201], "orth": "would", "lexemes": [{"lemma": "would", "mstag": "AUX", "disamb": true}]}, {"index": 40, "position": [202, 207], "orth": "shake", "lexemes": [{"lemma": "shake", "mstag": "VERB", "disamb": true}]}, {"index": 41, "position": [208, 210], "orth": "my", "lexemes": [{"lemma": "my", "mstag": "PRON", "disamb": true}]}, {"index": 42, "position": [211, 215], "orth": "hand", "lexemes": [{"lemma": "hand", "mstag": "NOUN", "disamb": true}]}, {"index": 43, "position": [216, 219], "orth": "and", "lexemes": [{"lemma": "and", "mstag": "CCONJ", "disamb": true}]}, {"index": 44, "position": [220, 224], "orth": "turn", "lexemes": [{"lemma": "turn", "mstag": "VERB", "disamb": true}]}, {"index": 45, "position": [225, 229], "orth": "away", "lexemes": [{"lemma": "away", "mstag": "ADV", "disamb": true}]}, {"index": 46, "position": [230, 237], "orth": "because", "lexemes": [{"lemma": "because", "mstag": "SCONJ", "disamb": true}]}, {"index": 47, "position": [238, 239], "orth": "I", "lexemes": [{"lemma": "I", "mstag": "PRON", "disamb": true}]}, {"index": 48, "position": [240, 243], "orth": "was", "lexemes": [{"lemma": "be", "mstag": "AUX", "disamb": true}]}, {"index": 49, "position": [244, 246], "orth": "n’t", "lexemes": [{"lemma": "not", "mstag": "PART", "disamb": true}]}, {"index": 50, "position": [247, 252], "orth": "worth", "lexemes": [{"lemma": "worth", "mstag": "ADJ", "disamb": true}]}, {"index": 51, "position": [253, 260], "orth": "talking", "lexemes": [{"lemma": "talk", "mstag": "VERB", "disamb": true}]}, {"index": 52, "position": [261, 263], "orth": "to", "lexemes": [{"lemma": "to", "mstag": "ADP", "disamb": true}]}, {"index": 53, "position": [264, 264], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 54, "position": [265, 265], "orth": "”", "lexemes": [{"lemma": "\"", "mstag": "PUNCT", "disamb": true}]}, {"index": 55, "position": [266, 270], "orth": "said", "lexemes": [{"lemma": "say", "mstag": "VERB", "disamb": true}]}, {"index": 56, "position": [271, 276], "orth": "Thrun", "lexemes": [{"lemma": "Thrun", "mstag": "PROPN", "disamb": true}]}, {"index": 57, "position": [277, 277], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 58, "position": [278, 280], "orth": "in", "lexemes": [{"lemma": "in", "mstag": "ADP", "disamb": true}]}, {"index": 59, "position": [281, 283], "orth": "an", "lexemes": [{"lemma": "an", "mstag": "DET", "disamb": true}]}, {"index": 60, "position": [284, 293], "orth": "interview", "lexemes": [{"lemma": "interview", "mstag": "NOUN", "disamb": true}]}, {"index": 61, "position": [294, 298], "orth": "with", "lexemes": [{"lemma": "with", "mstag": "ADP", "disamb": true}]}, {"index": 62, "position": [299, 305], "orth": "Recode", "lexemes": [{"lemma": "Recode", "mstag": "PROPN", "disamb": true}]}, {"index": 63, "position": [306, 313], "orth": "earlier", "lexemes": [{"lemma": "early", "mstag": "ADV", "disamb": true}]}, {"index": 64, "position": [314, 318], "orth": "this", "lexemes": [{"lemma": "this", "mstag": "DET", "disamb": true}]}, {"index": 65, "position": [319, 323], "orth": "week", "lexemes": [{"lemma": "week", "mstag": "NOUN", "disamb": true}]}, {"index": 66, "position": [324, 324], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 67, "position": [325, 325], "orth": "\n", "lexemes": [{"lemma": "\n", "mstag": "SPACE", "disamb": true}]}], "entities": [{"text": "Sebastian Thrun", "type": "PERSON", "tokens": [1, 3], "position": [5, 20]}, {"text": "Google", "type": "ORG", "tokens": [11, 12], "position": [61, 67]}, {"text": "2007", "type": "DATE", "tokens": [13, 14], "position": [71, 75]}, {"text": "American", "type": "NORP", "tokens": [35, 36], "position": [173, 181]}, {"text": "Thrun", "type": "PERSON", "tokens": [55, 56], "position": [271, 276]}, {"text": "Recode", "type": "ORG", "tokens": [61, 62], "position": [299, 305]}, {"text": "earlier this week", "type": "DATE", "tokens": [62, 65], "position": [306, 323]}]}
\ No newline at end of file
--- a/tests/testdata/expected/post_spacy_tagger_json_text_expected.json
+++ b/tests/testdata/expected/post_spacy_tagger_json_text_expected.json
+{"filename": "post_spacy_input", "tagset": "ud", "tokens": [{"index": 1, "position": [0, 4], "orth": "When", "lexemes": [{"lemma": "when", "mstag": "SCONJ", "disamb": true}]}, {"index": 2, "position": [5, 14], "orth": "Sebastian", "lexemes": [{"lemma": "Sebastian", "mstag": "PROPN", "disamb": true}]}, {"index": 3, "position": [15, 20], "orth": "Thrun", "lexemes": [{"lemma": "Thrun", "mstag": "PROPN", "disamb": true}]}, {"index": 4, "position": [21, 28], "orth": "started", "lexemes": [{"lemma": "start", "mstag": "VERB", "disamb": true}]}, {"index": 5, "position": [29, 36], "orth": "working", "lexemes": [{"lemma": "work", "mstag": "VERB", "disamb": true}]}, {"index": 6, "position": [37, 39], "orth": "on", "lexemes": [{"lemma": "on", "mstag": "ADP", "disamb": true}]}, {"index": 7, "position": [40, 44], "orth": "self", "lexemes": [{"lemma": "self", "mstag": "NOUN", "disamb": true}]}, {"index": 8, "position": [45, 45], "orth": "-", "lexemes": [{"lemma": "-", "mstag": "PUNCT", "disamb": true}]}, {"index": 9, "position": [46, 52], "orth": "driving", "lexemes": [{"lemma": "drive", "mstag": "VERB", "disamb": true}]}, {"index": 10, "position": [53, 57], "orth": "cars", "lexemes": [{"lemma": "car", "mstag": "NOUN", "disamb": true}]}, {"index": 11, "position": [58, 60], "orth": "at", "lexemes": [{"lemma": "at", "mstag": "ADP", "disamb": true}]}, {"index": 12, "position": [61, 67], "orth": "Google", "lexemes": [{"lemma": "Google", "mstag": "PROPN", "disamb": true}]}, {"index": 13, "position": [68, 70], "orth": "in", "lexemes": [{"lemma": "in", "mstag": "ADP", "disamb": true}]}, {"index": 14, "position": [71, 75], "orth": "2007", "lexemes": [{"lemma": "2007", "mstag": "NUM", "disamb": true}]}, {"index": 15, "position": [76, 76], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 16, "position": [77, 80], "orth": "few", "lexemes": [{"lemma": "few", "mstag": "ADJ", "disamb": true}]}, {"index": 17, "position": [81, 87], "orth": "people", "lexemes": [{"lemma": "people", "mstag": "NOUN", "disamb": true}]}, {"index": 18, "position": [88, 95], "orth": "outside", "lexemes": [{"lemma": "outside", "mstag": "ADV", "disamb": true}]}, {"index": 19, "position": [96, 98], "orth": "of", "lexemes": [{"lemma": "of", "mstag": "ADP", "disamb": true}]}, {"index": 20, "position": [99, 102], "orth": "the", "lexemes": [{"lemma": "the", "mstag": "DET", "disamb": true}]}, {"index": 21, "position": [103, 110], "orth": "company", "lexemes": [{"lemma": "company", "mstag": "NOUN", "disamb": true}]}, {"index": 22, "position": [111, 115], "orth": "took", "lexemes": [{"lemma": "take", "mstag": "VERB", "disamb": true}]}, {"index": 23, "position": [116, 119], "orth": "him", "lexemes": [{"lemma": "he", "mstag": "PRON", "disamb": true}]}, {"index": 24, "position": [120, 129], "orth": "seriously", "lexemes": [{"lemma": "seriously", "mstag": "ADV", "disamb": true}]}, {"index": 25, "position": [130, 130], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 26, "position": [131, 132], "orth": "“", "lexemes": [{"lemma": "\"", "mstag": "PUNCT", "disamb": true}]}, {"index": 27, "position": [133, 133], "orth": "I", "lexemes": [{"lemma": "I", "mstag": "PRON", "disamb": true}]}, {"index": 28, "position": [134, 137], "orth": "can", "lexemes": [{"lemma": "can", "mstag": "AUX", "disamb": true}]}, {"index": 29, "position": [138, 142], "orth": "tell", "lexemes": [{"lemma": "tell", "mstag": "VERB", "disamb": true}]}, {"index": 30, "position": [143, 146], "orth": "you", "lexemes": [{"lemma": "you", "mstag": "PRON", "disamb": true}]}, {"index": 31, "position": [147, 151], "orth": "very", "lexemes": [{"lemma": "very", "mstag": "ADV", "disamb": true}]}, {"index": 32, "position": [152, 158], "orth": "senior", "lexemes": [{"lemma": "senior", "mstag": "ADJ", "disamb": true}]}, {"index": 33, "position": [159, 163], "orth": "CEOs", "lexemes": [{"lemma": "ceo", "mstag": "NOUN", "disamb": true}]}, {"index": 34, "position": [164, 166], "orth": "of", "lexemes": [{"lemma": "of", "mstag": "ADP", "disamb": true}]}, {"index": 35, "position": [167, 172], "orth": "major", "lexemes": [{"lemma": "major", "mstag": "ADJ", "disamb": true}]}, {"index": 36, "position": [173, 181], "orth": "American", "lexemes": [{"lemma": "american", "mstag": "ADJ", "disamb": true}]}, {"index": 37, "position": [182, 185], "orth": "car", "lexemes": [{"lemma": "car", "mstag": "NOUN", "disamb": true}]}, {"index": 38, "position": [186, 195], "orth": "companies", "lexemes": [{"lemma": "company", "mstag": "NOUN", "disamb": true}]}, {"index": 39, "position": [196, 201], "orth": "would", "lexemes": [{"lemma": "would", "mstag": "AUX", "disamb": true}]}, {"index": 40, "position": [202, 207], "orth": "shake", "lexemes": [{"lemma": "shake", "mstag": "VERB", "disamb": true}]}, {"index": 41, "position": [208, 210], "orth": "my", "lexemes": [{"lemma": "my", "mstag": "PRON", "disamb": true}]}, {"index": 42, "position": [211, 215], "orth": "hand", "lexemes": [{"lemma": "hand", "mstag": "NOUN", "disamb": true}]}, {"index": 43, "position": [216, 219], "orth": "and", "lexemes": [{"lemma": "and", "mstag": "CCONJ", "disamb": true}]}, {"index": 44, "position": [220, 224], "orth": "turn", "lexemes": [{"lemma": "turn", "mstag": "VERB", "disamb": true}]}, {"index": 45, "position": [225, 229], "orth": "away", "lexemes": [{"lemma": "away", "mstag": "ADV", "disamb": true}]}, {"index": 46, "position": [230, 237], "orth": "because", "lexemes": [{"lemma": "because", "mstag": "SCONJ", "disamb": true}]}, {"index": 47, "position": [238, 239], "orth": "I", "lexemes": [{"lemma": "I", "mstag": "PRON", "disamb": true}]}, {"index": 48, "position": [240, 243], "orth": "was", "lexemes": [{"lemma": "be", "mstag": "AUX", "disamb": true}]}, {"index": 49, "position": [244, 246], "orth": "n’t", "lexemes": [{"lemma": "not", "mstag": "PART", "disamb": true}]}, {"index": 50, "position": [247, 252], "orth": "worth", "lexemes": [{"lemma": "worth", "mstag": "ADJ", "disamb": true}]}, {"index": 51, "position": [253, 260], "orth": "talking", "lexemes": [{"lemma": "talk", "mstag": "VERB", "disamb": true}]}, {"index": 52, "position": [261, 263], "orth": "to", "lexemes": [{"lemma": "to", "mstag": "ADP", "disamb": true}]}, {"index": 53, "position": [264, 264], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 54, "position": [265, 265], "orth": "”", "lexemes": [{"lemma": "\"", "mstag": "PUNCT", "disamb": true}]}, {"index": 55, "position": [266, 270], "orth": "said", "lexemes": [{"lemma": "say", "mstag": "VERB", "disamb": true}]}, {"index": 56, "position": [271, 276], "orth": "Thrun", "lexemes": [{"lemma": "Thrun", "mstag": "PROPN", "disamb": true}]}, {"index": 57, "position": [277, 277], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 58, "position": [278, 280], "orth": "in", "lexemes": [{"lemma": "in", "mstag": "ADP", "disamb": true}]}, {"index": 59, "position": [281, 283], "orth": "an", "lexemes": [{"lemma": "an", "mstag": "DET", "disamb": true}]}, {"index": 60, "position": [284, 293], "orth": "interview", "lexemes": [{"lemma": "interview", "mstag": "NOUN", "disamb": true}]}, {"index": 61, "position": [294, 298], "orth": "with", "lexemes": [{"lemma": "with", "mstag": "ADP", "disamb": true}]}, {"index": 62, "position": [299, 305], "orth": "Recode", "lexemes": [{"lemma": "Recode", "mstag": "PROPN", "disamb": true}]}, {"index": 63, "position": [306, 313], "orth": "earlier", "lexemes": [{"lemma": "early", "mstag": "ADV", "disamb": true}]}, {"index": 64, "position": [314, 318], "orth": "this", "lexemes": [{"lemma": "this", "mstag": "DET", "disamb": true}]}, {"index": 65, "position": [319, 323], "orth": "week", "lexemes": [{"lemma": "week", "mstag": "NOUN", "disamb": true}]}, {"index": 66, "position": [324, 324], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 67, "position": [325, 325], "orth": "\n", "lexemes": [{"lemma": "\n", "mstag": "SPACE", "disamb": true}]}], "text": "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously. “I can tell you very senior CEOs of major American car companies would shake my hand and turn away because I wasn’t worth talking to,” said Thrun, in an interview with Recode earlier this week.\n"}
\ No newline at end of file
--- a/tests/testdata/expected/post_spacy_tagger_no_json_text_expected.json
+++ b/tests/testdata/expected/post_spacy_tagger_no_json_text_expected.json
+{"filename": "post_spacy_input", "tagset": "ud", "tokens": [{"index": 1, "position": [0, 4], "orth": "When", "lexemes": [{"lemma": "when", "mstag": "SCONJ", "disamb": true}]}, {"index": 2, "position": [5, 14], "orth": "Sebastian", "lexemes": [{"lemma": "Sebastian", "mstag": "PROPN", "disamb": true}]}, {"index": 3, "position": [15, 20], "orth": "Thrun", "lexemes": [{"lemma": "Thrun", "mstag": "PROPN", "disamb": true}]}, {"index": 4, "position": [21, 28], "orth": "started", "lexemes": [{"lemma": "start", "mstag": "VERB", "disamb": true}]}, {"index": 5, "position": [29, 36], "orth": "working", "lexemes": [{"lemma": "work", "mstag": "VERB", "disamb": true}]}, {"index": 6, "position": [37, 39], "orth": "on", "lexemes": [{"lemma": "on", "mstag": "ADP", "disamb": true}]}, {"index": 7, "position": [40, 44], "orth": "self", "lexemes": [{"lemma": "self", "mstag": "NOUN", "disamb": true}]}, {"index": 8, "position": [45, 45], "orth": "-", "lexemes": [{"lemma": "-", "mstag": "PUNCT", "disamb": true}]}, {"index": 9, "position": [46, 52], "orth": "driving", "lexemes": [{"lemma": "drive", "mstag": "VERB", "disamb": true}]}, {"index": 10, "position": [53, 57], "orth": "cars", "lexemes": [{"lemma": "car", "mstag": "NOUN", "disamb": true}]}, {"index": 11, "position": [58, 60], "orth": "at", "lexemes": [{"lemma": "at", "mstag": "ADP", "disamb": true}]}, {"index": 12, "position": [61, 67], "orth": "Google", "lexemes": [{"lemma": "Google", "mstag": "PROPN", "disamb": true}]}, {"index": 13, "position": [68, 70], "orth": "in", "lexemes": [{"lemma": "in", "mstag": "ADP", "disamb": true}]}, {"index": 14, "position": [71, 75], "orth": "2007", "lexemes": [{"lemma": "2007", "mstag": "NUM", "disamb": true}]}, {"index": 15, "position": [76, 76], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 16, "position": [77, 80], "orth": "few", "lexemes": [{"lemma": "few", "mstag": "ADJ", "disamb": true}]}, {"index": 17, "position": [81, 87], "orth": "people", "lexemes": [{"lemma": "people", "mstag": "NOUN", "disamb": true}]}, {"index": 18, "position": [88, 95], "orth": "outside", "lexemes": [{"lemma": "outside", "mstag": "ADV", "disamb": true}]}, {"index": 19, "position": [96, 98], "orth": "of", "lexemes": [{"lemma": "of", "mstag": "ADP", "disamb": true}]}, {"index": 20, "position": [99, 102], "orth": "the", "lexemes": [{"lemma": "the", "mstag": "DET", "disamb": true}]}, {"index": 21, "position": [103, 110], "orth": "company", "lexemes": [{"lemma": "company", "mstag": "NOUN", "disamb": true}]}, {"index": 22, "position": [111, 115], "orth": "took", "lexemes": [{"lemma": "take", "mstag": "VERB", "disamb": true}]}, {"index": 23, "position": [116, 119], "orth": "him", "lexemes": [{"lemma": "he", "mstag": "PRON", "disamb": true}]}, {"index": 24, "position": [120, 129], "orth": "seriously", "lexemes": [{"lemma": "seriously", "mstag": "ADV", "disamb": true}]}, {"index": 25, "position": [130, 130], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 26, "position": [131, 132], "orth": "“", "lexemes": [{"lemma": "\"", "mstag": "PUNCT", "disamb": true}]}, {"index": 27, "position": [133, 133], "orth": "I", "lexemes": [{"lemma": "I", "mstag": "PRON", "disamb": true}]}, {"index": 28, "position": [134, 137], "orth": "can", "lexemes": [{"lemma": "can", "mstag": "AUX", "disamb": true}]}, {"index": 29, "position": [138, 142], "orth": "tell", "lexemes": [{"lemma": "tell", "mstag": "VERB", "disamb": true}]}, {"index": 30, "position": [143, 146], "orth": "you", "lexemes": [{"lemma": "you", "mstag": "PRON", "disamb": true}]}, {"index": 31, "position": [147, 151], "orth": "very", "lexemes": [{"lemma": "very", "mstag": "ADV", "disamb": true}]}, {"index": 32, "position": [152, 158], "orth": "senior", "lexemes": [{"lemma": "senior", "mstag": "ADJ", "disamb": true}]}, {"index": 33, "position": [159, 163], "orth": "CEOs", "lexemes": [{"lemma": "ceo", "mstag": "NOUN", "disamb": true}]}, {"index": 34, "position": [164, 166], "orth": "of", "lexemes": [{"lemma": "of", "mstag": "ADP", "disamb": true}]}, {"index": 35, "position": [167, 172], "orth": "major", "lexemes": [{"lemma": "major", "mstag": "ADJ", "disamb": true}]}, {"index": 36, "position": [173, 181], "orth": "American", "lexemes": [{"lemma": "american", "mstag": "ADJ", "disamb": true}]}, {"index": 37, "position": [182, 185], "orth": "car", "lexemes": [{"lemma": "car", "mstag": "NOUN", "disamb": true}]}, {"index": 38, "position": [186, 195], "orth": "companies", "lexemes": [{"lemma": "company", "mstag": "NOUN", "disamb": true}]}, {"index": 39, "position": [196, 201], "orth": "would", "lexemes": [{"lemma": "would", "mstag": "AUX", "disamb": true}]}, {"index": 40, "position": [202, 207], "orth": "shake", "lexemes": [{"lemma": "shake", "mstag": "VERB", "disamb": true}]}, {"index": 41, "position": [208, 210], "orth": "my", "lexemes": [{"lemma": "my", "mstag": "PRON", "disamb": true}]}, {"index": 42, "position": [211, 215], "orth": "hand", "lexemes": [{"lemma": "hand", "mstag": "NOUN", "disamb": true}]}, {"index": 43, "position": [216, 219], "orth": "and", "lexemes": [{"lemma": "and", "mstag": "CCONJ", "disamb": true}]}, {"index": 44, "position": [220, 224], "orth": "turn", "lexemes": [{"lemma": "turn", "mstag": "VERB", "disamb": true}]}, {"index": 45, "position": [225, 229], "orth": "away", "lexemes": [{"lemma": "away", "mstag": "ADV", "disamb": true}]}, {"index": 46, "position": [230, 237], "orth": "because", "lexemes": [{"lemma": "because", "mstag": "SCONJ", "disamb": true}]}, {"index": 47, "position": [238, 239], "orth": "I", "lexemes": [{"lemma": "I", "mstag": "PRON", "disamb": true}]}, {"index": 48, "position": [240, 243], "orth": "was", "lexemes": [{"lemma": "be", "mstag": "AUX", "disamb": true}]}, {"index": 49, "position": [244, 246], "orth": "n’t", "lexemes": [{"lemma": "not", "mstag": "PART", "disamb": true}]}, {"index": 50, "position": [247, 252], "orth": "worth", "lexemes": [{"lemma": "worth", "mstag": "ADJ", "disamb": true}]}, {"index": 51, "position": [253, 260], "orth": "talking", "lexemes": [{"lemma": "talk", "mstag": "VERB", "disamb": true}]}, {"index": 52, "position": [261, 263], "orth": "to", "lexemes": [{"lemma": "to", "mstag": "ADP", "disamb": true}]}, {"index": 53, "position": [264, 264], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 54, "position": [265, 265], "orth": "”", "lexemes": [{"lemma": "\"", "mstag": "PUNCT", "disamb": true}]}, {"index": 55, "position": [266, 270], "orth": "said", "lexemes": [{"lemma": "say", "mstag": "VERB", "disamb": true}]}, {"index": 56, "position": [271, 276], "orth": "Thrun", "lexemes": [{"lemma": "Thrun", "mstag": "PROPN", "disamb": true}]}, {"index": 57, "position": [277, 277], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 58, "position": [278, 280], "orth": "in", "lexemes": [{"lemma": "in", "mstag": "ADP", "disamb": true}]}, {"index": 59, "position": [281, 283], "orth": "an", "lexemes": [{"lemma": "an", "mstag": "DET", "disamb": true}]}, {"index": 60, "position": [284, 293], "orth": "interview", "lexemes": [{"lemma": "interview", "mstag": "NOUN", "disamb": true}]}, {"index": 61, "position": [294, 298], "orth": "with", "lexemes": [{"lemma": "with", "mstag": "ADP", "disamb": true}]}, {"index": 62, "position": [299, 305], "orth": "Recode", "lexemes": [{"lemma": "Recode", "mstag": "PROPN", "disamb": true}]}, {"index": 63, "position": [306, 313], "orth": "earlier", "lexemes": [{"lemma": "early", "mstag": "ADV", "disamb": true}]}, {"index": 64, "position": [314, 318], "orth": "this", "lexemes": [{"lemma": "this", "mstag": "DET", "disamb": true}]}, {"index": 65, "position": [319, 323], "orth": "week", "lexemes": [{"lemma": "week", "mstag": "NOUN", "disamb": true}]}, {"index": 66, "position": [324, 324], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 67, "position": [325, 325], "orth": "\n", "lexemes": [{"lemma": "\n", "mstag": "SPACE", "disamb": true}]}]}
\ No newline at end of file
--- a/tests/testdata/expected/pre_fextor3_expected.json
+++ b/tests/testdata/expected/pre_fextor3_expected.json
+{"base": {"Nazywam": 1, "się": 2, "Jan": 2, "Kowalski": 1, "i": 1, "mieszkać": 1, "w": 1, "Wrocław": 1, ".": 2, "mój": 1, "rodzinny": 1, "dom": 1, "mieścić": 1, "przy": 1, "aleji": 1, "Paweł": 1, "II": 1}}
\ No newline at end of file
--- a/tests/testdata/expected/remove_json_text.json
+++ b/tests/testdata/expected/remove_json_text.json
+{"filename": "simple_text_pl", "tagset": "ud", "tokens": [{"index": 1, "position": [0, 7], "orth": "Nazywam", "lexemes": [{"lemma": "Nazywam", "mstag": "VERB", "disamb": true}]}, {"index": 2, "position": [8, 11], "orth": "się", "lexemes": [{"lemma": "się", "mstag": "PRON", "disamb": true}]}, {"index": 3, "position": [12, 15], "orth": "Jan", "lexemes": [{"lemma": "Jan", "mstag": "PROPN", "disamb": true}]}, {"index": 4, "position": [16, 24], "orth": "Kowalski", "lexemes": [{"lemma": "Kowalski", "mstag": "PROPN", "disamb": true}]}, {"index": 5, "position": [25, 26], "orth": "i", "lexemes": [{"lemma": "i", "mstag": "CCONJ", "disamb": true}]}, {"index": 6, "position": [27, 35], "orth": "mieszkam", "lexemes": [{"lemma": "mieszkać", "mstag": "VERB", "disamb": true}]}, {"index": 7, "position": [36, 38], "orth": "we", "lexemes": [{"lemma": "w", "mstag": "ADP", "disamb": true}]}, {"index": 8, "position": [39, 48], "orth": "Wrocławiu", "lexemes": [{"lemma": "Wrocław", "mstag": "PROPN", "disamb": true}]}, {"index": 9, "position": [49, 49], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 10, "position": [50, 53], "orth": "Mój", "lexemes": [{"lemma": "mój", "mstag": "DET", "disamb": true}]}, {"index": 11, "position": [54, 62], "orth": "rodzinny", "lexemes": [{"lemma": "rodzinny", "mstag": "ADJ", "disamb": true}]}, {"index": 12, "position": [63, 66], "orth": "dom", "lexemes": [{"lemma": "dom", "mstag": "NOUN", "disamb": true}]}, {"index": 13, "position": [67, 73], "orth": "mieści", "lexemes": [{"lemma": "mieścić", "mstag": "VERB", "disamb": true}]}, {"index": 14, "position": [74, 77], "orth": "się", "lexemes": [{"lemma": "się", "mstag": "PRON", "disamb": true}]}, {"index": 15, "position": [78, 82], "orth": "przy", "lexemes": [{"lemma": "przy", "mstag": "ADP", "disamb": true}]}, {"index": 16, "position": [83, 88], "orth": "aleji", "lexemes": [{"lemma": "aleji", "mstag": "NOUN", "disamb": true}]}, {"index": 17, "position": [89, 93], "orth": "Jana", "lexemes": [{"lemma": "Jan", "mstag": "PROPN", "disamb": true}]}, {"index": 18, "position": [94, 99], "orth": "Pawła", "lexemes": [{"lemma": "Paweł", "mstag": "PROPN", "disamb": true}]}, {"index": 19, "position": [100, 102], "orth": "II", "lexemes": [{"lemma": "II", "mstag": "ADJ", "disamb": true}]}, {"index": 20, "position": [103, 103], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}]}
\ No newline at end of file
--- a/tests/testdata/expected/simple_ccl2json_conversion.json
+++ b/tests/testdata/expected/simple_ccl2json_conversion.json
--- a/tests/testdata/expected/simple_ccl2lemmas_conversion
+++ b/tests/testdata/expected/simple_ccl2lemmas_conversion
+woda być jeden z pospolity substancja w wszechświat . cząsteczka woda być trzeci bardzo rozpowszechniony molekuła w ośrodek międzygwiazdowy , po cząsteczkowy wodór i tlenek węgiel .
+być również szeroko rozpowszechniony w Układ Słoneczny : stanowić istotny element budowa ceres i księżyc lodowy krążyć wokół planeta - olbrzym , jako domieszka występować w on atmosfera , a przypuszczać się , że duży on ilość znajdować się w wnętrze ten planeta .
+jako lód występować także na część planetoida , a zapewne również na obiekt transneptunowych .
+woda być bardzo rozpowszechniony także na powierzchnia Ziemia .
+występować głównie w ocean , który pokrywać 70 , 8 % powierzchnia glob , ale także w rzeka , jezioro i w postać stały w lodowiec .
+część woda znajdować się w atmosfera ( chmura , para wodny ) .
+niektóry związek chemiczny zawierać cząsteczka woda w swój budowa ( hydrat – określać się on wówczas miano woda krystalizacyjny ) .
+zawartość woda włączyć w struktura minerał w płaszcz Ziemia móc przekraczać łączny zawartość woda w ocean i inny zbiornik powierzchniowy nawet dziesięciokrotnie .
+woda występować w przyroda być roztwór sól i gaz .
+najwięcej sól mineralny zawierać woda morski i woda mineralny ; najmniej woda z opad atmosferyczny .
+woda o mały zawartość składnik mineralny nazywać woda miękki , natomiast zawierać znaczny ilość sól wapń i magnez – woda twardy .
+oprócz to woda naturalny zawierać rozpuścić substancja pochodzenie organiczny , na przykład . mocznik , kwas humusowy i tym podobne .
--- a/tests/testdata/expected/simple_nkjp2ud_conversion.json
+++ b/tests/testdata/expected/simple_nkjp2ud_conversion.json
--- a/tests/testdata/input/identical_input_output_formats.json
+++ b/tests/testdata/input/identical_input_output_formats.json
+{"filename": "simple_text_pl", "tagset": "ud", "tokens": [{"index": 1, "position": [0, 7], "orth": "Nazywam", "lexemes": [{"lemma": "Nazywam", "mstag": "VERB", "disamb": true}]}, {"index": 2, "position": [8, 11], "orth": "się", "lexemes": [{"lemma": "się", "mstag": "PRON", "disamb": true}]}, {"index": 3, "position": [12, 15], "orth": "Jan", "lexemes": [{"lemma": "Jan", "mstag": "PROPN", "disamb": true}]}, {"index": 4, "position": [16, 24], "orth": "Kowalski", "lexemes": [{"lemma": "Kowalski", "mstag": "PROPN", "disamb": true}]}, {"index": 5, "position": [25, 26], "orth": "i", "lexemes": [{"lemma": "i", "mstag": "CCONJ", "disamb": true}]}, {"index": 6, "position": [27, 35], "orth": "mieszkam", "lexemes": [{"lemma": "mieszkać", "mstag": "VERB", "disamb": true}]}, {"index": 7, "position": [36, 38], "orth": "we", "lexemes": [{"lemma": "w", "mstag": "ADP", "disamb": true}]}, {"index": 8, "position": [39, 48], "orth": "Wrocławiu", "lexemes": [{"lemma": "Wrocław", "mstag": "PROPN", "disamb": true}]}, {"index": 9, "position": [49, 49], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 10, "position": [50, 53], "orth": "Mój", "lexemes": [{"lemma": "mój", "mstag": "DET", "disamb": true}]}, {"index": 11, "position": [54, 62], "orth": "rodzinny", "lexemes": [{"lemma": "rodzinny", "mstag": "ADJ", "disamb": true}]}, {"index": 12, "position": [63, 66], "orth": "dom", "lexemes": [{"lemma": "dom", "mstag": "NOUN", "disamb": true}]}, {"index": 13, "position": [67, 73], "orth": "mieści", "lexemes": [{"lemma": "mieścić", "mstag": "VERB", "disamb": true}]}, {"index": 14, "position": [74, 77], "orth": "się", "lexemes": [{"lemma": "się", "mstag": "PRON", "disamb": true}]}, {"index": 15, "position": [78, 82], "orth": "przy", "lexemes": [{"lemma": "przy", "mstag": "ADP", "disamb": true}]}, {"index": 16, "position": [83, 88], "orth": "aleji", "lexemes": [{"lemma": "aleji", "mstag": "NOUN", "disamb": true}]}, {"index": 17, "position": [89, 93], "orth": "Jana", "lexemes": [{"lemma": "Jan", "mstag": "PROPN", "disamb": true}]}, {"index": 18, "position": [94, 99], "orth": "Pawła", "lexemes": [{"lemma": "Paweł", "mstag": "PROPN", "disamb": true}]}, {"index": 19, "position": [100, 102], "orth": "II", "lexemes": [{"lemma": "II", "mstag": "ADJ", "disamb": true}]}, {"index": 20, "position": [103, 103], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}], "text": "Nazywam się Jan Kowalski i mieszkam we Wrocławiu. Mój rodzinny dom mieści się przy aleji Jana Pawła II."}
\ No newline at end of file
--- a/tests/testdata/input/post_postagger_input
+++ b/tests/testdata/input/post_postagger_input
+Woda jest jedną z najpospolitszych substancji we Wszechświecie.
+Cząsteczka wody jest trzecią najbardziej rozpowszechnioną molekułą w ośrodku międzygwiazdowym, po cząsteczkowym wodorze i tlenku węgla. Jest również szeroko rozpowszechniona w Układzie Słonecznym: stanowi istotny element budowy Ceres i księżyców lodowych krążących wokół planet-olbrzymów, jako domieszka występuje w ich atmosferach, a przypuszcza się, że duże jej ilości znajdują się we wnętrzach tych planet. Jako lód występuje także na części planetoid, a zapewne również na obiektach transneptunowych. Woda jest bardzo rozpowszechniona także na powierzchni Ziemi. Występuje głównie w oceanach, które pokrywają 70,8% powierzchni globu, ale także w rzekach, jeziorach i w postaci stałej w lodowcach. Część wody znajduje się w atmosferze (chmury, para wodna). Niektóre związki chemiczne zawierają cząsteczki wody w swojej budowie (hydraty – określa się ją wówczas mianem wody krystalizacyjnej). Zawartość wody włączonej w strukturę minerałów w płaszczu Ziemi może przekraczać łączną zawartość wody w oceanach i innych zbiornikach powierzchniowych nawet dziesięciokrotnie. 
+Woda występująca w przyrodzie jest roztworem soli i gazów. Najwięcej soli mineralnych zawiera woda morska i wody mineralne; najmniej woda z opadów atmosferycznych. Wodę o małej zawartości składników mineralnych nazywamy wodą miękką, natomiast zawierającą znaczne ilości soli wapnia i magnezu – wodą twardą. Oprócz tego wody naturalne zawierają rozpuszczone substancje pochodzenia organicznego, np. mocznik, kwasy humusowe itp.
--- a/tests/testdata/input/post_spacy_input
+++ b/tests/testdata/input/post_spacy_input
+When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously. “I can tell you very senior CEOs of major American car companies would shake my hand and turn away because I wasn’t worth talking to,” said Thrun, in an interview with Recode earlier this week.