Merge branch 'json2lemmas' into 'master'

Json2lemmas See merge request !2

Merge branch 'json2lemmas' into 'master'
077614de · Paweł Walkowiak · fdebbc6f · e9779d57 · 077614de · 077614de
Commit 077614de authored May 16, 2023 by Paweł Walkowiak
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,4 @@
 *__pycache__
 htmlcov
 config-test.ini
+tests/testdata/output/postagger_lone_json/
--- a/lpmn_queries.json
+++ b/lpmn_queries.json
 {
    "posconverter_lone": {"task": [{"posconverter": {"input_format": "json", "output_format": "json", "input_tagset": "ud", "output_tagset": "identical", "json_text": false}}], "input": "remove_json_text.json", "expected": "remove_json_text.json"},
+    "posconverter_lone_json2lemmas": {"task": [{"posconverter": {"input_format": "json", "output_format": "lemmas", "input_tagset": "ud", "output_tagset": "identical", "json_text": false}}], "input": "remove_json_text.json", "expected": "remove_json_text_lemmas.txt"},
    "post_spacy_tagger_json_text": {"task": [{"spacy": {"lang": "en", "method": "tagger"}}, {"posconverter": {"input_format": "json", "output_format": "json", "input_tagset": "ud", "output_tagset": "identical", "json_text": true}}], "input": "post_spacy_input", "expected": "post_spacy_tagger_json_text_expected.json"},

--- a/src/converter.py
+++ b/src/converter.py
@@ -36,6 +36,27 @@ def ccl_2_lemmas(input_file, output_file):
                f.write("\n")
+def json2lemmas(input_file, output_file):
+    """Implementation of lemmas extracting function.
+    :param  input_file: path to a json file with words lemmas,
+    output of MorphoDiTa or Wcrft2
+    :type input_file: str
+    :param output_file: path to resulting text file with words in lemma forms
+    :type output_file: str
+    """
+    with open(input_file, 'rt', encoding='utf-8') as f:
+        data = json.load(f)
+    words = []
+    with open(output_file, 'wt', encoding='utf-8') as f:
+        for token in data['tokens']:
+            for lexem in token['lexemes']:
+                words.append(lexem['lemma'])
+        f.write(" ".join(words))
+        f.write("\n")
 def _convert_tagset_in_token(token, tagset):
    """Function converting mstag in token (nkjp one) to 'tagset'.
@@ -162,6 +183,8 @@ class ConverterWorker(nlp_ws.NLPWorker):
            shutil.copyfile(input_path, output_path)
        elif input_format == "ccl" and output_format == "lemmas":
            ccl_2_lemmas(input_path, output_path)
+        elif input_format == "json" and output_format == "lemmas":
+            json2lemmas(input_path, output_path)
        elif input_format == "ccl" and output_format == "json":
            tagset = "nkjp"
            if output_tagset == "ud":

--- a/tests/testdata/expected/remove_json_text_lemmas.txt
+++ b/tests/testdata/expected/remove_json_text_lemmas.txt
+Nazywam się Jan Kowalski i mieszkać w Wrocław . mój rodzinny dom mieścić się przy aleji Jan Paweł II .