Skip to content
Snippets Groups Projects
Commit 077614de authored by Paweł Walkowiak's avatar Paweł Walkowiak
Browse files

Merge branch 'json2lemmas' into 'master'

Json2lemmas

See merge request !2
parents fdebbc6f e9779d57
No related branches found
No related tags found
1 merge request!2Json2lemmas
Pipeline #10581 failed
...@@ -5,3 +5,4 @@ ...@@ -5,3 +5,4 @@
*__pycache__ *__pycache__
htmlcov htmlcov
config-test.ini config-test.ini
tests/testdata/output/postagger_lone_json/
{ {
"posconverter_lone": {"task": [{"posconverter": {"input_format": "json", "output_format": "json", "input_tagset": "ud", "output_tagset": "identical", "json_text": false}}], "input": "remove_json_text.json", "expected": "remove_json_text.json"}, "posconverter_lone": {"task": [{"posconverter": {"input_format": "json", "output_format": "json", "input_tagset": "ud", "output_tagset": "identical", "json_text": false}}], "input": "remove_json_text.json", "expected": "remove_json_text.json"},
"posconverter_lone_json2lemmas": {"task": [{"posconverter": {"input_format": "json", "output_format": "lemmas", "input_tagset": "ud", "output_tagset": "identical", "json_text": false}}], "input": "remove_json_text.json", "expected": "remove_json_text_lemmas.txt"},
"post_spacy_tagger_json_text": {"task": [{"spacy": {"lang": "en", "method": "tagger"}}, {"posconverter": {"input_format": "json", "output_format": "json", "input_tagset": "ud", "output_tagset": "identical", "json_text": true}}], "input": "post_spacy_input", "expected": "post_spacy_tagger_json_text_expected.json"}, "post_spacy_tagger_json_text": {"task": [{"spacy": {"lang": "en", "method": "tagger"}}, {"posconverter": {"input_format": "json", "output_format": "json", "input_tagset": "ud", "output_tagset": "identical", "json_text": true}}], "input": "post_spacy_input", "expected": "post_spacy_tagger_json_text_expected.json"},
......
...@@ -36,6 +36,27 @@ def ccl_2_lemmas(input_file, output_file): ...@@ -36,6 +36,27 @@ def ccl_2_lemmas(input_file, output_file):
f.write("\n") f.write("\n")
def json2lemmas(input_file, output_file):
"""Implementation of lemmas extracting function.
:param input_file: path to a json file with words lemmas,
output of MorphoDiTa or Wcrft2
:type input_file: str
:param output_file: path to resulting text file with words in lemma forms
:type output_file: str
"""
with open(input_file, 'rt', encoding='utf-8') as f:
data = json.load(f)
words = []
with open(output_file, 'wt', encoding='utf-8') as f:
for token in data['tokens']:
for lexem in token['lexemes']:
words.append(lexem['lemma'])
f.write(" ".join(words))
f.write("\n")
def _convert_tagset_in_token(token, tagset): def _convert_tagset_in_token(token, tagset):
"""Function converting mstag in token (nkjp one) to 'tagset'. """Function converting mstag in token (nkjp one) to 'tagset'.
...@@ -162,6 +183,8 @@ class ConverterWorker(nlp_ws.NLPWorker): ...@@ -162,6 +183,8 @@ class ConverterWorker(nlp_ws.NLPWorker):
shutil.copyfile(input_path, output_path) shutil.copyfile(input_path, output_path)
elif input_format == "ccl" and output_format == "lemmas": elif input_format == "ccl" and output_format == "lemmas":
ccl_2_lemmas(input_path, output_path) ccl_2_lemmas(input_path, output_path)
elif input_format == "json" and output_format == "lemmas":
json2lemmas(input_path, output_path)
elif input_format == "ccl" and output_format == "json": elif input_format == "ccl" and output_format == "json":
tagset = "nkjp" tagset = "nkjp"
if output_tagset == "ud": if output_tagset == "ud":
......
Nazywam się Jan Kowalski i mieszkać w Wrocław . mój rodzinny dom mieścić się przy aleji Jan Paweł II .
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment