Skip to content
Snippets Groups Projects
Commit 47382d6e authored by Paweł Walkowiak's avatar Paweł Walkowiak
Browse files

Add no text option

parent ac7776f0
No related branches found
No related tags found
1 merge request!1Add ci
Pipeline #6875 passed
"""Implementation of ccl2json usage."""
import json
import pathlib
from xml.sax import handler, make_parser
from ccl2json.parse import CCLhandler
def ccl2json(path_in, path_out):
def ccl2json(path_in, options, path_out):
"""Function converting xml ccl to json.
:param path_in: path to xml ccl file
:type path_in: str
:param options: task_options: Dictionary containing options of ccl2json
json_text: if json output should contain original text (default = True)
:type: options: dict
:param path_out: path to output json file
:type path_out: str
"""
......@@ -18,12 +22,13 @@ def ccl2json(path_in, path_out):
parser.setContentHandler(CCLhandler())
parser.parse(path_in)
with open(path_out, 'w', encoding='utf-8') as fout:
dout = {
"filename": path_in.split('/')[-1].replace('.ccl', ''),
'text': parser.getContentHandler().get_text(),
with open(path_out, 'w', encoding='utf-8') as file_out:
data_out = {
"filename": pathlib.Path(path_in).stem,
'tagset': 'nkjp',
'tokens': [token.as_dict() for token
in parser.getContentHandler().get_tokens()]
}
json.dump(dout, fout, ensure_ascii=False)
if 'json_text' not in options or options['json_text']:
data_out['text'] = parser.getContentHandler().get_text()
json.dump(data_out, file_out, ensure_ascii=False)
......@@ -41,6 +41,8 @@ class TaggerWorker(nlp_ws.NLPWorker):
:param task_options: Dictionary containing options of pos_tagger
lang: language of text (default = 'pl')
output: format of results (default = 'json', values: json, ccl, lemmas)
json_text: bool if json output should contain original
text (default = True)
:type task_options: dict
:param output_path: Path to directory where the
......@@ -67,6 +69,6 @@ class TaggerWorker(nlp_ws.NLPWorker):
elif tagger_output == "ccl" and output == "lemmas":
lemmatizer.ccl_2_lemmas(l_result, output_path)
elif tagger_output == "ccl" and output == "json":
converter.ccl2json(l_result, output_path)
converter.ccl2json(l_result, task_options, output_path)
else:
raise Exception("Unsupported format conversion")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment