Skip to content
Snippets Groups Projects
Commit c86119bb authored by Paweł Walkowiak's avatar Paweł Walkowiak
Browse files

Save multiple labels in doc

parent 497a8f70
No related branches found
No related tags found
No related merge requests found
Pipeline #14671 passed
......@@ -58,7 +58,20 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
labels_path = labels_path_list[0]
labels_name_list = [Path(labels_path).name.split(".")[0]]
try:
with clarin_json.open(input_path, "r") as fin:
documents_in = fin.read_all()
documents = [{"text": doc.text} for doc in documents_in]
except json.decoder.JSONDecodeError:
with open(input_path, "r") as fin:
text = fin.read()
documents_in = clarin_json.Document(
id=str(uuid4()), text=text
)
documents = [{"text": text}]
for labels_path, labels_dict_name in zip(labels_path_list, labels_name_list):
result_documents = []
# Load labels
with open(labels_path, "r", encoding="utf-8") as json_file:
labels = json.load(json_file)
......@@ -71,13 +84,7 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
**task_options,
)
with clarin_json.open(output_path, "w", ensure_ascii=False) as fout:
try:
with clarin_json.open(input_path, "r") as fin:
documents_in = fin.read_all()
documents = [{"text": doc.text} for doc in documents_in]
documents_out = matcher.match_docs(documents)
for doc_in, doc_out in zip(documents_in, documents_out):
doc_in.set_spans(
[
......@@ -97,29 +104,7 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
],
"easymatcher-{}".format(labels_dict_name),
)
fout.write(doc_in)
except json.decoder.JSONDecodeError:
# plik tekstowy
with open(input_path, "r") as fin:
text = fin.read()
document_out = matcher.match(text)
em_spans = [
Span.from_dict(
{
"id": idx,
"start": start_idx,
"stop": end_idx,
"class": class_name,
}
)
for idx, (start_idx, end_idx, class_name) in enumerate(
document_out
)
]
doc = clarin_json.Document(
id=str(uuid4()), text=text, spans=em_spans
) # zapis spanów z easymacther-a
fout.write(doc)
result_documents.append(doc_in)
documents_in = result_documents
with clarin_json.open(output_path, "w", ensure_ascii=False) as fout:
fout.write(documents_in)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment