Skip to content
Snippets Groups Projects
Commit c86119bb authored by Paweł Walkowiak's avatar Paweł Walkowiak
Browse files

Save multiple labels in doc

parent 497a8f70
No related merge requests found
Pipeline #14671 passed with stages
in 12 minutes and 2 seconds
......@@ -23,10 +23,10 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
"""
def process(
self,
input_path: str,
task_options: dict[str, str | int | float],
output_path: str,
self,
input_path: str,
task_options: dict[str, str | int | float],
output_path: str,
) -> None:
"""Called for each request made to the worker.
......@@ -58,7 +58,20 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
labels_path = labels_path_list[0]
labels_name_list = [Path(labels_path).name.split(".")[0]]
try:
with clarin_json.open(input_path, "r") as fin:
documents_in = fin.read_all()
documents = [{"text": doc.text} for doc in documents_in]
except json.decoder.JSONDecodeError:
with open(input_path, "r") as fin:
text = fin.read()
documents_in = clarin_json.Document(
id=str(uuid4()), text=text
)
documents = [{"text": text}]
for labels_path, labels_dict_name in zip(labels_path_list, labels_name_list):
result_documents = []
# Load labels
with open(labels_path, "r", encoding="utf-8") as json_file:
labels = json.load(json_file)
......@@ -71,55 +84,27 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
**task_options,
)
with clarin_json.open(output_path, "w", ensure_ascii=False) as fout:
try:
with clarin_json.open(input_path, "r") as fin:
documents_in = fin.read_all()
documents = [{"text": doc.text} for doc in documents_in]
documents_out = matcher.match_docs(documents)
for doc_in, doc_out in zip(documents_in, documents_out):
doc_in.set_spans(
[
Span.from_dict(
{
"id": idx,
"start": start_idx,
"stop": end_idx,
"class": class_name,
}
)
for idx, (
start_idx,
end_idx,
class_name,
) in enumerate(doc_out["label"])
],
"easymatcher-{}".format(labels_dict_name),
)
fout.write(doc_in)
except json.decoder.JSONDecodeError:
# plik tekstowy
with open(input_path, "r") as fin:
text = fin.read()
document_out = matcher.match(text)
em_spans = [
Span.from_dict(
{
"id": idx,
"start": start_idx,
"stop": end_idx,
"class": class_name,
}
)
for idx, (start_idx, end_idx, class_name) in enumerate(
document_out
)
]
doc = clarin_json.Document(
id=str(uuid4()), text=text, spans=em_spans
) # zapis spanów z easymacther-a
fout.write(doc)
documents_out = matcher.match_docs(documents)
for doc_in, doc_out in zip(documents_in, documents_out):
doc_in.set_spans(
[
Span.from_dict(
{
"id": idx,
"start": start_idx,
"stop": end_idx,
"class": class_name,
}
)
for idx, (
start_idx,
end_idx,
class_name,
) in enumerate(doc_out["label"])
],
"easymatcher-{}".format(labels_dict_name),
)
result_documents.append(doc_in)
documents_in = result_documents
with clarin_json.open(output_path, "w", ensure_ascii=False) as fout:
fout.write(documents_in)
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment