Skip to content
Snippets Groups Projects
Commit 8fb2d227 authored by Konrad Wojtasik's avatar Konrad Wojtasik
Browse files

Quick fix with adding task option for labels name and multiple dictionaries at once

parent 072cf37f
No related merge requests found
Pipeline #14660 failed with stages
in 2 minutes and 50 seconds
......@@ -4,6 +4,7 @@ import json
import logging
from io import UnsupportedOperation
from pathlib import Path
from uuid import uuid4
import nlp_ws
import clarin_json
......@@ -47,48 +48,72 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
worker will store result json file.
:type output_path: str
"""
if (labels_path := task_options.pop("labels_path", None)) is None:
if (labels_path_list := task_options.pop("labels_path", None)) is None:
raise UnsupportedOperation(
"'labels_path' should be passed with 'task_options'."
)
if (labels_name_list := task_options.pop("labels_name", None)) is None:
# Get labels filename
labels_path = labels_path_list[0]
labels_name_list = [Path(labels_path).name.split(".")[0]]
# Load labels
with open(labels_path, "r", encoding="utf-8") as json_file:
labels = json.load(json_file)
# Get labels filename
labels_dict_name = Path(labels_path).name.split(".")[0]
matcher = MatrixMatcher(
labels["labels"],
multiple_labels=True,
sim_threshold=0.8,
tfidf_ngram_range=(2, 3),
**task_options,
)
with clarin_json.open(output_path, "w", ensure_ascii=False) as fout:
with clarin_json.open(input_path, "r") as fin:
documents_in = fin.read_all()
documents = [{"text": doc.text} for doc in documents_in]
documents_out = matcher.match_docs(documents)
for doc_in, doc_out in zip(documents_in, documents_out):
doc_in.set_spans(
[
Span.from_dict(
{
"id": idx,
"start": start_idx,
"stop": end_idx,
"class": class_name,
}
)
for idx, (start_idx, end_idx, class_name) in enumerate(
doc_out["label"]
for labels_path, labels_dict_name in zip(labels_path_list, labels_name_list):
# Load labels
with open(labels_path, "r", encoding="utf-8") as json_file:
labels = json.load(json_file)
matcher = MatrixMatcher(
labels["labels"],
multiple_labels=True,
sim_threshold=0.8,
tfidf_ngram_range=(2, 3),
**task_options,
)
with clarin_json.open(output_path, "w", ensure_ascii=False) as fout:
try:
with clarin_json.open(input_path, "r") as fin:
documents_in = fin.read_all()
documents = [{"text": doc.text} for doc in documents_in]
documents_out = matcher.match_docs(documents)
for doc_in, doc_out in zip(documents_in, documents_out):
doc_in.set_spans(
[
Span.from_dict(
{
"id": idx,
"start": start_idx,
"stop": end_idx,
"class": class_name,
}
)
for idx, (start_idx, end_idx, class_name) in enumerate(
doc_out["label"]
)
],
"easymatcher-{}".format(labels_dict_name),
)
],
"easymatcher-{}".format(labels_dict_name),
)
fout.write(doc_in)
fout.write(doc_in)
except json.decoder.JSONDecodeError:
# plik tekstowy
with open(input_path, 'r') as fin:
text = fin.read()
document_out = matcher.match(text)
em_spans = [Span.from_dict(
{
"id": idx,
"start": start_idx,
"stop": end_idx,
"class": class_name,
}
)
for idx, (start_idx, end_idx, class_name) in enumerate(document_out)]
doc = clarin_json.Document(id=str(uuid4()), text=text, spans=em_spans) # zapis spanów z easymacther-a
fout.write(doc)
......@@ -30,7 +30,7 @@ def test_easymatcher_process_document(
output_dir: Path,
expected_dir: Path,
):
task_options = {"labels_path": example_labels_path}
task_options = {"labels_path": [example_labels_path]}
output_path = output_dir / f"{example_document_path.stem}.jsonl"
expected_path = expected_dir / f"{example_document_path.stem}.jsonl"
worker.process(example_document_path, task_options, output_path)
......@@ -44,7 +44,7 @@ def test_easymatcher_process_documents(
output_dir: Path,
expected_dir: Path,
):
task_options = {"labels_path": example_labels_path}
task_options = {"labels_path": [example_labels_path]}
output_path = output_dir / "documents.jsonl"
expected_path = expected_dir / "documents.jsonl"
worker.process(example_documents_path_jsonl, task_options, output_path)
......@@ -58,7 +58,7 @@ def test_easymatcher_process_jsonl_document_structure(
output_dir: Path,
expected_dir: Path,
):
task_options = {"labels_path": example_labels_path_jsonl}
task_options = {"labels_path": [example_labels_path_jsonl]}
output_path = output_dir / f"{example_document_path_jsonl.stem}.jsonl"
expected_path = expected_dir / f"{example_document_path_jsonl.stem}.jsonl"
worker.process(example_document_path_jsonl, task_options, output_path)
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment