Quick fix with adding task option for labels name and multiple dictionaries at once

8fb2d227 · Konrad Wojtasik · 072cf37f · 8fb2d227 · 8fb2d227
Commit 8fb2d227 authored Nov 7, 2023 by Konrad Wojtasik
--- a/src/easymatcher_worker.py
+++ b/src/easymatcher_worker.py
@@ -4,6 +4,7 @@ import json
 import logging
 from io import UnsupportedOperation
 from pathlib import Path
+from uuid import uuid4

 import nlp_ws
 import clarin_json
@@ -47,17 +48,22 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
          worker will store result json file.
        :type output_path: str
        """
-        if (labels_path := task_options.pop("labels_path", None)) is None:
+        if (labels_path_list := task_options.pop("labels_path", None)) is None:
            raise UnsupportedOperation(
                "'labels_path' should be passed with 'task_options'."
            )
        
+        if (labels_name_list := task_options.pop("labels_name", None)) is None:
+            # Get labels filename
+            labels_path = labels_path_list[0]
+            labels_name_list = [Path(labels_path).name.split(".")[0]]
+
+        for labels_path, labels_dict_name in zip(labels_path_list, labels_name_list):
+
            # Load labels
            with open(labels_path, "r", encoding="utf-8") as json_file:
                labels = json.load(json_file)

-        # Get labels filename
-        labels_dict_name = Path(labels_path).name.split(".")[0]

            matcher = MatrixMatcher(
                labels["labels"],
@@ -68,6 +74,7 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
            )

            with clarin_json.open(output_path, "w", ensure_ascii=False) as fout:
+                try:
                    with clarin_json.open(input_path, "r") as fin:
                        documents_in = fin.read_all()
                        documents = [{"text": doc.text} for doc in documents_in]
@@ -92,3 +99,21 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
                            )

                            fout.write(doc_in)
+                except json.decoder.JSONDecodeError:      
+                    # plik tekstowy
+                    with open(input_path, 'r') as fin:
+                        text = fin.read()
+                    
+                        document_out = matcher.match(text)
+                        em_spans = [Span.from_dict(
+                                        {
+                                            "id": idx,
+                                            "start": start_idx,
+                                            "stop": end_idx,
+                                            "class": class_name,
+                                        }
+                                    )
+                            for idx, (start_idx, end_idx, class_name) in enumerate(document_out)]
+
+                        doc = clarin_json.Document(id=str(uuid4()), text=text, spans=em_spans) # zapis spanów z easymacther-a
+                        fout.write(doc)
--- a/tests/worker/test_worker.py
+++ b/tests/worker/test_worker.py
@@ -30,7 +30,7 @@ def test_easymatcher_process_document(
    output_dir: Path,
    expected_dir: Path,
 ):
-    task_options = {"labels_path": example_labels_path}
+    task_options = {"labels_path": [example_labels_path]}
    output_path = output_dir / f"{example_document_path.stem}.jsonl"
    expected_path = expected_dir / f"{example_document_path.stem}.jsonl"
    worker.process(example_document_path, task_options, output_path)
@@ -44,7 +44,7 @@ def test_easymatcher_process_documents(
    output_dir: Path,
    expected_dir: Path,
 ):
-    task_options = {"labels_path": example_labels_path}
+    task_options = {"labels_path": [example_labels_path]}
    output_path = output_dir / "documents.jsonl"
    expected_path = expected_dir / "documents.jsonl"
    worker.process(example_documents_path_jsonl, task_options, output_path)
@@ -58,7 +58,7 @@ def test_easymatcher_process_jsonl_document_structure(
    output_dir: Path,
    expected_dir: Path,
 ):
-    task_options = {"labels_path": example_labels_path_jsonl}
+    task_options = {"labels_path": [example_labels_path_jsonl]}
    output_path = output_dir / f"{example_document_path_jsonl.stem}.jsonl"
    expected_path = expected_dir / f"{example_document_path_jsonl.stem}.jsonl"
    worker.process(example_document_path_jsonl, task_options, output_path)