Save multiple labels in doc

c86119bb · Paweł Walkowiak · 497a8f70 · c86119bb
Commit c86119bb authored 1 year ago by Paweł Walkowiak
--- a/src/easymatcher_worker.py
+++ b/src/easymatcher_worker.py
@@ -23,10 +23,10 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
    """

    def process(
-        self,
-        input_path: str,
-        task_options: dict[str, str | int | float],
-        output_path: str,
+            self,
+            input_path: str,
+            task_options: dict[str, str | int | float],
+            output_path: str,
    ) -> None:
        """Called for each request made to the worker.

@@ -58,7 +58,20 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
            labels_path = labels_path_list[0]
            labels_name_list = [Path(labels_path).name.split(".")[0]]

+        try:
+            with clarin_json.open(input_path, "r") as fin:
+                documents_in = fin.read_all()
+                documents = [{"text": doc.text} for doc in documents_in]
+        except json.decoder.JSONDecodeError:
+            with open(input_path, "r") as fin:
+                text = fin.read()
+                documents_in = clarin_json.Document(
+                    id=str(uuid4()), text=text
+                )
+                documents = [{"text": text}]
+
        for labels_path, labels_dict_name in zip(labels_path_list, labels_name_list):
+            result_documents = []
            # Load labels
            with open(labels_path, "r", encoding="utf-8") as json_file:
                labels = json.load(json_file)
@@ -71,55 +84,27 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
                **task_options,
            )

-            with clarin_json.open(output_path, "w", ensure_ascii=False) as fout:
-                try:
-                    with clarin_json.open(input_path, "r") as fin:
-                        documents_in = fin.read_all()
-                        documents = [{"text": doc.text} for doc in documents_in]
-                        documents_out = matcher.match_docs(documents)
-
-                        for doc_in, doc_out in zip(documents_in, documents_out):
-                            doc_in.set_spans(
-                                [
-                                    Span.from_dict(
-                                        {
-                                            "id": idx,
-                                            "start": start_idx,
-                                            "stop": end_idx,
-                                            "class": class_name,
-                                        }
-                                    )
-                                    for idx, (
-                                        start_idx,
-                                        end_idx,
-                                        class_name,
-                                    ) in enumerate(doc_out["label"])
-                                ],
-                                "easymatcher-{}".format(labels_dict_name),
-                            )
-
-                            fout.write(doc_in)
-                except json.decoder.JSONDecodeError:
-                    # plik tekstowy
-                    with open(input_path, "r") as fin:
-                        text = fin.read()
-
-                        document_out = matcher.match(text)
-                        em_spans = [
-                            Span.from_dict(
-                                {
-                                    "id": idx,
-                                    "start": start_idx,
-                                    "stop": end_idx,
-                                    "class": class_name,
-                                }
-                            )
-                            for idx, (start_idx, end_idx, class_name) in enumerate(
-                                document_out
-                            )
-                        ]
-
-                        doc = clarin_json.Document(
-                            id=str(uuid4()), text=text, spans=em_spans
-                        )  # zapis spanów z easymacther-a
-                        fout.write(doc)
+            documents_out = matcher.match_docs(documents)
+            for doc_in, doc_out in zip(documents_in, documents_out):
+                doc_in.set_spans(
+                    [
+                        Span.from_dict(
+                            {
+                                "id": idx,
+                                "start": start_idx,
+                                "stop": end_idx,
+                                "class": class_name,
+                            }
+                        )
+                        for idx, (
+                            start_idx,
+                            end_idx,
+                            class_name,
+                        ) in enumerate(doc_out["label"])
+                    ],
+                    "easymatcher-{}".format(labels_dict_name),
+                )
+                result_documents.append(doc_in)
+            documents_in = result_documents
+        with clarin_json.open(output_path, "w", ensure_ascii=False) as fout:
+            fout.write(documents_in)