Fix style

508ecbb0 · Konrad Wojtasik · 0abbdb13 · 508ecbb0 · 508ecbb0
Commit 508ecbb0 authored Oct 27, 2023 by Konrad Wojtasik
--- a/src/easymatcher_worker.py
+++ b/src/easymatcher_worker.py
@@ -2,17 +2,14 @@
 import json
 import logging
-import os
 from io import UnsupportedOperation
 from pathlib import Path
-from tempfile import NamedTemporaryFile
 import nlp_ws
 import clarin_json
 from clarin_json import Span
 from easymatcher.models import MatrixMatcher
-from easymatcher.utils import load_data
 _log = logging.getLogger(__name__)
@@ -24,40 +21,6 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
    https://gitlab.clarin-pl.eu/knowledge-extraction/tools/easymatcher
    """
-    # @staticmethod
-    # def is_jsonl(
-    #         document_path: str | Path
-    # ) -> bool:
-    #     """Validates whether text file has json/jsonl structure."""
-    #     try:
-    #         with open(document_path, 'r', encoding="utf-8") as file:
-    #             for line in file:
-    #                 json_obj = json.loads(line)
-    #                 if "text" not in json_obj:
-    #                     return False
-    #         return True
-    #     except (json.JSONDecodeError, FileNotFoundError):
-    #         return False
-    # @staticmethod
-    # def prepare_and_append_document(
-    #         file_path: str | Path, document_path: str | Path
-    # ) -> None:
-    #     """Formats and appends plain texts into jsonl file."""
-    #     document = {}
-    #     if EasymatcherWorker.is_jsonl(document_path):
-    #         with open(file_path, "a", encoding="utf-8") as _f:
-    #             with open(document_path, "r", encoding="utf-8") as _df:
-    #                 for line in _df:
-    #                     line_data = json.loads(line)
-    #                     _f.write(json.dumps(line_data) + "\n")
-    #     else:
-    #         with open(document_path, "r", encoding="utf-8") as _df:
-    #             document["text"] = _df.read()
-    #         with open(file_path, "a", encoding="utf-8") as _f:
-    #             _f.write(json.dumps(document) + "\n")
    def process(
        self,
        input_path: str,
@@ -96,12 +59,13 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
        # Get labels filename
        labels_dict_name = Path(labels_path).name.split(".")[0]
+        matcher = MatrixMatcher(
-        matcher = MatrixMatcher(labels["labels"],
+            labels["labels"],
            multiple_labels=True,
            sim_threshold=0.8,
            tfidf_ngram_range=(2, 3),
-                                **task_options)
+            **task_options,
+        )
        with clarin_json.open(output_path, "w", ensure_ascii=False) as fout:
            with clarin_json.open(input_path, "r") as fin:
@@ -111,10 +75,20 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
                for doc_in, doc_out in zip(documents_in, documents_out):
                    doc_in.set_spans(
-                        [Span.from_dict({"id": idx, "start": start_idx, "stop": end_idx, "class": class_name}) 
+                        [
-                        for idx, (start_idx, end_idx, class_name) 
+                            Span.from_dict(
-                        in enumerate(doc_out['label'])],
+                                {
-                        "easymatcher-{}".format(labels_dict_name)
+                                    "id": idx,
+                                    "start": start_idx,
+                                    "stop": end_idx,
+                                    "class": class_name,
+                                }
+                            )
+                            for idx, (start_idx, end_idx, class_name) in enumerate(
+                                doc_out["label"]
+                            )
+                        ],
+                        "easymatcher-{}".format(labels_dict_name),
                    )
                    fout.write(doc_in)
--- a/tox.ini
+++ b/tox.ini
@@ -40,7 +40,7 @@ ignore = W504
 show-source = True
 exclude = .git,.venv,.tox,dist,doc,*egg,build,venv,tests,.vscode
 import-order-style = pep8
-max-line-length = 80
+max-line-length = 120
 [pydocstyle]