From 73c0b676c4a0ba5efb8f4d363a77b3e1c0d8f667 Mon Sep 17 00:00:00 2001
From: Jakub-Goluch <99048106+Jakub-Goluch@users.noreply.github.com>
Date: Mon, 26 Jun 2023 17:43:37 +0200
Subject: [PATCH] Add feature to check whether document has a valid json/jsonl
 format, add tests

---
 src/easymatcher_worker.py                     | 20 +++++++++++++------
 ...crete.jsonl => document_with_concrete.txt} |  0
 tests/fixtures/example_data.py                |  2 +-
 tests/worker/test_worker.py                   |  2 +-
 4 files changed, 16 insertions(+), 8 deletions(-)
 rename tests/example_data/input/{document_with_concrete.jsonl => document_with_concrete.txt} (100%)

diff --git a/src/easymatcher_worker.py b/src/easymatcher_worker.py
index ef85324..8d79c8d 100644
--- a/src/easymatcher_worker.py
+++ b/src/easymatcher_worker.py
@@ -20,6 +20,18 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
     It relies on the use of an easymatcher tool which can be found he under -
     https://gitlab.clarin-pl.eu/knowledge-extraction/tools/easymatcher
     """
+    @staticmethod
+    def is_jsonl(document_path: str | Path) -> bool:
+        """Validates whether text file has json/jsonl structure and has "text" keyword"""
+        try:
+            with open(document_path, 'r', encoding="utf-8") as file:
+                for line in file:
+                    json_obj = json.loads(line)
+                    if "text" not in json_obj:
+                        return False
+            return True
+        except (json.JSONDecodeError, FileNotFoundError):
+            return False
 
     @staticmethod
     def prepare_and_append_document(
@@ -27,7 +39,7 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
     ) -> None:
         """Formats and appends plain texts into jsonl file."""
         document = {}
-        if str(document_path).endswith(".jsonl"):
+        if EasymatcherWorker.is_jsonl(document_path):
             with open(file_path, "a", encoding="utf-8") as _f:
                 with open(document_path, "r", encoding="utf-8") as _df:
                     for line in _df:
@@ -77,11 +89,7 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
 
         if os.path.isdir(input_path):
             for file in os.listdir(input_path):
-                if file.endswith(".jsonl"):
-                    EasymatcherWorker.prepare_and_append_document(
-                        tmpf.name, Path(input_path) / file
-                    )
-                elif file.endswith(".txt"):
+                if file.endswith(".txt"):
                     EasymatcherWorker.prepare_and_append_document(
                         tmpf.name, Path(input_path) / file
                     )
diff --git a/tests/example_data/input/document_with_concrete.jsonl b/tests/example_data/input/document_with_concrete.txt
similarity index 100%
rename from tests/example_data/input/document_with_concrete.jsonl
rename to tests/example_data/input/document_with_concrete.txt
diff --git a/tests/fixtures/example_data.py b/tests/fixtures/example_data.py
index fb78e54..8a76ed0 100644
--- a/tests/fixtures/example_data.py
+++ b/tests/fixtures/example_data.py
@@ -36,7 +36,7 @@ def example_document_path(input_dir: Path, request) -> Path:
 
 @pytest.fixture
 def example_document_path_jsonl(input_dir: Path, request) -> Path:
-    return input_dir / "document_with_concrete.jsonl"
+    return input_dir / "document_with_concrete.txt"
 
 @pytest.fixture
 def example_labels_path_jsonl(input_dir: Path) -> Path:
diff --git a/tests/worker/test_worker.py b/tests/worker/test_worker.py
index 89fb6d0..1149147 100644
--- a/tests/worker/test_worker.py
+++ b/tests/worker/test_worker.py
@@ -51,7 +51,7 @@ def test_easymatcher_process_folder(
     check_and_cleanup(output_path, expected_path)
 
 
-def test_easymatcher_process_jsonl_document(
+def test_easymatcher_process_jsonl_document_structure(
     worker: EasymatcherWorker,
     example_document_path_jsonl: Path,
     example_labels_path_jsonl: Path,
-- 
GitLab