From 73c0b676c4a0ba5efb8f4d363a77b3e1c0d8f667 Mon Sep 17 00:00:00 2001 From: Jakub-Goluch <99048106+Jakub-Goluch@users.noreply.github.com> Date: Mon, 26 Jun 2023 17:43:37 +0200 Subject: [PATCH] Add feature to check whether document has a valid json/jsonl format, add tests --- src/easymatcher_worker.py | 20 +++++++++++++------ ...crete.jsonl => document_with_concrete.txt} | 0 tests/fixtures/example_data.py | 2 +- tests/worker/test_worker.py | 2 +- 4 files changed, 16 insertions(+), 8 deletions(-) rename tests/example_data/input/{document_with_concrete.jsonl => document_with_concrete.txt} (100%) diff --git a/src/easymatcher_worker.py b/src/easymatcher_worker.py index ef85324..8d79c8d 100644 --- a/src/easymatcher_worker.py +++ b/src/easymatcher_worker.py @@ -20,6 +20,18 @@ class EasymatcherWorker(nlp_ws.NLPWorker): It relies on the use of an easymatcher tool which can be found he under - https://gitlab.clarin-pl.eu/knowledge-extraction/tools/easymatcher """ + @staticmethod + def is_jsonl(document_path: str | Path) -> bool: + """Validates whether text file has json/jsonl structure and has "text" keyword""" + try: + with open(document_path, 'r', encoding="utf-8") as file: + for line in file: + json_obj = json.loads(line) + if "text" not in json_obj: + return False + return True + except (json.JSONDecodeError, FileNotFoundError): + return False @staticmethod def prepare_and_append_document( @@ -27,7 +39,7 @@ class EasymatcherWorker(nlp_ws.NLPWorker): ) -> None: """Formats and appends plain texts into jsonl file.""" document = {} - if str(document_path).endswith(".jsonl"): + if EasymatcherWorker.is_jsonl(document_path): with open(file_path, "a", encoding="utf-8") as _f: with open(document_path, "r", encoding="utf-8") as _df: for line in _df: @@ -77,11 +89,7 @@ class EasymatcherWorker(nlp_ws.NLPWorker): if os.path.isdir(input_path): for file in os.listdir(input_path): - if file.endswith(".jsonl"): - EasymatcherWorker.prepare_and_append_document( - tmpf.name, Path(input_path) / file - ) - elif file.endswith(".txt"): + if file.endswith(".txt"): EasymatcherWorker.prepare_and_append_document( tmpf.name, Path(input_path) / file ) diff --git a/tests/example_data/input/document_with_concrete.jsonl b/tests/example_data/input/document_with_concrete.txt similarity index 100% rename from tests/example_data/input/document_with_concrete.jsonl rename to tests/example_data/input/document_with_concrete.txt diff --git a/tests/fixtures/example_data.py b/tests/fixtures/example_data.py index fb78e54..8a76ed0 100644 --- a/tests/fixtures/example_data.py +++ b/tests/fixtures/example_data.py @@ -36,7 +36,7 @@ def example_document_path(input_dir: Path, request) -> Path: @pytest.fixture def example_document_path_jsonl(input_dir: Path, request) -> Path: - return input_dir / "document_with_concrete.jsonl" + return input_dir / "document_with_concrete.txt" @pytest.fixture def example_labels_path_jsonl(input_dir: Path) -> Path: diff --git a/tests/worker/test_worker.py b/tests/worker/test_worker.py index 89fb6d0..1149147 100644 --- a/tests/worker/test_worker.py +++ b/tests/worker/test_worker.py @@ -51,7 +51,7 @@ def test_easymatcher_process_folder( check_and_cleanup(output_path, expected_path) -def test_easymatcher_process_jsonl_document( +def test_easymatcher_process_jsonl_document_structure( worker: EasymatcherWorker, example_document_path_jsonl: Path, example_labels_path_jsonl: Path, -- GitLab