diff --git a/src/easymatcher_worker.py b/src/easymatcher_worker.py index ef85324c8b3f58384adb6fc0ac3509a4539d7965..8d79c8dab9e7d4a4d4ac8653ed30b978c3cc6ad2 100644 --- a/src/easymatcher_worker.py +++ b/src/easymatcher_worker.py @@ -20,6 +20,18 @@ class EasymatcherWorker(nlp_ws.NLPWorker): It relies on the use of an easymatcher tool which can be found he under - https://gitlab.clarin-pl.eu/knowledge-extraction/tools/easymatcher """ + @staticmethod + def is_jsonl(document_path: str | Path) -> bool: + """Validates whether text file has json/jsonl structure and has "text" keyword""" + try: + with open(document_path, 'r', encoding="utf-8") as file: + for line in file: + json_obj = json.loads(line) + if "text" not in json_obj: + return False + return True + except (json.JSONDecodeError, FileNotFoundError): + return False @staticmethod def prepare_and_append_document( @@ -27,7 +39,7 @@ class EasymatcherWorker(nlp_ws.NLPWorker): ) -> None: """Formats and appends plain texts into jsonl file.""" document = {} - if str(document_path).endswith(".jsonl"): + if EasymatcherWorker.is_jsonl(document_path): with open(file_path, "a", encoding="utf-8") as _f: with open(document_path, "r", encoding="utf-8") as _df: for line in _df: @@ -77,11 +89,7 @@ class EasymatcherWorker(nlp_ws.NLPWorker): if os.path.isdir(input_path): for file in os.listdir(input_path): - if file.endswith(".jsonl"): - EasymatcherWorker.prepare_and_append_document( - tmpf.name, Path(input_path) / file - ) - elif file.endswith(".txt"): + if file.endswith(".txt"): EasymatcherWorker.prepare_and_append_document( tmpf.name, Path(input_path) / file ) diff --git a/tests/example_data/input/document_with_concrete.jsonl b/tests/example_data/input/document_with_concrete.txt similarity index 100% rename from tests/example_data/input/document_with_concrete.jsonl rename to tests/example_data/input/document_with_concrete.txt diff --git a/tests/fixtures/example_data.py b/tests/fixtures/example_data.py index fb78e54bc491f49abb3a7e8f49662ade2ecd24a8..8a76ed08371da2b44b4f51a937362524e2cfc0ac 100644 --- a/tests/fixtures/example_data.py +++ b/tests/fixtures/example_data.py @@ -36,7 +36,7 @@ def example_document_path(input_dir: Path, request) -> Path: @pytest.fixture def example_document_path_jsonl(input_dir: Path, request) -> Path: - return input_dir / "document_with_concrete.jsonl" + return input_dir / "document_with_concrete.txt" @pytest.fixture def example_labels_path_jsonl(input_dir: Path) -> Path: diff --git a/tests/worker/test_worker.py b/tests/worker/test_worker.py index 89fb6d0eff76a7b742a204a8181429e2e38b50c9..1149147687d816082cb36080f48182ed60c05095 100644 --- a/tests/worker/test_worker.py +++ b/tests/worker/test_worker.py @@ -51,7 +51,7 @@ def test_easymatcher_process_folder( check_and_cleanup(output_path, expected_path) -def test_easymatcher_process_jsonl_document( +def test_easymatcher_process_jsonl_document_structure( worker: EasymatcherWorker, example_document_path_jsonl: Path, example_labels_path_jsonl: Path,