From ca09c49e73b4bdcd5321c2af92313e13d48bc6d5 Mon Sep 17 00:00:00 2001 From: Jakub-Goluch <99048106+Jakub-Goluch@users.noreply.github.com> Date: Mon, 19 Jun 2023 15:17:16 +0200 Subject: [PATCH 01/13] Add option to read jsonl files --- src/easymatcher_worker.py | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/src/easymatcher_worker.py b/src/easymatcher_worker.py index e30be43..19da113 100644 --- a/src/easymatcher_worker.py +++ b/src/easymatcher_worker.py @@ -23,21 +23,30 @@ class EasymatcherWorker(nlp_ws.NLPWorker): @staticmethod def prepare_and_append_document( - file_path: str | Path, document_path: str | Path + file_path: str | Path, document_path: str | Path ) -> None: """Formats and appends plain texts into jsonl file.""" document = {} - with open(document_path, "r", encoding="utf-8") as _df: - document["text"] = _df.read() + if document_path.endswith(".jsonl"): + with open(file_path, "a", encoding="utf-8") as _f: + with open(document_path, "r", encoding="utf-8") as _df: + for line in _df: + line_data = json.loads(line) + document['text'] = line_data['text'] + document['label'] = [] + _f.write(json.dumps(document) + "\n") + else: + with open(document_path, "r", encoding="utf-8") as _df: + document["text"] = _df.read() - with open(file_path, "a", encoding="utf-8") as _f: - _f.write(f"{json.dumps(document)}\n") + with open(file_path, "a", encoding="utf-8") as _f: + _f.write(f"{json.dumps(document)}\n") def process( - self, - input_path: str, - task_options: dict[str, str | int | float], - output_path: str, + self, + input_path: str, + task_options: dict[str, str | int | float], + output_path: str, ) -> None: """Called for each request made to the worker. @@ -68,7 +77,11 @@ class EasymatcherWorker(nlp_ws.NLPWorker): if os.path.isdir(input_path): for file in os.listdir(input_path): - if file.endswith(".txt"): + if file.endswith(".jsonl"): + EasymatcherWorker.prepare_and_append_document( + tmpf.name, Path(input_path) / file + ) + elif file.endswith(".txt"): EasymatcherWorker.prepare_and_append_document( tmpf.name, Path(input_path) / file ) -- GitLab From d199d0fb83c40022b8ea53eb4f1d7f0a3dfd250f Mon Sep 17 00:00:00 2001 From: Jakub-Goluch <99048106+Jakub-Goluch@users.noreply.github.com> Date: Mon, 19 Jun 2023 15:28:20 +0200 Subject: [PATCH 02/13] Change output type from json txt to jsonl --- src/easymatcher_worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/easymatcher_worker.py b/src/easymatcher_worker.py index 19da113..8f6e9b2 100644 --- a/src/easymatcher_worker.py +++ b/src/easymatcher_worker.py @@ -27,7 +27,7 @@ class EasymatcherWorker(nlp_ws.NLPWorker): ) -> None: """Formats and appends plain texts into jsonl file.""" document = {} - if document_path.endswith(".jsonl"): + if str(document_path).endswith(".jsonl"): with open(file_path, "a", encoding="utf-8") as _f: with open(document_path, "r", encoding="utf-8") as _df: for line in _df: -- GitLab From b736bb6309c876338590e651de4b8045005cec6d Mon Sep 17 00:00:00 2001 From: Jakub-Goluch <99048106+Jakub-Goluch@users.noreply.github.com> Date: Mon, 19 Jun 2023 15:47:26 +0200 Subject: [PATCH 03/13] Add option to read jsonl files --- src/easymatcher_worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/easymatcher_worker.py b/src/easymatcher_worker.py index 8f6e9b2..19da113 100644 --- a/src/easymatcher_worker.py +++ b/src/easymatcher_worker.py @@ -27,7 +27,7 @@ class EasymatcherWorker(nlp_ws.NLPWorker): ) -> None: """Formats and appends plain texts into jsonl file.""" document = {} - if str(document_path).endswith(".jsonl"): + if document_path.endswith(".jsonl"): with open(file_path, "a", encoding="utf-8") as _f: with open(document_path, "r", encoding="utf-8") as _df: for line in _df: -- GitLab From eb91833937f208201a3690dd9e54fd3f21dd0ad4 Mon Sep 17 00:00:00 2001 From: Jakub-Goluch <99048106+Jakub-Goluch@users.noreply.github.com> Date: Mon, 19 Jun 2023 15:56:26 +0200 Subject: [PATCH 04/13] Add option to read jsonl files --- src/easymatcher_worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/easymatcher_worker.py b/src/easymatcher_worker.py index 19da113..8f6e9b2 100644 --- a/src/easymatcher_worker.py +++ b/src/easymatcher_worker.py @@ -27,7 +27,7 @@ class EasymatcherWorker(nlp_ws.NLPWorker): ) -> None: """Formats and appends plain texts into jsonl file.""" document = {} - if document_path.endswith(".jsonl"): + if str(document_path).endswith(".jsonl"): with open(file_path, "a", encoding="utf-8") as _f: with open(document_path, "r", encoding="utf-8") as _df: for line in _df: -- GitLab From a0865827253143f5c4b1678a4962f17df743983b Mon Sep 17 00:00:00 2001 From: Jakub-Goluch <99048106+Jakub-Goluch@users.noreply.github.com> Date: Thu, 22 Jun 2023 10:44:37 +0200 Subject: [PATCH 05/13] Add tests to read jsonl files --- src/easymatcher_worker.py | 2 +- .../expected/document_with_concrete.jsonl | 1 + tests/example_data/expected/documents.jsonl | 1 + .../input/document_with_concrete.jsonl | 1 + .../input/labels_with_concrete.json | 332 ++++++++++++++++++ tests/fixtures/example_data.py | 9 + tests/worker/test_worker.py | 14 + 7 files changed, 359 insertions(+), 1 deletion(-) create mode 100644 tests/example_data/expected/document_with_concrete.jsonl create mode 100644 tests/example_data/input/document_with_concrete.jsonl create mode 100644 tests/example_data/input/labels_with_concrete.json diff --git a/src/easymatcher_worker.py b/src/easymatcher_worker.py index 8f6e9b2..ef85324 100644 --- a/src/easymatcher_worker.py +++ b/src/easymatcher_worker.py @@ -33,7 +33,7 @@ class EasymatcherWorker(nlp_ws.NLPWorker): for line in _df: line_data = json.loads(line) document['text'] = line_data['text'] - document['label'] = [] + # document['label'] = [] _f.write(json.dumps(document) + "\n") else: with open(document_path, "r", encoding="utf-8") as _df: diff --git a/tests/example_data/expected/document_with_concrete.jsonl b/tests/example_data/expected/document_with_concrete.jsonl new file mode 100644 index 0000000..cb94387 --- /dev/null +++ b/tests/example_data/expected/document_with_concrete.jsonl @@ -0,0 +1 @@ +{"text": "8703 Tarcza szlifierska Scanmaskin SC Elastic Metal 125 mm do betonu, lastryka i kamienia naturalnego (ziarnisto\u015b\u0107 30) Tarcza szlifierska SC Elastic Metal 125 mm marki Scanmaskin s\u0142u\u017cy do szlifowania betonu, lastryka i naturalnego kamienia. Powinna byc\u0301 stosowana przed na\u0142oz\u0307eniem cienkiej pow\u0142oki lub polerowaniem posadzki, aby usuna\u0328c\u0301 z powierzchni g\u0142e\u0328bokie zarysowania. Opis wariantu: Ziarnisto\u015b\u0107 30", "label": [[46, 51, "Metal"], [62, 68, "Beton"], [70, 78, "Lastryko"], [81, 89, "Kamie\u0144 naturalny"], [149, 154, "Metal"], [200, 206, "Beton"], [208, 216, "Lastryko"], [219, 239, "Kamie\u0144 naturalny"]]} diff --git a/tests/example_data/expected/documents.jsonl b/tests/example_data/expected/documents.jsonl index 1fce067..b34ea4d 100644 --- a/tests/example_data/expected/documents.jsonl +++ b/tests/example_data/expected/documents.jsonl @@ -1,2 +1,3 @@ {"text": "55calowy telewizor QLED Samsung QE55Q75TA Pozw\\u00f3l sobie na luksus wybieraj\\u0105c telewizor QLED Samsung QE55Q75TA kt\\u00f3ry zabierze Ci\\u0119 w \\u015bwiat wspania\\u0142ej rozrywki bez wychodzenia z domu Umozliwia Aktualizowanie oprogramowania przez USB oraz posiada Aktualizacje oprogramowania sprz\u0119towego przez Internet", "label": [[19, 23, "Typ TV"], [96, 100, "Typ TV"], [219, 258, "Aktualizowanie oprogramowania przez USB"], [272, 326, "Aktualizacja oprogramowania sprz\u0119towego przez Internet"]]} {"text": "55calowy telewizor QLED Samsung QE55Q75TA Pozw\\u00f3l sobie na luksus wybieraj\\u0105c telewizor QLED Samsung QE55Q75TA kt\\u00f3ry zabierze Ci\\u0119 w \\u015bwiat wspania\\u0142ej rozrywki bez wychodzenia z domu", "label": [[19, 23, "Typ TV"], [96, 100, "Typ TV"]]} +{"text": "8703 Tarcza szlifierska Scanmaskin SC Elastic Metal 125 mm do betonu, lastryka i kamienia naturalnego (ziarnisto\u015b\u0107 30) Tarcza szlifierska SC Elastic Metal 125 mm marki Scanmaskin s\u0142u\u017cy do szlifowania betonu, lastryka i naturalnego kamienia. Powinna byc\u0301 stosowana przed na\u0142oz\u0307eniem cienkiej pow\u0142oki lub polerowaniem posadzki, aby usuna\u0328c\u0301 z powierzchni g\u0142e\u0328bokie zarysowania. Opis wariantu: Ziarnisto\u015b\u0107 30", "label": []} diff --git a/tests/example_data/input/document_with_concrete.jsonl b/tests/example_data/input/document_with_concrete.jsonl new file mode 100644 index 0000000..0ac7d3f --- /dev/null +++ b/tests/example_data/input/document_with_concrete.jsonl @@ -0,0 +1 @@ +{"text": "8703 Tarcza szlifierska Scanmaskin SC Elastic Metal 125 mm do betonu, lastryka i kamienia naturalnego (ziarnistość 30) Tarcza szlifierska SC Elastic Metal 125 mm marki Scanmaskin służy do szlifowania betonu, lastryka i naturalnego kamienia. Powinna być stosowana przed nałożeniem cienkiej powłoki lub polerowaniem posadzki, aby usunąć z powierzchni głębokie zarysowania. Opis wariantu: Ziarnistość 30","label":[]} diff --git a/tests/example_data/input/labels_with_concrete.json b/tests/example_data/input/labels_with_concrete.json new file mode 100644 index 0000000..6b48c69 --- /dev/null +++ b/tests/example_data/input/labels_with_concrete.json @@ -0,0 +1,332 @@ +{ + "labels": { + "Asfalt": [ + "asfalt", + "takich jak asfalt", + "Asfalt" + ], + "Beton": [ + "idealna do betonu", + "beton", + "do kostki betonowej", + "Beton", + "beton" + ], + "Beton - miękki": [ + "twardy", + "średni", + "miękki", + "ścierny", + "od miękkich do bardzo twardych kruszyw" + ], + "Beton - ścierny": [ + "ścierny", + "od miękkich do bardzo" + ], + "Beton - średni": [ + "średni", + "od miękkich do bardzo twardych" + ], + "Beton - świeży": [ + "beton świeży", + "świeży beton", + "dla cięcia betonu świeżego", + "betonu próżniowego" + ], + "Beton - twardy": [ + "z betonem utwardzonym", + "bardzo twardych kruszyw betonowych", + "twardego betonu", + "Twardy beton", + "Beton twardy" + ], + "Beton - zbrojony": [ + "betonie zbrojonym", + "idealna do silnie zbrojonego betonu", + "Beton zbrojony", + "także zbrojnego", + "betonu zbrojonego", + "Beton zbrojny", + "beton zbrojny", + "Beton zbrojony", + "beton zbrojony" + ], + "Bloki ścienne": [ + "bloki ścienne" + ], + "Bloki wapienne": [ + "do bloków wapiennych", + "Blok wapienny" + ], + "Cegła": [ + "cegł", + "cegły", + "cegła", + "Cegła", + "cegła", + "cegłą", + "cegły", + "Cegła wapienno-piaskowa" + ], + "Ceramika": [ + "ceramika", + "ceramiki", + "płytki ceramiczne", + "glazurowana ceramika", + "tarcza do płytek", + "twarde płytki ceramiczne", + "płytki ceramiczne", + "Ceramika", + "ceramika" + ], + "Chrommagnezyt": [ + "Chrommagnezyt" + ], + "Dachówka": [ + "Dachówka cementowa", + "dachówek betonowych", + "dachówka" + ], + "Drewno": [ + "Drewno", + "drewno" + ], + "Fuga murarska": [ + "Fuga murarska", + "Fugi murarskie" + ], + "Glazura": [ + "do cięcia glazury" + ], + "Gnejs": [ + "gnejsu", + "gnejs" + ], + "Grafit": [ + "grafit" + ], + "Granit": [ + "Cięcie takich materiałów jak granit", + "marmur", + "naturalny kamień jest dopuszczalne przy użyciu opisywanej tarczy", + "granit", + "granitu", + "rawężniki granitowe", + "granir", + "Granit" + ], + "Gres": [ + "Gres" + ], + "Jastrych": [ + "jastrych" + ], + "Kamień naturalny": [ + "kamienia", + "naturalnego kamienia", + "kamień naturalny", + "naturalny kamień jest dopuszczalne przy użyciu opisywanej tarczy", + "kamień naturalny", + "kamien naturalny", + "kamienia", + "Kamień", + "kamień", + "Kamień naturalny" + ], + "Kamionki": [ + "kamionki" + ], + "Klej": [ + "Klej" + ], + "Klinkier": [ + "Klinkier", + "klinkier", + "linkier" + ], + "Kostka brukowa": [ + "kostki brukowej", + "kostka brukowa" + ], + "Krzemionka": [ + "Krzemionka" + ], + "Kwarcyt": [ + "kwarcyt", + "Kwarcyt" + ], + "Lastryko": [ + "lastryka", + "lastryk" + ], + "Łupek": [ + "łupek" + ], + "Magnezyt": [ + "Magnezyt" + ], + "Marmur": [ + "Cięcie takich materiałów jak granit", + "marmur", + "marmur", + "marmuru", + "Marmur", + "marmur" + ], + "Materiały budowlane": [ + "materiały", + "Materiały budowlane", + "materiały budowlane", + "standardowych materiałów budowlanych" + ], + "Materiały ścierne": [ + "materiały ścierne" + ], + "Materiały twarde": [ + "Materiały twarde" + ], + "Metal": [ + "Metal" + ], + "Miękkie kruszywa": [ + "miękkich do twardych kruszyw", + "innych miękkich materiałów" + ], + "PCV": [ + "PCV" + ], + "Piaskowiec": [ + "iaskow", + "piaskowca", + "piaskowiec" + ], + "Plastik": [ + "Plastik" + ], + "Płyta chodnikowa": [ + "płyta chodnikowa" + ], + "Płytki": [ + "Płytki", + "płytki" + ], + "Płytki betonowe": [ + "płytki betonowe" + ], + "Płytki ceramiczne": [ + "Płytki ceramiczne", + "płytki ceramiczne" + ], + "Płytki marmurowe": [ + "płytek betonowych i marmurowych" + ], + "Polbruk": [ + "polbruk" + ], + "Porcelana": [ + "porcelan", + "Porcelana", + "porcelana" + ], + "Porfir": [ + "Porfir", + "porfir" + ], + "Poroterm": [ + "poroterm" + ], + "Powłoki posadzkowe": [ + "Powłoki posadzkowe" + ], + "Powłoki ścienne": [ + "Powłoki ścienne" + ], + "Rury z żeliwa ciągliwego": [ + "Rury z żeliwa ciągliwego" + ], + "Stal": [ + "Stal", + "stal" + ], + "Stal nierdzewna": [ + "stali nierdzewnej" + ], + "Strunobeton": [ + "Strunobeton" + ], + "Szamot": [ + "szamot", + "Szamot" + ], + "Szkło pancerne": [ + "Szkło pancerne" + ], + "Sztuczny kamień": [ + "sztucznego kamienia" + ], + "Ścierna płyta budowlana": [ + "ścierna płyta budowlana" + ], + "Ścierne": [ + "ścierne" + ], + "Terazzo": [ + "terazzo" + ], + "Tlenek cyrkonu": [ + "tlenek cyrkonu" + ], + "Tlenek glinu": [ + "tlenek glinu" + ], + "Trawertyn": [ + "trawert", + "trawertyn" + ], + "Twarda glazura": [ + "Twarda glazura", + "twarda glazura" + ], + "Twarde kruszywa": [ + "twardych kruszyw" + ], + "Twarde materiały": [ + "twardych materiałach", + "twarde", + "twardych materiałów", + "im twardszy materiał" + ], + "Tworzywo sztuczne": [ + "Tworzywo sztuczne", + "tworzywo sztuczne" + ], + "Tynk": [ + "tynk" + ], + "Wapień": [ + "wapień" + ], + "Zaprawa murarska": [ + "Zaprawa murarska", + "zaprawa murarska" + ], + "Żelazo": [ + "Żelazo" + ], + "Żeliwo": [ + "Żeliwo", + "żeliwo" + ], + "Żeliwo sferoidalne": [ + "żeliwo sferoidalne" + ] + }, + "blackList": { + "Metal": [ + "Elastic Metal" + ], + "Średnica otworu montażowego tarczy - uwaga": [ + "mm" + ] + } +} \ No newline at end of file diff --git a/tests/fixtures/example_data.py b/tests/fixtures/example_data.py index 05e6891..fb78e54 100644 --- a/tests/fixtures/example_data.py +++ b/tests/fixtures/example_data.py @@ -32,3 +32,12 @@ def example_labels_path(input_dir: Path) -> Path: def example_document_path(input_dir: Path, request) -> Path: document_number = request.param return input_dir / f"document{document_number}.txt" + + +@pytest.fixture +def example_document_path_jsonl(input_dir: Path, request) -> Path: + return input_dir / "document_with_concrete.jsonl" + +@pytest.fixture +def example_labels_path_jsonl(input_dir: Path) -> Path: + return input_dir / "labels_with_concrete.json" diff --git a/tests/worker/test_worker.py b/tests/worker/test_worker.py index 868416c..89fb6d0 100644 --- a/tests/worker/test_worker.py +++ b/tests/worker/test_worker.py @@ -49,3 +49,17 @@ def test_easymatcher_process_folder( expected_path = expected_dir / "documents.jsonl" worker.process(input_dir, task_options, output_path) check_and_cleanup(output_path, expected_path) + + +def test_easymatcher_process_jsonl_document( + worker: EasymatcherWorker, + example_document_path_jsonl: Path, + example_labels_path_jsonl: Path, + output_dir: Path, + expected_dir: Path, +): + task_options = {"labels_path": example_labels_path_jsonl} + output_path = output_dir / f"{example_document_path_jsonl.stem}.jsonl" + expected_path = expected_dir / f"{example_document_path_jsonl.stem}.jsonl" + worker.process(example_document_path_jsonl, task_options, output_path) + check_and_cleanup(output_path, expected_path) \ No newline at end of file -- GitLab From 73c0b676c4a0ba5efb8f4d363a77b3e1c0d8f667 Mon Sep 17 00:00:00 2001 From: Jakub-Goluch <99048106+Jakub-Goluch@users.noreply.github.com> Date: Mon, 26 Jun 2023 17:43:37 +0200 Subject: [PATCH 06/13] Add feature to check whether document has a valid json/jsonl format, add tests --- src/easymatcher_worker.py | 20 +++++++++++++------ ...crete.jsonl => document_with_concrete.txt} | 0 tests/fixtures/example_data.py | 2 +- tests/worker/test_worker.py | 2 +- 4 files changed, 16 insertions(+), 8 deletions(-) rename tests/example_data/input/{document_with_concrete.jsonl => document_with_concrete.txt} (100%) diff --git a/src/easymatcher_worker.py b/src/easymatcher_worker.py index ef85324..8d79c8d 100644 --- a/src/easymatcher_worker.py +++ b/src/easymatcher_worker.py @@ -20,6 +20,18 @@ class EasymatcherWorker(nlp_ws.NLPWorker): It relies on the use of an easymatcher tool which can be found he under - https://gitlab.clarin-pl.eu/knowledge-extraction/tools/easymatcher """ + @staticmethod + def is_jsonl(document_path: str | Path) -> bool: + """Validates whether text file has json/jsonl structure and has "text" keyword""" + try: + with open(document_path, 'r', encoding="utf-8") as file: + for line in file: + json_obj = json.loads(line) + if "text" not in json_obj: + return False + return True + except (json.JSONDecodeError, FileNotFoundError): + return False @staticmethod def prepare_and_append_document( @@ -27,7 +39,7 @@ class EasymatcherWorker(nlp_ws.NLPWorker): ) -> None: """Formats and appends plain texts into jsonl file.""" document = {} - if str(document_path).endswith(".jsonl"): + if EasymatcherWorker.is_jsonl(document_path): with open(file_path, "a", encoding="utf-8") as _f: with open(document_path, "r", encoding="utf-8") as _df: for line in _df: @@ -77,11 +89,7 @@ class EasymatcherWorker(nlp_ws.NLPWorker): if os.path.isdir(input_path): for file in os.listdir(input_path): - if file.endswith(".jsonl"): - EasymatcherWorker.prepare_and_append_document( - tmpf.name, Path(input_path) / file - ) - elif file.endswith(".txt"): + if file.endswith(".txt"): EasymatcherWorker.prepare_and_append_document( tmpf.name, Path(input_path) / file ) diff --git a/tests/example_data/input/document_with_concrete.jsonl b/tests/example_data/input/document_with_concrete.txt similarity index 100% rename from tests/example_data/input/document_with_concrete.jsonl rename to tests/example_data/input/document_with_concrete.txt diff --git a/tests/fixtures/example_data.py b/tests/fixtures/example_data.py index fb78e54..8a76ed0 100644 --- a/tests/fixtures/example_data.py +++ b/tests/fixtures/example_data.py @@ -36,7 +36,7 @@ def example_document_path(input_dir: Path, request) -> Path: @pytest.fixture def example_document_path_jsonl(input_dir: Path, request) -> Path: - return input_dir / "document_with_concrete.jsonl" + return input_dir / "document_with_concrete.txt" @pytest.fixture def example_labels_path_jsonl(input_dir: Path) -> Path: diff --git a/tests/worker/test_worker.py b/tests/worker/test_worker.py index 89fb6d0..1149147 100644 --- a/tests/worker/test_worker.py +++ b/tests/worker/test_worker.py @@ -51,7 +51,7 @@ def test_easymatcher_process_folder( check_and_cleanup(output_path, expected_path) -def test_easymatcher_process_jsonl_document( +def test_easymatcher_process_jsonl_document_structure( worker: EasymatcherWorker, example_document_path_jsonl: Path, example_labels_path_jsonl: Path, -- GitLab From 8864e2687c1cdf1bd563953f262402e7c9bbaa76 Mon Sep 17 00:00:00 2001 From: Jakub-Goluch <99048106+Jakub-Goluch@users.noreply.github.com> Date: Mon, 26 Jun 2023 21:50:17 +0200 Subject: [PATCH 07/13] Add feature to check whether document has a valid json/jsonl format, add tests --- src/easymatcher_worker.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/easymatcher_worker.py b/src/easymatcher_worker.py index 8d79c8d..5e59e71 100644 --- a/src/easymatcher_worker.py +++ b/src/easymatcher_worker.py @@ -20,9 +20,12 @@ class EasymatcherWorker(nlp_ws.NLPWorker): It relies on the use of an easymatcher tool which can be found he under - https://gitlab.clarin-pl.eu/knowledge-extraction/tools/easymatcher """ + @staticmethod - def is_jsonl(document_path: str | Path) -> bool: - """Validates whether text file has json/jsonl structure and has "text" keyword""" + def is_jsonl( + document_path: str | Path + ) -> bool: + """Validates whether text file has json/jsonl structure and has "text" keyword.""" try: with open(document_path, 'r', encoding="utf-8") as file: for line in file: -- GitLab From e9bab9a05da19bee7a14defedac3199894ea8123 Mon Sep 17 00:00:00 2001 From: Jakub-Goluch <99048106+Jakub-Goluch@users.noreply.github.com> Date: Mon, 26 Jun 2023 22:04:48 +0200 Subject: [PATCH 08/13] Add feature to check whether document has a valid json/jsonl format, add tests --- src/easymatcher_worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/easymatcher_worker.py b/src/easymatcher_worker.py index 5e59e71..d300a49 100644 --- a/src/easymatcher_worker.py +++ b/src/easymatcher_worker.py @@ -25,7 +25,7 @@ class EasymatcherWorker(nlp_ws.NLPWorker): def is_jsonl( document_path: str | Path ) -> bool: - """Validates whether text file has json/jsonl structure and has "text" keyword.""" + """Validates whether text file has json/jsonl structure.""" try: with open(document_path, 'r', encoding="utf-8") as file: for line in file: -- GitLab From 3130c89d4c06f3850732f5907071f47b8e7b441c Mon Sep 17 00:00:00 2001 From: Jakub-Goluch <99048106+Jakub-Goluch@users.noreply.github.com> Date: Mon, 26 Jun 2023 22:26:15 +0200 Subject: [PATCH 09/13] Add feature to check whether document has a valid json/jsonl format, add tests --- src/easymatcher_worker.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/easymatcher_worker.py b/src/easymatcher_worker.py index 076e743..9366608 100644 --- a/src/easymatcher_worker.py +++ b/src/easymatcher_worker.py @@ -54,8 +54,8 @@ class EasymatcherWorker(nlp_ws.NLPWorker): with open(document_path, "r", encoding="utf-8") as _df: document["text"] = _df.read() - with open(file_path, "a", encoding="utf-8") as _f: - _f.write(json.dumps(document) + "\n") + with open(file_path, "a", encoding="utf-8") as _f: + _f.write(json.dumps(document) + "\n") def process( self, -- GitLab From 6a94b42e30491cbc64c69e530ad7edf2d59b521c Mon Sep 17 00:00:00 2001 From: Konrad Wojtasik <pwr200856@e-science.pl> Date: Tue, 4 Jul 2023 08:56:19 +0000 Subject: [PATCH 10/13] Update easymatcher_worker.py --- src/easymatcher_worker.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/easymatcher_worker.py b/src/easymatcher_worker.py index 9366608..f51ad45 100644 --- a/src/easymatcher_worker.py +++ b/src/easymatcher_worker.py @@ -47,9 +47,9 @@ class EasymatcherWorker(nlp_ws.NLPWorker): with open(document_path, "r", encoding="utf-8") as _df: for line in _df: line_data = json.loads(line) - document['text'] = line_data['text'] + # document['text'] = line_data['text'] # document['label'] = [] - _f.write(json.dumps(document) + "\n") + _f.write(json.dumps(line_data) + "\n") else: with open(document_path, "r", encoding="utf-8") as _df: document["text"] = _df.read() @@ -107,5 +107,7 @@ class EasymatcherWorker(nlp_ws.NLPWorker): os.unlink(tmpf.name) with open(output_path, "w", encoding="utf-8") as _f: - for out_document in out_documents: - _f.write(json.dumps(out_document) + "\n") + for out_document, document in zip(out_documents, documents): + # We want to keep content of the original labeled documents + document['label'] = out_document['label'] + _f.write(json.dumps(document) + "\n") -- GitLab From 8a0493152102a398f4b64caf2965f0927071c676 Mon Sep 17 00:00:00 2001 From: Konrad Wojtasik <pwr200856@e-science.pl> Date: Tue, 4 Jul 2023 09:02:19 +0000 Subject: [PATCH 11/13] Update .gitlab-ci.yml --- .gitlab-ci.yml | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 3ec1770..ee81085 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -55,13 +55,10 @@ pages: build_master: stage: builds - image: 'docker:18.09.7' + image: docker only: - master - services: - - 'docker:18.09.7-dind' script: - - docker build -t $CI_REGISTRY_IMAGE:latest -f DockerFile . + - docker build --load -t $CI_REGISTRY_IMAGE:latest -f DockerFile . - docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY - docker push $CI_REGISTRY_IMAGE:latest - -- GitLab From 05afe1e1476ef0cc0cfc49c74ad1f5bf99d482d3 Mon Sep 17 00:00:00 2001 From: Konrad Wojtasik <pwr200856@e-science.pl> Date: Tue, 4 Jul 2023 09:02:36 +0000 Subject: [PATCH 12/13] Update easymatcher_worker.py --- src/easymatcher_worker.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/easymatcher_worker.py b/src/easymatcher_worker.py index f51ad45..27402bd 100644 --- a/src/easymatcher_worker.py +++ b/src/easymatcher_worker.py @@ -47,8 +47,6 @@ class EasymatcherWorker(nlp_ws.NLPWorker): with open(document_path, "r", encoding="utf-8") as _df: for line in _df: line_data = json.loads(line) - # document['text'] = line_data['text'] - # document['label'] = [] _f.write(json.dumps(line_data) + "\n") else: with open(document_path, "r", encoding="utf-8") as _df: -- GitLab From 4294beedc4d1f435633ae56bac987bfe85624fcc Mon Sep 17 00:00:00 2001 From: Konrad Wojtasik <konrad.wojtasik@pwr.edu.pl> Date: Tue, 4 Jul 2023 12:58:09 +0200 Subject: [PATCH 13/13] Fix tests --- src/easymatcher_worker.py | 1 + tests/utils.py | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/easymatcher_worker.py b/src/easymatcher_worker.py index 27402bd..d8373a4 100644 --- a/src/easymatcher_worker.py +++ b/src/easymatcher_worker.py @@ -108,4 +108,5 @@ class EasymatcherWorker(nlp_ws.NLPWorker): for out_document, document in zip(out_documents, documents): # We want to keep content of the original labeled documents document['label'] = out_document['label'] + document['text'] = out_document['text'] _f.write(json.dumps(document) + "\n") diff --git a/tests/utils.py b/tests/utils.py index 703c07f..4bdde7f 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,8 +1,15 @@ import os +import json from filecmp import cmp from pathlib import Path +def load_jsonl(file_path: Path): + with open(file_path) as f: + data = [json.loads(line) for line in f] + return data def check_and_cleanup(output_path: Path, expected_path: Path) -> Path: - assert cmp(output_path, expected_path) + output = load_jsonl(output_path) + expected = load_jsonl(expected_path) + assert sorted(output, key=lambda d : d['text']) == sorted(expected, key=lambda d : d['text']) os.remove(output_path) -- GitLab