From ca09c49e73b4bdcd5321c2af92313e13d48bc6d5 Mon Sep 17 00:00:00 2001
From: Jakub-Goluch <99048106+Jakub-Goluch@users.noreply.github.com>
Date: Mon, 19 Jun 2023 15:17:16 +0200
Subject: [PATCH 01/13] Add option to read jsonl files

---
 src/easymatcher_worker.py | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/src/easymatcher_worker.py b/src/easymatcher_worker.py
index e30be43..19da113 100644
--- a/src/easymatcher_worker.py
+++ b/src/easymatcher_worker.py
@@ -23,21 +23,30 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
 
     @staticmethod
     def prepare_and_append_document(
-        file_path: str | Path, document_path: str | Path
+            file_path: str | Path, document_path: str | Path
     ) -> None:
         """Formats and appends plain texts into jsonl file."""
         document = {}
-        with open(document_path, "r", encoding="utf-8") as _df:
-            document["text"] = _df.read()
+        if document_path.endswith(".jsonl"):
+            with open(file_path, "a", encoding="utf-8") as _f:
+                with open(document_path, "r", encoding="utf-8") as _df:
+                    for line in _df:
+                        line_data = json.loads(line)
+                        document['text'] = line_data['text']
+                        document['label'] = []
+                        _f.write(json.dumps(document) + "\n")
+        else:
+            with open(document_path, "r", encoding="utf-8") as _df:
+                document["text"] = _df.read()
 
-        with open(file_path, "a", encoding="utf-8") as _f:
-            _f.write(f"{json.dumps(document)}\n")
+            with open(file_path, "a", encoding="utf-8") as _f:
+                _f.write(f"{json.dumps(document)}\n")
 
     def process(
-        self,
-        input_path: str,
-        task_options: dict[str, str | int | float],
-        output_path: str,
+            self,
+            input_path: str,
+            task_options: dict[str, str | int | float],
+            output_path: str,
     ) -> None:
         """Called for each request made to the worker.
 
@@ -68,7 +77,11 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
 
         if os.path.isdir(input_path):
             for file in os.listdir(input_path):
-                if file.endswith(".txt"):
+                if file.endswith(".jsonl"):
+                    EasymatcherWorker.prepare_and_append_document(
+                        tmpf.name, Path(input_path) / file
+                    )
+                elif file.endswith(".txt"):
                     EasymatcherWorker.prepare_and_append_document(
                         tmpf.name, Path(input_path) / file
                     )
-- 
GitLab


From d199d0fb83c40022b8ea53eb4f1d7f0a3dfd250f Mon Sep 17 00:00:00 2001
From: Jakub-Goluch <99048106+Jakub-Goluch@users.noreply.github.com>
Date: Mon, 19 Jun 2023 15:28:20 +0200
Subject: [PATCH 02/13] Change output type from json txt to jsonl

---
 src/easymatcher_worker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/easymatcher_worker.py b/src/easymatcher_worker.py
index 19da113..8f6e9b2 100644
--- a/src/easymatcher_worker.py
+++ b/src/easymatcher_worker.py
@@ -27,7 +27,7 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
     ) -> None:
         """Formats and appends plain texts into jsonl file."""
         document = {}
-        if document_path.endswith(".jsonl"):
+        if str(document_path).endswith(".jsonl"):
             with open(file_path, "a", encoding="utf-8") as _f:
                 with open(document_path, "r", encoding="utf-8") as _df:
                     for line in _df:
-- 
GitLab


From b736bb6309c876338590e651de4b8045005cec6d Mon Sep 17 00:00:00 2001
From: Jakub-Goluch <99048106+Jakub-Goluch@users.noreply.github.com>
Date: Mon, 19 Jun 2023 15:47:26 +0200
Subject: [PATCH 03/13] Add option to read jsonl files

---
 src/easymatcher_worker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/easymatcher_worker.py b/src/easymatcher_worker.py
index 8f6e9b2..19da113 100644
--- a/src/easymatcher_worker.py
+++ b/src/easymatcher_worker.py
@@ -27,7 +27,7 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
     ) -> None:
         """Formats and appends plain texts into jsonl file."""
         document = {}
-        if str(document_path).endswith(".jsonl"):
+        if document_path.endswith(".jsonl"):
             with open(file_path, "a", encoding="utf-8") as _f:
                 with open(document_path, "r", encoding="utf-8") as _df:
                     for line in _df:
-- 
GitLab


From eb91833937f208201a3690dd9e54fd3f21dd0ad4 Mon Sep 17 00:00:00 2001
From: Jakub-Goluch <99048106+Jakub-Goluch@users.noreply.github.com>
Date: Mon, 19 Jun 2023 15:56:26 +0200
Subject: [PATCH 04/13] Add option to read jsonl files

---
 src/easymatcher_worker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/easymatcher_worker.py b/src/easymatcher_worker.py
index 19da113..8f6e9b2 100644
--- a/src/easymatcher_worker.py
+++ b/src/easymatcher_worker.py
@@ -27,7 +27,7 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
     ) -> None:
         """Formats and appends plain texts into jsonl file."""
         document = {}
-        if document_path.endswith(".jsonl"):
+        if str(document_path).endswith(".jsonl"):
             with open(file_path, "a", encoding="utf-8") as _f:
                 with open(document_path, "r", encoding="utf-8") as _df:
                     for line in _df:
-- 
GitLab


From a0865827253143f5c4b1678a4962f17df743983b Mon Sep 17 00:00:00 2001
From: Jakub-Goluch <99048106+Jakub-Goluch@users.noreply.github.com>
Date: Thu, 22 Jun 2023 10:44:37 +0200
Subject: [PATCH 05/13] Add tests to read jsonl files

---
 src/easymatcher_worker.py                     |   2 +-
 .../expected/document_with_concrete.jsonl     |   1 +
 tests/example_data/expected/documents.jsonl   |   1 +
 .../input/document_with_concrete.jsonl        |   1 +
 .../input/labels_with_concrete.json           | 332 ++++++++++++++++++
 tests/fixtures/example_data.py                |   9 +
 tests/worker/test_worker.py                   |  14 +
 7 files changed, 359 insertions(+), 1 deletion(-)
 create mode 100644 tests/example_data/expected/document_with_concrete.jsonl
 create mode 100644 tests/example_data/input/document_with_concrete.jsonl
 create mode 100644 tests/example_data/input/labels_with_concrete.json

diff --git a/src/easymatcher_worker.py b/src/easymatcher_worker.py
index 8f6e9b2..ef85324 100644
--- a/src/easymatcher_worker.py
+++ b/src/easymatcher_worker.py
@@ -33,7 +33,7 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
                     for line in _df:
                         line_data = json.loads(line)
                         document['text'] = line_data['text']
-                        document['label'] = []
+                        # document['label'] = []
                         _f.write(json.dumps(document) + "\n")
         else:
             with open(document_path, "r", encoding="utf-8") as _df:
diff --git a/tests/example_data/expected/document_with_concrete.jsonl b/tests/example_data/expected/document_with_concrete.jsonl
new file mode 100644
index 0000000..cb94387
--- /dev/null
+++ b/tests/example_data/expected/document_with_concrete.jsonl
@@ -0,0 +1 @@
+{"text": "8703 Tarcza szlifierska Scanmaskin SC Elastic Metal 125 mm do betonu, lastryka i kamienia naturalnego (ziarnisto\u015b\u0107 30) Tarcza szlifierska SC Elastic Metal 125 mm marki Scanmaskin s\u0142u\u017cy do szlifowania betonu, lastryka i naturalnego kamienia. Powinna byc\u0301 stosowana przed na\u0142oz\u0307eniem cienkiej pow\u0142oki lub polerowaniem posadzki, aby usuna\u0328c\u0301 z powierzchni g\u0142e\u0328bokie zarysowania. Opis wariantu: Ziarnisto\u015b\u0107 30", "label": [[46, 51, "Metal"], [62, 68, "Beton"], [70, 78, "Lastryko"], [81, 89, "Kamie\u0144 naturalny"], [149, 154, "Metal"], [200, 206, "Beton"], [208, 216, "Lastryko"], [219, 239, "Kamie\u0144 naturalny"]]}
diff --git a/tests/example_data/expected/documents.jsonl b/tests/example_data/expected/documents.jsonl
index 1fce067..b34ea4d 100644
--- a/tests/example_data/expected/documents.jsonl
+++ b/tests/example_data/expected/documents.jsonl
@@ -1,2 +1,3 @@
 {"text": "55calowy telewizor QLED Samsung QE55Q75TA Pozw\\u00f3l sobie na luksus wybieraj\\u0105c telewizor QLED Samsung QE55Q75TA kt\\u00f3ry zabierze Ci\\u0119 w \\u015bwiat wspania\\u0142ej rozrywki bez wychodzenia z domu Umozliwia Aktualizowanie oprogramowania przez USB oraz posiada Aktualizacje oprogramowania sprz\u0119towego przez Internet", "label": [[19, 23, "Typ TV"], [96, 100, "Typ TV"], [219, 258, "Aktualizowanie oprogramowania przez USB"], [272, 326, "Aktualizacja oprogramowania sprz\u0119towego przez Internet"]]}
 {"text": "55calowy telewizor QLED Samsung QE55Q75TA Pozw\\u00f3l sobie na luksus wybieraj\\u0105c telewizor QLED Samsung QE55Q75TA kt\\u00f3ry zabierze Ci\\u0119 w \\u015bwiat wspania\\u0142ej rozrywki bez wychodzenia z domu", "label": [[19, 23, "Typ TV"], [96, 100, "Typ TV"]]}
+{"text": "8703 Tarcza szlifierska Scanmaskin SC Elastic Metal 125 mm do betonu, lastryka i kamienia naturalnego (ziarnisto\u015b\u0107 30) Tarcza szlifierska SC Elastic Metal 125 mm marki Scanmaskin s\u0142u\u017cy do szlifowania betonu, lastryka i naturalnego kamienia. Powinna byc\u0301 stosowana przed na\u0142oz\u0307eniem cienkiej pow\u0142oki lub polerowaniem posadzki, aby usuna\u0328c\u0301 z powierzchni g\u0142e\u0328bokie zarysowania. Opis wariantu: Ziarnisto\u015b\u0107 30", "label": []}
diff --git a/tests/example_data/input/document_with_concrete.jsonl b/tests/example_data/input/document_with_concrete.jsonl
new file mode 100644
index 0000000..0ac7d3f
--- /dev/null
+++ b/tests/example_data/input/document_with_concrete.jsonl
@@ -0,0 +1 @@
+{"text": "8703 Tarcza szlifierska Scanmaskin SC Elastic Metal 125 mm do betonu, lastryka i kamienia naturalnego (ziarnistość 30) Tarcza szlifierska SC Elastic Metal 125 mm marki Scanmaskin służy do szlifowania betonu, lastryka i naturalnego kamienia. Powinna być stosowana przed nałożeniem cienkiej powłoki lub polerowaniem posadzki, aby usunąć z powierzchni głębokie zarysowania. Opis wariantu: Ziarnistość 30","label":[]}
diff --git a/tests/example_data/input/labels_with_concrete.json b/tests/example_data/input/labels_with_concrete.json
new file mode 100644
index 0000000..6b48c69
--- /dev/null
+++ b/tests/example_data/input/labels_with_concrete.json
@@ -0,0 +1,332 @@
+{
+    "labels": {
+        "Asfalt": [
+            "asfalt",
+            "takich jak asfalt",
+            "Asfalt"
+        ],
+        "Beton": [
+            "idealna do betonu",
+            "beton",
+            "do kostki betonowej",
+            "Beton",
+            "beton"
+        ],
+        "Beton - miękki": [
+            "twardy",
+            "średni",
+            "miękki",
+            "ścierny",
+            "od miękkich do bardzo twardych kruszyw"
+        ],
+        "Beton - ścierny": [
+            "ścierny",
+            "od miękkich do bardzo"
+        ],
+        "Beton - średni": [
+            "średni",
+            "od miękkich do bardzo twardych"
+        ],
+        "Beton - świeży": [
+            "beton świeży",
+            "świeży beton",
+            "dla cięcia betonu świeżego",
+            "betonu próżniowego"
+        ],
+        "Beton - twardy": [
+            "z betonem utwardzonym",
+            "bardzo twardych kruszyw betonowych",
+            "twardego betonu",
+            "Twardy beton",
+            "Beton twardy"
+        ],
+        "Beton - zbrojony": [
+            "betonie zbrojonym",
+            "idealna do silnie zbrojonego betonu",
+            "Beton zbrojony",
+            "także zbrojnego",
+            "betonu zbrojonego",
+            "Beton zbrojny",
+            "beton zbrojny",
+            "Beton zbrojony",
+            "beton zbrojony"
+        ],
+        "Bloki ścienne": [
+            "bloki ścienne"
+        ],
+        "Bloki wapienne": [
+            "do bloków wapiennych",
+            "Blok wapienny"
+        ],
+        "Cegła": [
+            "cegł",
+            "cegły",
+            "cegła",
+            "Cegła",
+            "cegła",
+            "cegłą",
+            "cegły",
+            "Cegła wapienno-piaskowa"
+        ],
+        "Ceramika": [
+            "ceramika",
+            "ceramiki",
+            "płytki ceramiczne",
+            "glazurowana ceramika",
+            "tarcza do płytek",
+            "twarde płytki ceramiczne",
+            "płytki ceramiczne",
+            "Ceramika",
+            "ceramika"
+        ],
+        "Chrommagnezyt": [
+            "Chrommagnezyt"
+        ],
+        "Dachówka": [
+            "Dachówka cementowa",
+            "dachówek betonowych",
+            "dachówka"
+        ],
+        "Drewno": [
+            "Drewno",
+            "drewno"
+        ],
+        "Fuga murarska": [
+            "Fuga murarska",
+            "Fugi murarskie"
+        ],
+        "Glazura": [
+            "do cięcia glazury"
+        ],
+        "Gnejs": [
+            "gnejsu",
+            "gnejs"
+        ],
+        "Grafit": [
+            "grafit"
+        ],
+        "Granit": [
+            "Cięcie takich materiałów jak granit",
+            "marmur",
+            "naturalny kamień jest dopuszczalne przy użyciu opisywanej tarczy",
+            "granit",
+            "granitu",
+            "rawężniki granitowe",
+            "granir",
+            "Granit"
+        ],
+        "Gres": [
+            "Gres"
+        ],
+        "Jastrych": [
+            "jastrych"
+        ],
+        "Kamień naturalny": [
+            "kamienia",
+            "naturalnego kamienia",
+            "kamień naturalny",
+            "naturalny kamień jest dopuszczalne przy użyciu opisywanej tarczy",
+            "kamień naturalny",
+            "kamien naturalny",
+            "kamienia",
+            "Kamień",
+            "kamień",
+            "Kamień naturalny"
+        ],
+        "Kamionki": [
+            "kamionki"
+        ],
+        "Klej": [
+            "Klej"
+        ],
+        "Klinkier": [
+            "Klinkier",
+            "klinkier",
+            "linkier"
+        ],
+        "Kostka brukowa": [
+            "kostki brukowej",
+            "kostka brukowa"
+        ],
+        "Krzemionka": [
+            "Krzemionka"
+        ],
+        "Kwarcyt": [
+            "kwarcyt",
+            "Kwarcyt"
+        ],
+        "Lastryko": [
+            "lastryka",
+            "lastryk"
+        ],
+        "Łupek": [
+            "łupek"
+        ],
+        "Magnezyt": [
+            "Magnezyt"
+        ],
+        "Marmur": [
+            "Cięcie takich materiałów jak granit",
+            "marmur",
+            "marmur",
+            "marmuru",
+            "Marmur",
+            "marmur"
+        ],
+        "Materiały budowlane": [
+            "materiały",
+            "Materiały budowlane",
+            "materiały budowlane",
+            "standardowych materiałów budowlanych"
+        ],
+        "Materiały ścierne": [
+            "materiały ścierne"
+        ],
+        "Materiały twarde": [
+            "Materiały twarde"
+        ],
+        "Metal": [
+            "Metal"
+        ],
+        "Miękkie kruszywa": [
+            "miękkich do twardych kruszyw",
+            "innych miękkich materiałów"
+        ],
+        "PCV": [
+            "PCV"
+        ],
+        "Piaskowiec": [
+            "iaskow",
+            "piaskowca",
+            "piaskowiec"
+        ],
+        "Plastik": [
+            "Plastik"
+        ],
+        "Płyta chodnikowa": [
+            "płyta chodnikowa"
+        ],
+        "Płytki": [
+            "Płytki",
+            "płytki"
+        ],
+        "Płytki betonowe": [
+            "płytki betonowe"
+        ],
+        "Płytki ceramiczne": [
+            "Płytki ceramiczne",
+            "płytki ceramiczne"
+        ],
+        "Płytki marmurowe": [
+            "płytek betonowych i marmurowych"
+        ],
+        "Polbruk": [
+            "polbruk"
+        ],
+        "Porcelana": [
+            "porcelan",
+            "Porcelana",
+            "porcelana"
+        ],
+        "Porfir": [
+            "Porfir",
+            "porfir"
+        ],
+        "Poroterm": [
+            "poroterm"
+        ],
+        "Powłoki posadzkowe": [
+            "Powłoki posadzkowe"
+        ],
+        "Powłoki ścienne": [
+            "Powłoki ścienne"
+        ],
+        "Rury z żeliwa ciągliwego": [
+            "Rury z żeliwa ciągliwego"
+        ],
+        "Stal": [
+            "Stal",
+            "stal"
+        ],
+        "Stal nierdzewna": [
+            "stali nierdzewnej"
+        ],
+        "Strunobeton": [
+            "Strunobeton"
+        ],
+        "Szamot": [
+            "szamot",
+            "Szamot"
+        ],
+        "Szkło pancerne": [
+            "Szkło pancerne"
+        ],
+        "Sztuczny kamień": [
+            "sztucznego kamienia"
+        ],
+        "Ścierna płyta budowlana": [
+            "ścierna płyta budowlana"
+        ],
+        "Ścierne": [
+            "ścierne"
+        ],
+        "Terazzo": [
+            "terazzo"
+        ],
+        "Tlenek cyrkonu": [
+            "tlenek cyrkonu"
+        ],
+        "Tlenek glinu": [
+            "tlenek glinu"
+        ],
+        "Trawertyn": [
+            "trawert",
+            "trawertyn"
+        ],
+        "Twarda glazura": [
+            "Twarda glazura",
+            "twarda glazura"
+        ],
+        "Twarde kruszywa": [
+            "twardych kruszyw"
+        ],
+        "Twarde materiały": [
+            "twardych materiałach",
+            "twarde",
+            "twardych materiałów",
+            "im twardszy materiał"
+        ],
+        "Tworzywo sztuczne": [
+            "Tworzywo sztuczne",
+            "tworzywo sztuczne"
+        ],
+        "Tynk": [
+            "tynk"
+        ],
+        "Wapień": [
+            "wapień"
+        ],
+        "Zaprawa murarska": [
+            "Zaprawa murarska",
+            "zaprawa murarska"
+        ],
+        "Żelazo": [
+            "Żelazo"
+        ],
+        "Żeliwo": [
+            "Żeliwo",
+            "żeliwo"
+        ],
+        "Żeliwo sferoidalne": [
+            "żeliwo sferoidalne"
+        ]
+    },
+    "blackList": {
+        "Metal": [
+            "Elastic  Metal"
+        ],
+        "Średnica otworu montażowego tarczy - uwaga": [
+            "mm"
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/fixtures/example_data.py b/tests/fixtures/example_data.py
index 05e6891..fb78e54 100644
--- a/tests/fixtures/example_data.py
+++ b/tests/fixtures/example_data.py
@@ -32,3 +32,12 @@ def example_labels_path(input_dir: Path) -> Path:
 def example_document_path(input_dir: Path, request) -> Path:
     document_number = request.param
     return input_dir / f"document{document_number}.txt"
+
+
+@pytest.fixture
+def example_document_path_jsonl(input_dir: Path, request) -> Path:
+    return input_dir / "document_with_concrete.jsonl"
+
+@pytest.fixture
+def example_labels_path_jsonl(input_dir: Path) -> Path:
+    return input_dir / "labels_with_concrete.json"
diff --git a/tests/worker/test_worker.py b/tests/worker/test_worker.py
index 868416c..89fb6d0 100644
--- a/tests/worker/test_worker.py
+++ b/tests/worker/test_worker.py
@@ -49,3 +49,17 @@ def test_easymatcher_process_folder(
     expected_path = expected_dir / "documents.jsonl"
     worker.process(input_dir, task_options, output_path)
     check_and_cleanup(output_path, expected_path)
+
+
+def test_easymatcher_process_jsonl_document(
+    worker: EasymatcherWorker,
+    example_document_path_jsonl: Path,
+    example_labels_path_jsonl: Path,
+    output_dir: Path,
+    expected_dir: Path,
+):
+    task_options = {"labels_path": example_labels_path_jsonl}
+    output_path = output_dir / f"{example_document_path_jsonl.stem}.jsonl"
+    expected_path = expected_dir / f"{example_document_path_jsonl.stem}.jsonl"
+    worker.process(example_document_path_jsonl, task_options, output_path)
+    check_and_cleanup(output_path, expected_path)
\ No newline at end of file
-- 
GitLab


From 73c0b676c4a0ba5efb8f4d363a77b3e1c0d8f667 Mon Sep 17 00:00:00 2001
From: Jakub-Goluch <99048106+Jakub-Goluch@users.noreply.github.com>
Date: Mon, 26 Jun 2023 17:43:37 +0200
Subject: [PATCH 06/13] Add feature to check whether document has a valid
 json/jsonl format, add tests

---
 src/easymatcher_worker.py                     | 20 +++++++++++++------
 ...crete.jsonl => document_with_concrete.txt} |  0
 tests/fixtures/example_data.py                |  2 +-
 tests/worker/test_worker.py                   |  2 +-
 4 files changed, 16 insertions(+), 8 deletions(-)
 rename tests/example_data/input/{document_with_concrete.jsonl => document_with_concrete.txt} (100%)

diff --git a/src/easymatcher_worker.py b/src/easymatcher_worker.py
index ef85324..8d79c8d 100644
--- a/src/easymatcher_worker.py
+++ b/src/easymatcher_worker.py
@@ -20,6 +20,18 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
     It relies on the use of an easymatcher tool which can be found he under -
     https://gitlab.clarin-pl.eu/knowledge-extraction/tools/easymatcher
     """
+    @staticmethod
+    def is_jsonl(document_path: str | Path) -> bool:
+        """Validates whether text file has json/jsonl structure and has "text" keyword"""
+        try:
+            with open(document_path, 'r', encoding="utf-8") as file:
+                for line in file:
+                    json_obj = json.loads(line)
+                    if "text" not in json_obj:
+                        return False
+            return True
+        except (json.JSONDecodeError, FileNotFoundError):
+            return False
 
     @staticmethod
     def prepare_and_append_document(
@@ -27,7 +39,7 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
     ) -> None:
         """Formats and appends plain texts into jsonl file."""
         document = {}
-        if str(document_path).endswith(".jsonl"):
+        if EasymatcherWorker.is_jsonl(document_path):
             with open(file_path, "a", encoding="utf-8") as _f:
                 with open(document_path, "r", encoding="utf-8") as _df:
                     for line in _df:
@@ -77,11 +89,7 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
 
         if os.path.isdir(input_path):
             for file in os.listdir(input_path):
-                if file.endswith(".jsonl"):
-                    EasymatcherWorker.prepare_and_append_document(
-                        tmpf.name, Path(input_path) / file
-                    )
-                elif file.endswith(".txt"):
+                if file.endswith(".txt"):
                     EasymatcherWorker.prepare_and_append_document(
                         tmpf.name, Path(input_path) / file
                     )
diff --git a/tests/example_data/input/document_with_concrete.jsonl b/tests/example_data/input/document_with_concrete.txt
similarity index 100%
rename from tests/example_data/input/document_with_concrete.jsonl
rename to tests/example_data/input/document_with_concrete.txt
diff --git a/tests/fixtures/example_data.py b/tests/fixtures/example_data.py
index fb78e54..8a76ed0 100644
--- a/tests/fixtures/example_data.py
+++ b/tests/fixtures/example_data.py
@@ -36,7 +36,7 @@ def example_document_path(input_dir: Path, request) -> Path:
 
 @pytest.fixture
 def example_document_path_jsonl(input_dir: Path, request) -> Path:
-    return input_dir / "document_with_concrete.jsonl"
+    return input_dir / "document_with_concrete.txt"
 
 @pytest.fixture
 def example_labels_path_jsonl(input_dir: Path) -> Path:
diff --git a/tests/worker/test_worker.py b/tests/worker/test_worker.py
index 89fb6d0..1149147 100644
--- a/tests/worker/test_worker.py
+++ b/tests/worker/test_worker.py
@@ -51,7 +51,7 @@ def test_easymatcher_process_folder(
     check_and_cleanup(output_path, expected_path)
 
 
-def test_easymatcher_process_jsonl_document(
+def test_easymatcher_process_jsonl_document_structure(
     worker: EasymatcherWorker,
     example_document_path_jsonl: Path,
     example_labels_path_jsonl: Path,
-- 
GitLab


From 8864e2687c1cdf1bd563953f262402e7c9bbaa76 Mon Sep 17 00:00:00 2001
From: Jakub-Goluch <99048106+Jakub-Goluch@users.noreply.github.com>
Date: Mon, 26 Jun 2023 21:50:17 +0200
Subject: [PATCH 07/13] Add feature to check whether document has a valid
 json/jsonl format, add tests

---
 src/easymatcher_worker.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/easymatcher_worker.py b/src/easymatcher_worker.py
index 8d79c8d..5e59e71 100644
--- a/src/easymatcher_worker.py
+++ b/src/easymatcher_worker.py
@@ -20,9 +20,12 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
     It relies on the use of an easymatcher tool which can be found he under -
     https://gitlab.clarin-pl.eu/knowledge-extraction/tools/easymatcher
     """
+
     @staticmethod
-    def is_jsonl(document_path: str | Path) -> bool:
-        """Validates whether text file has json/jsonl structure and has "text" keyword"""
+    def is_jsonl(
+            document_path: str | Path
+    ) -> bool:
+        """Validates whether text file has json/jsonl structure and has "text" keyword."""
         try:
             with open(document_path, 'r', encoding="utf-8") as file:
                 for line in file:
-- 
GitLab


From e9bab9a05da19bee7a14defedac3199894ea8123 Mon Sep 17 00:00:00 2001
From: Jakub-Goluch <99048106+Jakub-Goluch@users.noreply.github.com>
Date: Mon, 26 Jun 2023 22:04:48 +0200
Subject: [PATCH 08/13] Add feature to check whether document has a valid
 json/jsonl format, add tests

---
 src/easymatcher_worker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/easymatcher_worker.py b/src/easymatcher_worker.py
index 5e59e71..d300a49 100644
--- a/src/easymatcher_worker.py
+++ b/src/easymatcher_worker.py
@@ -25,7 +25,7 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
     def is_jsonl(
             document_path: str | Path
     ) -> bool:
-        """Validates whether text file has json/jsonl structure and has "text" keyword."""
+        """Validates whether text file has json/jsonl structure."""
         try:
             with open(document_path, 'r', encoding="utf-8") as file:
                 for line in file:
-- 
GitLab


From 3130c89d4c06f3850732f5907071f47b8e7b441c Mon Sep 17 00:00:00 2001
From: Jakub-Goluch <99048106+Jakub-Goluch@users.noreply.github.com>
Date: Mon, 26 Jun 2023 22:26:15 +0200
Subject: [PATCH 09/13] Add feature to check whether document has a valid
 json/jsonl format, add tests

---
 src/easymatcher_worker.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/easymatcher_worker.py b/src/easymatcher_worker.py
index 076e743..9366608 100644
--- a/src/easymatcher_worker.py
+++ b/src/easymatcher_worker.py
@@ -54,8 +54,8 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
             with open(document_path, "r", encoding="utf-8") as _df:
                 document["text"] = _df.read()
 
-        with open(file_path, "a", encoding="utf-8") as _f:
-            _f.write(json.dumps(document) + "\n")
+            with open(file_path, "a", encoding="utf-8") as _f:
+                _f.write(json.dumps(document) + "\n")
 
     def process(
             self,
-- 
GitLab


From 6a94b42e30491cbc64c69e530ad7edf2d59b521c Mon Sep 17 00:00:00 2001
From: Konrad Wojtasik <pwr200856@e-science.pl>
Date: Tue, 4 Jul 2023 08:56:19 +0000
Subject: [PATCH 10/13] Update easymatcher_worker.py

---
 src/easymatcher_worker.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/easymatcher_worker.py b/src/easymatcher_worker.py
index 9366608..f51ad45 100644
--- a/src/easymatcher_worker.py
+++ b/src/easymatcher_worker.py
@@ -47,9 +47,9 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
                 with open(document_path, "r", encoding="utf-8") as _df:
                     for line in _df:
                         line_data = json.loads(line)
-                        document['text'] = line_data['text']
+                        # document['text'] = line_data['text'] 
                         # document['label'] = []
-                        _f.write(json.dumps(document) + "\n")
+                        _f.write(json.dumps(line_data) + "\n")
         else:
             with open(document_path, "r", encoding="utf-8") as _df:
                 document["text"] = _df.read()
@@ -107,5 +107,7 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
         os.unlink(tmpf.name)
 
         with open(output_path, "w", encoding="utf-8") as _f:
-            for out_document in out_documents:
-                _f.write(json.dumps(out_document) + "\n")
+            for out_document, document in zip(out_documents, documents):
+                # We want to keep content of the original labeled documents
+                document['label'] = out_document['label']
+                _f.write(json.dumps(document) + "\n")
-- 
GitLab


From 8a0493152102a398f4b64caf2965f0927071c676 Mon Sep 17 00:00:00 2001
From: Konrad Wojtasik <pwr200856@e-science.pl>
Date: Tue, 4 Jul 2023 09:02:19 +0000
Subject: [PATCH 11/13] Update .gitlab-ci.yml

---
 .gitlab-ci.yml | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 3ec1770..ee81085 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -55,13 +55,10 @@ pages:
 
 build_master:
   stage: builds
-  image: 'docker:18.09.7'
+  image: docker
   only:
     - master
-  services: 
-    - 'docker:18.09.7-dind'
   script:
-    - docker build -t $CI_REGISTRY_IMAGE:latest -f DockerFile .
+    - docker build --load -t $CI_REGISTRY_IMAGE:latest -f DockerFile .
     - docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY
     - docker push $CI_REGISTRY_IMAGE:latest
-
-- 
GitLab


From 05afe1e1476ef0cc0cfc49c74ad1f5bf99d482d3 Mon Sep 17 00:00:00 2001
From: Konrad Wojtasik <pwr200856@e-science.pl>
Date: Tue, 4 Jul 2023 09:02:36 +0000
Subject: [PATCH 12/13] Update easymatcher_worker.py

---
 src/easymatcher_worker.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/easymatcher_worker.py b/src/easymatcher_worker.py
index f51ad45..27402bd 100644
--- a/src/easymatcher_worker.py
+++ b/src/easymatcher_worker.py
@@ -47,8 +47,6 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
                 with open(document_path, "r", encoding="utf-8") as _df:
                     for line in _df:
                         line_data = json.loads(line)
-                        # document['text'] = line_data['text'] 
-                        # document['label'] = []
                         _f.write(json.dumps(line_data) + "\n")
         else:
             with open(document_path, "r", encoding="utf-8") as _df:
-- 
GitLab


From 4294beedc4d1f435633ae56bac987bfe85624fcc Mon Sep 17 00:00:00 2001
From: Konrad Wojtasik <konrad.wojtasik@pwr.edu.pl>
Date: Tue, 4 Jul 2023 12:58:09 +0200
Subject: [PATCH 13/13] Fix tests

---
 src/easymatcher_worker.py | 1 +
 tests/utils.py            | 9 ++++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/easymatcher_worker.py b/src/easymatcher_worker.py
index 27402bd..d8373a4 100644
--- a/src/easymatcher_worker.py
+++ b/src/easymatcher_worker.py
@@ -108,4 +108,5 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
             for out_document, document in zip(out_documents, documents):
                 # We want to keep content of the original labeled documents
                 document['label'] = out_document['label']
+                document['text'] = out_document['text']
                 _f.write(json.dumps(document) + "\n")
diff --git a/tests/utils.py b/tests/utils.py
index 703c07f..4bdde7f 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1,8 +1,15 @@
 import os
+import json
 from filecmp import cmp
 from pathlib import Path
 
+def load_jsonl(file_path: Path):
+    with open(file_path) as f:
+        data = [json.loads(line) for line in f]
+        return data
 
 def check_and_cleanup(output_path: Path, expected_path: Path) -> Path:
-    assert cmp(output_path, expected_path)
+    output = load_jsonl(output_path)
+    expected = load_jsonl(expected_path)
+    assert sorted(output, key=lambda d : d['text']) == sorted(expected, key=lambda d : d['text'])
     os.remove(output_path)
-- 
GitLab