[WIP] Style + logic updates, added output to txt

bf05a415 · Michał Pogoda · 831cf50b · bf05a415 · bf05a415 · bf05a415
Commit bf05a415 authored 2 years ago by Michał Pogoda
--- a/config/configuration/wiktorner_jsonl_txt_output.yaml
+++ b/config/configuration/wiktorner_jsonl_txt_output.yaml
+# @package _global_
+
+defaults:
+  - /paths: default
+  - /detectors: all_ner_kpwr_ext
+  - /replacers: tag
+  - /suppressor: order_based
+  - /input_parser: wiktor_ner
+  - /pipeline: sequential_jsonl
+  - _self_
+
+pipeline:
+  concat_to_txt: true
+
+language: "pl"
\ No newline at end of file
--- a/config/pipeline/sequential_jsonl.yaml
+++ b/config/pipeline/sequential_jsonl.yaml
@@ -2,4 +2,5 @@ _target_: src.pipeline.sequential_jsonl.SequentialJSONLPipeline
 input_parser: ${input_parser}
 detectors: ${detectors}
 suppressor: ${suppressor}
-replacers: ${replacers}
\ No newline at end of file
+replacers: ${replacers}
+concat_to_txt: false
\ No newline at end of file
--- a/src/detectors/date/utils.py
+++ b/src/detectors/date/utils.py
@@ -6,47 +6,47 @@ def _parse_day_or_month(re_entry) -> List[Tuple[int, int, DateDetection]]:
    assert re_entry["day_or_month_year"] is not None
    result = []

-    if re_entry["day_month1"] is not None:
-        if len(re_entry["day_month1"]) == 1:
-            result.append(
-                (
-                    DateDetection.AnnotationPart.TWO_DIGITS_DAY,
-                    "0" + re_entry["day_month1"],
-                )
-            )
-        else:
-            result.append(
-                (DateDetection.AnnotationPart.TWO_DIGITS_DAY, re_entry["day_month1"])
-            )
-        result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct1"]))
-
-        if len(re_entry["day_month2"]) == 1:
-            result.append(
-                (
-                    DateDetection.AnnotationPart.TWO_DIGIT_MONTH,
-                    "0" + re_entry["day_month2"],
-                )
-            )
-        else:
-            result.append(
-                (DateDetection.AnnotationPart.TWO_DIGIT_MONTH, re_entry["day_month2"])
-            )
-
-        result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct1"]))
-    elif "day_month2" in re_entry:
-        if len(re_entry["day_month2"]) == 1:
-            result.append(
-                (
-                    DateDetection.AnnotationPart.TWO_DIGIT_MONTH,
-                    "0" + re_entry["day_month2"],
-                )
-            )
-        else:
-            result.append(
-                (DateDetection.AnnotationPart.TWO_DIGIT_MONTH, re_entry["day_month2"])
-            )
-
-        result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct1"]))
+    # if re_entry["day_month1"] is not None:
+    if len(re_entry["day_month1"]) == 1:
+        result.append(
+            (
+                DateDetection.AnnotationPart.TWO_DIGITS_DAY,
+                "0" + re_entry["day_month1"],
+            )
+        )
+    else:
+        result.append(
+            (DateDetection.AnnotationPart.TWO_DIGITS_DAY, re_entry["day_month1"])
+        )
+    result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct1"]))
+
+    if len(re_entry["day_month2"]) == 1:
+        result.append(
+            (
+                DateDetection.AnnotationPart.TWO_DIGIT_MONTH,
+                "0" + re_entry["day_month2"],
+            )
+        )
+    else:
+        result.append(
+            (DateDetection.AnnotationPart.TWO_DIGIT_MONTH, re_entry["day_month2"])
+        )
+
+    result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct1"]))
+    # elif "day_month2" in re_entry:
+    #     if len(re_entry["day_month2"]) == 1:
+    #         result.append(
+    #             (
+    #                 DateDetection.AnnotationPart.TWO_DIGIT_MONTH,
+    #                 "0" + re_entry["day_month2"],
+    #             )
+    #         )
+    #     else:
+    #         result.append(
+    #             (DateDetection.AnnotationPart.TWO_DIGIT_MONTH, re_entry["day_month2"])
+    #         )
+
+    #     result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct1"]))

    if "year1" in re_entry:
        if len(re_entry["year1"]) == 2:

--- a/src/pipeline/sequential_jsonl.py
+++ b/src/pipeline/sequential_jsonl.py
@@ -14,11 +14,13 @@ class SequentialJSONLPipeline(Pipeline):
        detectors: Dict[str, Detector],
        suppressor: Suppressor,
        replacers: Dict[str, ReplacerInterface],
+        concat_to_txt: bool = False,
    ):
        self._input_parser = input_parser
        self._detectors = detectors
        self._suppressor = suppressor
        self._replacers = replacers
+        self._concat_to_txt = concat_to_txt

    def run(self, input_path) -> str:
        result = []
@@ -45,4 +47,15 @@ class SequentialJSONLPipeline(Pipeline):

                result.append({"text": replaced_input})

-        return "\n".join([json.dumps(item, ensure_ascii=False) for item in result])
+        if self._concat_to_txt:
+            result_text = ""
+            for item in result:
+                text = item["text"]
+                if result_text != "" and result_text.rstrip() == result_text and text.lstrip() == text:
+                    result_text += " " + text
+                else:
+                    result_text += text
+                    
+            return result_text
+        else:
+            return "\n".join([json.dumps(item, ensure_ascii=False) for item in result])
--- a/tests/integration/wiktorner_jsonl_txt_output_configuration/__init__.py
+++ b/tests/integration/wiktorner_jsonl_txt_output_configuration/__init__.py
--- a/tests/integration/wiktorner_jsonl_txt_output_configuration/test_wiktorner_jsonl_txt_output_configuration.py
+++ b/tests/integration/wiktorner_jsonl_txt_output_configuration/test_wiktorner_jsonl_txt_output_configuration.py
+from hydra import initialize, compose
+from hydra.utils import instantiate
+
+
+def test_wiktorner_jsonl_txt_output_configuration():
+    with initialize(config_path="../../../config", version_base="1.1"):
+        config = compose(
+            config_name="config",
+            overrides=["paths.root_path=../../../", "configuration=wiktorner_jsonl_txt_output"],
+        )
+        pipeline = instantiate(config.pipeline)
+
+    result = pipeline.run(
+        "./tests/integration/wiktorner_jsonl_txt_output_configuration/wiktorner_jsonl.jsonl"
+    )
+
+    assert (
+        result
+        == 'ROZDZIAŁ I. CO NIECO O SAMEJ PIPIDÓWCE Przede wszystkim muszę uprzedzić z góry czytelników, aby się daremnie nie trudzili nad szukaniem wyżej wyrażonego miasteczka na mapach [MIEJSCE] i [MIEJSCE], bo go tam nie znajdą. Nie dlatego, jakoby [MIEJSCE] nie istniała w rzeczywistości i była tylko wytworem fantazji autora, ale po prostu dlatego, że mieszkańcy owego sławnego grodu, urosłszy z czasem w ambicję, uważali tę nazwę jako ubliżającą ich powadze i podali do c. k. namiestnictwa pokorną prośbę o pozwolenie zamienienia jej na inną. Podobne zamiany nazwisk praktykują się dość często w [MIEJSCE], szczególnie u pojedynczych osób, które nie czując się na siłach uszlachetnienia sobą, swymi czynami własnego nazwiska, chcą nazwiskiem uszlachetnić siebie, i tak np. ROZDZIAŁ I. CO NIECO O SAMEJ PIPIDÓWCE Przede wszystkim muszę uprzedzić z góry czytelników, aby się daremnie nie trudzili nad szukaniem wyżej wyrażonego miasteczka na mapach [MIEJSCE] i [MIEJSCE], bo go tam nie znajdą. Nie dlatego, jakoby [MIEJSCE] nie istniała w rzeczywistości i była tylko wytworem fantazji autora, ale po prostu dlatego, że mieszkańcy owego sławnego grodu, urosłszy z czasem w ambicję, uważali tę nazwę jako ubliżającą ich powadze i podali do c. k. namiestnictwa pokorną prośbę o pozwolenie zamienienia jej na inną. Podobne zamiany nazwisk praktykują się dość często w [MIEJSCE], szczególnie u pojedynczych osób, które nie czując się na siłach uszlachetnienia sobą, swymi czynami własnego nazwiska, chcą nazwiskiem uszlachetnić siebie, i tak np.'
+    )
--- a/tests/integration/wiktorner_jsonl_txt_output_configuration/wiktorner_jsonl.jsonl
+++ b/tests/integration/wiktorner_jsonl_txt_output_configuration/wiktorner_jsonl.jsonl
--- a/tests/unit/detectors/date/test_pl.py
+++ b/tests/unit/detectors/date/test_pl.py
@@ -34,3 +34,28 @@ def test_detect_dates_pl():
        (7, 16, DateDetection(format_date1)),
        (34, 49, DateDetection(format_date2)),
    ]
+
+def test_date_with_different_punctuations():
+    # There is discussion about this wheter we should even detect such cases
+    # as a dates... However, for now we do and if we find cases where that is
+    # problematic, this definitly could be changed.
+    
+    detector = DateDetector("pl")
+
+    text = "1.01,2022"
+    found_dates = detector.detect(text, dict())
+
+    format_date = [
+        (
+            DateDetection.AnnotationPart.TWO_DIGITS_DAY,
+            "01",
+        ), 
+        (DateDetection.AnnotationPart.OTHER, "."),
+        (DateDetection.AnnotationPart.TWO_DIGIT_MONTH, "01"),
+        (DateDetection.AnnotationPart.OTHER, ","),
+        (DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, "2022"),
+    ]
+    
+    assert found_dates == [
+        (7, 16, DateDetection(format_date)),
+    ]
\ No newline at end of file