More refactoring, added less error-prone preprocessing

29efda6d · Michal Pogoda · 841d0546 · 29efda6d · 29efda6d · 29efda6d
Commit 29efda6d authored Aug 11, 2020 by Michal Pogoda
--- a/punctuate.py
+++ b/punctuate.py
 #!/usr/bin/python3

 import argparse
-from argparse import Namespace
 import os
+from argparse import Namespace

 from src.pipelines.actions_based.processing import apply_actions_punctuation
 from src.pipelines.actions_based.utils import load_model
+from src.utils import preprocess


 def get_args() -> Namespace:
@@ -42,7 +43,7 @@ if __name__ == "__main__":
    tokenizer, model = load_model(args.model, args.base, "cpu")

    with open(args.input, "r") as f:
-        text = f.read()
+        text = preprocess(f.read())
        text_processed = apply_actions_punctuation(
            text, args.chunk_size, tokenizer, model, args.threshold
        )

--- a/src/pipelines/actions_based/stage1_extraction.py
+++ b/src/pipelines/actions_based/stage1_extraction.py
@@ -6,7 +6,10 @@ import numpy as np
 import pandas as pd
 from dask.distributed import Client

-from src.pipelines.actions_based.processing import APPLY_FILE_PROCESSING_META, apply_file_processing
+from src.pipelines.actions_based.processing import (
+    APPLY_FILE_PROCESSING_META,
+    apply_file_processing,
+)
 from src.utils import PROJECT_ROOT, get_config, prepare_folder

 INPUT_FOLDER = f"{PROJECT_ROOT}/data"

--- a/src/pipelines/actions_based/stage2_tokenization.py
+++ b/src/pipelines/actions_based/stage2_tokenization.py
@@ -4,7 +4,10 @@ import dask.dataframe as dd
 from dask.distributed import Client
 from transformers import BertTokenizerFast

-from src.pipelines.actions_based.processing import APPLY_TOKENIZATION_META, apply_tokenization
+from src.pipelines.actions_based.processing import (
+    APPLY_TOKENIZATION_META,
+    apply_tokenization,
+)
 from src.utils import PROJECT_ROOT, get_config, prepare_folder

 INPUT_FOLDER = f"{PROJECT_ROOT}/generated/actions/stage1_extraction"

--- a/src/pipelines/actions_based/stage3_exploding.py
+++ b/src/pipelines/actions_based/stage3_exploding.py
@@ -2,7 +2,12 @@
 import dask.dataframe as dd
 from dask.distributed import Client

-from src.processing import EXPAND_DIMS_META, FLATTEN_DIMS_META, expand_dims, flatten_dims
+from src.processing import (
+    EXPAND_DIMS_META,
+    FLATTEN_DIMS_META,
+    expand_dims,
+    flatten_dims,
+)
 from src.utils import PROJECT_ROOT, get_config, prepare_folder

 INPUT_FOLDER = f"{PROJECT_ROOT}/generated/actions/stage2_tokenization"

--- a/src/pipelines/translation_based/stage1_extraction.py
+++ b/src/pipelines/translation_based/stage1_extraction.py
@@ -6,7 +6,10 @@ import numpy as np
 import pandas as pd
 from dask.distributed import Client

-from src.pipelines.translation_based.processing import RAW_TO_DATAFRAME_META, raw_to_dataframe
+from src.pipelines.translation_based.processing import (
+    RAW_TO_DATAFRAME_META,
+    raw_to_dataframe,
+)
 from src.utils import PROJECT_ROOT, get_config, prepare_folder

 INPUT_FOLDER = f"{PROJECT_ROOT}/data"

--- a/src/pipelines/translation_based/stage2_create_batches.py
+++ b/src/pipelines/translation_based/stage2_create_batches.py
@@ -4,7 +4,10 @@ from dask import delayed
 from dask.distributed import Client
 from transformers import BertTokenizerFast

-from src.pipelines.translation_based.processing import GENERATE_BATCHES_META, generate_batches
+from src.pipelines.translation_based.processing import (
+    GENERATE_BATCHES_META,
+    generate_batches,
+)
 from src.utils import PROJECT_ROOT, get_config, prepare_folder

 INPUT_FOLDER = f"{PROJECT_ROOT}/generated/translations/stage1_extraction"

--- a/src/utils.py
+++ b/src/utils.py
@@ -45,9 +45,31 @@ def remove_punctuation(text: str) -> str:
    Returns:
        str: Text with all punctuactions removed
    """
+
+    # Separating characters
+    text = text.replace("-", " ").replace("/", " ").replace("+", " ")
+
    return "".join(filter(lambda x: x.isalnum() or x.isspace(), text))


+def preprocess(text: str) -> str:
+    """Makes sure that input is in the same format as training data (no non-alphanum chars, no double spaces,
+        all lowercase etc.)
+
+    Args:
+        text (str): Text to be processed
+
+    Returns:
+        str: Text in training-data format
+    """
+    text = remove_punctuation(text)
+    text = remove_multiple_spaces(text)
+    text = text.lower()
+    text = text.strip()
+
+    return text
+
+
 def prepare_folder(path: str, wipe: bool = False) -> None:
    """Function make sure that provided path exists. Can aditionaly
    remove all files from the path.

--- a/tests/test_utils.py
+++ b/tests/test_utils.py
+from src.utils import convert_to_timedelta, preprocess, remove_multiple_spaces, remove_punctuation
+
+
+def test_remove_multiple_spaces():
+    provided = "Ala   ma Kota.      Kot ma Ale "
+    expected = "Ala ma Kota. Kot ma Ale "
+
+    assert remove_multiple_spaces(provided) == expected
+
+
+def test_remove_punctuation():
+    provided = "Ala..  ma-Kota!?.@@$ Kot ma Ale ()*"
+    expected = "Ala  ma Kota Kot ma Ale "
+
+    assert remove_punctuation(provided) == expected
+
+
+def test_preprocess():
+    provided = "Ala  ma-Kota!?.@@$ Kot ma Ale ()*"
+    expected = "ala ma kota kot ma ale"
+
+    assert preprocess(provided) == expected
+
+
+def test_convert_to_timedelta():
+    assert convert_to_timedelta("5d").days == 5
+    assert convert_to_timedelta("5d").seconds == 0
+    assert convert_to_timedelta("5d").microseconds == 0
+
+    assert convert_to_timedelta("4h").days == 0
+    assert convert_to_timedelta("4h").seconds == 4 * 60 * 60
+    assert convert_to_timedelta("4h").microseconds == 0
+
+    assert convert_to_timedelta("3m").days == 0
+    assert convert_to_timedelta("3m").seconds == 3 * 60
+    assert convert_to_timedelta("3m").microseconds == 0
+
+    assert convert_to_timedelta("2s").days == 0
+    assert convert_to_timedelta("2s").seconds == 2
+    assert convert_to_timedelta("2s").microseconds == 0
--- a/worker.py
+++ b/worker.py
@@ -6,6 +6,7 @@ import nlp_ws

 from src.pipelines.actions_based.processing import apply_actions_punctuation
 from src.pipelines.actions_based.utils import load_model
+from src.utils import preprocess


 class Worker(nlp_ws.NLPWorker):
@@ -27,7 +28,7 @@ class Worker(nlp_ws.NLPWorker):
        """Implementation of example tasks that copies files."""

        with open(input_file, "r") as f:
-            text = f.read()
+            text = preprocess(f.read())
            text_processed = apply_actions_punctuation(
                text, self.chunk_size, self.tokenizer, self.model, self.threshold
            )