Fix non-space whitespace characters destroying predictions

fd0d1a14 · Michał Pogoda · Szymon Ciombor · a588bad7 · fd0d1a14 · fd0d1a14
Commit fd0d1a14 authored Aug 24, 2020 by Michał Pogoda Committed by Szymon Ciombor Aug 24, 2020
--- a/src/utils.py
+++ b/src/utils.py
@@ -6,7 +6,8 @@ from typing import Optional

 import yaml

-PROJECT_ROOT = os.path.dirname(os.path.realpath("/".join(__file__.split("/")) + "/.."))
+PROJECT_ROOT = os.path.dirname(os.path.realpath(
+    "/".join(__file__.split("/")) + "/.."))


 def get_config() -> dict:
@@ -52,6 +53,26 @@ def remove_punctuation(text: str) -> str:
    return "".join(filter(lambda x: x.isalnum() or x.isspace(), text))


+def unify_whitespaces(text: str) -> str:
+    """Maps all whitespace characters into a simple ' '
+
+    Args:
+        text (str): Text containing multiple forms of whitespace
+
+    Returns:
+        str: Text with a single form of whitespace
+    """
+    result = ""
+
+    for c in text:
+        if c.isspace():
+            result += " "
+        else:
+            result += c
+
+    return result
+
+
 def preprocess(text: str) -> str:
    """Makes sure that input is in the same format as training data (no non-alphanum chars, no double spaces,
        all lowercase etc.)
@@ -63,6 +84,7 @@ def preprocess(text: str) -> str:
        str: Text in training-data format
    """
    text = remove_punctuation(text)
+    text = unify_whitespaces(text)
    text = remove_multiple_spaces(text)
    text = text.lower()
    text = text.strip()

--- a/worker.py
+++ b/worker.py
@@ -23,11 +23,14 @@ class Worker(nlp_ws.NLPWorker):
            self.config["deployment"]["device"],
        )

+        self.model.train(False)
+
    def process(self, input_file: str, task_options: dict, output_file: str) -> None:
        """Implementation of example tasks that copies files."""

        with open(input_file, "r") as f:
-            text = preprocess(f.read())
+            text = str(f.read())
+            text = preprocess(text)
            text_processed = apply_actions_punctuation(
                text, self.chunk_size, self.tokenizer, self.model, self.threshold
            )