From ab683b9bf02e6e9c996a2383d7ed8b35c88fc0d8 Mon Sep 17 00:00:00 2001
From: Michal Pogoda <michalpogoda@hotmail.com>
Date: Mon, 24 Aug 2020 12:01:53 +0200
Subject: [PATCH 1/3] Added whitespace unification to prevent newline
 corruptuion

---
 src/utils.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/utils.py b/src/utils.py
index 906de65..f1419ca 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -6,7 +6,8 @@ from typing import Optional
 
 import yaml
 
-PROJECT_ROOT = os.path.dirname(os.path.realpath("/".join(__file__.split("/")) + "/.."))
+PROJECT_ROOT = os.path.dirname(os.path.realpath(
+    "/".join(__file__.split("/")) + "/.."))
 
 
 def get_config() -> dict:
@@ -52,6 +53,18 @@ def remove_punctuation(text: str) -> str:
     return "".join(filter(lambda x: x.isalnum() or x.isspace(), text))
 
 
+def unify_whitespaces(text: str) -> str:
+    """Maps all whitespace characters into a simple ' ' 
+
+    Args:
+        text (str): Text containing multiple forms of whitespace
+
+    Returns:
+        str: Text with a single form of whitespace
+    """
+    return map(lambda c: ' ' if c.isspace() else c, text)
+
+
 def preprocess(text: str) -> str:
     """Makes sure that input is in the same format as training data (no non-alphanum chars, no double spaces,
         all lowercase etc.)
@@ -63,6 +76,7 @@ def preprocess(text: str) -> str:
         str: Text in training-data format
     """
     text = remove_punctuation(text)
+    text = unify_whitespaces(text)
     text = remove_multiple_spaces(text)
     text = text.lower()
     text = text.strip()
-- 
GitLab


From f8cbebaa6c765bd1c89c5ca198bff04261013b0f Mon Sep 17 00:00:00 2001
From: Michal Pogoda <michalpogoda@hotmail.com>
Date: Mon, 24 Aug 2020 12:25:19 +0200
Subject: [PATCH 2/3] More fixes

---
 src/utils.py | 10 +++++++++-
 worker.py    |  5 ++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index f1419ca..2f2bdef 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -62,7 +62,15 @@ def unify_whitespaces(text: str) -> str:
     Returns:
         str: Text with a single form of whitespace
     """
-    return map(lambda c: ' ' if c.isspace() else c, text)
+    result = ""
+
+    for c in text:
+        if c.isspace():
+            result += " "
+        else:
+            result += c
+
+    return result
 
 
 def preprocess(text: str) -> str:
diff --git a/worker.py b/worker.py
index c7132f5..54a25c1 100755
--- a/worker.py
+++ b/worker.py
@@ -23,11 +23,14 @@ class Worker(nlp_ws.NLPWorker):
             self.config["deployment"]["device"],
         )
 
+        self.model.train(False)
+
     def process(self, input_file: str, task_options: dict, output_file: str) -> None:
         """Implementation of example tasks that copies files."""
 
         with open(input_file, "r") as f:
-            text = preprocess(f.read())
+            text = str(f.read())
+            text = preprocess(text)
             text_processed = apply_actions_punctuation(
                 text, self.chunk_size, self.tokenizer, self.model, self.threshold
             )
-- 
GitLab


From fe92504492d205600e95afbfcb05235d9d5c737b Mon Sep 17 00:00:00 2001
From: Michal Pogoda <michalpogoda@hotmail.com>
Date: Mon, 24 Aug 2020 12:27:22 +0200
Subject: [PATCH 3/3] Style fix

---
 src/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils.py b/src/utils.py
index 2f2bdef..aef4911 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -54,7 +54,7 @@ def remove_punctuation(text: str) -> str:
 
 
 def unify_whitespaces(text: str) -> str:
-    """Maps all whitespace characters into a simple ' ' 
+    """Maps all whitespace characters into a simple ' '
 
     Args:
         text (str): Text containing multiple forms of whitespace
-- 
GitLab