From ab683b9bf02e6e9c996a2383d7ed8b35c88fc0d8 Mon Sep 17 00:00:00 2001 From: Michal Pogoda <michalpogoda@hotmail.com> Date: Mon, 24 Aug 2020 12:01:53 +0200 Subject: [PATCH 1/3] Added whitespace unification to prevent newline corruptuion --- src/utils.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/utils.py b/src/utils.py index 906de65..f1419ca 100644 --- a/src/utils.py +++ b/src/utils.py @@ -6,7 +6,8 @@ from typing import Optional import yaml -PROJECT_ROOT = os.path.dirname(os.path.realpath("/".join(__file__.split("/")) + "/..")) +PROJECT_ROOT = os.path.dirname(os.path.realpath( + "/".join(__file__.split("/")) + "/..")) def get_config() -> dict: @@ -52,6 +53,18 @@ def remove_punctuation(text: str) -> str: return "".join(filter(lambda x: x.isalnum() or x.isspace(), text)) +def unify_whitespaces(text: str) -> str: + """Maps all whitespace characters into a simple ' ' + + Args: + text (str): Text containing multiple forms of whitespace + + Returns: + str: Text with a single form of whitespace + """ + return map(lambda c: ' ' if c.isspace() else c, text) + + def preprocess(text: str) -> str: """Makes sure that input is in the same format as training data (no non-alphanum chars, no double spaces, all lowercase etc.) @@ -63,6 +76,7 @@ def preprocess(text: str) -> str: str: Text in training-data format """ text = remove_punctuation(text) + text = unify_whitespaces(text) text = remove_multiple_spaces(text) text = text.lower() text = text.strip() -- GitLab From f8cbebaa6c765bd1c89c5ca198bff04261013b0f Mon Sep 17 00:00:00 2001 From: Michal Pogoda <michalpogoda@hotmail.com> Date: Mon, 24 Aug 2020 12:25:19 +0200 Subject: [PATCH 2/3] More fixes --- src/utils.py | 10 +++++++++- worker.py | 5 ++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/src/utils.py b/src/utils.py index f1419ca..2f2bdef 100644 --- a/src/utils.py +++ b/src/utils.py @@ -62,7 +62,15 @@ def unify_whitespaces(text: str) -> str: Returns: str: Text with a single form of whitespace """ - return map(lambda c: ' ' if c.isspace() else c, text) + result = "" + + for c in text: + if c.isspace(): + result += " " + else: + result += c + + return result def preprocess(text: str) -> str: diff --git a/worker.py b/worker.py index c7132f5..54a25c1 100755 --- a/worker.py +++ b/worker.py @@ -23,11 +23,14 @@ class Worker(nlp_ws.NLPWorker): self.config["deployment"]["device"], ) + self.model.train(False) + def process(self, input_file: str, task_options: dict, output_file: str) -> None: """Implementation of example tasks that copies files.""" with open(input_file, "r") as f: - text = preprocess(f.read()) + text = str(f.read()) + text = preprocess(text) text_processed = apply_actions_punctuation( text, self.chunk_size, self.tokenizer, self.model, self.threshold ) -- GitLab From fe92504492d205600e95afbfcb05235d9d5c737b Mon Sep 17 00:00:00 2001 From: Michal Pogoda <michalpogoda@hotmail.com> Date: Mon, 24 Aug 2020 12:27:22 +0200 Subject: [PATCH 3/3] Style fix --- src/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils.py b/src/utils.py index 2f2bdef..aef4911 100644 --- a/src/utils.py +++ b/src/utils.py @@ -54,7 +54,7 @@ def remove_punctuation(text: str) -> str: def unify_whitespaces(text: str) -> str: - """Maps all whitespace characters into a simple ' ' + """Maps all whitespace characters into a simple ' ' Args: text (str): Text containing multiple forms of whitespace -- GitLab