Skip to content
Snippets Groups Projects
Commit fd0d1a14 authored by Michał Pogoda's avatar Michał Pogoda Committed by Szymon Ciombor
Browse files

Fix non-space whitespace characters destroying predictions

parent a588bad7
No related branches found
No related tags found
No related merge requests found
......@@ -6,7 +6,8 @@ from typing import Optional
import yaml
PROJECT_ROOT = os.path.dirname(os.path.realpath("/".join(__file__.split("/")) + "/.."))
PROJECT_ROOT = os.path.dirname(os.path.realpath(
"/".join(__file__.split("/")) + "/.."))
def get_config() -> dict:
......@@ -52,6 +53,26 @@ def remove_punctuation(text: str) -> str:
return "".join(filter(lambda x: x.isalnum() or x.isspace(), text))
def unify_whitespaces(text: str) -> str:
"""Maps all whitespace characters into a simple ' '
Args:
text (str): Text containing multiple forms of whitespace
Returns:
str: Text with a single form of whitespace
"""
result = ""
for c in text:
if c.isspace():
result += " "
else:
result += c
return result
def preprocess(text: str) -> str:
"""Makes sure that input is in the same format as training data (no non-alphanum chars, no double spaces,
all lowercase etc.)
......@@ -63,6 +84,7 @@ def preprocess(text: str) -> str:
str: Text in training-data format
"""
text = remove_punctuation(text)
text = unify_whitespaces(text)
text = remove_multiple_spaces(text)
text = text.lower()
text = text.strip()
......
......@@ -23,11 +23,14 @@ class Worker(nlp_ws.NLPWorker):
self.config["deployment"]["device"],
)
self.model.train(False)
def process(self, input_file: str, task_options: dict, output_file: str) -> None:
"""Implementation of example tasks that copies files."""
with open(input_file, "r") as f:
text = preprocess(f.read())
text = str(f.read())
text = preprocess(text)
text_processed = apply_actions_punctuation(
text, self.chunk_size, self.tokenizer, self.model, self.threshold
)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment