Skip to content
Snippets Groups Projects

Fix non-space whitespace characters destroying predictions

2 files
+ 27
2
Compare changes
  • Side-by-side
  • Inline

Files

+ 23
1
@@ -6,7 +6,8 @@ from typing import Optional
import yaml
PROJECT_ROOT = os.path.dirname(os.path.realpath("/".join(__file__.split("/")) + "/.."))
PROJECT_ROOT = os.path.dirname(os.path.realpath(
"/".join(__file__.split("/")) + "/.."))
def get_config() -> dict:
@@ -52,6 +53,26 @@ def remove_punctuation(text: str) -> str:
return "".join(filter(lambda x: x.isalnum() or x.isspace(), text))
def unify_whitespaces(text: str) -> str:
"""Maps all whitespace characters into a simple ' '
Args:
text (str): Text containing multiple forms of whitespace
Returns:
str: Text with a single form of whitespace
"""
result = ""
for c in text:
if c.isspace():
result += " "
else:
result += c
return result
def preprocess(text: str) -> str:
"""Makes sure that input is in the same format as training data (no non-alphanum chars, no double spaces,
all lowercase etc.)
@@ -63,6 +84,7 @@ def preprocess(text: str) -> str:
str: Text in training-data format
"""
text = remove_punctuation(text)
text = unify_whitespaces(text)
text = remove_multiple_spaces(text)
text = text.lower()
text = text.strip()
Loading