Skip to content
Snippets Groups Projects
Commit 29efda6d authored by Michal Pogoda's avatar Michal Pogoda
Browse files

More refactoring, added less error-prone preprocessing

parent 841d0546
Branches
No related tags found
3 merge requests!7Merge new models,!6Full transformer mixed model + accuracy optimizations,!1Initial Proof of Concept
Pipeline #1235 passed
#!/usr/bin/python3
import argparse
from argparse import Namespace
import os
from argparse import Namespace
from src.pipelines.actions_based.processing import apply_actions_punctuation
from src.pipelines.actions_based.utils import load_model
from src.utils import preprocess
def get_args() -> Namespace:
......@@ -42,7 +43,7 @@ if __name__ == "__main__":
tokenizer, model = load_model(args.model, args.base, "cpu")
with open(args.input, "r") as f:
text = f.read()
text = preprocess(f.read())
text_processed = apply_actions_punctuation(
text, args.chunk_size, tokenizer, model, args.threshold
)
......
......@@ -6,7 +6,10 @@ import numpy as np
import pandas as pd
from dask.distributed import Client
from src.pipelines.actions_based.processing import APPLY_FILE_PROCESSING_META, apply_file_processing
from src.pipelines.actions_based.processing import (
APPLY_FILE_PROCESSING_META,
apply_file_processing,
)
from src.utils import PROJECT_ROOT, get_config, prepare_folder
INPUT_FOLDER = f"{PROJECT_ROOT}/data"
......
......@@ -4,7 +4,10 @@ import dask.dataframe as dd
from dask.distributed import Client
from transformers import BertTokenizerFast
from src.pipelines.actions_based.processing import APPLY_TOKENIZATION_META, apply_tokenization
from src.pipelines.actions_based.processing import (
APPLY_TOKENIZATION_META,
apply_tokenization,
)
from src.utils import PROJECT_ROOT, get_config, prepare_folder
INPUT_FOLDER = f"{PROJECT_ROOT}/generated/actions/stage1_extraction"
......
......@@ -2,7 +2,12 @@
import dask.dataframe as dd
from dask.distributed import Client
from src.processing import EXPAND_DIMS_META, FLATTEN_DIMS_META, expand_dims, flatten_dims
from src.processing import (
EXPAND_DIMS_META,
FLATTEN_DIMS_META,
expand_dims,
flatten_dims,
)
from src.utils import PROJECT_ROOT, get_config, prepare_folder
INPUT_FOLDER = f"{PROJECT_ROOT}/generated/actions/stage2_tokenization"
......
......@@ -6,7 +6,10 @@ import numpy as np
import pandas as pd
from dask.distributed import Client
from src.pipelines.translation_based.processing import RAW_TO_DATAFRAME_META, raw_to_dataframe
from src.pipelines.translation_based.processing import (
RAW_TO_DATAFRAME_META,
raw_to_dataframe,
)
from src.utils import PROJECT_ROOT, get_config, prepare_folder
INPUT_FOLDER = f"{PROJECT_ROOT}/data"
......
......@@ -4,7 +4,10 @@ from dask import delayed
from dask.distributed import Client
from transformers import BertTokenizerFast
from src.pipelines.translation_based.processing import GENERATE_BATCHES_META, generate_batches
from src.pipelines.translation_based.processing import (
GENERATE_BATCHES_META,
generate_batches,
)
from src.utils import PROJECT_ROOT, get_config, prepare_folder
INPUT_FOLDER = f"{PROJECT_ROOT}/generated/translations/stage1_extraction"
......
......@@ -45,9 +45,31 @@ def remove_punctuation(text: str) -> str:
Returns:
str: Text with all punctuactions removed
"""
# Separating characters
text = text.replace("-", " ").replace("/", " ").replace("+", " ")
return "".join(filter(lambda x: x.isalnum() or x.isspace(), text))
def preprocess(text: str) -> str:
"""Makes sure that input is in the same format as training data (no non-alphanum chars, no double spaces,
all lowercase etc.)
Args:
text (str): Text to be processed
Returns:
str: Text in training-data format
"""
text = remove_punctuation(text)
text = remove_multiple_spaces(text)
text = text.lower()
text = text.strip()
return text
def prepare_folder(path: str, wipe: bool = False) -> None:
"""Function make sure that provided path exists. Can aditionaly
remove all files from the path.
......
from src.utils import convert_to_timedelta, preprocess, remove_multiple_spaces, remove_punctuation
def test_remove_multiple_spaces():
provided = "Ala ma Kota. Kot ma Ale "
expected = "Ala ma Kota. Kot ma Ale "
assert remove_multiple_spaces(provided) == expected
def test_remove_punctuation():
provided = "Ala.. ma-Kota!?.@@$ Kot ma Ale ()*"
expected = "Ala ma Kota Kot ma Ale "
assert remove_punctuation(provided) == expected
def test_preprocess():
provided = "Ala ma-Kota!?.@@$ Kot ma Ale ()*"
expected = "ala ma kota kot ma ale"
assert preprocess(provided) == expected
def test_convert_to_timedelta():
assert convert_to_timedelta("5d").days == 5
assert convert_to_timedelta("5d").seconds == 0
assert convert_to_timedelta("5d").microseconds == 0
assert convert_to_timedelta("4h").days == 0
assert convert_to_timedelta("4h").seconds == 4 * 60 * 60
assert convert_to_timedelta("4h").microseconds == 0
assert convert_to_timedelta("3m").days == 0
assert convert_to_timedelta("3m").seconds == 3 * 60
assert convert_to_timedelta("3m").microseconds == 0
assert convert_to_timedelta("2s").days == 0
assert convert_to_timedelta("2s").seconds == 2
assert convert_to_timedelta("2s").microseconds == 0
......@@ -6,6 +6,7 @@ import nlp_ws
from src.pipelines.actions_based.processing import apply_actions_punctuation
from src.pipelines.actions_based.utils import load_model
from src.utils import preprocess
class Worker(nlp_ws.NLPWorker):
......@@ -27,7 +28,7 @@ class Worker(nlp_ws.NLPWorker):
"""Implementation of example tasks that copies files."""
with open(input_file, "r") as f:
text = f.read()
text = preprocess(f.read())
text_processed = apply_actions_punctuation(
text, self.chunk_size, self.tokenizer, self.model, self.threshold
)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment