diff --git a/.dockerignore b/.dockerignore index 19d3acfbdd4924abfe9ebdaf1154f9bb8af3a0b8..06d8c5654460ecac74da67b588aa796e8bc35e2f 100644 --- a/.dockerignore +++ b/.dockerignore @@ -12,4 +12,5 @@ dask-worker-space data generated notebooks -tests \ No newline at end of file +tests +deploy \ No newline at end of file diff --git a/.gitignore b/.gitignore index c929ac9f1d18bbc3f31025b852c0b314088e31b4..31cb7122cf476011b717f36c98500e8f9de01825 100644 --- a/.gitignore +++ b/.gitignore @@ -13,4 +13,9 @@ __pycache__ .dvc .tox notebooks -dvc.lock \ No newline at end of file +dvc.lock +dask-worker-space +test_data +.env +deploy +service.log \ No newline at end of file diff --git a/README.md b/README.md index 0b99108079d2776d97c90b456d3a21769ff92f85..e8f9b06b25b944f8fd77838a1f8dea9170e1d206 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,50 @@ # Punctuator -A service that automatically adds punctuation to raw word-stream (eg. from speech2text). +A service that automatically adds punctuation to raw word-stream (eg. from speech2text) for polish language. -## Approaches -1. Token classification (actions): Each token is classified with 4 labels: Uppercase, dot, colon, question mark. The model is based on the stacked encoder part of transformer architecture (Bert), followed by FC-layer that transforms the output into per-token multilabel binary classifications. For now, there is no restriction for taking dot, question_mark and colon labels simultaneously, so that's the are of improvement (hierarchical, multilabel classification) +**Example input**: +> według webometrycznego rankingu uniwersytetów świata ze stycznia 2019 pokazującego zaangażowanie instytucji akademickich w internecie uczelnia zajmuje 5 miejsce w polsce wśród uczelni technicznych a na świecie 964 wśród wszystkich typów uczelni w rankingu szkół wyższych perspektyw politechnika wrocławska zajęła w 2019 roku 3 miejsce wśród uczelni technicznych oraz 6 miejsce spośród wszystkich uczelni akademickich w polsce -2. Sequence-to-Sequence (translations): Full encoder-decoder stack that takes input (unpunctuated text) and the output produced so far to predict the next token. In theory, this model should be able to represent many more cases (eg. all upper, some upper, dashes, ellipsis etc...) without explicit defines. However, the lack of constraints makes it much harder to train. +**Output**: +> Według webometrycznego rankingu uniwersytetów świata ze stycznia 2019, pokazującego zaangażowanie instytucji akademickich w Internecie, uczelnia zajmuje 5. miejsce w Polsce wśród uczelni technicznych, a na świecie 964. Wśród wszystkich typów uczelni w rankingu szkół wyższych perspektyw Politechnika Wrocławska zajęła w 2019 roku 3. miejsce wśród uczelni technicznych oraz 6. miejsce spośród wszystkich uczelni akademickich w Polsce + +## Models +### Action-Based +1. actions_base: A simple model, architecturally based on BERT. It's learned on a task to predict an "Action" for each token in the sentence. Action is described as either uppercasing of the token or adding a punctuation sign at the end of the token. + +2. actions_restricted: The model nearly identical with actions_base, however it predicts punctuation as a categorical distribution (so that punctuation is mutually exclusive in training time). The idea is to better differentiate between each punctuation. + +3. actions_mixed: A model based on the full transformer (encoder + decoder) architecture. It's much less performant, as it only predicts actions for one word at the time. However, it can model action probabilities conditioned on both the input and output predicted so far. Because of that, it's much less prone to not uppercasing letters in a new sentence or placing multiple punctuation signs in close proximity. + +### Translation +2. translation (Deprecated): Full encoder-decoder stack that takes input (unpunctuated text) and the output produced so far to predict the next token. The main difference from the actions model is that it's a full text2text model without restriction on tokens. Because of that, in theory, it can represent more cases (eg. all upper, some upper, dashes, ellipsis, etc...), as opposed to only a few explicitly defined actions. However, the lack of constraints makes it much harder to train (both in performance and data size). + +## Usage +To test the model localy you can use `punctuate.py` script. +```bash +punctuate.py [-h] -a {base,restricted,mixed} -d DIRECTORY -i INPUT [-m MODEL] [-l {upper_case,dot,colon,question_mark,none}] [-dv DEVICE] + +Evaluate actions model + +optional arguments: + -h, --help show this help message and exit + -a {base,restricted,mixed}, --architecture {base,restricted,mixed} + Model architecture + -d DIRECTORY, --directory DIRECTORY + Directory where trained model is located, relative to project root + -i INPUT, --input INPUT + Input text file + -m MODEL, --model MODEL + Pretrained model name + -l {upper_case,dot,colon,question_mark,none}, --highlight {upper_case,dot,colon,question_mark,none} + Highlight prediction confidence of selected action per-word + -dv DEVICE, --device DEVICE + Device on which inference will be made +``` +Eg. if you place your model named "production" at `punctuator/checkpoints/actions_base/` and example unpunctuated at `punctuator/test_data/test.txt` you can call + +```bash +python3 punctuate.py -a mixed -d /deploy/actions_mixed -i test_data/text.txt -m production -dv cuda:0 +``` ## Mountpoints -Directory where model will be downloaded (~500Mb) needs to be mounted at /punctuator/deploy +Directory where the model will be downloaded (~500Mb) needs to be mounted at /punctuator/deploy diff --git a/config.ini b/config.ini index 93819e52ab6958cf2e7a936a1acfa92e097c09fb..bcffce8d38872a03f848e96326046cb3c1c05868 100644 --- a/config.ini +++ b/config.ini @@ -1,8 +1,7 @@ [service] -tool = Punctuator - +tool = punctuator_test root = /samba/requests/ -rabbit_host = addr +rabbit_host = test rabbit_user = test rabbit_password = test @@ -15,7 +14,5 @@ local_log_level = INFO [deployment] device = cpu -chunk_size = 500 -threshold = 0.9 -model = deploy/model -base_model = dkleczek/bert-base-polish-cased-v1 \ No newline at end of file +models_dir = deploy +models_enabled = actions_base,actions_mixed,actions_restricted diff --git a/docker/development/Dockerfile b/docker/development/Dockerfile deleted file mode 100644 index 1535758262ef4bd86ae4fa82c0fc661b0d560d35..0000000000000000000000000000000000000000 --- a/docker/development/Dockerfile +++ /dev/null @@ -1,41 +0,0 @@ -from ubuntu:20.04 - -RUN apt update && apt install -y python3 python3-pip -RUN apt update && apt install -y git -RUN pip3 install ipywidgets - -#### CUDA Installation -RUN apt-get update && apt-get install -y --no-install-recommends \ -gnupg2 curl ca-certificates && \ - curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub | apt-key add - && \ - echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \ - echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \ -rm -rf /var/lib/apt/lists/* - -ENV CUDA_VERSION 10.2.89 - -ENV CUDA_PKG_VERSION 10-2=$CUDA_VERSION-1 - -# For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a -RUN apt-get update && apt-get install -y --no-install-recommends \ - cuda-cudart-$CUDA_PKG_VERSION \ -cuda-compat-10-2 && \ -ln -s cuda-10.2 /usr/local/cuda && \ - rm -rf /var/lib/apt/lists/* - -# Required for nvidia-docker v1 -RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ - echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf - -ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH} -ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64 - -# nvidia-container-runtime -ENV NVIDIA_VISIBLE_DEVICES all -ENV NVIDIA_DRIVER_CAPABILITIES compute,utility -ENV NVIDIA_REQUIRE_CUDA "cuda>=10.2 brand=tesla,driver>=384,driver<385 brand=tesla,driver>=396,driver<397 brand=tesla,driver>=410,driver<411 brand=tesla,driver>=418,driver<419" - -### END CUDA Installation - -RUN pip3 install numpy pandas tqdm seaborn torch dask[complete] transformers pyarrow==0.17.1 pytest lxml -RUN ln -s /usr/bin/pip3 /usr/bin/pip \ No newline at end of file diff --git a/docker/training/Dockerfile b/docker/training/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..b3a7fba6e2858fbd642e44fb65afb553b57671d3 --- /dev/null +++ b/docker/training/Dockerfile @@ -0,0 +1,24 @@ +FROM clarinpl/cuda-python:3.7 + +RUN DEBIAN_FRONTEND=noninteractive apt-get update && apt-get install -y gcc python3-dev +RUN mkdir /punctuator +WORKDIR /punctuator + +COPY requirements.txt requirements.txt +RUN pip3 install -r requirements.txt && rm requirements.txt + +ARG USERNAME=clarin +ARG USER_UID=1000 +ARG USER_GID=1000 + +# Create the user +RUN groupadd --gid $USER_GID $USERNAME \ + && useradd --uid $USER_UID --gid $USER_GID -m $USERNAME \ + && apt-get update \ + && apt-get install -y sudo \ + && echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME \ + && chmod 0440 /etc/sudoers.d/$USERNAME + +ENV PATH="/home/${USERNAME}/.local/bin:${PATH}" + +USER ${USERNAME} \ No newline at end of file diff --git a/docker/training/requirements.txt b/docker/training/requirements.txt new file mode 120000 index 0000000000000000000000000000000000000000..fd1efae711fc22b83cadac89860444b798eb1aac --- /dev/null +++ b/docker/training/requirements.txt @@ -0,0 +1 @@ +../../requirements.txt \ No newline at end of file diff --git a/dvc.yaml b/dvc.yaml index bc292439f63f2203deb927a933a3cbd5d15d31af..4970d16862ec36453545d39c8dd071e3382df7b0 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -1,4 +1,7 @@ stages: + ###################### + # Action # + ###################### actions_extraction: cmd: python3 -m src.pipelines.actions_based.stage1_extraction deps: @@ -12,7 +15,7 @@ stages: cmd: python3 -m src.pipelines.actions_based.stage2_tokenization deps: - generated/actions/stage1_extraction - - src/pipelines/actions_based/stage2_tokenization.py + - src params: - actions.tokenization.max_tokens - actions.tokenization.min_tokens @@ -23,38 +26,119 @@ stages: cmd: python3 -m src.pipelines.actions_based.stage3_exploding deps: - generated/actions/stage2_tokenization - - src/pipelines/actions_based/stage3_exploding.py + - src outs: - generated/actions/stage3_exploding actions_reindexing: cmd: python3 -m src.pipelines.actions_based.stage4_reindexing deps: - generated/actions/stage3_exploding - - src/pipelines/actions_based/stage4_reindexing.py + - src outs: - generated/actions/stage4_reindexing actions_stats: cmd: python3 -m src.pipelines.actions_based.stage5_stats deps: - generated/actions/stage4_reindexing - - src/pipelines/actions_based/stage5_stats.py + - src outs: - generated/actions/stage5_stats - actions_training: - cmd: python3 -m src.pipelines.actions_based.train + + # Base + actions_base_training: + cmd: python3 -m src.pipelines.actions_based.train_base deps: - generated/actions/stage4_reindexing - generated/actions/stage5_stats - - src/pipelines/actions_based/train.py + - src params: - global.base_model - - actions.training.max_training_time - - actions.training.learning_rate - - actions.training.num_epochs - - actions.training.batch_size - - actions.training.save_step + - global.random_seed + - actions.training_base.max_training_time + - actions.training_base.learning_rate + - actions.training_base.num_epochs + - actions.training_base.batch_size + - actions.training_base.save_step outs: - - checkpoints/actions + - checkpoints/actions_base + + actions_base_testing: + cmd: python3 -m src.pipelines.actions_based.test -a base -d checkpoints/actions_base/ -m "final" -ds generated/actions/stage4_reindexing/ -o generated/actions/test_results_base -s testing_base + deps: + - checkpoints/actions_base + - generated/actions/stage4_reindexing + - src + params: + - actions.testing_base.limit + outs: + - generated/actions/test_results_base + + # Restricted + actions_restricted_training: + cmd: python3 -m src.pipelines.actions_based.train_restricted + deps: + - generated/actions/stage4_reindexing + - generated/actions/stage5_stats + - src + params: + - global.base_model + - global.random_seed + - actions.training_restricted.max_training_time + - actions.training_restricted.learning_rate + - actions.training_restricted.num_epochs + - actions.training_restricted.batch_size + - actions.training_restricted.save_step + outs: + - checkpoints/actions_restricted + + actions_restricted_testing: + cmd: python3 -m src.pipelines.actions_based.test -a restricted -d checkpoints/actions_restricted/ -m "final" -ds generated/actions/stage4_reindexing/ -o generated/actions/test_results_restricted -s testing_restricted + deps: + - checkpoints/actions_restricted + - generated/actions/stage4_reindexing + - src + params: + - actions.testing_restricted.limit + outs: + - generated/actions/test_results_restricted + + # Mixed + actions_mixed_training: + cmd: python3 -m src.pipelines.actions_based.train_mixed + deps: + - generated/actions/stage4_reindexing + - generated/actions/stage5_stats + - src + params: + - global.base_model + - global.random_seed + - actions.training_mixed.embedding_size + - actions.training_mixed.num_heads + - actions.training_mixed.num_layers + - actions.training_mixed.dropout + - actions.training_mixed.feedforward_neurons + - actions.training_mixed.max_training_time + - actions.training_mixed.learning_rate + - actions.training_mixed.num_epochs + - actions.training_mixed.batch_size + - actions.training_mixed.save_step + outs: + - checkpoints/actions_mixed + + actions_mixed_testing: + cmd: python3 -m src.pipelines.actions_based.test -a mixed -d checkpoints/actions_mixed/ -m "final" -ds generated/actions/stage4_reindexing/ -o generated/actions/test_results_mixed -s testing_mixed + deps: + - checkpoints/actions_mixed + - generated/actions/stage4_reindexing + - src + params: + - actions.testing_mixed.limit + outs: + - generated/actions/test_results_mixed + + ###################### + # Translation # + ###################### translations_extraction: cmd: python3 -m src.pipelines.translation_based.stage1_extraction deps: @@ -63,6 +147,7 @@ stages: - translations.extraction.num_partitions outs: - generated/translations/stage1_extraction + translations_create_batches: cmd: python3 -m src.pipelines.translation_based.stage2_create_batches deps: @@ -90,6 +175,7 @@ stages: - src/pipelines/translation_based/train.py params: - global.base_model + - global.random_seed - translations.training.max_training_time - translations.training.learning_rate - translations.training.num_epochs diff --git a/entrypoint.sh b/entrypoint.sh index a6e06ed1888303023e935d1a1661dd86858ef546..5608c3832e5e2f2876e6f77ddb63c686b3d9f2fd 100755 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -1,8 +1,24 @@ #!/bin/bash -if ! test -f "./deploy/model"; then - mkdir -p ./deploy - wget https://minio.clarin-pl.eu/public/models/punctuation/0-190000.model -O deploy/model +if ! test -d "./deploy/actions_base"; then + mkdir -p ./deploy/actions_base + wget https://minio.clarin-pl.eu/public/models/punctuation/actions_base/production.model -O deploy/actions_base/production.model + wget https://minio.clarin-pl.eu/public/models/punctuation/actions_base/production.config -O deploy/actions_base/production.config + wget https://minio.clarin-pl.eu/public/models/punctuation/actions_base/production.runtime.yaml -O deploy/actions_base/production.runtime.yaml fi -python worker.py \ No newline at end of file +if ! test -d "./deploy/actions_mixed"; then + mkdir -p ./deploy/actions_mixed + wget https://minio.clarin-pl.eu/public/models/punctuation/actions_mixed/production.model -O deploy/actions_mixed/production.model + wget https://minio.clarin-pl.eu/public/models/punctuation/actions_mixed/production.config -O deploy/actions_mixed/production.config + wget https://minio.clarin-pl.eu/public/models/punctuation/actions_mixed/production.runtime.yaml -O deploy/actions_mixed/production.runtime.yaml +fi + +if ! test -d "./deploy/actions_restricted"; then + mkdir -p ./deploy/actions_restricted + wget https://minio.clarin-pl.eu/public/models/punctuation/actions_restricted/production.model -O deploy/actions_restricted/production.model + wget https://minio.clarin-pl.eu/public/models/punctuation/actions_restricted/production.config -O deploy/actions_restricted/production.config + wget https://minio.clarin-pl.eu/public/models/punctuation/actions_restricted/production.runtime.yaml -O deploy/actions_restricted/production.runtime.yaml +fi + +python worker.py diff --git a/params.yaml b/params.yaml index 3b9977d9621ab7d8a87e6d8170baedb08bedc817..fd62fe17e52c5c4c8f416ffd32f3e2a57c1f2a07 100644 --- a/params.yaml +++ b/params.yaml @@ -27,15 +27,58 @@ actions: num_workers: 24 worker_memory_limit: "2GB" - training: + training_base: + learning_rate: 0.0001 + num_epochs: 5 + batch_size: 2 + batch_buffer_size: 100 + save_step: 50 + max_training_time: null + loss_averaging_span: 1000 + fresh_start: false + device: "cuda:0" + + testing_base: + limit: None + batch_size: 1 + device: "cuda:0" + + training_restricted: + learning_rate: 0.0001 + num_epochs: 5 + batch_size: 2 + batch_buffer_size: 100 + save_step: 1000 + max_training_time: null + loss_averaging_span: 1000 + fresh_start: true + device: "cuda:0" + + test_restricted: + limit: None + batch_size: 1 + device: "cuda:0" + + training_mixed: + embedding_size: 768 + num_heads: 12 + num_layers: 6 + dropout: 0.1 + feedforward_neurons: 1000 learning_rate: 0.0001 num_epochs: 5 batch_size: 2 - save_step: 100 + batch_buffer_size: 1000 + save_step: 10000 max_training_time: null loss_averaging_span: 1000 fresh_start: true device: "cuda:0" + + test_mixed: + limit: None + batch_size: 1 + device: "cuda:0" translations: extraction: num_partitions: 2_000 @@ -64,4 +107,4 @@ translations: max_training_time: "4h" loss_averaging_span: 1000 fresh_start: false - device: "cuda:1" \ No newline at end of file + device: "cuda:1" diff --git a/punctuate.py b/punctuate.py index 510eaad122f7452287806f57d6ef82f6bf0518e7..8eb4bdc04369776859a6bf4e9540463ed28c8f51 100755 --- a/punctuate.py +++ b/punctuate.py @@ -1,52 +1,113 @@ -#!/usr/bin/python3 - import argparse -import os -from argparse import Namespace +from src.pipelines.actions_based.utils import max_suppression +from src.pipelines.actions_based.processing import ( + ACTIONS_KEYS, + recover_text, + token_labels_to_word_labels, +) +from src.models.interfaces import ActionsModel +from typing import Dict -from src.pipelines.actions_based.processing import apply_actions_punctuation -from src.pipelines.actions_based.utils import load_model -from src.utils import preprocess +import numpy as np +import torch +from src.models.actions_model_base import ActionsModelBase +from src.models.actions_model_mixed import ActionsModelMixed +from src.models.actions_model_restricted import ActionsModelRestricted +from src.utils import ( + PROJECT_ROOT, + input_preprocess, + output_preprocess, +) +import colored -def get_args() -> Namespace: - parser = argparse.ArgumentParser( - description="Adds punctuaiton in to raw text stream." - ) +SUPPORTED_MODELS: Dict[str, ActionsModel] = { + "base": ActionsModelBase, + "restricted": ActionsModelRestricted, + "mixed": ActionsModelMixed, +} + + +def print_highlighted(text: str, word_labels: np.ndarray, action_name: str) -> None: + label_id = np.argwhere(np.array(ACTIONS_KEYS) == action_name) + + text = text.split(" ") + for label, word in zip(word_labels, text): + SPAN = 255 - 232 + + bg_color = int(label[label_id] * (SPAN - 1) + 232) + print(colored.bg(bg_color) + colored.fg(2) + word, end=" ") + print("") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Evaluate actions model") parser.add_argument( - "-i", "--input", type=str, required=True, help="Path to input text file", + "-a", + "--architecture", + required=True, + choices=SUPPORTED_MODELS.keys(), + help="Model architecture", ) parser.add_argument( - "-o", "--output", type=str, required=True, help="Path to input text file", + "-d", + "--directory", + required=True, + help="Directory where trained model is located, relative to project root", ) parser.add_argument( - "-m", "--model", required=True, type=str, help="Path to the pretrained model", + "-i", "--input", required=True, type=str, help="Input text file" ) + parser.add_argument("-m", "--model", default="final", help="Pretrained model name") parser.add_argument( - "-b", "--base", required=True, type=str, help="Name of base model", + "-l", + "--highlight", + type=str, + required=False, + choices=ACTIONS_KEYS + ["none"], + default="none", + help="Highlight prediction confidence of selected action per-word", ) parser.add_argument( - "-c", "--chunk_size", default=500, type=int, help="Maximum chunk size" + "-dv", + "--device", + type=str, + required=False, + default="cpu", + help="Device on which inference will be made", ) - parser.add_argument("-t", "--threshold", default=0.9, type=float, help="Threshold") - - return parser.parse_args() + args = parser.parse_args() + print(f"Loading model {args.model}...") + device = torch.device(args.device) + model_location = f"{PROJECT_ROOT}/{args.directory}" + model_type = SUPPORTED_MODELS[args.architecture] + model = model_type.load(model_location, args.model, device) + model.train(False) -if __name__ == "__main__": - args = get_args() - - if not os.path.exists(args.input): - print(f"Error: File '{args.input}' does not exists") - exit(-1) + print("Loading text...") + with open(args.input, "r") as f: + text = f.read() - tokenizer, model = load_model(args.model, args.base, "cpu") + print("Inferencing...") + tokenizer = model.tokenizer() + data = input_preprocess(output_preprocess(text)) + data_tokenized = tokenizer(data, return_tensors="pt") - with open(args.input, "r") as f: - text = preprocess(f.read()) - text_processed = apply_actions_punctuation( - text, args.chunk_size, tokenizer, model, args.threshold + predictions = ( + model.predict_raw( + data_tokenized["input_ids"].to(device), + data_tokenized["attention_mask"].to(device), ) + .detach() + .cpu() + .numpy() + ) + word_labels = token_labels_to_word_labels(data, predictions[0, 1:-1], tokenizer) + word_labels_suppresed = max_suppression(np.expand_dims(word_labels, axis=0), 0.9)[0] + text_recovered = recover_text(data, word_labels_suppresed) - with open(args.output, "w") as f: - f.write(text_processed) + if args.highlight != "none": + print_highlighted(text_recovered, word_labels, args.highlight) + else: + print(text_recovered) diff --git a/requirements.txt b/requirements.txt index 4e2d19ab860508eab855751c232b65756f57736d..4a63c402fb8008dddeef0e44772538e6b9fa9e08 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +--index-url https://pypi.clarin-pl.eu/simple/ attrs==19.3.0 bokeh==2.1.1 certifi==2020.6.20 @@ -32,10 +33,8 @@ psutil==5.7.2 py==1.9.0 pyarrow==0.17.1 pycurl==7.43.0 -pygobject==3.20.0 pyparsing==2.4.7 pytest==6.0.1 -python-apt==1.1.0b1+ubuntu0.16.4.9 python-dateutil==2.8.1 pytz==2020.1 PyYAML==5.3.1 @@ -51,12 +50,13 @@ tblib==1.7.0 tokenizers==0.8.1rc1 toml==0.10.1 toolz==0.10.0 -torch==1.6.0 +torch==1.5.1 tornado==6.0.4 tqdm==4.48.2 transformers==3.0.2 typing-extensions==3.7.4.2 -unattended-upgrades==0.1 urllib3==1.25.10 zict==2.0.0 -git+https://gitlab.clarin-pl.eu/nlpworkers/nlp_ws.git@fa5f09a2f1447cac2c411c9d9e3d927ecd815ddc#egg=nlp_ws \ No newline at end of file +scikit-learn==0.23.2 +nlp_ws==0.6 +colored==1.4.2 diff --git a/src/models/TransformerSeq2Seq.py b/src/models/TransformerSeq2Seq.py index 753df1e392729766b9984216d24615f1e83098fa..3009faed34fd97b48a4ffb11a64c1cb0c0bef2ca 100644 --- a/src/models/TransformerSeq2Seq.py +++ b/src/models/TransformerSeq2Seq.py @@ -1,47 +1,7 @@ -import math - import torch import torch.nn as nn - -class PositionalEncoding(nn.Module): - """Adds sinsusoidal positional encoding (as in original "Attention is all you need" paper.) - src: https://pytorch.org/tutorials/beginner/transformer_tutorial.html - - """ - - def __init__(self, d_model: int, max_len: int, dropout=0.1): - """Sinusidal positional encodings - - Args: - d_model (int): Embedding dimension - max_len (int): Maximum length of sequence - dropout (float, optional): Dropout ratio. Defaults to 0.1. - """ - super(PositionalEncoding, self).__init__() - self.dropout = nn.Dropout(p=dropout) - - pe = torch.zeros(max_len, d_model) - position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) - div_term = torch.exp( - torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model) - ) - pe[:, 0::2] = torch.sin(position * div_term) - pe[:, 1::2] = torch.cos(position * div_term) - pe = pe.unsqueeze(0).transpose(0, 1) - self.register_buffer("pe", pe) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Applies positional encoding - - Args: - x (torch.Tensor): Word embeddings tensor - - Returns: - torch.Tensor: Word embeddings with added positional encodings - """ - x = x + self.pe[: x.size(0), :] - return self.dropout(x) +from src.models.common import PositionalEncoding class TransformerSeq2Seq(nn.Module): diff --git a/src/models/actions_model_base.py b/src/models/actions_model_base.py new file mode 100644 index 0000000000000000000000000000000000000000..d503f088d01c8e1b2670b139c077fb66202ac792 --- /dev/null +++ b/src/models/actions_model_base.py @@ -0,0 +1,210 @@ +from __future__ import annotations + +import os +from dataclasses import dataclass + +import numpy as np +import torch +import torch.nn as nn +from torch.nn.modules.loss import BCEWithLogitsLoss +from transformers.configuration_utils import PretrainedConfig +from transformers.modeling_bert import BertForTokenClassification +from transformers.tokenization_bert import BertTokenizerFast + +from src.models.interfaces import ActionsModel +from src.pipelines.actions_based.processing import ( + ACTIONS_KEYS, + action_vector, + last_stop_label, + recover_text, + token_labels_to_word_labels, +) +from src.pipelines.actions_based.utils import max_suppression +from src.utils import pickle_read, pickle_save, prepare_folder, yaml_serializable + + +@dataclass +class ActionsModelBaseParams: + """ + Parameters for ActionsModelBase initialization + + Args: + base_model (str): Name of base model + num_labels (int): Length of action vector + + """ + + base_model: str + num_labels: int = len(ACTIONS_KEYS) + + +@yaml_serializable +@dataclass +class ActionsModelBaseRuntimeParams: + """ + Parameters for ActionsModelBase during runtime interference + + Args: + threshold (float): minimum confidence for applying action + chunksize (int): Maximum number of chunks to perform inference on + """ + + threshold: float = 0.9 + chunksize: int = 500 + + +class ActionsModelBase(ActionsModel): + """Model based on simple multilabel per-token classifiaction. Each token is binarly classified in n-dimensions""" + + def __init__( + self, + params: ActionsModelBaseParams, + runtime: ActionsModelBaseRuntimeParams = ActionsModelBaseRuntimeParams(), + ) -> None: + """Initializes actions model + + Args: + params (ActionsModelBaseParams): Params defining model's structure + runtime (ActionsModelBaseRuntimeParams): Params defining model's runtime inference + """ + super(ActionsModelBase, self).__init__() + self.params = params + self.runtime = runtime + + self._tokenizer = BertTokenizerFast.from_pretrained(params.base_model) + config = PretrainedConfig.from_pretrained(params.base_model) + config.num_labels = params.num_labels + + self.core = BertForTokenClassification(config) + + def forward( + self, input_ids: torch.Tensor, attention_mask: torch.Tensor + ) -> torch.Tensor: + """Computes logits for uppercasing and adding punctuation to a word + + Args: + input_ids (torch.Tensor): Array of ids of tokens. Shape BxL + attention_mask (torch.Tensor): Mask telling if a token should be masked out (ie. Padding). Shape BxL + + Returns: + torch.Tensor: Predicted actions vector + """ + y_pred = self.core(input_ids=input_ids, attention_mask=attention_mask)[0] + + return y_pred + + def predict_raw( + self, input_ids: torch.Tensor, attention_mask: torch.Tensor + ) -> torch.Tensor: + """Function that maps input_ids tensors into per-token labels + + Args: + input_ids (torch.Tensor): Token ids of input. Shape BxL + attention_mask (torch.Tensor): Attention mask of tokens. Shape BxL + + Returns: + torch.Tensor: Per-token action-vector labels. Shape BxLxA + """ + + return self.forward(input_ids, attention_mask=attention_mask).sigmoid() + + def predict(self, text: str) -> str: + text = text.strip() + + tokenizer = self.tokenizer() + tokens = tokenizer(text, return_tensors="pt")["input_ids"] + output = None + + index_start = 0 + while index_start < len(tokens[0]): + index_end = min(index_start + self.runtime.chunksize, len(tokens[0])) + + tokens_chunk = tokens[:, index_start:index_end] + + actions = ( + self.predict_raw(tokens_chunk, torch.ones_like(tokens_chunk)) + .detach() + .cpu() + .numpy() + ) + actions_suppresed = max_suppression(actions, self.runtime.threshold)[0] + + offset = last_stop_label(actions_suppresed, action_vector(["dot"])) + + # Prevent infinite loop + if (offset is None) or (offset == 0): + offset = index_end - index_start + + if output is None: + output = actions[0, 0:offset] + else: + output = np.concatenate([output, actions[0, 0:offset]], axis=0) + + index_start += offset + + assert len(output) == len(tokens[0]) + + word_labels = token_labels_to_word_labels(text, output[1:-1], tokenizer) + actions = max_suppression( + np.expand_dims(word_labels, 0), self.runtime.threshold + )[0] + + return recover_text(text, actions) + + def tokenizer(self) -> BertTokenizerFast: + return self._tokenizer + + def save(self, dir: str, name: str, runtime: bool = True) -> None: + prepare_folder(dir) + torch.save(self.state_dict(), f"{dir}/{name}.model") + pickle_save(self.params, f"{dir}/{name}.config") + + if runtime: + self.runtime.save_yaml(f"{dir}/{name}.runtime.yaml") + + @staticmethod + def load(dir: str, name: str, device: torch.device) -> ActionsModelBase: + params = pickle_read(f"{dir}/{name}.config") + if os.path.exists(f"{dir}/{name}.runtime.yaml"): + runtime = ActionsModelBaseRuntimeParams.load_yaml( + f"{dir}/{name}.runtime.yaml" + ) + else: + runtime = ActionsModelBaseRuntimeParams() + + model = ActionsModelBase(params, runtime).to(device) + model.load_state_dict(torch.load(f"{dir}/{name}.model", map_location=device)) + + return model + + +class ActionsModelBaseLoss(nn.Module): + """Proposed loss for ActionsModelBase model""" + + def __init__(self, prior_inverse_odds: torch.Tensor) -> None: + """Initializes ActionsModelBaseLoss + + Args: + prior_odds (torch.Tensor): Negative to positive ratio of each action vector + entry in dataset. Shape A + """ + super(ActionsModelBaseLoss, self).__init__() + + self.core = BCEWithLogitsLoss(pos_weight=prior_inverse_odds) + + def forward( + self, + predicted_action_vector_logits: torch.Tensor, + true_action_vector: torch.Tensor, + ) -> torch.Tensor: + """Computes ActionsModelBase loss + + Args: + true_action_vector (torch.Tensor): Logits predicted by ActionsModelBase model. Shape BxLxA + predicted_action_vector_logits (torch.Tensor): Target labels. Shape BxLxA + + Returns: + torch.Tensor: Computed loss. + """ + + return self.core(predicted_action_vector_logits, true_action_vector) diff --git a/src/models/actions_model_mixed.py b/src/models/actions_model_mixed.py new file mode 100644 index 0000000000000000000000000000000000000000..e8f9a508cff61bdeac8e0c6a8679293b1a8414ad --- /dev/null +++ b/src/models/actions_model_mixed.py @@ -0,0 +1,302 @@ +import os +from dataclasses import dataclass +from typing import Optional + +import numpy as np +import torch +import torch.nn as nn +from torch.nn.modules.loss import BCEWithLogitsLoss +from transformers.tokenization_bert import BertTokenizerFast + +from src.models.common import PositionalEncoding, generate_square_subsequent_mask +from src.models.interfaces import PunctuationModel +from src.pipelines.actions_based.processing import ( + ACTIONS_KEYS, + action_vector, + recover_text, + token_labels_to_word_labels, +) +from src.utils import pickle_read, pickle_save, prepare_folder, yaml_serializable + + +@dataclass +class ActionsModelMixedParams: + """ + Parameters for initializing ActionsModelMixed + + Params: + base_tokenizer (str): Name of pretrained tokenizer + vocab_size (int): Number of tokens in tokenizer dictionary + embedding_size (int, optional): Shape of word and punctuation embeddings. Defaults to 200. + num_heads (int, optional): Number of heads in multiheaded attention. Defaults to 4. + num_layers (int, optional): Number of both decoded and encoder layers. Defaults to 2. + feedforward_neurons (int, optional): Size of feed-forward neural network at the end of encoder/decoder. Defaults to 200. + num_labels (int, optional): Action-vector size. Defaults to len(ACTIONS_KEYS). + max_len (int, optional): Maxium length of sequence. Defaults to 500. + dropout (float, optional): Dropout ratio. Defaults to 0.1. + """ + + base_tokenizer: str + vocab_size: int + threshold: float = 0.9 + embedding_size: int = 200 + num_heads: int = 4 + num_layers: int = 2 + feedforward_neurons: int = 200 + num_labels: int = len(ACTIONS_KEYS) + max_len: int = 500 + dropout: float = 0.1 + + +@yaml_serializable +@dataclass +class ActionsModelMixedRuntimeParams: + """ + Parameters for ActionsModelMixed during runtime interference + + Args: + threshold (float): minimum confidence for applying action + chunksize (int): Maximum number of chunks to perform inference on + """ + + threshold: float = 0.9 + max_cond_len: Optional[int] = 500 + + +class ActionsModelMixed(PunctuationModel): + """Encoder-decoder based model with unpunctuated token sequence as input and array of action-vectors as output""" + + def __init__( + self, + params: ActionsModelMixedParams, + runtime: ActionsModelMixedRuntimeParams = ActionsModelMixedRuntimeParams(), + ) -> None: + """Initializes mixed model + + Args: + params (ActionsModelMixedParams): Parameters for model + """ + super(ActionsModelMixed, self).__init__() + + self.params = params + self.runtime = runtime + self._tokenizer = None + + self.num_labels = params.num_labels + self.device = "cpu" + + # Word embedder + self.word_embedding = nn.Embedding(params.vocab_size, params.embedding_size) + self.punctuation_embedding = nn.Linear(params.num_labels, params.embedding_size) + + # Add positional encoding + self.words_position_embedding = PositionalEncoding( + params.embedding_size, params.max_len, params.dropout + ) + self.punctuation_position_embedding = PositionalEncoding( + params.embedding_size, params.max_len, params.dropout + ) + + # Sentence encoder + sentence_encoder_layer = nn.TransformerEncoderLayer( + params.embedding_size, + params.num_heads, + params.feedforward_neurons, + params.dropout, + ) + self.sentence_encoder = nn.TransformerEncoder( + sentence_encoder_layer, num_layers=params.num_layers + ) + + # Punctuation decoder + punctuation_decoder_layer = nn.TransformerDecoderLayer( + params.embedding_size, + params.num_heads, + params.feedforward_neurons, + params.dropout, + ) + self.punctuation_decoder = nn.TransformerDecoder( + punctuation_decoder_layer, num_layers=params.num_layers + ) + + self.to_labels = nn.Linear(params.embedding_size, params.num_labels) + + def forward( + self, + input_ids: torch.Tensor, + actions: torch.Tensor, + attention_mask: torch.Tensor, + ) -> torch.Tensor: + """Computes action vectors array from array of tokens + + Args: + input_ids (torch.Tensor): Tokens representing unpuctuated text. Shape BxL + actions (torch.Tensor): Actions vector predicted up-till now. Shape BxL-1xA + attention_mask (torch.Tensor): Mask representing if token is padding (True) or Not. Shape BxL + + Returns: + torch.Tensor: Predicted actions shifted one to the left. Shape BxL-1xA + """ + + # Input to encoder + x = input_ids.transpose(0, 1) + x = self.word_embedding(x) + x = self.words_position_embedding(x) + + # Input to decoder + y = actions.transpose(0, 1) + y = self.punctuation_embedding(y) + y = self.punctuation_position_embedding(y) + + tgt_mask = generate_square_subsequent_mask(y.shape[0]).to(y.device) + + sentence_encoded = self.sentence_encoder(x, src_key_padding_mask=attention_mask) + + actions_decoded = self.punctuation_decoder( + y, sentence_encoded, tgt_mask=tgt_mask + ) + + z = actions_decoded.transpose(1, 0) + + return self.to_labels(z) + + def to(self, device): + self.device = device + + super(ActionsModelMixed, self).to(device) + + def tokenizer(self) -> BertTokenizerFast: + if self._tokenizer is None: + self._tokenizer = BertTokenizerFast.from_pretrained( + self.params.base_tokenizer + ) + return self._tokenizer + + def predict(self, text: str) -> str: + inputs = [action_vector(["upper_case"])] + + tokenizer = self.tokenizer() + text_tokenized = tokenizer(text, return_tensors="pt") + + target_device = self.device + + max_cond_len = self.runtime.max_cond_len + if max_cond_len is None: + max_cond_len = np.iinfo(np.int).max + + for _ in range(text_tokenized["input_ids"].shape[1] - 2): + input_start = max(0, len(inputs) - max_cond_len) + + prediction_raw = self.forward( + text_tokenized["input_ids"][:, input_start:].to(target_device), + torch.tensor(inputs[input_start:], dtype=torch.float) + .reshape(1, -1, self.num_labels) + .to(target_device), + (text_tokenized["attention_mask"][:, input_start:] == 0).to( + target_device + ), + ).sigmoid() + + inputs.append( + ( + prediction_raw.detach().cpu().numpy()[0, -1, :] + > self.runtime.threshold + ).astype(np.float) + ) + + word_labels = token_labels_to_word_labels(text, inputs[1:], tokenizer) + + prediction_binary = word_labels.astype(np.int) + + return recover_text(text, prediction_binary) + + def predict_raw( + self, input_ids: torch.Tensor, attention_mask: torch.Tensor + ) -> torch.Tensor: + """Function that maps input_ids tensors into per-token labels + + Args: + input_ids (torch.Tensor): Token ids of input. Shape BxL + attention_mask (torch.Tensor): Attention mask of tokens. Shape BxL + + Returns: + torch.Tensor: Per-token action-vector labels. Shape BxLxA + """ + outputs = torch.tensor(action_vector(["upper_case"]), dtype=torch.float).to( + input_ids.device + ) + outputs = outputs.unsqueeze(0).unsqueeze(0).repeat(input_ids.shape[0], 1, 1) + + for _ in range(input_ids.shape[1] - 1): + prediction_raw = self.forward( + input_ids, outputs, (attention_mask == 0) + ).sigmoid() + + prediction_raw = (prediction_raw[:, -1:, :] > self.runtime.threshold).type( + torch.float + ) + outputs = torch.cat([outputs, prediction_raw], dim=1) + + return outputs + + def save(self, dir: str, name: str, runtime: bool = True) -> None: + prepare_folder(dir) + torch.save(self.state_dict(), f"{dir}/{name}.model") + pickle_save(self.params, f"{dir}/{name}.config") + + if runtime: + self.runtime.save_yaml(f"{dir}/{name}.runtime.yaml") + + @staticmethod + def load(dir: str, name: str, device: torch.device) -> PunctuationModel: + params = pickle_read(f"{dir}/{name}.config") + if os.path.exists(f"{dir}/{name}.runtime.yaml"): + runtime = ActionsModelMixedRuntimeParams.load_yaml( + f"{dir}/{name}.runtime.yaml" + ) + else: + runtime = ActionsModelMixedRuntimeParams() + + model = ActionsModelMixed(params, runtime) + model.to(device) + + model.load_state_dict(torch.load(f"{dir}/{name}.model", map_location=device)) + + return model + + +class ActionsModelMixedLoss(nn.Module): + """Class representing proposed loss for training mixed actions model""" + + def __init__(self, prior_odds: torch.Tensor) -> None: + """Initializes ActionsModelMixedLoss + + Args: + prior_odds (torch.Tensor): Odds representing ratio of positive to negative examples for each label in action vector. Shape A + """ + super(ActionsModelMixedLoss, self).__init__() + + self.core = BCEWithLogitsLoss(pos_weight=prior_odds) + + def forward( + self, + true_action_vector: torch.Tensor, + predicted_action_vector_logits: torch.Tensor, + ) -> torch.Tensor: + """Computes loss for training mixed actions model + + Args: + true_action_vector (torch.Tensor): Action vector that should be + predicted by ActionsModelMixed (shifted by 1 to the left in + regards to inputs). Shape BxL-1xA + + predicted_action_vector_logits (torch.Tensor): Action vector that + was acttualy predicted by ActionsModelMixed (shifted by 1 to + the left in regards to inputs). Shape BxL-1xA + + + Returns: + torch.Tensor: Loss of predition in relation to ground truth + """ + + return self.core(predicted_action_vector_logits, true_action_vector) diff --git a/src/models/actions_model_restricted.py b/src/models/actions_model_restricted.py new file mode 100644 index 0000000000000000000000000000000000000000..9239e667baa93c1ff91a954aa019775768ec4301 --- /dev/null +++ b/src/models/actions_model_restricted.py @@ -0,0 +1,266 @@ +from __future__ import annotations + +import os +from dataclasses import dataclass + +import numpy as np +import torch +import torch.nn as nn +from transformers.configuration_utils import PretrainedConfig +from transformers.modeling_bert import BertForTokenClassification +from transformers.tokenization_bert import BertTokenizerFast + +from src.models.actions_model_mixed import ActionsModelMixed +from src.models.interfaces import ActionsModel, PunctuationModel +from src.pipelines.actions_based.processing import ( + action_vector, + last_stop_label, + recover_text, + token_labels_to_word_labels, +) +from src.pipelines.actions_based.utils import max_suppression +from src.utils import pickle_read, pickle_save, prepare_folder, yaml_serializable + + +@dataclass +class ActionsModelRestrictedParams: + """ + Parameters for ActionsModelRestricted + + Params: + base_model (str): Name of base model + extended_action_vector_size (int): Action-vector size including additional no-punctuation logit + """ + + base_model: str + extended_action_vector_size: int + + +@yaml_serializable +@dataclass +class ActionsModelRestrictedRuntimeParams: + """ + Parameters for ActionsModelBase during runtime interference + + Args: + threshold (float): minimum confidence for applying action + chunksize (int): Maximum number of chunks to perform inference on + """ + + threshold: float = 0.9 + chunksize: int = 500 + + +class ActionsModelRestricted(ActionsModel): + """Similar to ActionsModelBase, however no-punctuation class is added + and punctuation-related entries are treaded as proper categorical distribution + """ + + def __init__( + self, + params: ActionsModelRestrictedParams, + runtime: ActionsModelRestrictedRuntimeParams = ActionsModelRestrictedRuntimeParams(), + ) -> None: + """Initializes restricted actions model + + Args: + base_model (str): Name of base model + extended_action_vector_size (int): Action-vector size including additional no-punctuation logit + """ + super(ActionsModelRestricted, self).__init__() + + self.params = params + self.runtime = runtime + self._tokenizer = None + + config = PretrainedConfig.from_pretrained(params.base_model) + + config.num_labels = params.extended_action_vector_size + + self.core = BertForTokenClassification(config) + + def forward( + self, input_ids: torch.Tensor, attention_mask: torch.Tensor + ) -> torch.Tensor: + """Computes logits for uppercasing and adding punctuation to a word + + Args: + input_ids (torch.Tensor): Array of ids of tokens. Shape BxL + attention_mask (torch.Tensor): Mask telling if a token should be masked out (ie. Padding). Shape BxL + + Returns: + torch.Tensor: Logit for making each word uppercase and for adding a punctuation mark to each word. Shape BxL + """ + y_pred = self.core(input_ids=input_ids, attention_mask=attention_mask)[0] + + return y_pred + + def predict_raw( + self, input_ids: torch.Tensor, attention_mask: torch.Tensor + ) -> torch.Tensor: + """Function that maps input_ids tensors into per-token labels + + Args: + input_ids (torch.Tensor): Token ids of input. Shape BxL + attention_mask (torch.Tensor): Attention mask of tokens. Shape BxL + + Returns: + torch.Tensor: Per-token action-vector labels. Shape BxLxA + """ + + logits = self.forward(input_ids, attention_mask=attention_mask) + prob_uppercase = logits[:, :, :1].sigmoid() + prob_punctuation = logits[:, :, 1:].softmax(dim=-1) + + no_punctuation = prob_punctuation.argmax(-1) == ( + self.params.extended_action_vector_size - 2 + ) + no_punctuation = ( + no_punctuation.type(torch.float) + .unsqueeze(-1) + .repeat(1, 1, prob_punctuation.shape[-1] - 1) + ) + + prob_punctuation = prob_punctuation[:, :, :-1].softmax(-1) * ( + 1 - no_punctuation + ) + + return torch.cat([prob_uppercase, prob_punctuation], dim=-1) + + def predict(self, text: str) -> str: + chunk_size = self.runtime.chunksize + threshold = self.runtime.threshold + + text = text.strip() + + tokenizer = self.tokenizer() + tokens = tokenizer(text, return_tensors="pt")["input_ids"] + output = None + + index_start = 0 + while index_start < len(tokens[0]): + index_end = min(index_start + chunk_size, len(tokens[0])) + + tokens_chunk = tokens[:, index_start:index_end] + + actions = ( + self.predict_raw(tokens_chunk, torch.ones_like(tokens_chunk)) + .detach() + .cpu() + .numpy() + ) + actions_suppresed = max_suppression(actions, threshold)[0] + + offset = last_stop_label(actions_suppresed, action_vector(["dot"])) + + # Prevent infinite loop + if (offset is None) or (offset == 0): + offset = index_end - index_start + + if output is None: + output = actions[0, 0:offset] + else: + output = np.concatenate([output, actions[0, 0:offset]], axis=0) + + index_start += offset + + assert len(output) == len(tokens[0]) + + word_labels = token_labels_to_word_labels(text, output[1:-1], tokenizer) + actions = max_suppression(np.expand_dims(word_labels, 0), threshold)[0] + + return recover_text(text, actions) + + @staticmethod + def _logit(x: torch.Tensor): + EPS = 1e-5 + + z = torch.clamp(x, EPS, 1.0 - EPS) + + return torch.log(z / (1 - z)) + + def tokenizer(self) -> BertTokenizerFast: + if self._tokenizer is None: + self._tokenizer = BertTokenizerFast.from_pretrained(self.params.base_model) + return self._tokenizer + + def save(self, dir: str, name: str, runtime: bool = True) -> None: + prepare_folder(dir) + torch.save(self.state_dict(), f"{dir}/{name}.model") + pickle_save(self.params, f"{dir}/{name}.config") + + if runtime: + self.runtime.save_yaml(f"{dir}/{name}.runtime.yaml") + + @staticmethod + def load(dir: str, name: str, device: torch.device) -> ActionsModelRestricted: + params = pickle_read(f"{dir}/{name}.config") + if os.path.exists(f"{dir}/{name}.runtime.yaml"): + runtime = ActionsModelRestrictedRuntimeParams.load_yaml( + f"{dir}/{name}.runtime.yaml" + ) + else: + runtime = ActionsModelRestrictedRuntimeParams() + + model = ActionsModelRestricted(params, runtime).to(device) + model.load_state_dict(torch.load(f"{dir}/{name}.model", map_location=device,)) + + return model + + +class ActionsModelRestrictedLoss(nn.Module): + def __init__( + self, prior_uppercase_odds: torch.Tensor, punctuation_weights: torch.Tensor + ) -> None: + """Initializes ActionsModelRestrictedLoss + + Args: + prior_uppercase_odds (torch.Tensor): Odds od positive to negative cases of uppercase in dataset + punctuation_weights (torch.Tensor): Weights for each class in loss function. Should be inversly proportional to number of + their occurances in dataset (Shape A+1) + """ + super(ActionsModelRestrictedLoss, self).__init__() + + self.binary_ce = nn.BCEWithLogitsLoss( + pos_weight=prior_uppercase_odds.reshape(1) + ) + self.cat_ce = nn.CrossEntropyLoss(punctuation_weights) + + def forward( + self, + predicted_action_vector_logits: torch.Tensor, + true_extended_action_vector: torch.Tensor, + ) -> torch.Tensor: + """Loss for ActionsModelRestricted model + + Args: + true_extended_action_vector (torch.Tensor): Ground-truth action vectors. Shape BxLxA+1 + predicted_action_vector_logits (torch.Tensor): Action vector-s logits predicted by ActionsModelRestricted model. Shape BxLxA+1 + + Returns: + torch.Tensor: Loss value + """ + + predicted_punc = predicted_action_vector_logits[:, :, 1:].transpose(1, 2) + target_punc_index = torch.argmax(true_extended_action_vector[:, :, 1:], dim=-1) + punc_loss = self.cat_ce(predicted_punc, target_punc_index) + + predicted_uppercase = predicted_action_vector_logits[:, :, 0] + target_uppercase = true_extended_action_vector[:, :, 0] + uppercase_loss = self.binary_ce(predicted_uppercase, target_uppercase) + + return punc_loss + uppercase_loss + + def save(self, dir: str, name: str) -> None: + prepare_folder(dir) + torch.save(self.state_dict(), f"{dir}/{name}.model") + pickle_save(self.params, f"{dir}/{name}.config") + + @staticmethod + def load(dir: str, name: str, device: torch.device) -> PunctuationModel: + params = pickle_read(f"{dir}/{name}.config") + model = ActionsModelMixed(params) + + model.load_state_dict(torch.load(f"{dir}/{name}.model", map_location=device,)) + + return model diff --git a/src/models/common.py b/src/models/common.py new file mode 100644 index 0000000000000000000000000000000000000000..012999f2f7f3461eeae51a10ce8c3f7fcdf5a69f --- /dev/null +++ b/src/models/common.py @@ -0,0 +1,60 @@ +import math + +import torch +import torch.nn as nn + + +def generate_square_subsequent_mask(sz): + r""" + Generate a square mask for the sequence. The masked positions are filled with float('-inf'). + Unmasked positions are filled with float(0.0). + + Source: torch Transformer class + """ + mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1) + mask = ( + mask.float() + .masked_fill(mask == 0, float("-inf")) + .masked_fill(mask == 1, float(0.0)) + ) + return mask + + +class PositionalEncoding(nn.Module): + """Adds sinsusoidal positional encoding (as in original "Attention is all you need" paper.) + src: https://pytorch.org/tutorials/beginner/transformer_tutorial.html + + """ + + def __init__(self, d_model: int, max_len: int, dropout=0.1): + """Sinusidal positional encodings + + Args: + d_model (int): Embedding dimension + max_len (int): Maximum length of sequence + dropout (float, optional): Dropout ratio. Defaults to 0.1. + """ + super(PositionalEncoding, self).__init__() + self.dropout = nn.Dropout(p=dropout) + + pe = torch.zeros(max_len, d_model) + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp( + torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model) + ) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0).transpose(0, 1) + self.register_buffer("pe", pe) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Applies positional encoding + + Args: + x (torch.Tensor): Word embeddings tensor + + Returns: + torch.Tensor: Word embeddings with added positional encodings + """ + x = x + self.pe[: x.size(0), :] + return self.dropout(x) diff --git a/src/models/interfaces.py b/src/models/interfaces.py new file mode 100644 index 0000000000000000000000000000000000000000..46271456f012fb5c862850771f8329957044456f --- /dev/null +++ b/src/models/interfaces.py @@ -0,0 +1,49 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod + +import torch +import torch.nn as nn +from transformers.tokenization_utils_fast import PreTrainedTokenizerFast + + +class PunctuationModel(nn.Module, ABC): + def __init__(self) -> None: + super().__init__() + + @abstractmethod + def tokenizer(self) -> PreTrainedTokenizerFast: + pass + + @abstractmethod + def save(self, dir: str, name: str, runtime: bool = False) -> None: + pass + + @staticmethod + @abstractmethod + def load(dir: str, name: str, device: torch.device) -> PunctuationModel: + pass + + +class ActionsModel(PunctuationModel): + def __init__(self) -> None: + super().__init__() + + @abstractmethod + def predict_raw( + self, input_ids: torch.Tensor, attention_mask: torch.Tensor + ) -> torch.Tensor: + """Function that maps input_ids tensors into per-token labels + + Args: + input_ids (torch.Tensor): Token ids of input. Shape BxL + attention_mask (torch.Tensor): Attention mask of tokens. Shape BxL + + Returns: + torch.Tensor: Per-token action-vector labels. Shape BxLxA + """ + pass + + @abstractmethod + def predict(self, text: str) -> str: + pass diff --git a/src/models/model_factory.py b/src/models/model_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..3d4abcc4af843f1781aeba981fb6f16be2843565 --- /dev/null +++ b/src/models/model_factory.py @@ -0,0 +1,10 @@ +from src.models.actions_model_mixed import ActionsModelMixed +from src.models.actions_model_restricted import ActionsModelRestricted +from src.models.actions_model_base import ActionsModelBase + + +MODELS_MAP = { + "actions_base": ActionsModelBase, + "actions_restricted": ActionsModelRestricted, + "actions_mixed": ActionsModelMixed, +} diff --git a/src/pipelines/actions_based/processing.py b/src/pipelines/actions_based/processing.py index 44cc2a54f1f48425c7c4649e78ab0bfa981ac23e..4f19a1a55f49120521ce7285675822116f7ca981 100644 --- a/src/pipelines/actions_based/processing.py +++ b/src/pipelines/actions_based/processing.py @@ -6,11 +6,11 @@ import numpy as np from transformers import BertTokenizerFast from transformers.tokenization_utils_fast import PreTrainedTokenizerFast -from src.utils import remove_punctuation +from src.utils import input_preprocess, output_preprocess -ACTIONS_KEYS = ["dot", "upper_case", "colon", "question_mark"] -UPPERCASE_INDEX = 1 -PUNCTUATION_INDEXES = [0, 2, 3] +ACTIONS_KEYS = ["upper_case", "dot", "colon", "question_mark"] +UPPERCASE_INDEX = 0 +PUNCTUATION_INDEXES = [1, 2, 3] def apply_file_processing(x: dict) -> dict: @@ -107,8 +107,8 @@ def action_vector(actions: List[str]) -> np.ndarray: """ return encode_actions( { - "dot": "dot" in actions, "upper_case": "upper_case" in actions, + "dot": "dot" in actions, "colon": "colon" in actions, "question_mark": "question_mark" in actions, } @@ -128,6 +128,8 @@ def last_stop_label(labels: np.array, stop_action: np.array) -> Optional[int]: assert len(labels.shape) == 2 assert len(stop_action.shape) == 1 + assert stop_action.shape[0] == labels.shape[-1] + stop_labels = np.argwhere(np.all(labels == stop_action, axis=1)) if len(stop_labels) == 0: @@ -206,8 +208,8 @@ def detect_actions(word: str, next_word: Optional[str]) -> Mapping[str, bool]: return dict(zip(ACTIONS_KEYS, [False] * len(ACTIONS_KEYS))) actions = { - "dot": word[-1] == ".", "upper_case": word[0].isupper(), + "dot": word[-1] == ".", "colon": word[-1] == ",", "question_mark": word[-1] == "?", } @@ -251,7 +253,7 @@ def create_model_input_output(text: str) -> Tuple[str, np.ndarray]: text_cleaned (str): Text without any interpuction and all lowercase actions (np.ndarray): To dimensional array, where each row is aciton vector for each word (columns) """ - words = text.split(" ") + words = output_preprocess(text).split(" ") words_output = [] actions_output = [] @@ -261,7 +263,7 @@ def create_model_input_output(text: str) -> Tuple[str, np.ndarray]: word = words[i] next_word = words[i + 1] if len(words) > i + 1 else None - word_sanitized = remove_punctuation(word).lower() + word_sanitized = input_preprocess(word) if len(word_sanitized) > 0: actions = detect_actions(word, next_word) actions_encoded = encode_actions(actions) @@ -289,8 +291,6 @@ def token_word_mapping(text: str, tokenizer: PreTrainedTokenizerFast) -> np.ndar text_tokenized = tokenizer(text, return_offsets_mapping=True) offset_mappings = text_tokenized["offset_mapping"][1:-1] - offset_mappings = text_tokenized["offset_mapping"][1:-1] - # Create a map where each character is assigned index of it's word words_mapping = [] actual_word = 0 diff --git a/src/pipelines/actions_based/scoring.py b/src/pipelines/actions_based/scoring.py new file mode 100644 index 0000000000000000000000000000000000000000..66a267f72ffceab792c2e8997bee79aec535c325 --- /dev/null +++ b/src/pipelines/actions_based/scoring.py @@ -0,0 +1,122 @@ +from typing import List, Optional, Tuple + +import numpy as np +from sklearn.metrics import auc, f1_score, roc_curve + +from src.pipelines.actions_based.processing import ACTIONS_KEYS +from src.utils import prepare_folder + + +class Metrics: + """Class for model metrics calcuation and presenting""" + + def __init__(self, name: str, output_dir: Optional[str]) -> None: + """Initializes Metrics + + Args: + name (str): Name of the model that is measured + output_dir (Optional[str]): Directory where measurments will be saved. Can be None if saving is not required + """ + self.name = name + self.message = "" + self.output_dir = output_dir + + def compute_metrics(self, predictions: np.ndarray, targets: np.ndarray): + """Performs metrics calculation on model predictions relative to ground truth + + Args: + predictions (np.ndarray): Predicted, non-thresholded values + targets (np.ndarray): Ground truth values + """ + f1_scores = self._f1_scores(predictions, targets) + + self._log_text(f"Model {self.name} | F1 scores") + self._log_text("----------------------") + self._log_text(f1_scores) + self._log_text("----------------------") + + self._output_message() + + def _f1_scores(self, predictions: np.ndarray, targets: np.ndarray) -> dict: + predictions = predictions_threshold(predictions, 0.0) + f1_scores = f1_score(predictions, targets, average=None) + + return dict(zip(ACTIONS_KEYS, f1_scores)) + + def _output_message(self): + print(self.message) + + if self.output_dir is not None: + prepare_folder(self.output_dir) + + with open(f"{self.output_dir}/{self.name}.txt", "w") as f: + f.write(self.message) + + def _log_text(self, text: str): + self.message += f"{text}\n" + + +def predictions_threshold( + predictions: np.ndarray, threshold: float = 0.9 +) -> np.ndarray: + """Applies thresholding above which all values will be assigned 1.0, otherwsie 0.0 + + Args: + predictions (np.ndarray): Unthresholded predictions + threshold (float, optional): Threshold. Defaults to 0.9. + + Returns: + np.ndarray: Binarized predictions + """ + return (predictions > threshold).astype(np.float) + + +def multiclass_roc_curve( + target: np.ndarray, predictions: np.ndarray +) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]: + """Computes ROC-curve points for multiclass/mutlilabel case + + Args: + target (np.ndarray): Ground-truth values + predictions (np.ndarray): Unthresholded predictions + + Returns: + Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]: Falsoe positive rates, True-positives rates, thresholds. All + values are returned as a lists, where each entry in the list coresponds to value at single class + """ + class_fprs = [] + class_tprs = [] + class_thresholds = [] + + for index in range(predictions.shape[-1]): + fpr, tpr, thresholds = roc_curve(target[:, index], predictions[:, index]) + + class_fprs.append(fpr) + class_tprs.append(tpr) + class_thresholds.append(thresholds) + + return class_fprs, class_tprs, class_thresholds + + +def multiclass_auc( + false_positive_rate: List[np.ndarray], true_positive_rate: List[np.ndarray] +) -> np.ndarray: + """Computes area under curve for each class in multilabel/multiclass case + + Args: + false_positive_rate (List[np.ndarray]): False positive rates, where each entry in the list coresponds to value at single class + true_positive_rate (List[np.ndarray]): True positive rates, where each entry in the list coresponds to value at single class + + Returns: + np.ndarray: List of auc values for each class + """ + + assert len(false_positive_rate) == len(true_positive_rate) + + num_classes = len(false_positive_rate) + auc_list = np.zeros(num_classes) + + for i in range(num_classes): + auc_list[i] = auc(false_positive_rate[i], true_positive_rate[i]) + + return auc_list diff --git a/src/pipelines/actions_based/test.py b/src/pipelines/actions_based/test.py new file mode 100644 index 0000000000000000000000000000000000000000..8cfcda93c2e04f18290269c4ae137418670e6108 --- /dev/null +++ b/src/pipelines/actions_based/test.py @@ -0,0 +1,114 @@ +import argparse + +import dask.dataframe as dd +import numpy as np +import torch +from tqdm import trange + +from src.batch_loading import get_ordered_dataframe_len +from src.models.actions_model_base import ActionsModelBase +from src.models.actions_model_mixed import ActionsModelMixed +from src.models.actions_model_restricted import ActionsModelRestricted +from src.pipelines.actions_based.scoring import Metrics +from src.utils import PROJECT_ROOT, get_config, unflattened_column + +SUPPORTED_MODELS = { + "base": ActionsModelBase, + "restricted": ActionsModelRestricted, + "mixed": ActionsModelMixed, +} + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Evaluate actions model") + parser.add_argument( + "-a", + "--architecture", + required=True, + choices=SUPPORTED_MODELS.keys(), + help="Model architecture", + ) + parser.add_argument( + "-d", + "--directory", + required=True, + help="Directory where trained model is located, relative to project root", + ) + parser.add_argument("-m", "--model", default="final", help="Pretrained model name") + parser.add_argument( + "-ds", + "--dataset", + type=str, + required=True, + help="Directory where test dataset is located, relative to project root", + ) + parser.add_argument( + "-o", + "--output", + type=str, + required=True, + help="Directory where output will be stored", + ) + parser.add_argument( + "-s", "--stage", type=str, required=True, help="Stage name in params.yaml" + ) + args = parser.parse_args() + + config = get_config() + limit = config["actions"][args.stage]["limit"] + batch_size = config["actions"][args.stage]["batch_size"] + device_name = config["actions"][args.stage]["device"] + + test_dataset = f"{PROJECT_ROOT}/{args.dataset}" + + print("Getting dataset info...") + df = dd.read_parquet(test_dataset, engine="pyarrow") + + print("Loading dataset to memory...") + df_len = get_ordered_dataframe_len(df) + + data_start = max(df_len - limit, 0) + data_end = df_len + pdf = df.loc[data_start:data_end].compute().reset_index() + + device = torch.device(device_name) + + print(f"Loading model {args.model}") + model_location = f"{PROJECT_ROOT}/{args.directory}" + model_type = SUPPORTED_MODELS[args.architecture] + model = model_type.load(model_location, args.model, device) + + true_batches = [] + prediction_batches = [] + + print("Computing...") + num_batches = len(pdf) // batch_size + for batch in trange(num_batches): + + batch_start = batch * batch_size + batch_end = (batch + 1) * batch_size + batch_pdf = pdf.iloc[batch_start:batch_end] + + inputs = unflattened_column(batch_pdf, "source") + outputs = unflattened_column(batch_pdf, "target") + attentions_mask = unflattened_column(batch_pdf, "attention_mask") + + inputs = torch.tensor(inputs, dtype=torch.long).squeeze(dim=-1).to(device) + outputs = torch.tensor(outputs, dtype=torch.float).to(device) + attentions_mask = torch.tensor(attentions_mask).to(device) + + prediction_batch = ( + model.predict_raw(inputs, attentions_mask).detach().cpu().numpy() + ) + prediction_batches.append(prediction_batch) + + true_batches.append(outputs.cpu().numpy()) + + predictions = np.concatenate(prediction_batches, axis=0).reshape( + -1, prediction_batches[0].shape[-1] + ) + trues = np.concatenate(true_batches, axis=0).reshape(-1, true_batches[0].shape[-1]) + + metrics = Metrics("actions-base", args.output) + + print("Calculating metrics...") + metrics.compute_metrics(predictions, trues) diff --git a/src/pipelines/actions_based/train.py b/src/pipelines/actions_based/train.py deleted file mode 100755 index e6ed38e621e363d77408da543109f71eb419dc25..0000000000000000000000000000000000000000 --- a/src/pipelines/actions_based/train.py +++ /dev/null @@ -1,142 +0,0 @@ -#!/usr/bin/python3 - -import glob -import pickle -from datetime import datetime - -import dask.dataframe as dd -import numpy as np -import torch -from torch.nn import BCEWithLogitsLoss -from transformers import BertForTokenClassification, BertTokenizerFast - -from src.batch_loading import get_batches -from src.pipelines.actions_based.processing import ACTIONS_KEYS -from src.training import latest_model, save_training_step -from src.utils import PROJECT_ROOT, convert_to_timedelta, get_config, prepare_folder - -INPUT_PATH = f"{PROJECT_ROOT}/generated/actions/stage4_reindexing" -INPUT_STATS_PATH = f"{PROJECT_ROOT}/generated/actions/stage5_stats" -OUTPUT_PATH = f"{PROJECT_ROOT}/checkpoints/actions" - -if __name__ == "__main__": - config = get_config() - learning_rate = config["actions"]["training"]["learning_rate"] - num_epochs = config["actions"]["training"]["num_epochs"] - batch_size = config["actions"]["training"]["batch_size"] - save_step = config["actions"]["training"]["save_step"] - loss_averaging_span = config["actions"]["training"]["loss_averaging_span"] - fresh_start = config["actions"]["training"]["fresh_start"] - device_name = config["actions"]["training"]["device"] - max_train_time = config["actions"]["training"]["max_training_time"] - base_model = config["global"]["base_model"] - seed = config["global"]["random_seed"] - - prepare_folder(OUTPUT_PATH) - np.random.seed(seed=seed) - - if max_train_time is not None: - max_train_time = convert_to_timedelta(max_train_time) - - device = torch.device(device_name if torch.cuda.is_available() else "cpu") - print(f"Training on {device}") - - # Load loss weights - with open(f"{INPUT_STATS_PATH}/stats.pickle", "rb") as f: - stats = pickle.load(f) - pos_examples = stats["class_number"] - neg_examples = stats["num_examples"] - stats["class_number"] - pos_weight = torch.tensor(neg_examples / pos_examples) - - df = dd.read_parquet(INPUT_PATH, engine="pyarrow") - tokenizer = BertTokenizerFast.from_pretrained(base_model) - - model = BertForTokenClassification.from_pretrained( - base_model, num_labels=len(ACTIONS_KEYS) - ).to(device) - criterion = BCEWithLogitsLoss(pos_weight=pos_weight).to(device) - optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate) - - epoch_start = 0 - sample_start = 0 - if fresh_start is False: - checkpoint_files = glob.glob(f"{OUTPUT_PATH}/*.model") - latest = latest_model(checkpoint_files) - - if latest is not None: - epoch, batch = latest - model.load_state_dict( - torch.load(f"{OUTPUT_PATH}/{epoch}-{batch}.model", map_location=device,) - ) - optimizer.load_state_dict( - torch.load( - f"{OUTPUT_PATH}/{epoch}-{batch}.optimizer", map_location=device, - ) - ) - - epoch_start, sample_start = epoch, batch - print(f"Loaded {epoch}-{batch}") - - model.train() - model.base_model.train() - losses = [] - - num_samples = df.tail(1).index.values[0] + 1 - random_index_shuffle = np.random.permutation(range(num_samples)) - - training_stopped = False - - time_max = datetime.max - if max_train_time is not None: - time_max = datetime.now() + max_train_time - - for epoch in range(epoch_start, num_epochs): - if training_stopped: - break - - i = sample_start - for data_batch in get_batches(df, batch_size, 100, random_index_shuffle, i): - inputs = data_batch.apply( - lambda x: x["source"].reshape(x["source_shape"]), axis=1 - ).values - outputs = data_batch.apply( - lambda x: x["target"].reshape(x["target_shape"]), axis=1 - ).values - attentions_mask = data_batch.apply( - lambda x: x["attention_mask"].reshape(x["attention_mask_shape"]), - axis=1, - ).values - - inputs = torch.tensor(np.stack(inputs).squeeze()).to(device) - outputs = torch.tensor(np.stack(outputs)).to(device) - attentions_mask = torch.tensor(np.stack(attentions_mask)).to(device) - - y_pred = model(input_ids=inputs, attention_mask=attentions_mask)[0] - - loss = criterion(y_pred, outputs) - - losses.append(loss.item()) - if len(losses) > loss_averaging_span: - losses = losses[-loss_averaging_span:] - - print(f"epoch: {epoch} | step: {i} | loss: {np.mean(losses)}") - - optimizer.zero_grad() - - if i % save_step == 0 and (i != sample_start or epoch != epoch_start): - print(f"Saving: Epoch {epoch}, step {i}") - save_training_step(OUTPUT_PATH, f"{epoch}-{i}", model, optimizer) - - if datetime.now() > time_max: - print(f"Max time reached, saving: Epoch {epoch}, step {i}") - save_training_step(OUTPUT_PATH, f"{epoch}-{i}", model, optimizer) - training_stopped = True - break - - loss.backward() - optimizer.step() - - i += 1 - - if not training_stopped: - save_training_step(OUTPUT_PATH, "final", model, optimizer) diff --git a/src/pipelines/actions_based/train_base.py b/src/pipelines/actions_based/train_base.py new file mode 100755 index 0000000000000000000000000000000000000000..c8597f017c1d53582822e8a93559b8fbf95fda27 --- /dev/null +++ b/src/pipelines/actions_based/train_base.py @@ -0,0 +1,120 @@ +#!/usr/bin/python3 + +import pickle + +import dask.dataframe as dd +import numpy as np +import torch +from transformers import BertTokenizerFast + +from src.models.actions_model_base import ( + ActionsModelBase, + ActionsModelBaseLoss, + ActionsModelBaseParams, +) +from src.pipelines.actions_based.processing import ACTIONS_KEYS +from src.utils import ( + PROJECT_ROOT, + Checkpoint, + Loader, + ProgressTracker, + Saver, + Timeout, + convert_to_timedelta, + get_config, + random_indexes, + training_loop, + unflattened_column, +) + +INPUT_PATH = f"{PROJECT_ROOT}/generated/actions/stage4_reindexing" +INPUT_STATS_PATH = f"{PROJECT_ROOT}/generated/actions/stage5_stats" +OUTPUT_PATH = f"{PROJECT_ROOT}/checkpoints/actions_base" + + +if __name__ == "__main__": + config = get_config() + learning_rate = config["actions"]["training_base"]["learning_rate"] + num_epochs = config["actions"]["training_base"]["num_epochs"] + batch_size = config["actions"]["training_base"]["batch_size"] + save_step = config["actions"]["training_base"]["save_step"] + batch_buffer_size = config["actions"]["training_base"]["batch_buffer_size"] + loss_averaging_span = config["actions"]["training_base"]["loss_averaging_span"] + fresh_start = config["actions"]["training_base"]["fresh_start"] + device_name = config["actions"]["training_base"]["device"] + max_train_time = convert_to_timedelta( + config["actions"]["training_base"]["max_training_time"] + ) + base_model = config["global"]["base_model"] + seed = config["global"]["random_seed"] + + np.random.seed(seed=seed) + df = dd.read_parquet(INPUT_PATH, engine="pyarrow") + + device = torch.device(device_name if torch.cuda.is_available() else "cpu") + tokenizer = BertTokenizerFast.from_pretrained(base_model) + + loader = Loader(OUTPUT_PATH, ActionsModelBase, torch.optim.AdamW, device) + if loader.has_checkpoints() and not fresh_start: + model, optimizer, epoch_start, sample_start = loader.load_latest() + else: + params = ActionsModelBaseParams(base_model, len(ACTIONS_KEYS)) + model = ActionsModelBase(params) + model.to(device) + + optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate) + epoch_start, sample_start = (0, 0) + + model.train() + + # Load loss weights + with open(f"{INPUT_STATS_PATH}/stats.pickle", "rb") as f: + stats = pickle.load(f) + pos_examples = stats["class_number"] + neg_examples = stats["num_examples"] - stats["class_number"] + pos_weight = torch.tensor(neg_examples / pos_examples) + + criterion = ActionsModelBaseLoss(pos_weight).to(device) + + random_index_shuffle = random_indexes(df) + training_stopped = False + + saver = Saver(OUTPUT_PATH, model, optimizer) + checkpoint = Checkpoint(save_step, saver, epoch_start, sample_start) + timer = Timeout(max_train_time, saver) + tracker = ProgressTracker(device, loss_averaging_span) + + timer.start() + for data_batch, epoch, i in training_loop( + epoch_start, + sample_start, + num_epochs, + df, + batch_size, + batch_buffer_size, + random_index_shuffle, + ): + inputs = unflattened_column(data_batch, "source") + outputs = unflattened_column(data_batch, "target") + attentions_mask = unflattened_column(data_batch, "attention_mask") + + inputs = torch.tensor(inputs, dtype=torch.long).squeeze(dim=-1).to(device) + outputs = torch.tensor(outputs, dtype=torch.float).to(device) + attentions_mask = torch.tensor(attentions_mask).type(torch.long).to(device) + + y_pred = model(input_ids=inputs, attention_mask=attentions_mask) + + optimizer.zero_grad() + loss = criterion(y_pred, outputs) + + tracker.step(epoch, i, loss) + checkpoint.step(epoch, i) + if timer.step(epoch, i): + training_stopped = True + break + + loss.backward() + optimizer.step() + + if not training_stopped: + saver.save("final") diff --git a/src/pipelines/actions_based/train_mixed.py b/src/pipelines/actions_based/train_mixed.py new file mode 100755 index 0000000000000000000000000000000000000000..fd44e27b0101501632366e2df949ef4b8a871f08 --- /dev/null +++ b/src/pipelines/actions_based/train_mixed.py @@ -0,0 +1,144 @@ +#!/usr/bin/python3 + +import pickle + +import dask.dataframe as dd +import numpy as np +import torch +from transformers import BertTokenizerFast + +from src.models.actions_model_mixed import ( + ActionsModelMixed, + ActionsModelMixedLoss, + ActionsModelMixedParams, +) +from src.pipelines.actions_based.processing import ACTIONS_KEYS +from src.utils import ( + PROJECT_ROOT, + Checkpoint, + Loader, + ProgressTracker, + Saver, + Timeout, + convert_to_timedelta, + get_config, + random_indexes, + training_loop, + unflattened_column, +) + +INPUT_PATH = f"{PROJECT_ROOT}/generated/actions/stage4_reindexing" +INPUT_STATS_PATH = f"{PROJECT_ROOT}/generated/actions/stage5_stats" +OUTPUT_PATH = f"{PROJECT_ROOT}/checkpoints/actions_mixed" + + +if __name__ == "__main__": + config = get_config() + threshold = config["actions"]["training_mixed"]["threshold"] + embedding_size = config["actions"]["training_mixed"]["embedding_size"] + num_heads = config["actions"]["training_mixed"]["num_heads"] + num_layers = config["actions"]["training_mixed"]["num_layers"] + dropout = config["actions"]["training_mixed"]["dropout"] + feedforward_neurons = config["actions"]["training_mixed"]["feedforward_neurons"] + learning_rate = config["actions"]["training_mixed"]["learning_rate"] + num_epochs = config["actions"]["training_mixed"]["num_epochs"] + batch_size = config["actions"]["training_mixed"]["batch_size"] + save_step = config["actions"]["training_mixed"]["save_step"] + batch_buffer_size = config["actions"]["training_mixed"]["batch_buffer_size"] + loss_averaging_span = config["actions"]["training_mixed"]["loss_averaging_span"] + fresh_start = config["actions"]["training_mixed"]["fresh_start"] + device_name = config["actions"]["training_mixed"]["device"] + max_train_time = convert_to_timedelta( + config["actions"]["training_mixed"]["max_training_time"] + ) + base_model = config["global"]["base_model"] + seed = config["global"]["random_seed"] + + np.random.seed(seed=seed) + df = dd.read_parquet(INPUT_PATH, engine="pyarrow") + + device = torch.device(device_name if torch.cuda.is_available() else "cpu") + tokenizer = BertTokenizerFast.from_pretrained(base_model) + + loader = Loader(OUTPUT_PATH, ActionsModelMixed, torch.optim.AdamW, device) + + if loader.has_checkpoints() and not fresh_start: + model, optimizer, epoch_start, sample_start = loader.load_latest() + else: + params = ActionsModelMixedParams( + base_model, + tokenizer.vocab_size, + threshold, + embedding_size, + num_heads, + num_layers, + feedforward_neurons, + len(ACTIONS_KEYS), + 500, + dropout, + ) + model = ActionsModelMixed(params) + model.to(device) + optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate) + epoch_start, sample_start = (0, 0) + + model.train() + + # Load loss weights + with open(f"{INPUT_STATS_PATH}/stats.pickle", "rb") as f: + stats = pickle.load(f) + pos_examples = stats["class_number"] + neg_examples = stats["num_examples"] - stats["class_number"] + pos_weight = torch.tensor(neg_examples / pos_examples) + + criterion = ActionsModelMixedLoss(pos_weight).to(device) + + random_index_shuffle = random_indexes(df) + training_stopped = False + + saver = Saver(OUTPUT_PATH, model, optimizer) + checkpoint = Checkpoint(save_step, saver, epoch_start, sample_start) + timer = Timeout(max_train_time, saver) + tracker = ProgressTracker(device, loss_averaging_span) + + timer.start() + for data_batch, epoch, i in training_loop( + epoch_start, + sample_start, + num_epochs, + df, + batch_size, + batch_buffer_size, + random_index_shuffle, + ): + inputs = unflattened_column(data_batch, "source") + outputs = unflattened_column(data_batch, "target") + attentions_mask = unflattened_column(data_batch, "attention_mask") + + inputs = torch.tensor(inputs, dtype=torch.long).to(device).squeeze(dim=2) + + outputs = torch.tensor(outputs, dtype=torch.float).to(device) + + # Convert to boolean + attentions_mask = torch.tensor(attentions_mask == 0).to(device) + + y_pred = model( + input_ids=inputs, + actions=outputs[:, :-1, :], + attention_mask=attentions_mask, + ) + + loss = criterion(outputs[:, 1:, :], y_pred) + optimizer.zero_grad() + + tracker.step(epoch, i, loss) + checkpoint.step(epoch, i) + if timer.step(epoch, i): + training_stopped = True + break + + loss.backward() + optimizer.step() + + if not training_stopped: + saver.save("final") diff --git a/src/pipelines/actions_based/train_restricted.py b/src/pipelines/actions_based/train_restricted.py new file mode 100755 index 0000000000000000000000000000000000000000..ed43789b0fd7cd7866a41baeccf901293cb0a171 --- /dev/null +++ b/src/pipelines/actions_based/train_restricted.py @@ -0,0 +1,146 @@ +#!/usr/bin/python3 + +import pickle + +import dask.dataframe as dd +import numpy as np +import torch +from transformers import BertTokenizerFast + +from src.models.actions_model_restricted import ( + ActionsModelRestricted, + ActionsModelRestrictedLoss, + ActionsModelRestrictedParams, +) +from src.pipelines.actions_based.processing import ACTIONS_KEYS +from src.utils import ( + PROJECT_ROOT, + Checkpoint, + Loader, + ProgressTracker, + Saver, + Timeout, + convert_to_timedelta, + get_config, + random_indexes, + training_loop, + unflattened_column, +) + +INPUT_PATH = f"{PROJECT_ROOT}/generated/actions/stage4_reindexing" +INPUT_STATS_PATH = f"{PROJECT_ROOT}/generated/actions/stage5_stats" +OUTPUT_PATH = f"{PROJECT_ROOT}/checkpoints/actions_restricted" + + +if __name__ == "__main__": + + config = get_config() + learning_rate = config["actions"]["training_restricted"]["learning_rate"] + num_epochs = config["actions"]["training_restricted"]["num_epochs"] + batch_size = config["actions"]["training_restricted"]["batch_size"] + save_step = config["actions"]["training_restricted"]["save_step"] + batch_buffer_size = config["actions"]["training_restricted"]["batch_buffer_size"] + loss_averaging_span = config["actions"]["training_restricted"][ + "loss_averaging_span" + ] + fresh_start = config["actions"]["training_restricted"]["fresh_start"] + device_name = config["actions"]["training_restricted"]["device"] + max_train_time = convert_to_timedelta( + config["actions"]["training_restricted"]["max_training_time"] + ) + base_model = config["global"]["base_model"] + seed = config["global"]["random_seed"] + + np.random.seed(seed=seed) + df = dd.read_parquet(INPUT_PATH, engine="pyarrow") + + device = torch.device(device_name if torch.cuda.is_available() else "cpu") + tokenizer = BertTokenizerFast.from_pretrained(base_model) + + loader = Loader(OUTPUT_PATH, ActionsModelRestricted, torch.optim.AdamW, device) + if loader.has_checkpoints() and not fresh_start: + model, optimizer, epoch_start, sample_start = loader.load_latest() + else: + params = ActionsModelRestrictedParams(base_model, len(ACTIONS_KEYS) + 1) + model = ActionsModelRestricted(params) + model.to(device) + optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate) + epoch_start, sample_start = (0, 0) + + model.train() + + # Load loss weights + with open(f"{INPUT_STATS_PATH}/stats.pickle", "rb") as f: + stats = pickle.load(f) + pos_examples = stats["class_number"] + neg_examples = stats["num_examples"] - stats["class_number"] + + uppercase_pos_examples = pos_examples[0] + uppercase_neg_examples = neg_examples[0] + uppercase_pos_odds = torch.tensor( + uppercase_pos_examples / uppercase_neg_examples, dtype=torch.float + ) + + has_punctuation_neg_examples = neg_examples[1:] + has_no_punctuation_neg_examples = np.sum(pos_examples[1:]) + + punctuation_neg_examples = np.concatenate( + [has_punctuation_neg_examples, has_no_punctuation_neg_examples.reshape(1)], + -1, + ) + + punctuation_class_weights = torch.tensor( + (punctuation_neg_examples) / np.sum(punctuation_neg_examples), + dtype=torch.float, + ) + + criterion = ActionsModelRestrictedLoss( + uppercase_pos_odds, punctuation_class_weights + ).to(device) + + random_index_shuffle = random_indexes(df) + training_stopped = False + + saver = Saver(OUTPUT_PATH, model, optimizer) + checkpoint = Checkpoint(save_step, saver, epoch_start, sample_start) + timer = Timeout(max_train_time, saver) + tracker = ProgressTracker(device, loss_averaging_span) + + timer.start() + for data_batch, epoch, i in training_loop( + epoch_start, + sample_start, + num_epochs, + df, + batch_size, + batch_buffer_size, + random_index_shuffle, + ): + inputs = unflattened_column(data_batch, "source") + outputs = unflattened_column(data_batch, "target") + attentions_mask = unflattened_column(data_batch, "attention_mask") + + inputs = torch.tensor(inputs, dtype=torch.long).squeeze(dim=-1).to(device) + outputs = torch.tensor(outputs, dtype=torch.float).to(device) + attentions_mask = torch.tensor(attentions_mask).to(device) + + y_pred = model(input_ids=inputs, attention_mask=attentions_mask) + + outputs = torch.cat( + [outputs, (1.0 - outputs[:, :, 1:].max(-1)[0]).unsqueeze(-1)], axis=-1 + ) + + loss = criterion(y_pred, outputs) + optimizer.zero_grad() + + tracker.step(epoch, i, loss) + checkpoint.step(epoch, i) + if timer.step(epoch, i): + training_stopped = True + break + + loss.backward() + optimizer.step() + + if not training_stopped: + saver.save("final") diff --git a/src/pipelines/translation_based/processing.py b/src/pipelines/translation_based/processing.py index 41962da3b1e75641192bb018f3b5d028635a6316..608cf433aef25b6409b8615f8f1819c09474f77c 100644 --- a/src/pipelines/translation_based/processing.py +++ b/src/pipelines/translation_based/processing.py @@ -1,9 +1,10 @@ -from typing import Tuple +from typing import List, Tuple import numpy as np from transformers import BertTokenizerFast -from src.pipelines.actions_based.processing import remove_punctuation, text_from_xml +from src.pipelines.actions_based.processing import text_from_xml +from src.utils import input_preprocess def raw_to_dataframe(entry: dict) -> dict: @@ -125,7 +126,9 @@ def find_new_sentence_right(seq: np.array, pos: int) -> int: return None -def get_batch_indexes(seq: np.array, min_length: int, max_length: int) -> [np.array]: +def get_batch_indexes( + seq: np.array, min_length: int, max_length: int +) -> List[np.array]: """Turns long sequence into array of indices, composing a single batch file. Args: @@ -241,7 +244,7 @@ def create_input_output( np.ndarray: Single sample that will serve as expected output from the model """ decoded_str = tokenizer.decode(tokens) - cleaned_str = remove_punctuation(decoded_str).lower() + cleaned_str = input_preprocess(decoded_str).lower() source_batch_entry = tokenizer(cleaned_str)["input_ids"][1:-1] target_batch_entry = tokens @@ -270,10 +273,10 @@ def create_input_output( def crete_input_output_batch( seq: np.ndarray, - batch_indexes: [np.ndarray], + batch_indexes: List[np.ndarray], length: int, tokenizer: BertTokenizerFast, -) -> (np.ndarray, np.ndarray): +) -> Tuple[np.ndarray, np.ndarray]: """Transforms a sequence of tokens into "translation" input and output batch Args: diff --git a/src/pipelines/translation_based/train.py b/src/pipelines/translation_based/train.py index 6e39eccdebf220bff2f60c72a440971fe41ef5b2..67fcf7aa77a334547d3ae9b6ff93d8599fbe7055 100755 --- a/src/pipelines/translation_based/train.py +++ b/src/pipelines/translation_based/train.py @@ -10,8 +10,14 @@ from transformers import BertTokenizerFast from src.batch_loading import get_batches, get_ordered_dataframe_len from src.models.TransformerSeq2Seq import TransformerSeq2Seq -from src.training import latest_model, save_training_step -from src.utils import PROJECT_ROOT, convert_to_timedelta, get_config, prepare_folder +from src.utils import ( + PROJECT_ROOT, + convert_to_timedelta, + get_config, + latest_model, + prepare_folder, + save_training_step, +) INPUT_PATH = f"{PROJECT_ROOT}/generated/translations/stage4_reindexing" OUTPUT_PATH = f"{PROJECT_ROOT}/checkpoints/translations" diff --git a/src/training.py b/src/training.py deleted file mode 100644 index f9ffd92e5702a91f59ed97ca4ac2e9c912824e13..0000000000000000000000000000000000000000 --- a/src/training.py +++ /dev/null @@ -1,66 +0,0 @@ -import re -from typing import Optional, Tuple - -import torch -import torch.nn as nn -import torch.optim as optim - -from src.utils import prepare_folder - - -def latest_model(file_paths: [str]) -> Optional[Tuple[int, int]]: - """Finds newest model in directory - - Args: - files ([str]): List of all file paths that will be considered. File extension is discarded - File names must be in format epoch_num-batch_num.extension - - Returns: - (int, int): Tuple of (latest_batch, latest_step) for latest model - """ - - furthest_epoch = -1 - furthest_batch_num = -1 - for checkpoint_file in file_paths: - filename = checkpoint_file.split("/")[-1].split(".")[0] - - result = re.search(r"^(\d+)-(\d+)$", filename) - if result is not None: - epoch, batch = [int(x) for x in result.groups()] - - if epoch > furthest_epoch: - furthest_epoch = epoch - furthest_batch_num = batch - elif epoch == furthest_epoch: - furthest_batch_num = max(batch, furthest_batch_num) - - if (furthest_epoch == -1) or (furthest_batch_num == -1): - return None - - return furthest_epoch, furthest_batch_num - - -def save_training_step( - dir: str, - name: str, - model: nn.Module, - optimizer: Optional[optim.Optimizer] = None, - create_dir: bool = False, -) -> None: - """Saves a trainig step to a directory - - Args: - dir (str): Directory where step will be saved - name (str): Name of the step (eg. "0-1000") - model (nn.Module): model that will be saved - optimizer (optim.Optimizer): optimizer that will be saved. Might be None - """ - if create_dir: - prepare_folder(dir, wipe=False) - - torch.save(model.state_dict(), f"{dir}/{name}.model") - - if optimizer is not None: - torch.save( - optimizer.state_dict(), f"{dir}/{name}.optimizer", - ) diff --git a/src/utils.py b/src/utils.py index 5cd1c5ccd3a30d0a5c3ed21fe0b3294ef6860d08..90a69f5f1ed13cd2f0a933a7ed79134d64ffa4fc 100644 --- a/src/utils.py +++ b/src/utils.py @@ -1,14 +1,248 @@ +from __future__ import annotations + import os +import pickle import re import shutil -from datetime import timedelta -from typing import Optional - +from datetime import datetime, timedelta +from glob import glob +from typing import Generator, List, Optional, Tuple, Type + +import dask.dataframe as dd +import numpy as np +import pandas as pd +import torch +import torch.nn as nn import yaml +from torch.optim import Optimizer + +from src.batch_loading import get_batches, get_ordered_dataframe_len +from src.models.interfaces import PunctuationModel PROJECT_ROOT = os.path.dirname(os.path.realpath("/".join(__file__.split("/")) + "/..")) +class Saver: + """Class that allows saving and loading mode-optimizer pairs""" + + def __init__( + self, save_dir: str, model: PunctuationModel, optimizer: Optimizer + ) -> None: + """Initializes Saver + + Args: + save_dir (str): Directory where model and optimizer will be saved + model (PunctuationModel): Model to save + optimizer (Optimizer): Optimizer to save + """ + self.save_dir = save_dir + self.model = model + self.optimizer = optimizer + + prepare_folder(self.save_dir) + + def save(self, name: str): + """Saves model and optimizer + + Args: + name (str): Name under which modell will be saved + """ + self.model.save(self.save_dir, name) + torch.save(self.optimizer.state_dict(), f"{self.save_dir}/{name}.optimizer") + + +class Loader: + """Class for loading model and it's optimizer from checkpoint""" + + def __init__( + self, + save_dir: str, + model_type: Type[PunctuationModel], + optimizer_type: Type[Optimizer], + device: torch.device, + ) -> None: + """Initializes Loader + + Args: + save_dir (str): Directory where to search for models + model_type (Type[PunctuationModel]): Model class that should be loaded + optimizer_type (Type[Optimizer]): Optimizer class that should be loaded + device (torch.device): Device on which loaded model/optimizer will exists + """ + self.save_dir = save_dir + self.device = device + + self.model_type = model_type + self.optimizer_type = optimizer_type + + def has_checkpoints(self) -> bool: + """Checks if there are any saved checkpoints in model's directory + + Returns: + bool: True if checkpoints where found, False otherwise + """ + files = glob(f"{self.save_dir}/*.model") + + return latest_model(files) is not None + + def load(self, name: str) -> Tuple[PunctuationModel, Optimizer]: + """Loads a model and optimizer from file + + Args: + name (str): Name of the model that will be loaded + + Returns: + Tuple[PunctuationModel, Optimizer]: Model and optimizer + """ + model = self.model_type.load(self.save_dir, name, self.device) + + optimizer = self.optimizer_type(model.parameters()) + optimizer.load_state_dict( + torch.load(f"{self.save_dir}/{name}.optimizer", map_location=self.device) + ) + + print(f"Loaded model {name}") + + return model, optimizer + + def load_latest(self) -> Tuple[PunctuationModel, Optimizer, int, int]: + """Loads latest checkpoint in directory + + Returns: + Tuple[PunctuationModel, Optimizer, int, int]: Model, Optimizer, Epoch at + which checkpoint was made, step at which checkpoint was made + """ + files = glob(f"{self.save_dir}/*.model") + + model_id = latest_model(files) + if model_id is None: + return None + + epoch, step = model_id + model, optimizer = self.load(f"{epoch}-{step}") + + return model, optimizer, epoch, step + + +class Checkpoint: + """Utility class to make checkpoints every constant ammount of steps""" + + def __init__( + self, save_step: int, saver: Saver, start_epoch: int, start_step: int + ) -> None: + """Initializes Checkpoint. + Starting epoch and step are provided, so that checkpoint will not be made right after + loading model. + + Args: + save_step (int): Number of steps after which checkpoints will be saved + saver (Saver): Saver used to save model/optimizer state + start_epoch (int): Epoch at which training was started + start_step (int): Step at which training was started + """ + self.start_step = start_step + self.start_epoch = start_epoch + self.save_step = save_step + + self.saver = saver + + def step(self, epoch: int, step: int) -> None: + """Check if checkpoint should be made, and save it if necessary + + Args: + epoch (int): Epoch num + step (int): Step num + """ + if step % self.save_step == 0 and ( + step != self.start_step or epoch != self.start_epoch + ): + print(f"Saving: Epoch {epoch}, step {step}") + self.saver.save(f"{epoch}-{step}") + + +class Timeout: + """Utility class that prevent training from surpassing maximum ammount of time""" + + def __init__(self, duration: timedelta, saver: Optional[Saver]) -> None: + """Initializes Timeout + + Args: + duration (timedelta): Maxium duration of training + saver (Optional[Saver]): Saver used to save checkpoint if traing time is + exceeded + """ + self.saver = saver + self.duration = duration + self.time_max = None + + def start(self, time_now: Optional[datetime] = None): + """Starts counting time from the start of training + + Args: + time_now (Optional[datetime], optional): Point from which time will be measured. + Use current time if None. Defaults to None. + """ + if time_now is None: + time_now = datetime.now() + + self.time_max = datetime.max + if self.duration is not None: + self.time_max = time_now + self.max_train_time + + def step(self, epoch: int, step: int, time: Optional[datetime] = None) -> bool: + """Check if timeout was not exceeded. Saved checkpoint if time is exceeded + + Args: + epoch (int): Epoch number + step (int): Step number + time (Optional[datetime], optional): Current time. Use current time if None. Defaults to None. + + Returns: + bool: True if time was exceeded, False otherwise + """ + assert self.time_max is not None + + if time is None: + time = datetime.now() + + if time > self.time_max: + if self.checkpoint is not None: + print(f"Max time reached, saving: Epoch {epoch}, step {step}") + self.saver.save(f"{epoch}-{step}") + + return True + + return False + + +class ProgressTracker: + """Utility class used to tracking loss and displaying it to user""" + + def __init__(self, device: torch.device, loss_averaging_span: int) -> None: + """Initializes ProgressTracker + + Args: + device (torch.device): Device on which training is performed + loss_averaging_span (int): Number of latest samples used to calculate average loss + """ + print(f"Training on {device}") + self.loss_averaging_span = loss_averaging_span + self.losses = [] + + def step(self, epoch: int, step: int, loss: float) -> None: + """New loss was calculated. Informs user about it + + Args: + epoch (int): Epoch number + step (int): Step number + loss (float): Loss value at provided epoch and step + """ + self.losses.append(loss.item()) + loss_mean, self.losses = moving_average(self.losses, self.loss_averaging_span) + + print(f"epoch: {epoch} | step: {step} | loss: {loss_mean}") + + def get_config() -> dict: """Returns dict with config values @@ -34,7 +268,7 @@ def remove_multiple_spaces(text: str) -> str: return re.sub(r"\s\s+", " ", text) -def remove_punctuation(text: str) -> str: +def remove_punctuation(text: str, whitelist: List[str] = []) -> str: """Removes all non-alphanumeric characters from the text. Might result in multiple spaces while chracters like `-` are used @@ -46,10 +280,7 @@ def remove_punctuation(text: str) -> str: str: Text with all punctuactions removed """ - # Separating characters - text = text.replace("-", " ").replace("/", " ").replace("+", " ") - - return "".join(filter(lambda x: x.isalnum() or x.isspace(), text)) + return "".join(filter(lambda x: x.isalnum() or x.isspace() or x in whitelist, text)) def unify_whitespaces(text: str) -> str: @@ -72,7 +303,30 @@ def unify_whitespaces(text: str) -> str: return result -def preprocess(text: str) -> str: +def output_preprocess(text: str) -> str: + """Cleans the text out of bad formating and removes or replaces symbols that will not be predicted by a model + + Args: + text (str): Arbitrary text + + Returns: + str: Text that could be a direct output of punctuation prediction algorithm + """ + # Whitespace-like characters + text = text.replace("-", " ").replace("/", " ").replace("+", " ") + + # Punctuation-like characters + text = text.replace(";", ".").replace("!", ".") + + text = remove_punctuation(text, [".", ",", "?"]) + text = unify_whitespaces(text) + text = remove_multiple_spaces(text) + text = text.strip() + + return text + + +def input_preprocess(text: str) -> str: """Makes sure that input is in the same format as training data (no non-alphanum chars, no double spaces, all lowercase etc.) @@ -106,7 +360,131 @@ def prepare_folder(path: str, wipe: bool = False) -> None: os.makedirs(path, exist_ok=True) -def convert_to_timedelta(time_val: str) -> Optional[timedelta]: +def unflattened_column(df: pd.DataFrame, name: str) -> np.ndarray: + """Get column from the dataframe that was flattened. Dataframe must have columns + "name" and "name_shape", where name is 1D numpy array and name_shape is target + shape of this numpy array. + + Args: + df (pd.DataFrame): Dataframe from which to extract array + name (str): Name of the column + + Returns: + np.ndarray: Unflattened mutlidiamenional column of shape Lx*(name_shape) + """ + + values = df.apply(lambda x: x[name].reshape(x[f"{name}_shape"]), axis=1).values + + return np.stack(values) + + +def moving_average( + values: List[np.ndarray], average_span: int +) -> Tuple[float, np.ndarray]: + """Computes moving average and keeps only latests records + + Args: + values (List[np.ndarray]): Table containing values over which to compute moving averag + average_span (int): Maximum span over which to average + + Returns: + Tuple[float, np.ndarray]: computetd average, values array trimed to last "average_span" entries + """ + + if len(values) > average_span: + values = values[-average_span:] + + return np.mean(values), values + + +def optimizer_step(loss: torch.Tensor, optimizer: torch.optim.Optimizer) -> None: + """Computes and applies a single step of optimization + + Args: + loss (torch.Tensor): Loss that is optimized + optimizer (torch.optim.optimizer.Optimizer): Optimizer used to optimize loss + """ + optimizer.zero_grad() + loss.backward() + optimizer.step() + + +def training_loop( + epoch_start: int, + sample_start: int, + num_epochs: int, + df: dd.DataFrame, + batch_size: int, + batch_buffer_size: int, + random_index_shuffle: np.ndarray, +) -> Generator[pd.DataFrame, int, int]: + """Generator providing all data necessary to perform a training steps. This function handels epochs/steps management + + Args: + epoch_start (int): Epoch from which to start training + sample_start (int): Batch in epoch from which to start training + num_epochs (int): Number of epochs to train + df (dd.DataFrame): Dask dataframe with training dataset. Indexes must be continous from 0 to len + batch_size (int): Batch size + batch_buffer_size (int): Number of batches to load at once to memory + random_index_shuffle (np.ndarray): Shuffled indices of dataset + + Yields: + Generator: batch, epoch_num, step_num + """ + i = sample_start + for epoch in range(epoch_start, num_epochs): + for data_batch in get_batches( + df, batch_size, batch_buffer_size, random_index_shuffle, i + ): + if len(data_batch) == 0: + continue + + yield data_batch, epoch, i + + i += 1 + + i = 0 + + +def random_indexes(df: dd.DataFrame) -> np.ndarray: + """Provides array of randomly shuffled indices for dataset + + Args: + df (dd.DataFrame): Dask dataframe with training dataset. Indexes must be continous from 0 to len + + Returns: + np.ndarray: Shuffled indices + """ + num_samples = get_ordered_dataframe_len(df) + return np.random.permutation(range(num_samples)) + + +def pickle_save(obj: any, path: str) -> None: + """Pickles and saves object to a file + + Args: + obj (any): Object to pickle + path (str): Path to output file + """ + with open(path, "wb") as f: + pickle.dump(obj, f) + + +def pickle_read(path: str) -> any: + """Reads pickled objet from a file + + Args: + path (str): Path to input file + + Returns: + any: Unpickled object + """ + with open(path, "rb") as f: + return pickle.load(f) + + +def convert_to_timedelta(time_val: Optional[str]) -> Optional[timedelta]: """ src: https://code.activestate.com/recipes/577894-convert-strings-like-5d-and-60s-to-timedelta-objec/ Given a *time_val* (string) such as '5d', returns a timedelta object @@ -132,6 +510,9 @@ def convert_to_timedelta(time_val: str) -> Optional[timedelta]: >>> convert_to_timedelta('120s') datetime.timedelta(0, 120) """ + if time_val is None: + return None + num = int(time_val[:-1]) if time_val.endswith("s"): return timedelta(seconds=num) @@ -143,3 +524,83 @@ def convert_to_timedelta(time_val: str) -> Optional[timedelta]: return timedelta(days=num) else: return None + + +def latest_model(file_paths: List[str]) -> Optional[Tuple[int, int]]: + """Finds newest model in directory + + Args: + files ([str]): List of all file paths that will be considered. File extension is discarded + File names must be in format epoch_num-batch_num.extension + + Returns: + (int, int): Tuple of (latest_batch, latest_step) for latest model + """ + + furthest_epoch = -1 + furthest_batch_num = -1 + for checkpoint_file in file_paths: + filename = checkpoint_file.split("/")[-1].split(".")[0] + + result = re.search(r"^(\d+)-(\d+)$", filename) + if result is not None: + epoch, batch = [int(x) for x in result.groups()] + + if epoch > furthest_epoch: + furthest_epoch = epoch + furthest_batch_num = batch + elif epoch == furthest_epoch: + furthest_batch_num = max(batch, furthest_batch_num) + + if (furthest_epoch == -1) or (furthest_batch_num == -1): + return None + + return furthest_epoch, furthest_batch_num + + +def save_training_step( + dir: str, + name: str, + model: nn.Module, + optimizer: Optional[Optimizer] = None, + create_dir: bool = False, +) -> None: + """Saves a trainig step to a directory + + Args: + dir (str): Directory where step will be saved + name (str): Name of the step (eg. "0-1000") + model (nn.Module): model that will be saved + optimizer (optim.Optimizer): optimizer that will be saved. Might be None + """ + if create_dir: + prepare_folder(dir, wipe=False) + + torch.save(model.state_dict(), f"{dir}/{name}.model") + + if optimizer is not None: + torch.save( + optimizer.state_dict(), f"{dir}/{name}.optimizer", + ) + + +def yaml_serializable(cls): + def save_yaml(self, path: str) -> None: + yml = yaml.dump(self.__dict__) + with open(path, "w") as f: + f.write(yml) + + @staticmethod + def load_yaml(path: str) -> cls: + with open(path, "r") as f: + yml = f.read() + + obj = cls() + obj.__dict__ = yaml.load(yml, Loader=yaml.FullLoader) + + return obj + + setattr(cls, "save_yaml", save_yaml) + setattr(cls, "load_yaml", load_yaml) + + return cls diff --git a/tests/models/__init__.py b/tests/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/models/test_actions_model_base.py b/tests/models/test_actions_model_base.py new file mode 100644 index 0000000000000000000000000000000000000000..900cf89a8d9918d64e46c60d19c89bee75e9e0d8 --- /dev/null +++ b/tests/models/test_actions_model_base.py @@ -0,0 +1,59 @@ +import torch +from transformers.tokenization_bert import BertTokenizerFast + +from src.models.actions_model_base import ( + ActionsModelBase, + ActionsModelBaseLoss, + ActionsModelBaseParams, +) +from src.pipelines.actions_based.processing import ACTIONS_KEYS + + +def test_dimensions(): + base_model = "dkleczek/bert-base-polish-cased-v1" + action_vector_size = 5 + + tokens = BertTokenizerFast.from_pretrained(base_model)( + "Ala ma kota", return_tensors="pt" + ) + + params = ActionsModelBaseParams(base_model, action_vector_size) + model = ActionsModelBase(params) + + result = model(tokens["input_ids"], tokens["attention_mask"]) + + assert len(result.shape) == 3 + + assert result.shape[0] == tokens["input_ids"].shape[0] + assert result.shape[1] == tokens["input_ids"].shape[1] + assert result.shape[2] == action_vector_size + + +def test_loss_dimensions(): + batch_size = 5 + sequence_len = 10 + actions_size = 3 + weights = torch.zeros(actions_size) + 0.3 + actions_vector_true = torch.zeros((batch_size, sequence_len, actions_size)) + actions_vector_bad = torch.ones((batch_size, sequence_len, actions_size)) + loss = ActionsModelBaseLoss(weights) + + result = loss(actions_vector_bad, actions_vector_true) + assert len(result.shape) == 0 + + result_perfect = loss(actions_vector_true, actions_vector_true) + result_bad = loss(actions_vector_bad, actions_vector_true) + + assert result_perfect < result_bad + + +def test_predict(): + params = ActionsModelBaseParams( + "dkleczek/bert-base-polish-cased-v1", len(ACTIONS_KEYS) + ) + model = ActionsModelBase(params) + + input_str = "testowy ciag znakow" + result = model.predict(input_str) + + assert len(result) >= len(input_str) diff --git a/tests/models/test_actions_model_mixed.py b/tests/models/test_actions_model_mixed.py new file mode 100644 index 0000000000000000000000000000000000000000..786136dbe6abdb0a09e1dfd3413df8fe485feac7 --- /dev/null +++ b/tests/models/test_actions_model_mixed.py @@ -0,0 +1,94 @@ +import torch +from transformers.tokenization_bert import BertTokenizerFast + +from src.models.actions_model_mixed import ( + ActionsModelMixed, + ActionsModelMixedLoss, + ActionsModelMixedParams, + ActionsModelMixedRuntimeParams, +) +from src.pipelines.actions_based.processing import ACTIONS_KEYS + + +def test_dimensions(): + base_model = "dkleczek/bert-base-polish-cased-v1" + action_vector_size = 5 + + tokenizer = BertTokenizerFast.from_pretrained(base_model) + tokens = tokenizer("Ala ma kota", return_tensors="pt") + + embedding_size = 20 + threshold = 0.9 + num_heads = 2 + num_layers = 2 + feedforward_neurons = 10 + max_len = 500 + dropout = 0.1 + + params = ActionsModelMixedParams( + base_model, + tokenizer.vocab_size, + threshold, + embedding_size, + num_heads, + num_layers, + feedforward_neurons, + action_vector_size, + max_len, + dropout, + ) + model = ActionsModelMixed(params) + + actions_len = 3 + actions = torch.distributions.Multinomial( + 1, torch.tensor([0.5] * action_vector_size) + ).sample((tokens["input_ids"].shape[0], actions_len)) + + result = model(tokens["input_ids"], actions, tokens["attention_mask"]) + + assert len(result.shape) == 3 + + assert result.shape[0] == tokens["input_ids"].shape[0] + assert result.shape[1] == actions_len + assert result.shape[2] == action_vector_size + + +def test_loss_dimensions(): + batch_size = 5 + sequence_len = 10 + actions_size = 3 + prior_odds = torch.zeros(actions_size) + 0.3 + actions_vector_true = torch.zeros((batch_size, sequence_len, actions_size)) + actions_vector_bad = torch.ones((batch_size, sequence_len, actions_size)) + loss = ActionsModelMixedLoss(prior_odds) + + result = loss(actions_vector_true, actions_vector_bad) + assert len(result.shape) == 0 + + result_perfect = loss(actions_vector_true, actions_vector_true) + result_bad = loss(actions_vector_true, actions_vector_bad) + + assert result_perfect < result_bad + + +def test_predict(): + tokenizer = BertTokenizerFast.from_pretrained("dkleczek/bert-base-polish-cased-v1") + params = ActionsModelMixedParams( + "dkleczek/bert-base-polish-cased-v1", + tokenizer.vocab_size, + 0.9, + 10, + 2, + 1, + 10, + len(ACTIONS_KEYS), + 500, + 0.1, + ) + runtime = ActionsModelMixedRuntimeParams(0.9, 100) + model = ActionsModelMixed(params, runtime) + + input_str = "testowy ciag znakow" + result = model.predict(input_str) + + assert len(result) >= len(input_str) diff --git a/tests/models/test_actions_model_restricted.py b/tests/models/test_actions_model_restricted.py new file mode 100644 index 0000000000000000000000000000000000000000..b659b271d12bd88f00f9ebfbb5b01ae868fcb735 --- /dev/null +++ b/tests/models/test_actions_model_restricted.py @@ -0,0 +1,74 @@ +import torch +from transformers.tokenization_bert import BertTokenizerFast + +from src.models.actions_model_restricted import ( + ActionsModelRestricted, + ActionsModelRestrictedLoss, + ActionsModelRestrictedParams, +) +from src.pipelines.actions_based.processing import ACTIONS_KEYS + + +def test_dimensions(): + base_model = "dkleczek/bert-base-polish-cased-v1" + action_vector_size = 5 + + tokens = BertTokenizerFast.from_pretrained(base_model)( + "Ala ma kota", return_tensors="pt" + ) + + params = ActionsModelRestrictedParams(base_model, action_vector_size) + model = ActionsModelRestricted(params) + + result = model(tokens["input_ids"], tokens["attention_mask"]) + + assert len(result.shape) == 3 + + assert result.shape[0] == tokens["input_ids"].shape[0] + assert result.shape[1] == tokens["input_ids"].shape[1] + assert result.shape[2] == action_vector_size + + +def test_loss_dimensions(): + batch_size = 5 + sequence_len = 10 + action_vector_size = 4 + uppercase_odds = torch.tensor(0.3, dtype=torch.float) + punctuation_weights = torch.tensor([0.3, 0.3, 0.1], dtype=torch.float) + loss = ActionsModelRestrictedLoss(uppercase_odds, punctuation_weights) + + actions_vector_true = torch.zeros( + (batch_size, sequence_len, action_vector_size), dtype=torch.float + ) + actions_vector_true[:, :, -1] = 1.0 + + actions_vector_bad = torch.zeros( + (batch_size, sequence_len, action_vector_size), dtype=torch.float + ) + actions_vector_bad[:, :, :2] = 1.0 + actions_vector_bad[:, :, -1] = 0.0 + + result = loss(actions_vector_true, actions_vector_bad) + assert len(result.shape) == 0 + + result_perfect = loss(actions_vector_true, actions_vector_true) + result_bad = loss(actions_vector_true, actions_vector_bad) + + print(result_perfect) + print(result_bad) + + assert result_perfect < result_bad + assert result_perfect > 0 + assert result_bad > 0 + + +def test_predict(): + params = ActionsModelRestrictedParams( + "dkleczek/bert-base-polish-cased-v1", len(ACTIONS_KEYS) + 1 + ) + model = ActionsModelRestricted(params) + + input_str = "testowy ciag znakow" + result = model.predict(input_str) + + assert len(result) >= len(input_str) diff --git a/tests/pipelines/actions_based/test_processing.py b/tests/pipelines/actions_based/test_processing.py index c626ff25dcd894a7055b14dd8e3ecce45149573b..8e4caaa86f0bfa79a8434866a7d58d22f59779d6 100644 --- a/tests/pipelines/actions_based/test_processing.py +++ b/tests/pipelines/actions_based/test_processing.py @@ -24,24 +24,24 @@ from src.pipelines.actions_based.processing import ( def test_detect_actions(): actions = detect_actions("Janek.", None) assert actions == { - "dot": True, "upper_case": True, + "dot": True, "colon": False, "question_mark": False, } actions = detect_actions("ewka?", None) assert actions == { - "dot": False, "upper_case": False, + "dot": False, "colon": False, "question_mark": True, } actions = detect_actions("Test", None) assert actions == { - "dot": False, "upper_case": True, + "dot": False, "colon": False, "question_mark": False, } @@ -49,21 +49,21 @@ def test_detect_actions(): def test_encode_actions(): x = { - "dot": True, "upper_case": False, + "dot": True, "colon": False, "question_mark": True, } - assert np.all(encode_actions(x) == np.array([1, 0, 0, 1])) + assert np.all(encode_actions(x) == np.array([0, 1, 0, 1])) def test_decode_actions(): - x = np.array([1, 0, 0, 1]) + x = np.array([0, 1, 0, 1]) assert decode_actions(x) == { - "dot": True, "upper_case": False, + "dot": True, "colon": False, "question_mark": True, } @@ -136,8 +136,8 @@ def test_nearest_sentence_l(): def create_dummy_action(end_sentence: bool) -> np.array: return encode_actions( { - "dot": end_sentence, "upper_case": False, + "dot": end_sentence, "colon": False, "question_mark": False, } diff --git a/tests/pipelines/actions_based/test_scoring.py b/tests/pipelines/actions_based/test_scoring.py new file mode 100644 index 0000000000000000000000000000000000000000..62c524ca956855cfcb4041bad4edd3f39dc9f6c7 --- /dev/null +++ b/tests/pipelines/actions_based/test_scoring.py @@ -0,0 +1,60 @@ +import numpy as np +from numpy.testing import assert_allclose, assert_array_equal + +from src.pipelines.actions_based.scoring import ( + multiclass_auc, + multiclass_roc_curve, + predictions_threshold, +) + + +def test_predictions_threshold(): + threshold = 0.5 + predictions = np.array([[[0.3, 0.6, 0.1, 0.2, 0.9], [0.3, 0.6, 0.1, 0.2, 0.9]]]) + expected = np.array([[[0.0, 1.0, 0.0, 0.0, 1.0], [0.0, 1.0, 0.0, 0.0, 1.0]]]) + + got = predictions_threshold(predictions, threshold) + + assert np.all(got == expected) + + +def test_multiclass_roc_curve(): + predictions = np.array([[0.3, 0.2, 0.1, 0.3, 0.1], [0.7, 0.5, 0.1, 0.2, 0.9]]) + expected = np.array([[0.0, 1.0, 0.0, 0.0, 1.0], [1.0, 0.0, 1.0, 1.0, 0.0]]) + + fpr, tpr, thresholds = multiclass_roc_curve(expected, predictions) + + assert len(thresholds) == expected.shape[1] + + # Thresholds + assert_allclose(thresholds[0], [1.7, 0.7, 0.3]) + assert_allclose(thresholds[1], [1.5, 0.5, 0.2]) + assert_allclose(thresholds[2], [1.1, 0.1]) + assert_allclose(thresholds[3], [1.3, 0.3, 0.2]) + assert_allclose(thresholds[4], [1.9, 0.9, 0.1]) + + # False positive rate + assert_array_equal(fpr[0], [0.0, 0.0, 1.0]) + assert_array_equal(fpr[1], [0.0, 1.0, 1.0]) + assert_array_equal(fpr[2], [0.0, 1.0]) + assert_array_equal(fpr[3], [0.0, 1.0, 1.0]) + assert_array_equal(fpr[4], [0.0, 1.0, 1.0]) + + # True positive rate + assert_array_equal(tpr[0], [0.0, 1.0, 1.0]) + assert_array_equal(tpr[1], [0.0, 0.0, 1.0]) + assert_array_equal(tpr[2], [0.0, 1.0]) + assert_array_equal(tpr[3], [0.0, 0.0, 1.0]) + assert_array_equal(tpr[4], [0.0, 0.0, 1.0]) + + +def test_multiclass_auc(): + predictions = np.array([[0.3, 0.2, 0.1, 0.3, 0.1], [0.7, 0.5, 0.1, 0.2, 0.9]]) + expected = np.array([[0.0, 1.0, 0.0, 0.0, 1.0], [1.0, 0.0, 1.0, 1.0, 0.0]]) + + fpr, tpr, _ = multiclass_roc_curve(expected, predictions) + result = multiclass_auc(fpr, tpr) + + assert len(result) == 5 + assert np.all(result >= 0) + assert np.all(result <= 1) diff --git a/tests/test_training.py b/tests/test_training.py deleted file mode 100644 index 2aa5d6a2b70c9b197bb72b2438b61c39620a3c49..0000000000000000000000000000000000000000 --- a/tests/test_training.py +++ /dev/null @@ -1,21 +0,0 @@ -from src.training import latest_model - - -def test_latest_model(): - files = [] - assert latest_model(files) is None - - files.append("/path/tam/pam/Wrongformat.b") - assert latest_model(files) is None - - files.append("/path/tam/pam/0-2000.b") - assert latest_model(files) == (0, 2000) - - files.append("/path/tam/pam/0-3000.c") - assert latest_model(files) == (0, 3000) - - files.append("/path/tam/pam/1-1000.a") - assert latest_model(files) == (1, 1000) - - files.append("/path/tam/pam/1-500.a") - assert latest_model(files) == (1, 1000) diff --git a/tests/test_utils.py b/tests/test_utils.py index 38faa617b90a08e7d445ca60fa1c12a92a85e0af..7e52db24549a14ec36b5be258c6e4a980f4ae869 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,8 +1,14 @@ +import os +from dataclasses import dataclass + from src.utils import ( convert_to_timedelta, - preprocess, + input_preprocess, + latest_model, + output_preprocess, remove_multiple_spaces, remove_punctuation, + yaml_serializable, ) @@ -15,16 +21,28 @@ def test_remove_multiple_spaces(): def test_remove_punctuation(): provided = "Ala.. ma-Kota!?.@@$ Kot ma Ale ()*" - expected = "Ala ma Kota Kot ma Ale " + expected = "Ala maKota Kot ma Ale " assert remove_punctuation(provided) == expected + whitelist = [".", "?"] + expected_whitelist = "Ala.. maKota?. Kot ma Ale " + + assert remove_punctuation(provided, whitelist) == expected_whitelist + -def test_preprocess(): +def test_input_preprocess(): provided = "Ala ma-Kota!?.@@$ Kot ma Ale ()*" - expected = "ala ma kota kot ma ale" + expected = "ala makota kot ma ale" - assert preprocess(provided) == expected + assert input_preprocess(provided) == expected + + +def test_output_preprocess(): + provided = "Ala ma-Kota!?.@@$ Kot ma Ale ()*" + expected = "Ala ma Kota.?. Kot ma Ale" + + assert output_preprocess(provided) == expected def test_convert_to_timedelta(): @@ -43,3 +61,45 @@ def test_convert_to_timedelta(): assert convert_to_timedelta("2s").days == 0 assert convert_to_timedelta("2s").seconds == 2 assert convert_to_timedelta("2s").microseconds == 0 + + +def test_latest_model(): + files = [] + assert latest_model(files) is None + + files.append("/path/tam/pam/Wrongformat.b") + assert latest_model(files) is None + + files.append("/path/tam/pam/0-2000.b") + assert latest_model(files) == (0, 2000) + + files.append("/path/tam/pam/0-3000.c") + assert latest_model(files) == (0, 3000) + + files.append("/path/tam/pam/1-1000.a") + assert latest_model(files) == (1, 1000) + + files.append("/path/tam/pam/1-500.a") + assert latest_model(files) == (1, 1000) + + +def test_yaml_serializable(tmp_path): + PATH = tmp_path / "test.yaml" + + @yaml_serializable + @dataclass + class Test: + x: int = 3 + y: str = "test1" + + x = Test() + x.x = -1 + x.y = "test2" + x.save_yaml(PATH) + + assert os.path.exists(PATH) + + y = Test.load_yaml(PATH) + + assert y.x == -1 + assert y.y == "test2" diff --git a/tox.ini b/tox.ini index 4326963241d376dcdec8a3b163a80412e667ab9b..02ec5a04fd4446f894bc517a9ca760c2b5a46fc4 100644 --- a/tox.ini +++ b/tox.ini @@ -3,17 +3,7 @@ envlist = unittest,pep8 skipsdist = True [testenv] -deps = - pytest - numpy - pyyaml - pandas - tqdm - torch - dask[complete] - transformers - pyarrow==0.17.1 - lxml +deps = -rrequirements.txt [testenv:unittest] commands = pytest --ignore data --ignore generated @@ -34,9 +24,10 @@ exclude = data generated max-complexity = 10 +min_python_version = 3.8 max-line-length = 80 select = I,C,E,F,W,B,B950,TYP,T -ignore = E501, C901, I201 +ignore = E501, C901, I201, W503 [testenv:pep8] diff --git a/train.sh b/train.sh new file mode 100755 index 0000000000000000000000000000000000000000..3d7da2d1b0ab2910fe3cffd72230bd74a748fcfe --- /dev/null +++ b/train.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +# Usage: ./train.sh [module_to_run] [container_name] +# Eg.: ./train.sh src.pipelines.actions_based.train_base base_training + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + +docker build . -f ./docker/training/Dockerfile -t clarinpl/punctuator_training --build-arg USERNAME=$(whoami) --build-arg USER_UID=$(id -u) --build-arg USER_GID=$(id -u) && \ +docker run -v $DIR:/punctuator --name $2 --gpus all -it --entrypoint python clarinpl/punctuator_training -m $1 diff --git a/worker.py b/worker.py index 54a25c115b7cc46d4160e46be5ba105aa0951430..7c8011be572c280845cf13c54ba46ecde506a20c 100755 --- a/worker.py +++ b/worker.py @@ -1,11 +1,12 @@ #!/usr/bin/python import configparser +from src.models.model_factory import MODELS_MAP +from typing import List import nlp_ws -from src.pipelines.actions_based.utils import apply_actions_punctuation, load_model -from src.utils import preprocess +from src.utils import input_preprocess, output_preprocess class Worker(nlp_ws.NLPWorker): @@ -15,28 +16,42 @@ class Worker(nlp_ws.NLPWorker): self.config = configparser.ConfigParser() self.config.read("config.ini") - self.threshold = float(self.config["deployment"]["threshold"]) - self.chunk_size = int(self.config["deployment"]["chunk_size"]) - self.tokenizer, self.model = load_model( - self.config["deployment"]["model"], - self.config["deployment"]["base_model"], - self.config["deployment"]["device"], - ) + self.device = self.config["deployment"]["device"] + self.models_dir = self.config["deployment"]["models_dir"] + self.models = {} + + models_enabled = self.config["deployment"]["models_enabled"] + models_enabled = models_enabled.split(",") + + self._load_models(models_enabled) + + def _load_models(self, models_list: List[str]): + for model_type in models_list: + self.models[model_type] = MODELS_MAP[model_type].load( + f"{self.models_dir}/{model_type}", "production", self.device + ) + self.models[model_type].train(False) self.model.train(False) def process(self, input_file: str, task_options: dict, output_file: str) -> None: """Implementation of example tasks that copies files.""" + if ( + "model" in task_options.keys() + and task_options["model"] in MODELS_MAP.keys() + ): + model_type = task_options["model"] + else: + model_type = "actions_base" + with open(input_file, "r") as f: - text = str(f.read()) - text = preprocess(text) - text_processed = apply_actions_punctuation( - text, self.chunk_size, self.tokenizer, self.model, self.threshold - ) + text = input_preprocess(output_preprocess(f.read())) + + result = self.models[model_type].predict(text) with open(output_file, "w") as f: - f.write(text_processed) + f.write(result) if __name__ == "__main__":