From 66def97e81c37eb9bdd5559f73e4d7b6b924db3b Mon Sep 17 00:00:00 2001
From: Michal Pogoda <michalpogoda@hotmail.com>
Date: Mon, 15 Feb 2021 11:13:09 +0100
Subject: [PATCH 1/8] Working punctuator V2 version

---
 .dockerignore                                 |  16 -
 .dvc/.gitignore                               |   3 -
 .dvc/config                                   |   6 -
 .dvc/plots/confusion.json                     |  30 -
 .dvc/plots/default.json                       |  29 -
 .dvc/plots/scatter.json                       |  27 -
 .dvc/plots/smooth.json                        |  39 --
 .gitignore                                    |  29 +-
 .gitlab-ci.yml                                |  10 +-
 .isort.cfg                                    |   3 -
 Dockerfile                                    |  15 -
 Dockerfile.worker                             |  18 +
 README.md                                     |  68 --
 config.ini                                    |   8 +-
 data.dvc                                      |   3 -
 docker/training/Dockerfile                    |  24 -
 docker/training/requirements.txt              |   1 -
 download_dataset.sh                           |   4 -
 dvc.yaml                                      | 185 ------
 entrypoint.sh                                 |  29 +-
 generated/.gitignore                          |   2 -
 generated/actions/.gitignore                  |   5 -
 generated/translations/.gitignore             |   4 -
 params.yaml                                   | 110 ---
 punctuate.py                                  | 113 ----
 {src => punctuator}/__init__.py               |   0
 punctuator/punctuator.py                      | 115 ++++
 requirements.txt                              |  66 +-
 src/batch_loading.py                          |  96 ---
 src/models/TransformerSeq2Seq.py              |  73 --
 src/models/__init__.py                        |   0
 src/models/actions_model_base.py              | 216 ------
 src/models/actions_model_mixed.py             | 304 ---------
 src/models/actions_model_restricted.py        | 273 --------
 src/models/common.py                          |  60 --
 src/models/interfaces.py                      |  49 --
 src/models/model_factory.py                   |   9 -
 src/pipelines/__init__.py                     |   0
 src/pipelines/actions_based/__init__.py       |   0
 src/pipelines/actions_based/processing.py     | 628 ------------------
 src/pipelines/actions_based/scoring.py        | 122 ----
 .../actions_based/stage1_extraction.py        |  49 --
 .../actions_based/stage2_tokenization.py      |  43 --
 .../actions_based/stage3_exploding.py         |  34 -
 .../actions_based/stage4_reindexing.py        |  29 -
 src/pipelines/actions_based/stage5_stats.py   |  55 --
 src/pipelines/actions_based/test.py           | 114 ----
 src/pipelines/actions_based/train_base.py     | 120 ----
 src/pipelines/actions_based/train_mixed.py    | 144 ----
 .../actions_based/train_restricted.py         | 146 ----
 src/pipelines/actions_based/utils.py          | 135 ----
 src/pipelines/translation_based/__init__.py   |   0
 src/pipelines/translation_based/processing.py | 302 ---------
 .../translation_based/stage1_extraction.py    |  45 --
 .../stage2_create_batches.py                  |  46 --
 .../translation_based/stage3_exploding.py     |  34 -
 .../translation_based/stage4_reindexing.py    |  37 --
 src/pipelines/translation_based/train.py      | 143 ----
 src/processing.py                             |  65 --
 src/utils.py                                  | 620 -----------------
 tests/__init__.py                             |   0
 tests/models/__init__.py                      |   0
 tests/models/test_actions_model_base.py       |  59 --
 tests/models/test_actions_model_mixed.py      |  94 ---
 tests/models/test_actions_model_restricted.py |  74 ---
 tests/pipelines/__init__.py                   |   0
 tests/pipelines/actions_based/__init__.py     |   0
 .../actions_based/test_processing.py          | 230 -------
 tests/pipelines/actions_based/test_scoring.py |  60 --
 tests/pipelines/translation_based/__init__.py |   0
 .../translation_based/test_processing.py      | 185 ------
 tests/test_batch_loading.py                   |  57 --
 tests/test_chunking.py                        |  72 ++
 tests/test_utils.py                           | 105 ---
 tox.ini                                       |   6 -
 train.sh                                      |   9 -
 worker.py                                     |  72 +-
 77 files changed, 267 insertions(+), 5709 deletions(-)
 delete mode 100644 .dockerignore
 delete mode 100644 .dvc/.gitignore
 delete mode 100644 .dvc/config
 delete mode 100644 .dvc/plots/confusion.json
 delete mode 100644 .dvc/plots/default.json
 delete mode 100644 .dvc/plots/scatter.json
 delete mode 100644 .dvc/plots/smooth.json
 delete mode 100644 .isort.cfg
 delete mode 100644 Dockerfile
 create mode 100644 Dockerfile.worker
 delete mode 100644 README.md
 delete mode 100644 data.dvc
 delete mode 100644 docker/training/Dockerfile
 delete mode 120000 docker/training/requirements.txt
 delete mode 100755 download_dataset.sh
 delete mode 100644 dvc.yaml
 mode change 100755 => 100644 entrypoint.sh
 delete mode 100644 generated/.gitignore
 delete mode 100644 generated/actions/.gitignore
 delete mode 100644 generated/translations/.gitignore
 delete mode 100644 params.yaml
 delete mode 100755 punctuate.py
 rename {src => punctuator}/__init__.py (100%)
 create mode 100644 punctuator/punctuator.py
 delete mode 100644 src/batch_loading.py
 delete mode 100644 src/models/TransformerSeq2Seq.py
 delete mode 100644 src/models/__init__.py
 delete mode 100644 src/models/actions_model_base.py
 delete mode 100644 src/models/actions_model_mixed.py
 delete mode 100644 src/models/actions_model_restricted.py
 delete mode 100644 src/models/common.py
 delete mode 100644 src/models/interfaces.py
 delete mode 100644 src/models/model_factory.py
 delete mode 100644 src/pipelines/__init__.py
 delete mode 100644 src/pipelines/actions_based/__init__.py
 delete mode 100644 src/pipelines/actions_based/processing.py
 delete mode 100644 src/pipelines/actions_based/scoring.py
 delete mode 100644 src/pipelines/actions_based/stage1_extraction.py
 delete mode 100644 src/pipelines/actions_based/stage2_tokenization.py
 delete mode 100644 src/pipelines/actions_based/stage3_exploding.py
 delete mode 100644 src/pipelines/actions_based/stage4_reindexing.py
 delete mode 100644 src/pipelines/actions_based/stage5_stats.py
 delete mode 100644 src/pipelines/actions_based/test.py
 delete mode 100755 src/pipelines/actions_based/train_base.py
 delete mode 100755 src/pipelines/actions_based/train_mixed.py
 delete mode 100755 src/pipelines/actions_based/train_restricted.py
 delete mode 100644 src/pipelines/actions_based/utils.py
 delete mode 100644 src/pipelines/translation_based/__init__.py
 delete mode 100644 src/pipelines/translation_based/processing.py
 delete mode 100644 src/pipelines/translation_based/stage1_extraction.py
 delete mode 100644 src/pipelines/translation_based/stage2_create_batches.py
 delete mode 100644 src/pipelines/translation_based/stage3_exploding.py
 delete mode 100644 src/pipelines/translation_based/stage4_reindexing.py
 delete mode 100755 src/pipelines/translation_based/train.py
 delete mode 100644 src/processing.py
 delete mode 100644 src/utils.py
 delete mode 100644 tests/__init__.py
 delete mode 100644 tests/models/__init__.py
 delete mode 100644 tests/models/test_actions_model_base.py
 delete mode 100644 tests/models/test_actions_model_mixed.py
 delete mode 100644 tests/models/test_actions_model_restricted.py
 delete mode 100644 tests/pipelines/__init__.py
 delete mode 100644 tests/pipelines/actions_based/__init__.py
 delete mode 100644 tests/pipelines/actions_based/test_processing.py
 delete mode 100644 tests/pipelines/actions_based/test_scoring.py
 delete mode 100644 tests/pipelines/translation_based/__init__.py
 delete mode 100644 tests/pipelines/translation_based/test_processing.py
 delete mode 100644 tests/test_batch_loading.py
 create mode 100644 tests/test_chunking.py
 delete mode 100644 tests/test_utils.py
 delete mode 100755 train.sh
 mode change 100755 => 100644 worker.py

diff --git a/.dockerignore b/.dockerignore
deleted file mode 100644
index 06d8c56..0000000
--- a/.dockerignore
+++ /dev/null
@@ -1,16 +0,0 @@
-data
-__pycache__
-.devcontainer
-.dvc
-.idea
-.metals
-.pytest_cache
-.tox
-.vscode
-checkpoints
-dask-worker-space
-data
-generated
-notebooks
-tests
-deploy
\ No newline at end of file
diff --git a/.dvc/.gitignore b/.dvc/.gitignore
deleted file mode 100644
index 528f30c..0000000
--- a/.dvc/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-/config.local
-/tmp
-/cache
diff --git a/.dvc/config b/.dvc/config
deleted file mode 100644
index c30b54e..0000000
--- a/.dvc/config
+++ /dev/null
@@ -1,6 +0,0 @@
-[core]
-    remote = newremote
-['remote "newremote"']
-    url = s3://punctuation/action_based
-    endpointurl = https://minio.clarin-pl.eu
-    profile = clarinpl
diff --git a/.dvc/plots/confusion.json b/.dvc/plots/confusion.json
deleted file mode 100644
index 0d9a333..0000000
--- a/.dvc/plots/confusion.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-    "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
-    "data": {
-        "values": "<DVC_METRIC_DATA>"
-    },
-    "title": "<DVC_METRIC_TITLE>",
-    "mark": "rect",
-    "encoding": {
-        "x": {
-            "field": "<DVC_METRIC_X>",
-            "type": "nominal",
-            "sort": "ascending",
-            "title": "<DVC_METRIC_X_LABEL>"
-        },
-        "y": {
-            "field": "<DVC_METRIC_Y>",
-            "type": "nominal",
-            "sort": "ascending",
-            "title": "<DVC_METRIC_Y_LABEL>"
-        },
-        "color": {
-            "aggregate": "count",
-            "type": "quantitative"
-        },
-        "facet": {
-            "field": "rev",
-            "type": "nominal"
-        }
-    }
-}
diff --git a/.dvc/plots/default.json b/.dvc/plots/default.json
deleted file mode 100644
index d00782a..0000000
--- a/.dvc/plots/default.json
+++ /dev/null
@@ -1,29 +0,0 @@
-{
-    "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
-    "data": {
-        "values": "<DVC_METRIC_DATA>"
-    },
-    "title": "<DVC_METRIC_TITLE>",
-    "mark": {
-        "type": "line"
-    },
-    "encoding": {
-        "x": {
-            "field": "<DVC_METRIC_X>",
-            "type": "quantitative",
-            "title": "<DVC_METRIC_X_LABEL>"
-        },
-        "y": {
-            "field": "<DVC_METRIC_Y>",
-            "type": "quantitative",
-            "title": "<DVC_METRIC_Y_LABEL>",
-            "scale": {
-                "zero": false
-            }
-        },
-        "color": {
-            "field": "rev",
-            "type": "nominal"
-        }
-    }
-}
diff --git a/.dvc/plots/scatter.json b/.dvc/plots/scatter.json
deleted file mode 100644
index 90165d4..0000000
--- a/.dvc/plots/scatter.json
+++ /dev/null
@@ -1,27 +0,0 @@
-{
-    "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
-    "data": {
-        "values": "<DVC_METRIC_DATA>"
-    },
-    "title": "<DVC_METRIC_TITLE>",
-    "mark": "point",
-    "encoding": {
-        "x": {
-            "field": "<DVC_METRIC_X>",
-            "type": "quantitative",
-            "title": "<DVC_METRIC_X_LABEL>"
-        },
-        "y": {
-            "field": "<DVC_METRIC_Y>",
-            "type": "quantitative",
-            "title": "<DVC_METRIC_Y_LABEL>",
-            "scale": {
-                "zero": false
-            }
-        },
-        "color": {
-            "field": "rev",
-            "type": "nominal"
-        }
-    }
-}
diff --git a/.dvc/plots/smooth.json b/.dvc/plots/smooth.json
deleted file mode 100644
index d497ce7..0000000
--- a/.dvc/plots/smooth.json
+++ /dev/null
@@ -1,39 +0,0 @@
-{
-    "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
-    "data": {
-        "values": "<DVC_METRIC_DATA>"
-    },
-    "title": "<DVC_METRIC_TITLE>",
-    "mark": {
-        "type": "line"
-    },
-    "encoding": {
-        "x": {
-            "field": "<DVC_METRIC_X>",
-            "type": "quantitative",
-            "title": "<DVC_METRIC_X_LABEL>"
-        },
-        "y": {
-            "field": "<DVC_METRIC_Y>",
-            "type": "quantitative",
-            "title": "<DVC_METRIC_Y_LABEL>",
-            "scale": {
-                "zero": false
-            }
-        },
-        "color": {
-            "field": "rev",
-            "type": "nominal"
-        }
-    },
-    "transform": [
-        {
-            "loess": "<DVC_METRIC_Y>",
-            "on": "<DVC_METRIC_X>",
-            "groupby": [
-                "rev"
-            ],
-            "bandwidth": 0.3
-        }
-    ]
-}
diff --git a/.gitignore b/.gitignore
index 31cb712..007a515 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,21 +1,10 @@
-dane/**
-dataset_simple
-dataset_actions
-**/dask-worker-space
-.vscode
-.devcontainer
-.idea
-.metals
-/data
+/samba
+/.pytest_cache
+/.tox
+/.vscode
+/.env
+/model
+/config.test.ini
+/wandb
 __pycache__
-.pytest_cache
-/checkpoints
-.dvc
-.tox
-notebooks
-dvc.lock
-dask-worker-space
-test_data
-.env
-deploy
-service.log
\ No newline at end of file
+/notebook.ipynb
\ No newline at end of file
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index a858b46..3c0266d 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -6,7 +6,6 @@ cache:
 
 stages:
   - check_style
-  - testing
   - build
 
 before_script:
@@ -17,11 +16,6 @@ pep8:
   script:
     - tox -v -e pep8
 
-unittest:
-  stage: testing
-  script:
-    - tox -v -e unittest
-
 build_image:
   stage: build
   image: docker:18.09.7
@@ -32,8 +26,8 @@ build_image:
   before_script:
     - ''
   script:
-    - docker build -t clarinpl/punctuator .
+    - docker build -t clarinpl/punctuator -f Dockerfile.worker .
     - echo $DOCKER_PASSWORD > pass.txt
     - cat pass.txt | docker login --username $DOCKER_USERNAME --password-stdin
     - rm pass.txt
-    - docker push clarinpl/punctuator
\ No newline at end of file
+    - docker push clarinpl/punctuator
diff --git a/.isort.cfg b/.isort.cfg
deleted file mode 100644
index 9e5a06c..0000000
--- a/.isort.cfg
+++ /dev/null
@@ -1,3 +0,0 @@
-[settings]
-profile=hug
-src_paths=src,test
diff --git a/Dockerfile b/Dockerfile
deleted file mode 100644
index 90eeb9a..0000000
--- a/Dockerfile
+++ /dev/null
@@ -1,15 +0,0 @@
-FROM clarinpl/cuda-python:3.7
-
-RUN DEBIAN_FRONTEND=noninteractive apt-get update && apt-get install -y gcc python3-dev
-RUN mkdir /punctuator
-WORKDIR /punctuator
-
-COPY requirements.txt requirements.txt
-RUN pip3 install -r requirements.txt && rm requirements.txt
-
-COPY src ./src
-COPY config.ini .
-COPY worker.py .
-COPY entrypoint.sh .
-
-ENTRYPOINT [ "./entrypoint.sh" ]
\ No newline at end of file
diff --git a/Dockerfile.worker b/Dockerfile.worker
new file mode 100644
index 0000000..049273f
--- /dev/null
+++ b/Dockerfile.worker
@@ -0,0 +1,18 @@
+FROM clarinpl/cuda-python:3.7
+
+RUN DEBIAN_FRONTEND=noninteractive apt-get update && apt-get install -y gcc python3-dev
+
+COPY requirements.txt requirements.txt
+RUN pip install -r requirements.txt && rm requirements.txt
+
+RUN mkdir /workspace
+WORKDIR /workspace
+
+RUN pip3 install --index-url https://pypi.clarin-pl.eu/simple/ nlp_ws==0.6
+
+COPY punctuator punctuator
+COPY entrypoint.sh entrypoint.sh
+COPY worker.py worker.py
+COPY config.ini config.ini
+
+ENTRYPOINT ["bash", "entrypoint.sh"]
\ No newline at end of file
diff --git a/README.md b/README.md
deleted file mode 100644
index ffead89..0000000
--- a/README.md
+++ /dev/null
@@ -1,68 +0,0 @@
-# Punctuator
-A service that automatically adds punctuation to raw word-stream (eg. from speech2text) for polish language. 
-
-**Example input**:
-> według webometrycznego rankingu uniwersytetów świata ze stycznia 2019 pokazującego zaangażowanie instytucji akademickich w internecie uczelnia zajmuje 5 miejsce w polsce wśród uczelni technicznych a na świecie 964 wśród wszystkich typów uczelni w rankingu szkół wyższych perspektyw politechnika wrocławska zajęła w 2019 roku 3 miejsce wśród uczelni technicznych oraz 6 miejsce spośród wszystkich uczelni akademickich w polsce
-
-**Output**:
-> Według webometrycznego rankingu uniwersytetów świata ze stycznia 2019, pokazującego zaangażowanie instytucji akademickich w Internecie, uczelnia zajmuje 5. miejsce w Polsce wśród uczelni technicznych, a na świecie 964. Wśród wszystkich typów uczelni w rankingu szkół wyższych perspektyw Politechnika Wrocławska zajęła w 2019 roku 3. miejsce wśród uczelni technicznych oraz 6. miejsce spośród wszystkich uczelni akademickich w Polsce
-
-## Models
-### Action-Based
-1. actions_base: A simple model, architecturally based on BERT. It's learned on a task to predict an "Action" for each token in the sentence. Action is described as either uppercasing of the token or adding a punctuation sign at the end of the token.
-
-2. actions_restricted: The model nearly identical with actions_base, however it predicts punctuation as a categorical distribution (so that punctuation is mutually exclusive in training time). The idea is to better differentiate between each punctuation.
-
-3. actions_mixed: A model based on the full transformer (encoder + decoder) architecture. It's much less performant, as it only predicts actions for one word at the time. However, it can model action probabilities conditioned on both the input and output predicted so far. Because of that, it's much less prone to not uppercasing letters in a new sentence or placing multiple punctuation signs in close proximity.
-
-### Translation
-2. translation (Deprecated): Full encoder-decoder stack that takes input (unpunctuated text) and the output produced so far to predict the next token. The main difference from the actions model is that it's a full text2text model without restriction on tokens. Because of that, in theory, it can represent more cases (eg. all upper, some upper, dashes, ellipsis, etc...), as opposed to only a few explicitly defined actions. However, the lack of constraints makes it much harder to train (both in performance and data size).
-
-## Usage
-To test the model localy you can use `punctuate.py` script.
-```bash
-punctuate.py [-h] -a {base,restricted,mixed} -d DIRECTORY -i INPUT [-m MODEL] [-l {upper_case,dot,colon,question_mark,none}] [-dv DEVICE]
-
-Evaluate actions model
-
-optional arguments:
-  -h, --help            show this help message and exit
-  -a {base,restricted,mixed}, --architecture {base,restricted,mixed}
-                        Model architecture
-  -d DIRECTORY, --directory DIRECTORY
-                        Directory where trained model is located, relative to project root
-  -i INPUT, --input INPUT
-                        Input text file
-  -m MODEL, --model MODEL
-                        Pretrained model name
-  -l {upper_case,dot,colon,question_mark,none}, --highlight {upper_case,dot,colon,question_mark,none}
-                        Highlight prediction confidence of selected action per-word
-  -dv DEVICE, --device DEVICE
-                        Device on which inference will be made
-```
-Eg. if you place your model named "production" at `punctuator/checkpoints/actions_base/` and example unpunctuated at `punctuator/test_data/test.txt` you can call 
-
-```bash
-python3 punctuate.py -a mixed -d /deploy/actions_mixed -i test_data/text.txt -m production -dv cuda:0
-```
-
-## Config
-```ini
-[deployment]
-device = cpu ; Device on which inference will be made (eg. cpu, cuda:0 etc)
-models_dir = deploy ; Relative path to directory, where models will be placed
-models_enabled = actions_base,actions_mixed,actions_restricted ; which models are available. 
-```
-
-## LPMN
-```
-filedir(/users/michal.pogoda)|any2txt|punctuator_test
-```
-or
-```
-filedir(/users/michal.pogoda)|any2txt|punctuator_test({"model":"model_name"})
-```
-where model_name is one of models specified in models_enabled. If no model is provided or requested model is unavailable, actions_base will be used.
-
-## Mountpoints
-Directory where the model will be downloaded (~500Mb) needs to be mounted at /punctuator/deploy
diff --git a/config.ini b/config.ini
index bcffce8..47abace 100644
--- a/config.ini
+++ b/config.ini
@@ -1,5 +1,5 @@
 [service]
-tool = punctuator_test
+tool = textcleaner_test
 root = /samba/requests/
 rabbit_host = test
 rabbit_user = test
@@ -13,6 +13,6 @@ port = 9981
 local_log_level = INFO
 
 [deployment]
-device = cpu
-models_dir = deploy
-models_enabled = actions_base,actions_mixed,actions_restricted
+model_path = /model/punctuator
+max_context_size = 256
+overlap = 20
\ No newline at end of file
diff --git a/data.dvc b/data.dvc
deleted file mode 100644
index eb543e5..0000000
--- a/data.dvc
+++ /dev/null
@@ -1,3 +0,0 @@
-outs:
-- md5: 1fa175e752af1638dc896838e82a9d7d.dir
-  path: data
diff --git a/docker/training/Dockerfile b/docker/training/Dockerfile
deleted file mode 100644
index b3a7fba..0000000
--- a/docker/training/Dockerfile
+++ /dev/null
@@ -1,24 +0,0 @@
-FROM clarinpl/cuda-python:3.7
-
-RUN DEBIAN_FRONTEND=noninteractive apt-get update && apt-get install -y gcc python3-dev
-RUN mkdir /punctuator
-WORKDIR /punctuator
-
-COPY requirements.txt requirements.txt
-RUN pip3 install -r requirements.txt && rm requirements.txt
-
-ARG USERNAME=clarin
-ARG USER_UID=1000
-ARG USER_GID=1000
-
-# Create the user
-RUN groupadd --gid $USER_GID $USERNAME \
-    && useradd --uid $USER_UID --gid $USER_GID -m $USERNAME \
-    && apt-get update \
-    && apt-get install -y sudo \
-    && echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME \
-    && chmod 0440 /etc/sudoers.d/$USERNAME
-
-ENV PATH="/home/${USERNAME}/.local/bin:${PATH}"
-
-USER ${USERNAME}
\ No newline at end of file
diff --git a/docker/training/requirements.txt b/docker/training/requirements.txt
deleted file mode 120000
index fd1efae..0000000
--- a/docker/training/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-../../requirements.txt
\ No newline at end of file
diff --git a/download_dataset.sh b/download_dataset.sh
deleted file mode 100755
index 5c70a48..0000000
--- a/download_dataset.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/bash
-wget http://manage.legis.nlp.ipipan.waw.pl/download/ppc-nanno.tar.gz
-tar -xvf ppc-nanno.tar.gz
-rm ppc-nanno.tar.gz
diff --git a/dvc.yaml b/dvc.yaml
deleted file mode 100644
index 4970d16..0000000
--- a/dvc.yaml
+++ /dev/null
@@ -1,185 +0,0 @@
-stages:
-  ######################
-  #       Action       #
-  ######################
-  actions_extraction:
-    cmd: python3 -m src.pipelines.actions_based.stage1_extraction
-    deps:
-    - data
-    - src/pipelines/actions_based/stage1_extraction.py
-    params:
-    - actions.extraction.num_partitions
-    outs:
-    - generated/actions/stage1_extraction
-  actions_tokenization:
-    cmd: python3 -m src.pipelines.actions_based.stage2_tokenization
-    deps:
-    - generated/actions/stage1_extraction
-    - src
-    params:
-    - actions.tokenization.max_tokens
-    - actions.tokenization.min_tokens
-    - global.base_model
-    outs:
-    - generated/actions/stage2_tokenization
-  actions_exploding:
-    cmd: python3 -m src.pipelines.actions_based.stage3_exploding
-    deps:
-    - generated/actions/stage2_tokenization
-    - src
-    outs:
-    - generated/actions/stage3_exploding
-  actions_reindexing:
-    cmd: python3 -m src.pipelines.actions_based.stage4_reindexing
-    deps:
-    - generated/actions/stage3_exploding
-    - src
-    outs:
-    - generated/actions/stage4_reindexing
-  actions_stats:
-    cmd: python3 -m src.pipelines.actions_based.stage5_stats
-    deps:
-    - generated/actions/stage4_reindexing
-    - src
-    outs:
-    - generated/actions/stage5_stats
-
-  # Base
-  actions_base_training:
-    cmd: python3 -m src.pipelines.actions_based.train_base
-    deps:
-    - generated/actions/stage4_reindexing
-    - generated/actions/stage5_stats
-    - src
-    params:
-    - global.base_model
-    - global.random_seed
-    - actions.training_base.max_training_time
-    - actions.training_base.learning_rate
-    - actions.training_base.num_epochs
-    - actions.training_base.batch_size
-    - actions.training_base.save_step
-    outs:
-    - checkpoints/actions_base
-
-  actions_base_testing:
-    cmd: python3 -m src.pipelines.actions_based.test -a base -d checkpoints/actions_base/ -m "final" -ds generated/actions/stage4_reindexing/ -o generated/actions/test_results_base -s testing_base
-    deps:
-    - checkpoints/actions_base
-    - generated/actions/stage4_reindexing
-    - src
-    params:
-    - actions.testing_base.limit
-    outs:
-    - generated/actions/test_results_base
-
-  # Restricted
-  actions_restricted_training:
-    cmd: python3 -m src.pipelines.actions_based.train_restricted
-    deps:
-    - generated/actions/stage4_reindexing
-    - generated/actions/stage5_stats
-    - src
-    params:
-    - global.base_model
-    - global.random_seed
-    - actions.training_restricted.max_training_time
-    - actions.training_restricted.learning_rate
-    - actions.training_restricted.num_epochs
-    - actions.training_restricted.batch_size
-    - actions.training_restricted.save_step
-    outs:
-    - checkpoints/actions_restricted
-
-  actions_restricted_testing:
-    cmd: python3 -m src.pipelines.actions_based.test -a restricted -d checkpoints/actions_restricted/ -m "final" -ds generated/actions/stage4_reindexing/ -o generated/actions/test_results_restricted -s testing_restricted
-    deps:
-    - checkpoints/actions_restricted
-    - generated/actions/stage4_reindexing
-    - src
-    params:
-    - actions.testing_restricted.limit
-    outs:
-    - generated/actions/test_results_restricted
-
-  # Mixed
-  actions_mixed_training:
-    cmd: python3 -m src.pipelines.actions_based.train_mixed
-    deps:
-    - generated/actions/stage4_reindexing
-    - generated/actions/stage5_stats
-    - src
-    params:
-    - global.base_model
-    - global.random_seed
-    - actions.training_mixed.embedding_size
-    - actions.training_mixed.num_heads
-    - actions.training_mixed.num_layers
-    - actions.training_mixed.dropout
-    - actions.training_mixed.feedforward_neurons
-    - actions.training_mixed.max_training_time
-    - actions.training_mixed.learning_rate
-    - actions.training_mixed.num_epochs
-    - actions.training_mixed.batch_size
-    - actions.training_mixed.save_step
-    outs:
-    - checkpoints/actions_mixed
-
-  actions_mixed_testing:
-    cmd: python3 -m src.pipelines.actions_based.test -a mixed -d checkpoints/actions_mixed/ -m "final" -ds generated/actions/stage4_reindexing/ -o generated/actions/test_results_mixed -s testing_mixed
-    deps:
-    - checkpoints/actions_mixed
-    - generated/actions/stage4_reindexing
-    - src
-    params:
-    - actions.testing_mixed.limit
-    outs:
-    - generated/actions/test_results_mixed
-
-  ######################
-  #    Translation     #
-  ######################
-  translations_extraction:
-    cmd: python3 -m src.pipelines.translation_based.stage1_extraction
-    deps:
-    - data
-    params:
-    - translations.extraction.num_partitions
-    outs:
-    - generated/translations/stage1_extraction
-
-  translations_create_batches:
-    cmd: python3 -m src.pipelines.translation_based.stage2_create_batches
-    deps:
-    - generated/translations/stage1_extraction
-    params:
-    - global.base_model
-    outs:
-    - generated/translations/stage2_create_batches
-  translations_exploding:
-    cmd: python3 -m src.pipelines.translation_based.stage3_exploding
-    deps:
-    - generated/translations/stage2_create_batches
-    outs:
-    - generated/translations/stage3_exploding
-  translations_reindexing:
-    cmd: python3 -m src.pipelines.translation_based.stage4_reindexing
-    deps:
-    - generated/translations/stage3_exploding
-    outs:
-    - generated/translations/stage4_reindexing
-  translations_training:
-    cmd: python3 -m src.pipelines.translation_based.train
-    deps:
-    - generated/translations/stage4_reindexing
-    - src/pipelines/translation_based/train.py
-    params:
-    - global.base_model
-    - global.random_seed
-    - translations.training.max_training_time
-    - translations.training.learning_rate
-    - translations.training.num_epochs
-    - translations.training.batch_size
-    - translations.training.save_step
-    outs:
-    - checkpoints/translations
\ No newline at end of file
diff --git a/entrypoint.sh b/entrypoint.sh
old mode 100755
new mode 100644
index 5608c38..bb89756
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -1,24 +1,13 @@
 #!/bin/bash
 
-if ! test -d "./deploy/actions_base"; then
-    mkdir -p ./deploy/actions_base
-    wget https://minio.clarin-pl.eu/public/models/punctuation/actions_base/production.model -O deploy/actions_base/production.model
-    wget https://minio.clarin-pl.eu/public/models/punctuation/actions_base/production.config -O deploy/actions_base/production.config
-    wget https://minio.clarin-pl.eu/public/models/punctuation/actions_base/production.runtime.yaml -O deploy/actions_base/production.runtime.yaml
+if ! test -d "/model/punctuator"; then
+    mkdir -p /model/punctuator
+    wget https://minio.clarin-pl.eu/public/models/punctuator/model/pytorch_model.bin -O /model/punctuator/pytorch_model.bin
+    wget https://minio.clarin-pl.eu/public/models/punctuator/model/vocab.txt -O /model/punctuator/vocab.txt
+    wget https://minio.clarin-pl.eu/public/models/punctuator/model/config.json -O /model/punctuator/config.json
+    wget https://minio.clarin-pl.eu/public/models/punctuator/model/tokenizer_config.json -O /model/punctuator/tokenizer_config.json
+    wget https://minio.clarin-pl.eu/public/models/punctuator/model/special_tokens_map.json -O /model/punctuator/special_tokens_map.json
+    wget https://minio.clarin-pl.eu/public/models/punctuator/model/classes.json -O /model/punctuator/classes.json
 fi
 
-if ! test -d "./deploy/actions_mixed"; then
-    mkdir -p ./deploy/actions_mixed
-    wget https://minio.clarin-pl.eu/public/models/punctuation/actions_mixed/production.model -O deploy/actions_mixed/production.model
-    wget https://minio.clarin-pl.eu/public/models/punctuation/actions_mixed/production.config -O deploy/actions_mixed/production.config
-    wget https://minio.clarin-pl.eu/public/models/punctuation/actions_mixed/production.runtime.yaml -O deploy/actions_mixed/production.runtime.yaml
-fi
-
-if ! test -d "./deploy/actions_restricted"; then
-    mkdir -p ./deploy/actions_restricted
-    wget https://minio.clarin-pl.eu/public/models/punctuation/actions_restricted/production.model -O deploy/actions_restricted/production.model
-    wget https://minio.clarin-pl.eu/public/models/punctuation/actions_restricted/production.config -O deploy/actions_restricted/production.config
-    wget https://minio.clarin-pl.eu/public/models/punctuation/actions_restricted/production.runtime.yaml -O deploy/actions_restricted/production.runtime.yaml
-fi
-
-python worker.py
+python worker.py
\ No newline at end of file
diff --git a/generated/.gitignore b/generated/.gitignore
deleted file mode 100644
index c96a04f..0000000
--- a/generated/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-*
-!.gitignore
\ No newline at end of file
diff --git a/generated/actions/.gitignore b/generated/actions/.gitignore
deleted file mode 100644
index 49854ca..0000000
--- a/generated/actions/.gitignore
+++ /dev/null
@@ -1,5 +0,0 @@
-/stage1_extraction
-/stage2_tokenization
-/stage3_exploding
-/stage4_reindexing
-/stage5_stats
\ No newline at end of file
diff --git a/generated/translations/.gitignore b/generated/translations/.gitignore
deleted file mode 100644
index c31dad5..0000000
--- a/generated/translations/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-/stage1_extraction
-/stage2_create_batches
-/stage3_exploding
-/stage4_reindexing
diff --git a/params.yaml b/params.yaml
deleted file mode 100644
index fd62fe1..0000000
--- a/params.yaml
+++ /dev/null
@@ -1,110 +0,0 @@
-global:
-    dashboard_port: 8787
-    base_model: "dkleczek/bert-base-polish-cased-v1"
-    random_seed: 44
-
-actions:
-    extraction:
-        num_partitions: 2_000
-        num_workers: 24
-        worker_memory_limit: "2GB"
-
-    tokenization:
-        min_tokens: 10
-        max_tokens: 500
-        num_workers: 24
-        worker_memory_limit: "2GB"
-
-    exploding:
-        num_workers: 24
-        worker_memory_limit: "2GB"
-
-    reindexing:
-        num_workers: 1
-        worker_memory_limit: "60GB"
-
-    stats:
-        num_workers: 24
-        worker_memory_limit: "2GB"
-
-    training_base:
-        learning_rate: 0.0001
-        num_epochs: 5
-        batch_size: 2
-        batch_buffer_size: 100
-        save_step: 50
-        max_training_time: null
-        loss_averaging_span: 1000
-        fresh_start: false
-        device: "cuda:0"
-
-    testing_base:
-        limit: None
-        batch_size: 1
-        device: "cuda:0"
-
-    training_restricted:
-        learning_rate: 0.0001
-        num_epochs: 5
-        batch_size: 2
-        batch_buffer_size: 100
-        save_step: 1000
-        max_training_time: null
-        loss_averaging_span: 1000
-        fresh_start: true
-        device: "cuda:0"
-
-    test_restricted:
-        limit: None
-        batch_size: 1
-        device: "cuda:0"
-
-    training_mixed:
-        embedding_size: 768
-        num_heads: 12
-        num_layers: 6
-        dropout: 0.1
-        feedforward_neurons: 1000
-        learning_rate: 0.0001
-        num_epochs: 5
-        batch_size: 2
-        batch_buffer_size: 1000
-        save_step: 10000
-        max_training_time: null
-        loss_averaging_span: 1000
-        fresh_start: true
-        device: "cuda:0"
-
-    test_mixed:
-        limit: None
-        batch_size: 1
-        device: "cuda:0"
-translations:
-    extraction:
-        num_partitions: 2_000
-        num_workers: 24
-        worker_memory_limit: "2GB"
-
-    create_batches:
-        num_workers: 24
-        worker_memory_limit: "2GB"
-        min_tokens: 5
-        max_tokens: 300
-
-    exploding:
-        num_workers: 24
-        worker_memory_limit: "2GB"
-
-    reindexing:
-        num_workers: 1
-        worker_memory_limit: "60GB"
-
-    training:
-        learning_rate: 0.0001
-        num_epochs: 5
-        batch_size: 10
-        save_step: 1000
-        max_training_time: "4h"
-        loss_averaging_span: 1000
-        fresh_start: false
-        device: "cuda:1"
diff --git a/punctuate.py b/punctuate.py
deleted file mode 100755
index 8eb4bdc..0000000
--- a/punctuate.py
+++ /dev/null
@@ -1,113 +0,0 @@
-import argparse
-from src.pipelines.actions_based.utils import max_suppression
-from src.pipelines.actions_based.processing import (
-    ACTIONS_KEYS,
-    recover_text,
-    token_labels_to_word_labels,
-)
-from src.models.interfaces import ActionsModel
-from typing import Dict
-
-import numpy as np
-import torch
-
-from src.models.actions_model_base import ActionsModelBase
-from src.models.actions_model_mixed import ActionsModelMixed
-from src.models.actions_model_restricted import ActionsModelRestricted
-from src.utils import (
-    PROJECT_ROOT,
-    input_preprocess,
-    output_preprocess,
-)
-import colored
-
-SUPPORTED_MODELS: Dict[str, ActionsModel] = {
-    "base": ActionsModelBase,
-    "restricted": ActionsModelRestricted,
-    "mixed": ActionsModelMixed,
-}
-
-
-def print_highlighted(text: str, word_labels: np.ndarray, action_name: str) -> None:
-    label_id = np.argwhere(np.array(ACTIONS_KEYS) == action_name)
-
-    text = text.split(" ")
-    for label, word in zip(word_labels, text):
-        SPAN = 255 - 232
-
-        bg_color = int(label[label_id] * (SPAN - 1) + 232)
-        print(colored.bg(bg_color) + colored.fg(2) + word, end=" ")
-    print("")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Evaluate actions model")
-    parser.add_argument(
-        "-a",
-        "--architecture",
-        required=True,
-        choices=SUPPORTED_MODELS.keys(),
-        help="Model architecture",
-    )
-    parser.add_argument(
-        "-d",
-        "--directory",
-        required=True,
-        help="Directory where trained model is located, relative to project root",
-    )
-    parser.add_argument(
-        "-i", "--input", required=True, type=str, help="Input text file"
-    )
-    parser.add_argument("-m", "--model", default="final", help="Pretrained model name")
-    parser.add_argument(
-        "-l",
-        "--highlight",
-        type=str,
-        required=False,
-        choices=ACTIONS_KEYS + ["none"],
-        default="none",
-        help="Highlight prediction confidence of selected action per-word",
-    )
-    parser.add_argument(
-        "-dv",
-        "--device",
-        type=str,
-        required=False,
-        default="cpu",
-        help="Device on which inference will be made",
-    )
-    args = parser.parse_args()
-
-    print(f"Loading model {args.model}...")
-    device = torch.device(args.device)
-    model_location = f"{PROJECT_ROOT}/{args.directory}"
-    model_type = SUPPORTED_MODELS[args.architecture]
-    model = model_type.load(model_location, args.model, device)
-    model.train(False)
-
-    print("Loading text...")
-    with open(args.input, "r") as f:
-        text = f.read()
-
-    print("Inferencing...")
-    tokenizer = model.tokenizer()
-    data = input_preprocess(output_preprocess(text))
-    data_tokenized = tokenizer(data, return_tensors="pt")
-
-    predictions = (
-        model.predict_raw(
-            data_tokenized["input_ids"].to(device),
-            data_tokenized["attention_mask"].to(device),
-        )
-        .detach()
-        .cpu()
-        .numpy()
-    )
-    word_labels = token_labels_to_word_labels(data, predictions[0, 1:-1], tokenizer)
-    word_labels_suppresed = max_suppression(np.expand_dims(word_labels, axis=0), 0.9)[0]
-    text_recovered = recover_text(data, word_labels_suppresed)
-
-    if args.highlight != "none":
-        print_highlighted(text_recovered, word_labels, args.highlight)
-    else:
-        print(text_recovered)
diff --git a/src/__init__.py b/punctuator/__init__.py
similarity index 100%
rename from src/__init__.py
rename to punctuator/__init__.py
diff --git a/punctuator/punctuator.py b/punctuator/punctuator.py
new file mode 100644
index 0000000..0baf991
--- /dev/null
+++ b/punctuator/punctuator.py
@@ -0,0 +1,115 @@
+from typing import List
+import numpy as np
+
+
+def decode_labels(results, labels_map) -> List[str]:
+    labels_decoded = list(map(lambda x: labels_map[x], results))
+
+    return labels_decoded
+
+
+def decode(tokens, labels_decoded, tokenizer):
+    text_recovered = []
+    word = []
+    word_end = ""
+
+    for label, token in zip(labels_decoded, tokens):
+        token_str = tokenizer.convert_ids_to_tokens([token])[0]
+
+        if token_str == "[PAD]":
+            break
+
+        if token_str.startswith("##"):
+            word.append(token_str.replace("##", ""))
+        else:
+            if len(word) > 0:
+                word.append(word_end)
+                text_recovered.append("".join(word))
+                word = []
+
+            if label.startswith("__ALL_UPPER__"):
+                # TODO: Make all uppercase
+                word.append(token_str[0].upper() + token_str[1:])
+            elif label.startswith("__UPPER__"):
+                word.append(token_str[0].upper() + token_str[1:])
+            else:
+                word.append(token_str)
+
+            label = label.replace("__UPPER__", "")
+            label = label.replace("__ALL_UPPER__", "")
+            word_end = label
+
+    text_recovered.append("".join(word))
+
+    return "".join(text_recovered)
+
+
+def inference_masks(num_tokens, max_len, overlap):
+    if max_len >= num_tokens:
+        return [[True] * num_tokens], [[False] + [True] * (num_tokens - 2) + [False]]
+
+    # Account for CLS & SEP tokens
+    real_max_len = max_len - 2
+    real_num_tokens = num_tokens - 2
+
+    step_size = real_max_len - 2 * overlap
+
+    masks = []
+    entries = []
+    for start_id in range(0, real_num_tokens, step_size):
+        stop = False
+        if start_id == 0:
+            entry = [True] + [True] * real_max_len + [False] * \
+                (real_num_tokens - real_max_len) + [True]
+            mask = [False] + [True] * \
+                (real_max_len - overlap) + [False] * (overlap + 1)
+        elif start_id + real_max_len >= real_num_tokens:
+            offset_start = real_num_tokens - real_max_len
+            entry = [True] + [False] * \
+                (offset_start) + [True] * real_max_len + [True]
+            mask = [False] * (overlap + 1 + (start_id - offset_start)) + [True] * \
+                (real_max_len - overlap - (start_id - offset_start)) + [False]
+            stop = True
+        else:
+            entry = [True] + [False] * start_id + [True] * real_max_len + \
+                [False] * (real_num_tokens - (start_id + real_max_len)) + [True]
+            mask = [False] * (overlap + 1) + [True] * \
+                (real_max_len - 2 * overlap) + [False] * (overlap + 1)
+
+        masks.append(mask)
+        entries.append(entry)
+
+        if stop:
+            break
+
+    return entries, masks
+
+
+def combine_masks(num_tokens, max_len, overlap):
+    if max_len >= num_tokens:
+        return np.array([[False] + [True] * (num_tokens - 2) + [False]])
+
+    step_size = max_len - 2 - overlap
+
+    entries = []
+    for start in range(0, num_tokens - 2, step_size):
+        stop = False
+
+        if start + max_len - 2 - overlap < num_tokens - 2:
+            entry = [False] + [False] * \
+                (start) + [True] * (max_len - 2 - overlap)
+            entry += [False] * (num_tokens - 2
+                                - (start + max_len - 2 - overlap))
+            entry += [False]
+        else:
+            entry = [False] + [False] * (start)
+            entry += [True] * (num_tokens - 2 - start)
+            entry += [False]
+            stop = True
+
+        entries.append(entry)
+
+        if stop:
+            break
+
+    return entries
diff --git a/requirements.txt b/requirements.txt
index 068c99e..1de9709 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,63 +1,3 @@
---index-url https://pypi.clarin-pl.eu/simple/
---find-links https://download.pytorch.org/whl/torch_stable.html
-attrs==19.3.0
-bokeh==2.1.1
-certifi==2020.6.20
-chardet==3.0.4
-click==7.1.2
-cloudpickle==1.5.0
-cycler==0.10.0
-dask==2.22.0
-distributed==2.22.0
-filelock==3.0.12
-fsspec==0.8.0
-future==0.18.2
-HeapDict==1.0.1
-idna==2.10
-iniconfig==1.0.1
-Jinja2==2.11.2
-joblib==0.16.0
-kiwisolver==1.2.0
-locket==0.2.0
-lxml==4.5.2
-MarkupSafe==1.1.1
-matplotlib==3.3.0
-more-itertools==8.4.0
-msgpack==1.0.0
-numpy==1.19.1
-packaging==20.4
-pandas==1.1.0
-partd==1.1.0
-Pillow==7.2.0
-pluggy==0.13.1
-psutil==5.7.2
-py==1.9.0
-pyarrow==0.17.1
-pycurl==7.43.0
-pyparsing==2.4.7
-pytest==6.0.1
-python-dateutil==2.8.1
-pytz==2020.1
-PyYAML==5.3.1
-regex==2020.7.14
-requests==2.24.0
-sacremoses==0.0.43
-scipy==1.5.2
-seaborn==0.10.1
-sentencepiece==0.1.91
-six==1.15.0
-sortedcontainers==2.2.2
-tblib==1.7.0
-tokenizers==0.8.1rc1
-toml==0.10.1
-toolz==0.10.0
-torch==1.4.0+cu100
-tornado==6.0.4
-tqdm==4.48.2
-transformers==3.0.2
-typing-extensions==3.7.4.2
-urllib3==1.25.10
-zict==2.0.0
-scikit-learn==0.23.2
-nlp_ws==0.6
-colored==1.4.2
+numpy==1.19.4
+transformers==4.3.2
+torch==1.7.1
\ No newline at end of file
diff --git a/src/batch_loading.py b/src/batch_loading.py
deleted file mode 100644
index e0527e6..0000000
--- a/src/batch_loading.py
+++ /dev/null
@@ -1,96 +0,0 @@
-from typing import Union
-
-import dask.dataframe as dd
-import numpy as np
-import pandas as pd
-
-
-def calculate_batch_buffer_id(batch_id: int, buffer_batch_num: int) -> int:
-    """Calculate which buffer should be loaded into memory for a given batch
-
-    Args:
-        batch_id (int): Id of the batch, counted from the start
-        buffer_batch_num (int): Number of batches that are loaded at once into memory
-
-    Returns:
-        int: Batch buffer id that needs to be in memory for a given batch
-    """
-    return batch_id // buffer_batch_num
-
-
-def yield_batch_buffer_span(
-    batch_size: int, batch_buffer_len: int, num_samples: int
-) -> np.array:
-    """Calculates which samples should be loaded in a given batch buffer
-
-    Args:
-        batch_buffer_id (int): Id of the buffer, counting from beggining
-        batch_buffer_size (int): Size of batch buffer (in number of batches)
-        num_samples (int): Number of samples in a dataset
-
-    Returns:
-        np.array: Contignous ids that should be loaded to memory for a given buffer
-    """
-    batch_buffer_size = batch_size * batch_buffer_len
-
-    batch_buffer_id = 0
-
-    while batch_buffer_id < (num_samples / batch_buffer_size):
-        buffer_start = batch_buffer_size * batch_buffer_id
-        buffer_end = min(num_samples, buffer_start + batch_buffer_size)
-
-        yield np.arange(buffer_start, buffer_end, 1, np.long)
-        batch_buffer_id += 1
-
-
-def get_ordered_dataframe_len(df: Union[pd.DataFrame, dd.DataFrame]) -> int:
-    """Gets length of a dataframe, which ids are ORDERED CONTINUOUSLY from 0 to N
-    without counting all the elements
-
-    Args:
-        df (Union[pd.DataFrame, dd.DataFrame]): Dataframe
-
-    Returns:
-        int: Length of the dataframe
-    """
-    return df.tail(1).index.values[0] + 1
-
-
-def get_batches(
-    df: dd.DataFrame,
-    batch_size: int,
-    batch_buffer_len: int,
-    shuffled_ids: np.array,
-    batch_start: int = 0,
-) -> pd.DataFrame:
-    """Generator for getting batches from large Dask dataframe with implemented buffering
-
-    Args:
-        df (dd.DataFrame): Source dask dataframe
-        batch_size (int): Desired size of a batch
-        batch_buffer_len (int): Number of batches to load to memory at once
-        shuffled_ids (np.array): Shuffled order of samples
-
-    Returns:
-        pd.DataFrame: [description]
-
-    Yields:
-        Iterator[pd.DataFrame]: [description]
-    """
-    length = get_ordered_dataframe_len(df)
-
-    batch_id = batch_start
-
-    for batch_buffer_span in yield_batch_buffer_span(
-        batch_size, batch_buffer_len, length
-    ):
-        buffer_ids = shuffled_ids[batch_buffer_span]
-        buffer = df.loc[buffer_ids].compute()
-
-        for i in range(batch_buffer_len):
-            batch_ids = buffer_ids[
-                range(i * batch_size, min((i + 1) * batch_size, len(buffer_ids)))
-            ]
-
-            yield buffer.loc[batch_ids]
-            batch_id += 1
diff --git a/src/models/TransformerSeq2Seq.py b/src/models/TransformerSeq2Seq.py
deleted file mode 100644
index 3009fae..0000000
--- a/src/models/TransformerSeq2Seq.py
+++ /dev/null
@@ -1,73 +0,0 @@
-import torch
-import torch.nn as nn
-
-from src.models.common import PositionalEncoding
-
-
-class TransformerSeq2Seq(nn.Module):
-    """Class representing a sequence to sequence transformer, based on original "Attention is all you need" paper."""
-
-    def __init__(
-        self,
-        vocab_size: int,
-        embedding_size: int,
-        max_len: int,
-        num_heads: int = 8,
-        encoder_layers: int = 6,
-        decoder_layers: int = 6,
-        feedforward_neurons: int = 2048,
-        dropout: float = 0.1,
-    ):
-
-        super(TransformerSeq2Seq, self).__init__()
-
-        # Embedd from token to vec space
-        self.word_embedding = nn.Embedding(vocab_size, embedding_size)
-
-        # Add positional encoding
-        self.position_embedding = PositionalEncoding(embedding_size, max_len, dropout)
-
-        # Combined encoder-decoder step
-        self.core = nn.Transformer(
-            embedding_size,
-            num_heads,
-            encoder_layers,
-            decoder_layers,
-            feedforward_neurons,
-            dropout,
-        )
-
-        # Map embedding to word
-        self.embedding_to_words = nn.Linear(embedding_size, vocab_size)
-
-    def forward(
-        self, source: torch.Tensor, target: torch.Tensor, source_mask: torch.Tensor,
-    ) -> torch.Tensor:
-        """Full encoder-decoder pass
-
-        Args:
-            source (torch.Tensor): Tensor with batch of source sentences tokens [BxL shape]
-            target (torch.Tensor): Tensor with batch of target sentences tokens [BxL-1 shape]
-            source_mask (torch.Tensor): Mask applied to source (True if element is padding, False otherwise) [BxL shape]
-
-        Returns:
-            torch.Tensor: Tensor with predicted target sentences tokens [Bx(L-1)xV]
-        """
-        # Input to encoder
-        x = source.transpose(0, 1)
-        x = self.word_embedding(x)
-        x = self.position_embedding(x)
-
-        # Input to decoder
-        y = target.transpose(0, 1)
-        y = self.word_embedding(y)
-        y = self.position_embedding(y)
-
-        tgt_mask = self.core.generate_square_subsequent_mask(y.shape[0]).to(y.device)
-
-        z = self.core(
-            x, y, src_key_padding_mask=source_mask, tgt_mask=tgt_mask
-        ).transpose(1, 0)
-        z = self.embedding_to_words(z)
-
-        return z
diff --git a/src/models/__init__.py b/src/models/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/src/models/actions_model_base.py b/src/models/actions_model_base.py
deleted file mode 100644
index 339b981..0000000
--- a/src/models/actions_model_base.py
+++ /dev/null
@@ -1,216 +0,0 @@
-from __future__ import annotations
-
-import os
-from dataclasses import dataclass
-
-import numpy as np
-import torch
-import torch.nn as nn
-from torch.nn.modules.loss import BCEWithLogitsLoss
-from transformers.configuration_utils import PretrainedConfig
-from transformers.modeling_bert import BertForTokenClassification
-from transformers.tokenization_bert import BertTokenizerFast
-
-from src.models.interfaces import ActionsModel
-from src.pipelines.actions_based.processing import (
-    ACTIONS_KEYS,
-    action_vector,
-    last_stop_label,
-    recover_text,
-    token_labels_to_word_labels,
-)
-from src.pipelines.actions_based.utils import max_suppression
-from src.utils import (
-    get_device,
-    pickle_read,
-    pickle_save,
-    prepare_folder,
-    yaml_serializable,
-)
-
-
-@dataclass
-class ActionsModelBaseParams:
-    """
-    Parameters for ActionsModelBase initialization
-
-    Args:
-        base_model (str): Name of base model
-        num_labels (int): Length of action vector
-
-    """
-
-    base_model: str
-    num_labels: int = len(ACTIONS_KEYS)
-
-
-@yaml_serializable
-@dataclass
-class ActionsModelBaseRuntimeParams:
-    """
-    Parameters for ActionsModelBase during runtime interference
-
-    Args:
-        threshold (float): minimum confidence for applying action
-        chunksize (int): Maximum number of chunks to perform inference on
-    """
-
-    threshold: float = 0.9
-    chunksize: int = 500
-
-
-class ActionsModelBase(ActionsModel):
-    """Model based on simple multilabel per-token classifiaction. Each token is binarly classified in n-dimensions"""
-
-    def __init__(
-        self,
-        params: ActionsModelBaseParams,
-        runtime: ActionsModelBaseRuntimeParams = ActionsModelBaseRuntimeParams(),
-    ) -> None:
-        """Initializes actions model
-
-        Args:
-            params (ActionsModelBaseParams): Params defining model's structure
-            runtime (ActionsModelBaseRuntimeParams): Params defining model's runtime inference
-        """
-        super(ActionsModelBase, self).__init__()
-        self.params = params
-        self.runtime = runtime
-
-        self._tokenizer = BertTokenizerFast.from_pretrained(params.base_model)
-        config = PretrainedConfig.from_pretrained(params.base_model)
-        config.num_labels = params.num_labels
-
-        self.core = BertForTokenClassification(config)
-
-    def forward(
-        self, input_ids: torch.Tensor, attention_mask: torch.Tensor
-    ) -> torch.Tensor:
-        """Computes logits for uppercasing and adding punctuation to a word
-
-        Args:
-            input_ids (torch.Tensor): Array of ids of tokens. Shape BxL
-            attention_mask (torch.Tensor): Mask telling if a token should be masked out (ie. Padding). Shape BxL
-
-        Returns:
-            torch.Tensor: Predicted actions vector
-        """
-        y_pred = self.core(input_ids=input_ids, attention_mask=attention_mask)[0]
-
-        return y_pred
-
-    def predict_raw(
-        self, input_ids: torch.Tensor, attention_mask: torch.Tensor
-    ) -> torch.Tensor:
-        """Function that maps input_ids tensors into per-token labels
-
-        Args:
-            input_ids (torch.Tensor): Token ids of input. Shape BxL
-            attention_mask (torch.Tensor): Attention mask of tokens. Shape BxL
-
-        Returns:
-            torch.Tensor: Per-token action-vector labels. Shape BxLxA
-        """
-
-        return self.forward(input_ids, attention_mask=attention_mask).sigmoid()
-
-    def predict(self, text: str) -> str:
-        text = text.strip()
-
-        device = get_device(self)
-
-        tokenizer = self.tokenizer()
-        tokens = tokenizer(text, return_tensors="pt")["input_ids"].to(device)
-        output = None
-
-        index_start = 0
-        while index_start < len(tokens[0]):
-            index_end = min(index_start + self.runtime.chunksize, len(tokens[0]))
-
-            tokens_chunk = tokens[:, index_start:index_end]
-            attention_mask = torch.ones_like(tokens_chunk).to(device)
-
-            actions = (
-                self.predict_raw(tokens_chunk, attention_mask).detach().cpu().numpy()
-            )
-            actions_suppresed = max_suppression(actions, self.runtime.threshold)[0]
-
-            offset = last_stop_label(actions_suppresed, action_vector(["dot"]))
-
-            # Prevent infinite loop
-            if (offset is None) or (offset == 0):
-                offset = index_end - index_start
-
-            if output is None:
-                output = actions[0, 0:offset]
-            else:
-                output = np.concatenate([output, actions[0, 0:offset]], axis=0)
-
-            index_start += offset
-
-        assert len(output) == len(tokens[0])
-
-        word_labels = token_labels_to_word_labels(text, output[1:-1], tokenizer)
-        actions = max_suppression(
-            np.expand_dims(word_labels, 0), self.runtime.threshold
-        )[0]
-
-        return recover_text(text, actions)
-
-    def tokenizer(self) -> BertTokenizerFast:
-        return self._tokenizer
-
-    def save(self, dir: str, name: str, runtime: bool = True) -> None:
-        prepare_folder(dir)
-        torch.save(self.state_dict(), f"{dir}/{name}.model")
-        pickle_save(self.params, f"{dir}/{name}.config")
-
-        if runtime:
-            self.runtime.save_yaml(f"{dir}/{name}.runtime.yaml")
-
-    @staticmethod
-    def load(dir: str, name: str, device: torch.device) -> ActionsModelBase:
-        params = pickle_read(f"{dir}/{name}.config")
-        if os.path.exists(f"{dir}/{name}.runtime.yaml"):
-            runtime = ActionsModelBaseRuntimeParams.load_yaml(
-                f"{dir}/{name}.runtime.yaml"
-            )
-        else:
-            runtime = ActionsModelBaseRuntimeParams()
-
-        model = ActionsModelBase(params, runtime).to(device)
-        model.load_state_dict(torch.load(f"{dir}/{name}.model", map_location=device))
-
-        return model
-
-
-class ActionsModelBaseLoss(nn.Module):
-    """Proposed loss for ActionsModelBase model"""
-
-    def __init__(self, prior_inverse_odds: torch.Tensor) -> None:
-        """Initializes ActionsModelBaseLoss
-
-        Args:
-            prior_odds (torch.Tensor): Negative to positive ratio of each action vector
-                entry in dataset. Shape A
-        """
-        super(ActionsModelBaseLoss, self).__init__()
-
-        self.core = BCEWithLogitsLoss(pos_weight=prior_inverse_odds)
-
-    def forward(
-        self,
-        predicted_action_vector_logits: torch.Tensor,
-        true_action_vector: torch.Tensor,
-    ) -> torch.Tensor:
-        """Computes ActionsModelBase loss
-
-        Args:
-            true_action_vector (torch.Tensor): Logits predicted by ActionsModelBase model. Shape BxLxA
-            predicted_action_vector_logits (torch.Tensor): Target labels. Shape BxLxA
-
-        Returns:
-            torch.Tensor: Computed loss.
-        """
-
-        return self.core(predicted_action_vector_logits, true_action_vector)
diff --git a/src/models/actions_model_mixed.py b/src/models/actions_model_mixed.py
deleted file mode 100644
index e09c0fa..0000000
--- a/src/models/actions_model_mixed.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import os
-from dataclasses import dataclass
-from typing import Optional
-
-import numpy as np
-import torch
-import torch.nn as nn
-from torch.nn.modules.loss import BCEWithLogitsLoss
-from transformers.tokenization_bert import BertTokenizerFast
-
-from src.models.common import PositionalEncoding, generate_square_subsequent_mask
-from src.models.interfaces import PunctuationModel
-from src.pipelines.actions_based.processing import (
-    ACTIONS_KEYS,
-    action_vector,
-    recover_text,
-    token_labels_to_word_labels,
-)
-from src.utils import (
-    get_device,
-    pickle_read,
-    pickle_save,
-    prepare_folder,
-    yaml_serializable,
-)
-
-
-@dataclass
-class ActionsModelMixedParams:
-    """
-    Parameters for initializing ActionsModelMixed
-
-    Params:
-        base_tokenizer (str): Name of pretrained tokenizer
-        vocab_size (int): Number of tokens in tokenizer dictionary
-        embedding_size (int, optional): Shape of word and punctuation embeddings. Defaults to 200.
-        num_heads (int, optional): Number of heads in multiheaded attention. Defaults to 4.
-        num_layers (int, optional): Number of both decoded and encoder layers. Defaults to 2.
-        feedforward_neurons (int, optional): Size of feed-forward neural network at the end of encoder/decoder. Defaults to 200.
-        num_labels (int, optional): Action-vector size. Defaults to len(ACTIONS_KEYS).
-        max_len (int, optional): Maxium length of sequence. Defaults to 500.
-        dropout (float, optional): Dropout ratio. Defaults to 0.1.
-    """
-
-    base_tokenizer: str
-    vocab_size: int
-    threshold: float = 0.9
-    embedding_size: int = 200
-    num_heads: int = 4
-    num_layers: int = 2
-    feedforward_neurons: int = 200
-    num_labels: int = len(ACTIONS_KEYS)
-    max_len: int = 500
-    dropout: float = 0.1
-
-
-@yaml_serializable
-@dataclass
-class ActionsModelMixedRuntimeParams:
-    """
-    Parameters for ActionsModelMixed during runtime interference
-
-    Args:
-        threshold (float): minimum confidence for applying action
-        chunksize (int): Maximum number of chunks to perform inference on
-    """
-
-    threshold: float = 0.9
-    max_cond_len: Optional[int] = 500
-
-
-class ActionsModelMixed(PunctuationModel):
-    """Encoder-decoder based model with unpunctuated token sequence as input and array of action-vectors as output"""
-
-    def __init__(
-        self,
-        params: ActionsModelMixedParams,
-        runtime: ActionsModelMixedRuntimeParams = ActionsModelMixedRuntimeParams(),
-    ) -> None:
-        """Initializes mixed model
-
-        Args:
-            params (ActionsModelMixedParams): Parameters for model
-        """
-        super(ActionsModelMixed, self).__init__()
-
-        self.params = params
-        self.runtime = runtime
-        self._tokenizer = None
-
-        self.num_labels = params.num_labels
-
-        # Word embedder
-        self.word_embedding = nn.Embedding(params.vocab_size, params.embedding_size)
-        self.punctuation_embedding = nn.Linear(params.num_labels, params.embedding_size)
-
-        # Add positional encoding
-        self.words_position_embedding = PositionalEncoding(
-            params.embedding_size, params.max_len, params.dropout
-        )
-        self.punctuation_position_embedding = PositionalEncoding(
-            params.embedding_size, params.max_len, params.dropout
-        )
-
-        # Sentence encoder
-        sentence_encoder_layer = nn.TransformerEncoderLayer(
-            params.embedding_size,
-            params.num_heads,
-            params.feedforward_neurons,
-            params.dropout,
-        )
-        self.sentence_encoder = nn.TransformerEncoder(
-            sentence_encoder_layer, num_layers=params.num_layers
-        )
-
-        # Punctuation decoder
-        punctuation_decoder_layer = nn.TransformerDecoderLayer(
-            params.embedding_size,
-            params.num_heads,
-            params.feedforward_neurons,
-            params.dropout,
-        )
-        self.punctuation_decoder = nn.TransformerDecoder(
-            punctuation_decoder_layer, num_layers=params.num_layers
-        )
-
-        self.to_labels = nn.Linear(params.embedding_size, params.num_labels)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        actions: torch.Tensor,
-        attention_mask: torch.Tensor,
-    ) -> torch.Tensor:
-        """Computes action vectors array from array of tokens
-
-        Args:
-            input_ids (torch.Tensor): Tokens representing unpuctuated text. Shape BxL
-            actions (torch.Tensor): Actions vector predicted up-till now. Shape BxL-1xA
-            attention_mask (torch.Tensor): Mask representing if token is padding (True) or Not. Shape BxL
-
-        Returns:
-            torch.Tensor: Predicted actions shifted one to the left. Shape BxL-1xA
-        """
-
-        # Input to encoder
-        x = input_ids.transpose(0, 1)
-        x = self.word_embedding(x)
-        x = self.words_position_embedding(x)
-
-        # Input to decoder
-        y = actions.transpose(0, 1)
-        y = self.punctuation_embedding(y)
-        y = self.punctuation_position_embedding(y)
-
-        tgt_mask = generate_square_subsequent_mask(y.shape[0]).to(y.device)
-
-        sentence_encoded = self.sentence_encoder(x, src_key_padding_mask=attention_mask)
-
-        actions_decoded = self.punctuation_decoder(
-            y, sentence_encoded, tgt_mask=tgt_mask
-        )
-
-        z = actions_decoded.transpose(1, 0)
-
-        return self.to_labels(z)
-
-    def tokenizer(self) -> BertTokenizerFast:
-        if self._tokenizer is None:
-            self._tokenizer = BertTokenizerFast.from_pretrained(
-                self.params.base_tokenizer
-            )
-        return self._tokenizer
-
-    def predict(self, text: str) -> str:
-        # TODO: Optimize for speed
-
-        inputs = [action_vector(["upper_case"])]
-
-        tokenizer = self.tokenizer()
-        text_tokenized = tokenizer(text, return_tensors="pt")
-
-        target_device = get_device(self)
-
-        max_cond_len = self.runtime.max_cond_len
-        if max_cond_len is None:
-            max_cond_len = np.iinfo(np.int).max
-
-        for _ in range(text_tokenized["input_ids"].shape[1] - 2):
-            input_start = max(0, len(inputs) - max_cond_len)
-
-            prediction_raw = self.forward(
-                text_tokenized["input_ids"][:, input_start:].to(target_device),
-                torch.tensor(inputs[input_start:], dtype=torch.float)
-                .reshape(1, -1, self.num_labels)
-                .to(target_device),
-                (text_tokenized["attention_mask"][:, input_start:] == 0).to(
-                    target_device
-                ),
-            ).sigmoid()
-
-            inputs.append(
-                (
-                    prediction_raw.detach().cpu().numpy()[0, -1, :]
-                    > self.runtime.threshold
-                ).astype(np.float)
-            )
-
-        word_labels = token_labels_to_word_labels(text, inputs[1:], tokenizer)
-
-        prediction_binary = word_labels.astype(np.int)
-
-        return recover_text(text, prediction_binary)
-
-    def predict_raw(
-        self, input_ids: torch.Tensor, attention_mask: torch.Tensor
-    ) -> torch.Tensor:
-        """Function that maps input_ids tensors into per-token labels
-
-        Args:
-            input_ids (torch.Tensor): Token ids of input. Shape BxL
-            attention_mask (torch.Tensor): Attention mask of tokens. Shape BxL
-
-        Returns:
-            torch.Tensor: Per-token action-vector labels. Shape BxLxA
-        """
-        outputs = torch.tensor(action_vector(["upper_case"]), dtype=torch.float).to(
-            input_ids.device
-        )
-        outputs = outputs.unsqueeze(0).unsqueeze(0).repeat(input_ids.shape[0], 1, 1)
-
-        for _ in range(input_ids.shape[1] - 1):
-            prediction_raw = self.forward(
-                input_ids, outputs, (attention_mask == 0)
-            ).sigmoid()
-
-            prediction_raw = (prediction_raw[:, -1:, :] > self.runtime.threshold).type(
-                torch.float
-            )
-            outputs = torch.cat([outputs, prediction_raw], dim=1)
-
-        return outputs
-
-    def save(self, dir: str, name: str, runtime: bool = True) -> None:
-        prepare_folder(dir)
-        torch.save(self.state_dict(), f"{dir}/{name}.model")
-        pickle_save(self.params, f"{dir}/{name}.config")
-
-        if runtime:
-            self.runtime.save_yaml(f"{dir}/{name}.runtime.yaml")
-
-    @staticmethod
-    def load(dir: str, name: str, device: torch.device) -> PunctuationModel:
-        params = pickle_read(f"{dir}/{name}.config")
-        if os.path.exists(f"{dir}/{name}.runtime.yaml"):
-            runtime = ActionsModelMixedRuntimeParams.load_yaml(
-                f"{dir}/{name}.runtime.yaml"
-            )
-        else:
-            runtime = ActionsModelMixedRuntimeParams()
-
-        model = ActionsModelMixed(params, runtime)
-        model.to(device)
-
-        model.load_state_dict(torch.load(f"{dir}/{name}.model", map_location=device))
-
-        return model
-
-
-class ActionsModelMixedLoss(nn.Module):
-    """Class representing proposed loss for training mixed actions model"""
-
-    def __init__(self, prior_odds: torch.Tensor) -> None:
-        """Initializes ActionsModelMixedLoss
-
-        Args:
-            prior_odds (torch.Tensor): Odds representing ratio of positive to negative examples for each label in action vector. Shape A
-        """
-        super(ActionsModelMixedLoss, self).__init__()
-
-        self.core = BCEWithLogitsLoss(pos_weight=prior_odds)
-
-    def forward(
-        self,
-        true_action_vector: torch.Tensor,
-        predicted_action_vector_logits: torch.Tensor,
-    ) -> torch.Tensor:
-        """Computes loss for training mixed actions model
-
-        Args:
-            true_action_vector (torch.Tensor): Action vector that should be
-                predicted by ActionsModelMixed (shifted by 1 to the left in
-                regards to inputs). Shape BxL-1xA
-
-            predicted_action_vector_logits (torch.Tensor): Action vector that
-                was acttualy predicted by ActionsModelMixed (shifted by 1 to
-                the left in regards to inputs). Shape BxL-1xA
-
-
-        Returns:
-            torch.Tensor: Loss of predition in relation to ground truth
-        """
-
-        return self.core(predicted_action_vector_logits, true_action_vector)
diff --git a/src/models/actions_model_restricted.py b/src/models/actions_model_restricted.py
deleted file mode 100644
index eb7f859..0000000
--- a/src/models/actions_model_restricted.py
+++ /dev/null
@@ -1,273 +0,0 @@
-from __future__ import annotations
-
-import os
-from dataclasses import dataclass
-
-import numpy as np
-import torch
-import torch.nn as nn
-from transformers.configuration_utils import PretrainedConfig
-from transformers.modeling_bert import BertForTokenClassification
-from transformers.tokenization_bert import BertTokenizerFast
-
-from src.models.actions_model_mixed import ActionsModelMixed
-from src.models.interfaces import ActionsModel, PunctuationModel
-from src.pipelines.actions_based.processing import (
-    action_vector,
-    last_stop_label,
-    recover_text,
-    token_labels_to_word_labels,
-)
-from src.pipelines.actions_based.utils import max_suppression
-from src.utils import (
-    get_device,
-    pickle_read,
-    pickle_save,
-    prepare_folder,
-    yaml_serializable,
-)
-
-
-@dataclass
-class ActionsModelRestrictedParams:
-    """
-    Parameters for ActionsModelRestricted
-
-    Params:
-        base_model (str): Name of base model
-        extended_action_vector_size (int): Action-vector size including additional no-punctuation logit
-    """
-
-    base_model: str
-    extended_action_vector_size: int
-
-
-@yaml_serializable
-@dataclass
-class ActionsModelRestrictedRuntimeParams:
-    """
-    Parameters for ActionsModelBase during runtime interference
-
-    Args:
-        threshold (float): minimum confidence for applying action
-        chunksize (int): Maximum number of chunks to perform inference on
-    """
-
-    threshold: float = 0.9
-    chunksize: int = 500
-
-
-class ActionsModelRestricted(ActionsModel):
-    """Similar to ActionsModelBase, however no-punctuation class is added
-    and punctuation-related entries are treaded as proper categorical distribution
-    """
-
-    def __init__(
-        self,
-        params: ActionsModelRestrictedParams,
-        runtime: ActionsModelRestrictedRuntimeParams = ActionsModelRestrictedRuntimeParams(),
-    ) -> None:
-        """Initializes restricted actions model
-
-        Args:
-            base_model (str): Name of base model
-            extended_action_vector_size (int): Action-vector size including additional no-punctuation logit
-        """
-        super(ActionsModelRestricted, self).__init__()
-
-        self.params = params
-        self.runtime = runtime
-        self._tokenizer = None
-
-        config = PretrainedConfig.from_pretrained(params.base_model)
-
-        config.num_labels = params.extended_action_vector_size
-
-        self.core = BertForTokenClassification(config)
-
-    def forward(
-        self, input_ids: torch.Tensor, attention_mask: torch.Tensor
-    ) -> torch.Tensor:
-        """Computes logits for uppercasing and adding punctuation to a word
-
-        Args:
-            input_ids (torch.Tensor): Array of ids of tokens. Shape BxL
-            attention_mask (torch.Tensor): Mask telling if a token should be masked out (ie. Padding). Shape BxL
-
-        Returns:
-            torch.Tensor: Logit for making each word uppercase and for adding a punctuation mark to each word. Shape BxL
-        """
-        y_pred = self.core(input_ids=input_ids, attention_mask=attention_mask)[0]
-
-        return y_pred
-
-    def predict_raw(
-        self, input_ids: torch.Tensor, attention_mask: torch.Tensor
-    ) -> torch.Tensor:
-        """Function that maps input_ids tensors into per-token labels
-
-        Args:
-            input_ids (torch.Tensor): Token ids of input. Shape BxL
-            attention_mask (torch.Tensor): Attention mask of tokens. Shape BxL
-
-        Returns:
-            torch.Tensor: Per-token action-vector labels. Shape BxLxA
-        """
-
-        logits = self.forward(input_ids, attention_mask=attention_mask)
-        prob_uppercase = logits[:, :, :1].sigmoid()
-        prob_punctuation = logits[:, :, 1:].softmax(dim=-1)
-
-        no_punctuation = prob_punctuation.argmax(-1) == (
-            self.params.extended_action_vector_size - 2
-        )
-        no_punctuation = (
-            no_punctuation.type(torch.float)
-            .unsqueeze(-1)
-            .repeat(1, 1, prob_punctuation.shape[-1] - 1)
-        )
-
-        prob_punctuation = prob_punctuation[:, :, :-1].softmax(-1) * (
-            1 - no_punctuation
-        )
-
-        return torch.cat([prob_uppercase, prob_punctuation], dim=-1)
-
-    def predict(self, text: str) -> str:
-        chunk_size = self.runtime.chunksize
-        threshold = self.runtime.threshold
-
-        device = get_device(self)
-
-        text = text.strip()
-
-        tokenizer = self.tokenizer()
-        tokens = tokenizer(text, return_tensors="pt")["input_ids"].to(device)
-        output = None
-
-        index_start = 0
-        while index_start < len(tokens[0]):
-            index_end = min(index_start + chunk_size, len(tokens[0]))
-
-            tokens_chunk = tokens[:, index_start:index_end]
-
-            attention_mask = torch.ones_like(tokens_chunk).to(device)
-
-            actions = (
-                self.predict_raw(tokens_chunk, attention_mask).detach().cpu().numpy()
-            )
-            actions_suppresed = max_suppression(actions, threshold)[0]
-
-            offset = last_stop_label(actions_suppresed, action_vector(["dot"]))
-
-            # Prevent infinite loop
-            if (offset is None) or (offset == 0):
-                offset = index_end - index_start
-
-            if output is None:
-                output = actions[0, 0:offset]
-            else:
-                output = np.concatenate([output, actions[0, 0:offset]], axis=0)
-
-            index_start += offset
-
-        assert len(output) == len(tokens[0])
-
-        word_labels = token_labels_to_word_labels(text, output[1:-1], tokenizer)
-        actions = max_suppression(np.expand_dims(word_labels, 0), threshold)[0]
-
-        return recover_text(text, actions)
-
-    @staticmethod
-    def _logit(x: torch.Tensor):
-        EPS = 1e-5
-
-        z = torch.clamp(x, EPS, 1.0 - EPS)
-
-        return torch.log(z / (1 - z))
-
-    def tokenizer(self) -> BertTokenizerFast:
-        if self._tokenizer is None:
-            self._tokenizer = BertTokenizerFast.from_pretrained(self.params.base_model)
-        return self._tokenizer
-
-    def save(self, dir: str, name: str, runtime: bool = True) -> None:
-        prepare_folder(dir)
-        torch.save(self.state_dict(), f"{dir}/{name}.model")
-        pickle_save(self.params, f"{dir}/{name}.config")
-
-        if runtime:
-            self.runtime.save_yaml(f"{dir}/{name}.runtime.yaml")
-
-    @staticmethod
-    def load(dir: str, name: str, device: torch.device) -> ActionsModelRestricted:
-        params = pickle_read(f"{dir}/{name}.config")
-        if os.path.exists(f"{dir}/{name}.runtime.yaml"):
-            runtime = ActionsModelRestrictedRuntimeParams.load_yaml(
-                f"{dir}/{name}.runtime.yaml"
-            )
-        else:
-            runtime = ActionsModelRestrictedRuntimeParams()
-
-        model = ActionsModelRestricted(params, runtime).to(device)
-        model.load_state_dict(torch.load(f"{dir}/{name}.model", map_location=device,))
-
-        return model
-
-
-class ActionsModelRestrictedLoss(nn.Module):
-    def __init__(
-        self, prior_uppercase_odds: torch.Tensor, punctuation_weights: torch.Tensor
-    ) -> None:
-        """Initializes ActionsModelRestrictedLoss
-
-        Args:
-            prior_uppercase_odds (torch.Tensor): Odds od positive to negative cases of uppercase in dataset
-            punctuation_weights (torch.Tensor): Weights for each class in loss function. Should be inversly proportional to number of
-                their occurances in dataset (Shape A+1)
-        """
-        super(ActionsModelRestrictedLoss, self).__init__()
-
-        self.binary_ce = nn.BCEWithLogitsLoss(
-            pos_weight=prior_uppercase_odds.reshape(1)
-        )
-        self.cat_ce = nn.CrossEntropyLoss(punctuation_weights)
-
-    def forward(
-        self,
-        predicted_action_vector_logits: torch.Tensor,
-        true_extended_action_vector: torch.Tensor,
-    ) -> torch.Tensor:
-        """Loss for ActionsModelRestricted model
-
-        Args:
-            true_extended_action_vector (torch.Tensor): Ground-truth action vectors. Shape BxLxA+1
-            predicted_action_vector_logits (torch.Tensor): Action vector-s logits predicted by ActionsModelRestricted model. Shape BxLxA+1
-
-        Returns:
-            torch.Tensor: Loss value
-        """
-
-        predicted_punc = predicted_action_vector_logits[:, :, 1:].transpose(1, 2)
-        target_punc_index = torch.argmax(true_extended_action_vector[:, :, 1:], dim=-1)
-        punc_loss = self.cat_ce(predicted_punc, target_punc_index)
-
-        predicted_uppercase = predicted_action_vector_logits[:, :, 0]
-        target_uppercase = true_extended_action_vector[:, :, 0]
-        uppercase_loss = self.binary_ce(predicted_uppercase, target_uppercase)
-
-        return punc_loss + uppercase_loss
-
-    def save(self, dir: str, name: str) -> None:
-        prepare_folder(dir)
-        torch.save(self.state_dict(), f"{dir}/{name}.model")
-        pickle_save(self.params, f"{dir}/{name}.config")
-
-    @staticmethod
-    def load(dir: str, name: str, device: torch.device) -> PunctuationModel:
-        params = pickle_read(f"{dir}/{name}.config")
-        model = ActionsModelMixed(params)
-
-        model.load_state_dict(torch.load(f"{dir}/{name}.model", map_location=device,))
-
-        return model
diff --git a/src/models/common.py b/src/models/common.py
deleted file mode 100644
index 012999f..0000000
--- a/src/models/common.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import math
-
-import torch
-import torch.nn as nn
-
-
-def generate_square_subsequent_mask(sz):
-    r"""
-    Generate a square mask for the sequence. The masked positions are filled with float('-inf').
-    Unmasked positions are filled with float(0.0).
-
-    Source: torch Transformer class
-    """
-    mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
-    mask = (
-        mask.float()
-        .masked_fill(mask == 0, float("-inf"))
-        .masked_fill(mask == 1, float(0.0))
-    )
-    return mask
-
-
-class PositionalEncoding(nn.Module):
-    """Adds sinsusoidal positional encoding (as in original "Attention is all you need" paper.)
-    src: https://pytorch.org/tutorials/beginner/transformer_tutorial.html
-
-    """
-
-    def __init__(self, d_model: int, max_len: int, dropout=0.1):
-        """Sinusidal positional encodings
-
-        Args:
-            d_model (int): Embedding dimension
-            max_len (int): Maximum length of sequence
-            dropout (float, optional): Dropout ratio. Defaults to 0.1.
-        """
-        super(PositionalEncoding, self).__init__()
-        self.dropout = nn.Dropout(p=dropout)
-
-        pe = torch.zeros(max_len, d_model)
-        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
-        div_term = torch.exp(
-            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
-        )
-        pe[:, 0::2] = torch.sin(position * div_term)
-        pe[:, 1::2] = torch.cos(position * div_term)
-        pe = pe.unsqueeze(0).transpose(0, 1)
-        self.register_buffer("pe", pe)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Applies positional encoding
-
-        Args:
-            x (torch.Tensor): Word embeddings tensor
-
-        Returns:
-            torch.Tensor: Word embeddings with added positional encodings
-        """
-        x = x + self.pe[: x.size(0), :]
-        return self.dropout(x)
diff --git a/src/models/interfaces.py b/src/models/interfaces.py
deleted file mode 100644
index 4627145..0000000
--- a/src/models/interfaces.py
+++ /dev/null
@@ -1,49 +0,0 @@
-from __future__ import annotations
-
-from abc import ABC, abstractmethod
-
-import torch
-import torch.nn as nn
-from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
-
-
-class PunctuationModel(nn.Module, ABC):
-    def __init__(self) -> None:
-        super().__init__()
-
-    @abstractmethod
-    def tokenizer(self) -> PreTrainedTokenizerFast:
-        pass
-
-    @abstractmethod
-    def save(self, dir: str, name: str, runtime: bool = False) -> None:
-        pass
-
-    @staticmethod
-    @abstractmethod
-    def load(dir: str, name: str, device: torch.device) -> PunctuationModel:
-        pass
-
-
-class ActionsModel(PunctuationModel):
-    def __init__(self) -> None:
-        super().__init__()
-
-    @abstractmethod
-    def predict_raw(
-        self, input_ids: torch.Tensor, attention_mask: torch.Tensor
-    ) -> torch.Tensor:
-        """Function that maps input_ids tensors into per-token labels
-
-        Args:
-            input_ids (torch.Tensor): Token ids of input. Shape BxL
-            attention_mask (torch.Tensor): Attention mask of tokens. Shape BxL
-
-        Returns:
-            torch.Tensor: Per-token action-vector labels. Shape BxLxA
-        """
-        pass
-
-    @abstractmethod
-    def predict(self, text: str) -> str:
-        pass
diff --git a/src/models/model_factory.py b/src/models/model_factory.py
deleted file mode 100644
index 5e4a9fc..0000000
--- a/src/models/model_factory.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from src.models.actions_model_base import ActionsModelBase
-from src.models.actions_model_mixed import ActionsModelMixed
-from src.models.actions_model_restricted import ActionsModelRestricted
-
-MODELS_MAP = {
-    "actions_base": ActionsModelBase,
-    "actions_restricted": ActionsModelRestricted,
-    "actions_mixed": ActionsModelMixed,
-}
diff --git a/src/pipelines/__init__.py b/src/pipelines/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/src/pipelines/actions_based/__init__.py b/src/pipelines/actions_based/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/src/pipelines/actions_based/processing.py b/src/pipelines/actions_based/processing.py
deleted file mode 100644
index 4f19a1a..0000000
--- a/src/pipelines/actions_based/processing.py
+++ /dev/null
@@ -1,628 +0,0 @@
-from collections import defaultdict
-from typing import List, Mapping, Optional, Tuple
-from xml.etree import ElementTree as ET
-
-import numpy as np
-from transformers import BertTokenizerFast
-from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
-
-from src.utils import input_preprocess, output_preprocess
-
-ACTIONS_KEYS = ["upper_case", "dot", "colon", "question_mark"]
-UPPERCASE_INDEX = 0
-PUNCTUATION_INDEXES = [1, 2, 3]
-
-
-def apply_file_processing(x: dict) -> dict:
-    """Creates input-output pairs from xml file from dataset
-
-    Args:
-        x (dict): Dask dataframe row with columns: file
-
-    Returns:
-        dict: Dask dataframe row with columns: source, target, target_shape
-    """
-    full_text = text_from_xml(x.file)
-
-    if len(full_text) > 0:
-        model_input, model_output = create_model_input_output(full_text)
-
-        output_shape = np.array(model_output.shape, dtype=np.int)
-
-        return {
-            "source": model_input,
-            "target": model_output.reshape(-1),
-            "target_shape": output_shape,
-        }
-    else:
-        return {"source": None, "target": None, "target_shape": None}
-
-
-APPLY_FILE_PROCESSING_META = {
-    "source": object,
-    "target": object,
-    "target_shape": object,
-}
-
-
-def apply_tokenization(
-    df: dict, min_tokens: int, max_tokens: int, tokenizer: BertTokenizerFast
-) -> dict:
-    """Applies tokenization and chunking
-
-    Args:
-        df (dict): Dataframe entry with columns: source, target, target_shape
-        min_tokens (int): Minimum number of tokens in a single training example
-        max_tokens (int): Maximum number of tokens in a single testing example
-        tokenizer (BertTokenizerFast): Tokenizer that will be used for tokenization
-
-    Returns:
-        dict: Dataframe entry with columns: source, target, attention_mask, source_shape
-                , target_shape, attention_mask_shape
-    """
-    text_clean = df.source
-    labels = df.target
-    shape = df.target_shape
-
-    tokens, token_labels = tokenize_labeled_text(
-        text_clean, labels.reshape(shape), tokenizer
-    )
-
-    inputs, outputs, attentions = batchify_data(
-        tokens, token_labels, max_tokens, tokenizer, min_tokens
-    )
-
-    inputs_shape = np.array(inputs.shape)
-    outputs_shape = np.array(outputs.shape)
-    attentions_shape = np.array(attentions.shape)
-
-    return {
-        "source": inputs.reshape(-1),
-        "target": outputs.reshape(-1),
-        "attention_mask": attentions.reshape(-1),
-        "source_shape": inputs_shape,
-        "target_shape": outputs_shape,
-        "attention_mask_shape": attentions_shape,
-    }
-
-
-APPLY_TOKENIZATION_META = {
-    "source": object,
-    "target": object,
-    "attention_mask": object,
-    "source_shape": object,
-    "target_shape": object,
-    "attention_mask_shape": object,
-}
-
-
-def action_vector(actions: List[str]) -> np.ndarray:
-    """Transforms array of label names into an action vector.
-
-    Args:
-        actions ([str]): Actions that should be in action vector (eg. ["dot", "upper_case"])
-
-    Returns:
-        np.ndarray: Action vector with provided actions
-    """
-    return encode_actions(
-        {
-            "upper_case": "upper_case" in actions,
-            "dot": "dot" in actions,
-            "colon": "colon" in actions,
-            "question_mark": "question_mark" in actions,
-        }
-    )
-
-
-def last_stop_label(labels: np.array, stop_action: np.array) -> Optional[int]:
-    """Finds the position of the last sentence ending token
-
-    Args:
-        labels (np.array): Array of token-labels in form of action vectors (LxA shape)
-        stop_token (np.array): Action vector that mark a stop token (A shape)
-
-    Returns:
-        int: Index of the last found stop token in a sentence. None if no stop token is found
-    """
-
-    assert len(labels.shape) == 2
-    assert len(stop_action.shape) == 1
-    assert stop_action.shape[0] == labels.shape[-1]
-
-    stop_labels = np.argwhere(np.all(labels == stop_action, axis=1))
-
-    if len(stop_labels) == 0:
-        return None
-
-    return stop_labels[-1][0]
-
-
-def empty_action_vector() -> np.ndarray:
-    """Returns a do-nothing actions vector
-
-    Returns:
-        np.ndarray: Vector with all zeroes and length of ACTION_KEYS
-    """
-    return np.zeros(len(ACTIONS_KEYS))
-
-
-def empty_action_dict() -> dict:
-    """Returns a do-noting unencoded action dict
-
-    Returns:
-        dict: Action dict with all actions set to False
-    """
-
-    return decode_actions(empty_action_vector())
-
-
-def text_from_xml(path: str) -> str:
-    """Extract spoken text from dataset's xml format
-
-    Args:
-        path (str): Path to xml
-
-    Returns:
-        str: Raw text
-    """
-    root = ET.parse(path).getroot()
-
-    full_text = ""
-
-    for node in root.iter("*"):
-        if len(node) == 0:
-            who = node.get("who")
-            text = node.text
-
-            if text is not None and who is not None and who != "#komentarz":
-                full_text = " ".join([full_text, text])
-
-    del root
-
-    return full_text
-
-
-def detect_actions(word: str, next_word: Optional[str]) -> Mapping[str, bool]:
-    """Detect what actions should model perform on a word and returns encoded
-       action vector
-
-    Args:
-        word (str): Word on wich action is decided
-        next_word (Optional[str]): Word that follows considered word. Can be
-            None if nothing follows a word
-
-    Returns:
-        Mapping[str, bool]: Mapping telling if each of possible actions should be performed (True) or not (False)
-    """
-    # Unsuported characters
-    word.replace(";", ".")
-    word.replace('"', "")
-    word.replace("(", "")
-    word.replace(")", "")
-
-    while len(word) > 0 and not word[0].isalnum():  # remove proceding characters
-        word = word[1:]
-
-    if len(word) == 0:
-        return dict(zip(ACTIONS_KEYS, [False] * len(ACTIONS_KEYS)))
-
-    actions = {
-        "upper_case": word[0].isupper(),
-        "dot": word[-1] == ".",
-        "colon": word[-1] == ",",
-        "question_mark": word[-1] == "?",
-    }
-
-    return actions
-
-
-def encode_actions(actions: Mapping[str, bool]) -> np.ndarray:
-    """Transforms actions into vector
-
-    Args:
-        actions (Mapping[str, bool]): Map telling which actions should be made
-
-    Returns:
-        np.ndarray: 1 dimensional action vector
-    """
-    return np.array(list(actions.values())).astype(float)
-
-
-def decode_actions(encoded_actions: np.ndarray) -> Mapping[str, bool]:
-    """Decodes actions
-
-    Args:
-        encoded_actions (np.ndarray): 1 dimensional action vector
-
-    Returns:
-        Mapping[str, bool]: Map telling which actions should be made
-    """
-    assert encoded_actions.shape[0] == len(ACTIONS_KEYS)
-
-    return dict(zip(ACTIONS_KEYS, encoded_actions.astype(np.bool).tolist()))
-
-
-def create_model_input_output(text: str) -> Tuple[str, np.ndarray]:
-    """Returns a pair of input and desired output of the model
-
-    Args:
-        text (str): Correct text sample
-
-    Returns:
-        text_cleaned (str): Text without any interpuction and all lowercase
-        actions (np.ndarray): To dimensional array, where each row is aciton vector for each word (columns)
-    """
-    words = output_preprocess(text).split(" ")
-
-    words_output = []
-    actions_output = []
-
-    i = 0
-    while i < len(words):
-        word = words[i]
-        next_word = words[i + 1] if len(words) > i + 1 else None
-
-        word_sanitized = input_preprocess(word)
-        if len(word_sanitized) > 0:
-            actions = detect_actions(word, next_word)
-            actions_encoded = encode_actions(actions)
-
-            words_output.append(word_sanitized)
-            actions_output.append(actions_encoded)
-
-        i += 1
-
-    assert len(words_output) == len(actions_output)
-
-    return " ".join(words_output), np.array(actions_output)
-
-
-def token_word_mapping(text: str, tokenizer: PreTrainedTokenizerFast) -> np.ndarray:
-    """Returns mapping where each token is labeled with index of word it's part of
-
-    Args:
-        text (str): Input text
-        tokenizer (PreTrainedTokenizerFast): Tokenizer used to tokenize text
-
-    Returns:
-        np.ndarray: Array of length L (number of tokens) where each entry is index of word (cls and sep labels are not counted).
-    """
-    text_tokenized = tokenizer(text, return_offsets_mapping=True)
-    offset_mappings = text_tokenized["offset_mapping"][1:-1]
-
-    # Create a map where each character is assigned index of it's word
-    words_mapping = []
-    actual_word = 0
-    for character in text:
-        words_mapping.append(actual_word)
-        if character == " ":
-            actual_word += 1
-
-    token_mapping = [words_mapping[x[0]] for x in offset_mappings]
-
-    return np.array(token_mapping)
-
-
-def token_labels_to_word_labels(
-    text: str, token_labels: np.ndarray, tokenizer: PreTrainedTokenizerFast
-) -> np.ndarray:
-    mapping = token_word_mapping(text, tokenizer)
-
-    assert len(mapping) == len(token_labels)
-
-    labels = defaultdict(list)
-
-    for i in range(len(mapping)):
-        labels[mapping[i]].append(token_labels[i])
-
-    return np.array([np.mean(labels[x], axis=0) for x in sorted(labels)])
-
-
-def tokenize_labeled_text(
-    text: str, labels: np.ndarray, tokenizer: PreTrainedTokenizerFast
-) -> Tuple[np.ndarray, np.ndarray]:
-    """Transforms text into numerical tokens. Also expand word-level labels into token-level labels
-
-    Args:
-        text (str): Text that will be tokenized (TODO: Change to array)
-        labels (np.ndarray): Word-level labels for text to be tokenized. Word is defined via space spearation
-        tokenizer (PreTrainedTokenizerFast): Tokenizer that will be used for tokenization
-
-    Returns:
-        np.ndarray: 2-dimensional array with tokens (without cls and sep tokens!)
-        np.ndarray 2-dimensional array with token-level labels
-    """
-    text_tokenized = tokenizer(text, return_offsets_mapping=True)
-
-    offset_mappings = text_tokenized["offset_mapping"][1:-1]
-    input_ids = text_tokenized["input_ids"][1:-1]
-
-    # Create a map where each character is assigned index of it's word
-    words_mapping = []
-    actual_word = 0
-    for character in text:
-        words_mapping.append(actual_word)
-        if character == " ":
-            actual_word += 1
-
-    # Assign each token to a word
-    token_mapping = [words_mapping[x[0]] for x in offset_mappings]
-
-    # Expand word-based labels to token-based labels
-    labels_tokenized = [labels[i] for i in token_mapping]
-
-    return np.array(input_ids).reshape(-1, 1), np.array(labels_tokenized)
-
-
-def recover_word(word: str, action: Mapping[str, bool]) -> str:
-    """Applies action to a word
-
-    Args:
-        word (str): word on which action will be applied
-        action (Mapping[str, bool]): Action to be applied
-
-    Returns:
-        str: transfomed word
-    """
-    word_result = word
-
-    if action["dot"]:
-        word_result += "."
-    if action["upper_case"]:
-        word_result = word_result.capitalize()
-    if action["colon"]:
-        word_result += ","
-    if action["question_mark"]:
-        word_result += "?"
-
-    return word_result
-
-
-def is_sentence_end(actions_encoded: np.ndarray) -> bool:
-    """Returns if given action would end a sentence
-
-    Args:
-        actions_encoded (np.ndarray): Action vector
-
-    Returns:
-        bool: True if action would end a sentence, False otherwise
-    """
-    actions_decoded = decode_actions(actions_encoded)
-
-    return actions_decoded["dot"] is True
-
-
-def nearest_sentence_l(labels: np.array, index_start: int) -> int:
-    """Find nearest word that begins a sentence that has lower or equal index to index_start
-
-    Args:
-        labels (np.array): 2-dimensonal array of action-vectors
-        index_start (int): Index from which search will be started
-
-    Returns:
-        int: Index of nearest left-oriented start of the sentence. If no sentence is found, first index is assumed to
-             start a sentence
-    """
-    result_index = index_start
-
-    while result_index > 0:
-        if is_sentence_end(labels[result_index, :]):
-            # prevent beeing in the middle of token
-            result_index -= 1
-        elif is_sentence_end(labels[result_index - 1, :]):
-            break
-        elif result_index == 1:
-            result_index = 0
-            break
-        else:
-            result_index -= 1
-
-    return result_index
-
-
-def nearest_sentence_r(labels: np.array, index_start: int) -> Optional[int]:
-    """Find nearest word that begins a sentence that has higher or equal index to index_start
-
-    Args:
-        labels (np.array): 2-dimensonal array of action-vectors
-        index_start (int): Index from which search will be started
-
-    Returns:
-        int: Index of nearest right-oriented start of the sentence. None if no later sentence is found
-    """
-    result_index = index_start
-
-    while result_index < len(labels):
-        if is_sentence_end(labels[result_index - 1]) and not is_sentence_end(
-            labels[result_index]
-        ):
-            break
-        else:
-            result_index += 1
-
-    if result_index >= len(labels):
-        return None
-    else:
-        return result_index
-
-
-def batchify_labels(
-    labels: np.ndarray, max_tokens: int, min_tokens: int = 3
-) -> List[np.ndarray]:
-    """Splits long labels array into batches of desired size
-
-    Args:
-        labels (np.ndarray): 2-dimensional array of action-vectors
-        max_tokens (int): Maximum number of labels in a single batch
-        min_tokens (int, optional): Minimum number of labels in a single batch. Defaults to 3.
-
-    Returns:
-        [np.ndarray]: List of arrays with indexes composing each batch
-    """
-    assert min_tokens >= 1
-    assert max_tokens >= 1
-
-    labels_batches = []
-
-    index = 0
-    new_index = 0
-    while index < (labels.shape[0] - min_tokens):
-        num_consumed = min(max_tokens, labels.shape[0] - index)
-
-        assert num_consumed >= min_tokens
-
-        if index + num_consumed < (labels.shape[0] - min_tokens):
-            new_index = nearest_sentence_l(labels, index + num_consumed)
-            if new_index == index:
-                new_index = nearest_sentence_r(labels, index + num_consumed)
-                if new_index is None:
-                    labels_batches.append(
-                        np.array(list(range(index, index + num_consumed)))
-                    )
-                    break
-        else:
-            labels_batches.append(np.array(list(range(index, index + num_consumed))))
-            break
-
-        labels_batches.append(np.array(list(range(index, index + num_consumed))))
-
-        index = new_index
-
-    return labels_batches
-
-
-def add_cls_sep(
-    tokens: np.ndarray, labels: np.ndarray, tokenizer: PreTrainedTokenizerFast
-) -> Tuple[np.ndarray, np.ndarray]:
-    """Adds staring cls and ending sep token ids into tokens & labels
-
-    Args:
-        tokens (np.ndarray): 2-dimensional array (with 1 feature!) of tokens
-        labels (np.ndarray): 2-dimensional array of action vectors
-
-    Returns:
-        np.ndarray: tokens with added cls & sep tokens ids
-        np.ndarray: labels with first and last item duplicated to accomodate for cls & sep
-    """
-
-    tokens = np.concatenate(
-        [[[tokenizer.cls_token_id]], tokens, [[tokenizer.sep_token_id]]]
-    )
-    labels = np.concatenate([labels[:1, :], labels, labels[-1:, :]])
-
-    return tokens, labels
-
-
-def add_padding(
-    tokens: np.ndarray,
-    labels: np.ndarray,
-    length: int,
-    tokenizer: PreTrainedTokenizerFast,
-) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-    """Appends padding to tokens and labels to match desired length
-
-    Args:
-        tokens (np.ndarray): Lx1 array of token ids
-        labels (np.ndarray): LxA array of action vectors
-        length (int): Desired length of a vector. Must be higher than L
-        tokenizer (PreTrainedTokenizerFast): Tokenizer that was used for tokenization
-
-    Returns:
-        np.ndarray: (L+P)x1 array of token ids with added padding
-        np.ndarray: (L+P)xA array of action vectors with added padding
-        np.ndarray: (L+P)-length array of masks where True means token False - padding
-    """
-
-    pad_length = length - tokens.shape[0]
-    assert pad_length >= 0
-
-    if pad_length > 0:
-        tokens = np.concatenate([tokens, [[tokenizer.pad_token_id]] * pad_length])
-        labels = np.concatenate([labels, [empty_action_vector()] * pad_length])
-
-    mask = np.ones(len(tokens)).astype(np.int)
-
-    if pad_length > 0:
-        mask[-pad_length:] = False
-
-    return tokens, labels, mask
-
-
-def batchify_data(
-    tokens: np.ndarray,
-    labels: np.ndarray,
-    max_tokens: int,
-    tokenizer: PreTrainedTokenizerFast,
-    min_tokens: int = 3,
-) -> Tuple[np.ndarray, np.ndarray]:
-    """Chop long tokens-labels pair into smaller ones of equal length (with added padding)
-
-    Args:
-        tokens (np.ndarray): Tokens representing long, unpunctuated text (Shape L)
-        labels (np.ndarray): Action-labels to transform provided text into punctuated one (Shape LxA)
-        max_tokens (int): Maxium number of tokens in a single entry
-        tokenizer (PreTrainedTokenizerFast): Tokenizer used to tokenize sentence into tokens
-        min_tokens (int, optional): Minimum number of tokens in a sentence. Defaults to 3.
-
-    Returns:
-        Tuple[np.ndarray, np.ndarray]:
-            tokens_batch - Tokens array splitted into smaller chunks. (Shape (max_tokens)xL )
-            labels_batch - LAbels array splitted into smaller chunks. (Shape (max_tokens)xLxA )
-
-    """
-
-    assert max_tokens >= min_tokens + 2
-    assert min_tokens >= 1
-
-    tokens_batch = []
-    labels_batch = []
-    mask_batch = []
-
-    idxs = batchify_labels(labels, max_tokens - 2, min_tokens)
-
-    for ids in idxs:
-        tokens_sample = tokens[ids, :]
-        labels_sample = labels[ids, :]
-
-        assert len(ids) >= min_tokens
-        assert len(ids) <= max_tokens - 2
-
-        tokens_sample, labels_sample = add_cls_sep(
-            tokens_sample, labels_sample, tokenizer
-        )
-
-        assert len(tokens_sample) <= max_tokens
-
-        tokens_sample, labels_sample, mask = add_padding(
-            tokens_sample, labels_sample, max_tokens, tokenizer
-        )
-
-        tokens_batch.append(tokens_sample)
-        labels_batch.append(labels_sample)
-        mask_batch.append(mask)
-
-    return np.array(tokens_batch), np.array(labels_batch), np.array(mask_batch)
-
-
-def recover_text(text: str, actions_encoded: np.ndarray) -> str:
-    """Applies per-word actions to unpunctuated text
-
-    Args:
-        text (str): lowercase, unpunctuated text
-        actions_encoded (np.ndarray): Array of per-word action vectors (Shape LxA)
-
-    Returns:
-        str: Punctuated version of the text
-    """
-    words = text.split(" ")
-
-    words_output = []
-
-    for word, action_encoded in zip(words, actions_encoded.tolist()):
-        action_decoded = decode_actions(np.array(action_encoded))
-
-        word_recovered = recover_word(word, action_decoded)
-        words_output.append(word_recovered)
-
-    return " ".join(words_output)
diff --git a/src/pipelines/actions_based/scoring.py b/src/pipelines/actions_based/scoring.py
deleted file mode 100644
index 66a267f..0000000
--- a/src/pipelines/actions_based/scoring.py
+++ /dev/null
@@ -1,122 +0,0 @@
-from typing import List, Optional, Tuple
-
-import numpy as np
-from sklearn.metrics import auc, f1_score, roc_curve
-
-from src.pipelines.actions_based.processing import ACTIONS_KEYS
-from src.utils import prepare_folder
-
-
-class Metrics:
-    """Class for model metrics calcuation and presenting"""
-
-    def __init__(self, name: str, output_dir: Optional[str]) -> None:
-        """Initializes Metrics
-
-        Args:
-            name (str): Name of the model that is measured
-            output_dir (Optional[str]): Directory where measurments will be saved. Can be None if saving is not required
-        """
-        self.name = name
-        self.message = ""
-        self.output_dir = output_dir
-
-    def compute_metrics(self, predictions: np.ndarray, targets: np.ndarray):
-        """Performs metrics calculation on model predictions relative to ground truth
-
-        Args:
-            predictions (np.ndarray): Predicted, non-thresholded values
-            targets (np.ndarray): Ground truth values
-        """
-        f1_scores = self._f1_scores(predictions, targets)
-
-        self._log_text(f"Model {self.name} | F1 scores")
-        self._log_text("----------------------")
-        self._log_text(f1_scores)
-        self._log_text("----------------------")
-
-        self._output_message()
-
-    def _f1_scores(self, predictions: np.ndarray, targets: np.ndarray) -> dict:
-        predictions = predictions_threshold(predictions, 0.0)
-        f1_scores = f1_score(predictions, targets, average=None)
-
-        return dict(zip(ACTIONS_KEYS, f1_scores))
-
-    def _output_message(self):
-        print(self.message)
-
-        if self.output_dir is not None:
-            prepare_folder(self.output_dir)
-
-            with open(f"{self.output_dir}/{self.name}.txt", "w") as f:
-                f.write(self.message)
-
-    def _log_text(self, text: str):
-        self.message += f"{text}\n"
-
-
-def predictions_threshold(
-    predictions: np.ndarray, threshold: float = 0.9
-) -> np.ndarray:
-    """Applies thresholding above which all values will be assigned 1.0, otherwsie 0.0
-
-    Args:
-        predictions (np.ndarray): Unthresholded predictions
-        threshold (float, optional): Threshold. Defaults to 0.9.
-
-    Returns:
-        np.ndarray: Binarized predictions
-    """
-    return (predictions > threshold).astype(np.float)
-
-
-def multiclass_roc_curve(
-    target: np.ndarray, predictions: np.ndarray
-) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
-    """Computes ROC-curve points for multiclass/mutlilabel case
-
-    Args:
-        target (np.ndarray): Ground-truth values
-        predictions (np.ndarray): Unthresholded predictions
-
-    Returns:
-        Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]: Falsoe positive rates, True-positives rates, thresholds. All
-        values are returned as a lists, where each entry in the list coresponds to value at single class
-    """
-    class_fprs = []
-    class_tprs = []
-    class_thresholds = []
-
-    for index in range(predictions.shape[-1]):
-        fpr, tpr, thresholds = roc_curve(target[:, index], predictions[:, index])
-
-        class_fprs.append(fpr)
-        class_tprs.append(tpr)
-        class_thresholds.append(thresholds)
-
-    return class_fprs, class_tprs, class_thresholds
-
-
-def multiclass_auc(
-    false_positive_rate: List[np.ndarray], true_positive_rate: List[np.ndarray]
-) -> np.ndarray:
-    """Computes area under curve for each class in multilabel/multiclass case
-
-    Args:
-        false_positive_rate (List[np.ndarray]): False positive rates, where each entry in the list coresponds to value at single class
-        true_positive_rate (List[np.ndarray]): True positive rates, where each entry in the list coresponds to value at single class
-
-    Returns:
-        np.ndarray: List of auc values for each class
-    """
-
-    assert len(false_positive_rate) == len(true_positive_rate)
-
-    num_classes = len(false_positive_rate)
-    auc_list = np.zeros(num_classes)
-
-    for i in range(num_classes):
-        auc_list[i] = auc(false_positive_rate[i], true_positive_rate[i])
-
-    return auc_list
diff --git a/src/pipelines/actions_based/stage1_extraction.py b/src/pipelines/actions_based/stage1_extraction.py
deleted file mode 100644
index 94dc26c..0000000
--- a/src/pipelines/actions_based/stage1_extraction.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# /usr/bin/python3
-import glob
-
-import dask.dataframe as dd
-import numpy as np
-import pandas as pd
-from dask.distributed import Client
-
-from src.pipelines.actions_based.processing import (
-    APPLY_FILE_PROCESSING_META,
-    apply_file_processing,
-)
-from src.utils import PROJECT_ROOT, get_config, prepare_folder
-
-INPUT_FOLDER = f"{PROJECT_ROOT}/data"
-OUTPUT_FOLDER = f"{PROJECT_ROOT}/generated/actions/stage1_extraction"
-
-
-if __name__ == "__main__":
-
-    config = get_config()
-    num_partitions = config["actions"]["extraction"]["num_partitions"]
-    num_workers = config["actions"]["extraction"]["num_workers"]
-    memory_limit = config["actions"]["extraction"]["worker_memory_limit"]
-
-    prepare_folder(OUTPUT_FOLDER)
-
-    file_schema = "data/**/text_structure.xml"
-    files_paths = glob.glob(file_schema, recursive=True)
-
-    # Make sure python memory fragmentation won't go insane
-    np.random.shuffle(files_paths)
-
-    client = Client(n_workers=num_workers, memory_limit=memory_limit)
-    print(f"Dashboard: {client.dashboard_link}")
-
-    # Processing pipeline
-    df = dd.from_pandas(pd.DataFrame({"file": files_paths}), npartitions=num_partitions)
-
-    df = df.apply(
-        apply_file_processing,
-        result_type="expand",
-        axis=1,
-        meta=APPLY_FILE_PROCESSING_META,
-    )
-    df = df.dropna()
-
-    # Export
-    df.to_parquet(OUTPUT_FOLDER, engine="pyarrow")
diff --git a/src/pipelines/actions_based/stage2_tokenization.py b/src/pipelines/actions_based/stage2_tokenization.py
deleted file mode 100644
index b30445f..0000000
--- a/src/pipelines/actions_based/stage2_tokenization.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# /usr/bin/python3
-import dask
-import dask.dataframe as dd
-from dask.distributed import Client
-from transformers import BertTokenizerFast
-
-from src.pipelines.actions_based.processing import (
-    APPLY_TOKENIZATION_META,
-    apply_tokenization,
-)
-from src.utils import PROJECT_ROOT, get_config, prepare_folder
-
-INPUT_FOLDER = f"{PROJECT_ROOT}/generated/actions/stage1_extraction"
-OUTPUT_FOLDER = f"{PROJECT_ROOT}/generated/actions/stage2_tokenization"
-
-if __name__ == "__main__":
-
-    config = get_config()
-    max_tokens = config["actions"]["tokenization"]["max_tokens"]
-    min_tokens = config["actions"]["tokenization"]["min_tokens"]
-    num_workers = config["actions"]["tokenization"]["num_workers"]
-    memory_limit = config["actions"]["tokenization"]["worker_memory_limit"]
-    base_model = config["global"]["base_model"]
-
-    prepare_folder(OUTPUT_FOLDER)
-
-    client = Client(n_workers=num_workers, memory_limit=memory_limit)
-    print(client.dashboard_link)
-
-    tokenizer = BertTokenizerFast.from_pretrained(base_model)
-
-    tokenizer = dask.delayed(tokenizer)
-
-    df = dd.read_parquet(INPUT_FOLDER, engine="pyarrow")
-    df = df.apply(
-        apply_tokenization,
-        args=(min_tokens, max_tokens, tokenizer),
-        result_type="expand",
-        axis=1,
-        meta=APPLY_TOKENIZATION_META,
-    )
-
-    df.to_parquet(OUTPUT_FOLDER, engine="pyarrow")
diff --git a/src/pipelines/actions_based/stage3_exploding.py b/src/pipelines/actions_based/stage3_exploding.py
deleted file mode 100644
index 72ec128..0000000
--- a/src/pipelines/actions_based/stage3_exploding.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# /usr/bin/python3
-import dask.dataframe as dd
-from dask.distributed import Client
-
-from src.processing import (
-    EXPAND_DIMS_META,
-    FLATTEN_DIMS_META,
-    expand_dims,
-    flatten_dims,
-)
-from src.utils import PROJECT_ROOT, get_config, prepare_folder
-
-INPUT_FOLDER = f"{PROJECT_ROOT}/generated/actions/stage2_tokenization"
-OUTPUT_FOLDER = f"{PROJECT_ROOT}/generated/actions/stage3_exploding"
-
-if __name__ == "__main__":
-    config = get_config()
-    num_workers = config["actions"]["exploding"]["num_workers"]
-    memory_limit = config["actions"]["exploding"]["worker_memory_limit"]
-
-    prepare_folder(OUTPUT_FOLDER)
-
-    client = Client(n_workers=num_workers, memory_limit=memory_limit)
-    print(client.dashboard_link)
-
-    df = dd.read_parquet(INPUT_FOLDER, engine="pyarrow")
-
-    df = df.apply(expand_dims, result_type="expand", axis=1, meta=EXPAND_DIMS_META)
-    df = df.map_partitions(
-        lambda x: x.apply(lambda y: y.explode(), axis=0), meta=EXPAND_DIMS_META
-    )
-    df = df.apply(flatten_dims, result_type="expand", axis=1, meta=FLATTEN_DIMS_META)
-
-    df.to_parquet(OUTPUT_FOLDER, engine="pyarrow")
diff --git a/src/pipelines/actions_based/stage4_reindexing.py b/src/pipelines/actions_based/stage4_reindexing.py
deleted file mode 100644
index fade725..0000000
--- a/src/pipelines/actions_based/stage4_reindexing.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# /usr/bin/python3
-import dask.dataframe as dd
-from dask.distributed import Client
-
-from src.utils import PROJECT_ROOT, get_config, prepare_folder
-
-INPUT_FOLDER = f"{PROJECT_ROOT}/generated/actions/stage3_exploding"
-OUTPUT_FOLDER = f"{PROJECT_ROOT}/generated/actions/stage4_reindexing"
-
-if __name__ == "__main__":
-    config = get_config()
-    num_workers = config["actions"]["reindexing"]["num_workers"]
-    memory_limit = config["actions"]["reindexing"]["worker_memory_limit"]
-
-    prepare_folder(OUTPUT_FOLDER)
-
-    client = Client(n_workers=num_workers, memory_limit=memory_limit)
-    print(client.dashboard_link)
-
-    df = dd.read_parquet(INPUT_FOLDER, engine="pyarrow")
-
-    # Add ordered indexes
-    df = df.assign(ones=1)
-    df = df.reset_index(drop=True)
-    idx = (df.ones.cumsum() - 1).persist()
-    df = df.assign(ones=idx)
-
-    df = df.set_index("ones", shuffle="tasks")
-    df.to_parquet(OUTPUT_FOLDER, engine="pyarrow")
diff --git a/src/pipelines/actions_based/stage5_stats.py b/src/pipelines/actions_based/stage5_stats.py
deleted file mode 100644
index a91ae2f..0000000
--- a/src/pipelines/actions_based/stage5_stats.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# /usr/bin/python3
-import pickle
-
-import dask.dataframe as dd
-import numpy as np
-from dask.distributed import Client
-
-from src.pipelines.actions_based.processing import ACTIONS_KEYS
-from src.processing import EXPAND_DIMS_META, expand_dims
-from src.utils import PROJECT_ROOT, get_config, prepare_folder
-
-INPUT_FOLDER = f"{PROJECT_ROOT}/generated/actions/stage4_reindexing"
-OUTPUT_FOLDER = f"{PROJECT_ROOT}/generated/actions/stage5_stats"
-
-
-def reduce_fold(fold_value, new_value):
-    return {
-        "class_number": fold_value["class_number"] + np.sum(new_value, axis=0),
-        "num_examples": fold_value["num_examples"] + new_value.shape[0],
-    }
-
-
-def reduce_partitions(x, y):
-    return {
-        "class_number": x["class_number"] + y["class_number"],
-        "num_examples": x["num_examples"] + y["num_examples"],
-    }
-
-
-if __name__ == "__main__":
-    config = get_config()
-    num_workers = config["actions"]["stats"]["num_workers"]
-    memory_limit = config["actions"]["stats"]["worker_memory_limit"]
-
-    prepare_folder(OUTPUT_FOLDER)
-
-    client = Client(n_workers=num_workers, memory_limit=memory_limit)
-    print(client.dashboard_link)
-
-    df = dd.read_parquet(INPUT_FOLDER, engine="pyarrow")
-    df = df.apply(expand_dims, result_type="expand", axis=1, meta=EXPAND_DIMS_META)
-
-    outputs_bag = df["target"].to_bag()
-
-    inital_values = {
-        "class_number": np.array([0] * len(ACTIONS_KEYS)),
-        "num_examples": 0,
-    }
-
-    result = outputs_bag.fold(
-        reduce_fold, reduce_partitions, initial=inital_values
-    ).compute()
-
-    with open(f"{OUTPUT_FOLDER}/stats.pickle", "wb") as f:
-        pickle.dump(result, f)
diff --git a/src/pipelines/actions_based/test.py b/src/pipelines/actions_based/test.py
deleted file mode 100644
index 8cfcda9..0000000
--- a/src/pipelines/actions_based/test.py
+++ /dev/null
@@ -1,114 +0,0 @@
-import argparse
-
-import dask.dataframe as dd
-import numpy as np
-import torch
-from tqdm import trange
-
-from src.batch_loading import get_ordered_dataframe_len
-from src.models.actions_model_base import ActionsModelBase
-from src.models.actions_model_mixed import ActionsModelMixed
-from src.models.actions_model_restricted import ActionsModelRestricted
-from src.pipelines.actions_based.scoring import Metrics
-from src.utils import PROJECT_ROOT, get_config, unflattened_column
-
-SUPPORTED_MODELS = {
-    "base": ActionsModelBase,
-    "restricted": ActionsModelRestricted,
-    "mixed": ActionsModelMixed,
-}
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Evaluate actions model")
-    parser.add_argument(
-        "-a",
-        "--architecture",
-        required=True,
-        choices=SUPPORTED_MODELS.keys(),
-        help="Model architecture",
-    )
-    parser.add_argument(
-        "-d",
-        "--directory",
-        required=True,
-        help="Directory where trained model is located, relative to project root",
-    )
-    parser.add_argument("-m", "--model", default="final", help="Pretrained model name")
-    parser.add_argument(
-        "-ds",
-        "--dataset",
-        type=str,
-        required=True,
-        help="Directory where test dataset is located, relative to project root",
-    )
-    parser.add_argument(
-        "-o",
-        "--output",
-        type=str,
-        required=True,
-        help="Directory where output will be stored",
-    )
-    parser.add_argument(
-        "-s", "--stage", type=str, required=True, help="Stage name in params.yaml"
-    )
-    args = parser.parse_args()
-
-    config = get_config()
-    limit = config["actions"][args.stage]["limit"]
-    batch_size = config["actions"][args.stage]["batch_size"]
-    device_name = config["actions"][args.stage]["device"]
-
-    test_dataset = f"{PROJECT_ROOT}/{args.dataset}"
-
-    print("Getting dataset info...")
-    df = dd.read_parquet(test_dataset, engine="pyarrow")
-
-    print("Loading dataset to memory...")
-    df_len = get_ordered_dataframe_len(df)
-
-    data_start = max(df_len - limit, 0)
-    data_end = df_len
-    pdf = df.loc[data_start:data_end].compute().reset_index()
-
-    device = torch.device(device_name)
-
-    print(f"Loading model {args.model}")
-    model_location = f"{PROJECT_ROOT}/{args.directory}"
-    model_type = SUPPORTED_MODELS[args.architecture]
-    model = model_type.load(model_location, args.model, device)
-
-    true_batches = []
-    prediction_batches = []
-
-    print("Computing...")
-    num_batches = len(pdf) // batch_size
-    for batch in trange(num_batches):
-
-        batch_start = batch * batch_size
-        batch_end = (batch + 1) * batch_size
-        batch_pdf = pdf.iloc[batch_start:batch_end]
-
-        inputs = unflattened_column(batch_pdf, "source")
-        outputs = unflattened_column(batch_pdf, "target")
-        attentions_mask = unflattened_column(batch_pdf, "attention_mask")
-
-        inputs = torch.tensor(inputs, dtype=torch.long).squeeze(dim=-1).to(device)
-        outputs = torch.tensor(outputs, dtype=torch.float).to(device)
-        attentions_mask = torch.tensor(attentions_mask).to(device)
-
-        prediction_batch = (
-            model.predict_raw(inputs, attentions_mask).detach().cpu().numpy()
-        )
-        prediction_batches.append(prediction_batch)
-
-        true_batches.append(outputs.cpu().numpy())
-
-    predictions = np.concatenate(prediction_batches, axis=0).reshape(
-        -1, prediction_batches[0].shape[-1]
-    )
-    trues = np.concatenate(true_batches, axis=0).reshape(-1, true_batches[0].shape[-1])
-
-    metrics = Metrics("actions-base", args.output)
-
-    print("Calculating metrics...")
-    metrics.compute_metrics(predictions, trues)
diff --git a/src/pipelines/actions_based/train_base.py b/src/pipelines/actions_based/train_base.py
deleted file mode 100755
index c8597f0..0000000
--- a/src/pipelines/actions_based/train_base.py
+++ /dev/null
@@ -1,120 +0,0 @@
-#!/usr/bin/python3
-
-import pickle
-
-import dask.dataframe as dd
-import numpy as np
-import torch
-from transformers import BertTokenizerFast
-
-from src.models.actions_model_base import (
-    ActionsModelBase,
-    ActionsModelBaseLoss,
-    ActionsModelBaseParams,
-)
-from src.pipelines.actions_based.processing import ACTIONS_KEYS
-from src.utils import (
-    PROJECT_ROOT,
-    Checkpoint,
-    Loader,
-    ProgressTracker,
-    Saver,
-    Timeout,
-    convert_to_timedelta,
-    get_config,
-    random_indexes,
-    training_loop,
-    unflattened_column,
-)
-
-INPUT_PATH = f"{PROJECT_ROOT}/generated/actions/stage4_reindexing"
-INPUT_STATS_PATH = f"{PROJECT_ROOT}/generated/actions/stage5_stats"
-OUTPUT_PATH = f"{PROJECT_ROOT}/checkpoints/actions_base"
-
-
-if __name__ == "__main__":
-    config = get_config()
-    learning_rate = config["actions"]["training_base"]["learning_rate"]
-    num_epochs = config["actions"]["training_base"]["num_epochs"]
-    batch_size = config["actions"]["training_base"]["batch_size"]
-    save_step = config["actions"]["training_base"]["save_step"]
-    batch_buffer_size = config["actions"]["training_base"]["batch_buffer_size"]
-    loss_averaging_span = config["actions"]["training_base"]["loss_averaging_span"]
-    fresh_start = config["actions"]["training_base"]["fresh_start"]
-    device_name = config["actions"]["training_base"]["device"]
-    max_train_time = convert_to_timedelta(
-        config["actions"]["training_base"]["max_training_time"]
-    )
-    base_model = config["global"]["base_model"]
-    seed = config["global"]["random_seed"]
-
-    np.random.seed(seed=seed)
-    df = dd.read_parquet(INPUT_PATH, engine="pyarrow")
-
-    device = torch.device(device_name if torch.cuda.is_available() else "cpu")
-    tokenizer = BertTokenizerFast.from_pretrained(base_model)
-
-    loader = Loader(OUTPUT_PATH, ActionsModelBase, torch.optim.AdamW, device)
-    if loader.has_checkpoints() and not fresh_start:
-        model, optimizer, epoch_start, sample_start = loader.load_latest()
-    else:
-        params = ActionsModelBaseParams(base_model, len(ACTIONS_KEYS))
-        model = ActionsModelBase(params)
-        model.to(device)
-
-        optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
-        epoch_start, sample_start = (0, 0)
-
-    model.train()
-
-    # Load loss weights
-    with open(f"{INPUT_STATS_PATH}/stats.pickle", "rb") as f:
-        stats = pickle.load(f)
-        pos_examples = stats["class_number"]
-        neg_examples = stats["num_examples"] - stats["class_number"]
-        pos_weight = torch.tensor(neg_examples / pos_examples)
-
-    criterion = ActionsModelBaseLoss(pos_weight).to(device)
-
-    random_index_shuffle = random_indexes(df)
-    training_stopped = False
-
-    saver = Saver(OUTPUT_PATH, model, optimizer)
-    checkpoint = Checkpoint(save_step, saver, epoch_start, sample_start)
-    timer = Timeout(max_train_time, saver)
-    tracker = ProgressTracker(device, loss_averaging_span)
-
-    timer.start()
-    for data_batch, epoch, i in training_loop(
-        epoch_start,
-        sample_start,
-        num_epochs,
-        df,
-        batch_size,
-        batch_buffer_size,
-        random_index_shuffle,
-    ):
-        inputs = unflattened_column(data_batch, "source")
-        outputs = unflattened_column(data_batch, "target")
-        attentions_mask = unflattened_column(data_batch, "attention_mask")
-
-        inputs = torch.tensor(inputs, dtype=torch.long).squeeze(dim=-1).to(device)
-        outputs = torch.tensor(outputs, dtype=torch.float).to(device)
-        attentions_mask = torch.tensor(attentions_mask).type(torch.long).to(device)
-
-        y_pred = model(input_ids=inputs, attention_mask=attentions_mask)
-
-        optimizer.zero_grad()
-        loss = criterion(y_pred, outputs)
-
-        tracker.step(epoch, i, loss)
-        checkpoint.step(epoch, i)
-        if timer.step(epoch, i):
-            training_stopped = True
-            break
-
-        loss.backward()
-        optimizer.step()
-
-    if not training_stopped:
-        saver.save("final")
diff --git a/src/pipelines/actions_based/train_mixed.py b/src/pipelines/actions_based/train_mixed.py
deleted file mode 100755
index fd44e27..0000000
--- a/src/pipelines/actions_based/train_mixed.py
+++ /dev/null
@@ -1,144 +0,0 @@
-#!/usr/bin/python3
-
-import pickle
-
-import dask.dataframe as dd
-import numpy as np
-import torch
-from transformers import BertTokenizerFast
-
-from src.models.actions_model_mixed import (
-    ActionsModelMixed,
-    ActionsModelMixedLoss,
-    ActionsModelMixedParams,
-)
-from src.pipelines.actions_based.processing import ACTIONS_KEYS
-from src.utils import (
-    PROJECT_ROOT,
-    Checkpoint,
-    Loader,
-    ProgressTracker,
-    Saver,
-    Timeout,
-    convert_to_timedelta,
-    get_config,
-    random_indexes,
-    training_loop,
-    unflattened_column,
-)
-
-INPUT_PATH = f"{PROJECT_ROOT}/generated/actions/stage4_reindexing"
-INPUT_STATS_PATH = f"{PROJECT_ROOT}/generated/actions/stage5_stats"
-OUTPUT_PATH = f"{PROJECT_ROOT}/checkpoints/actions_mixed"
-
-
-if __name__ == "__main__":
-    config = get_config()
-    threshold = config["actions"]["training_mixed"]["threshold"]
-    embedding_size = config["actions"]["training_mixed"]["embedding_size"]
-    num_heads = config["actions"]["training_mixed"]["num_heads"]
-    num_layers = config["actions"]["training_mixed"]["num_layers"]
-    dropout = config["actions"]["training_mixed"]["dropout"]
-    feedforward_neurons = config["actions"]["training_mixed"]["feedforward_neurons"]
-    learning_rate = config["actions"]["training_mixed"]["learning_rate"]
-    num_epochs = config["actions"]["training_mixed"]["num_epochs"]
-    batch_size = config["actions"]["training_mixed"]["batch_size"]
-    save_step = config["actions"]["training_mixed"]["save_step"]
-    batch_buffer_size = config["actions"]["training_mixed"]["batch_buffer_size"]
-    loss_averaging_span = config["actions"]["training_mixed"]["loss_averaging_span"]
-    fresh_start = config["actions"]["training_mixed"]["fresh_start"]
-    device_name = config["actions"]["training_mixed"]["device"]
-    max_train_time = convert_to_timedelta(
-        config["actions"]["training_mixed"]["max_training_time"]
-    )
-    base_model = config["global"]["base_model"]
-    seed = config["global"]["random_seed"]
-
-    np.random.seed(seed=seed)
-    df = dd.read_parquet(INPUT_PATH, engine="pyarrow")
-
-    device = torch.device(device_name if torch.cuda.is_available() else "cpu")
-    tokenizer = BertTokenizerFast.from_pretrained(base_model)
-
-    loader = Loader(OUTPUT_PATH, ActionsModelMixed, torch.optim.AdamW, device)
-
-    if loader.has_checkpoints() and not fresh_start:
-        model, optimizer, epoch_start, sample_start = loader.load_latest()
-    else:
-        params = ActionsModelMixedParams(
-            base_model,
-            tokenizer.vocab_size,
-            threshold,
-            embedding_size,
-            num_heads,
-            num_layers,
-            feedforward_neurons,
-            len(ACTIONS_KEYS),
-            500,
-            dropout,
-        )
-        model = ActionsModelMixed(params)
-        model.to(device)
-        optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
-        epoch_start, sample_start = (0, 0)
-
-    model.train()
-
-    # Load loss weights
-    with open(f"{INPUT_STATS_PATH}/stats.pickle", "rb") as f:
-        stats = pickle.load(f)
-        pos_examples = stats["class_number"]
-        neg_examples = stats["num_examples"] - stats["class_number"]
-        pos_weight = torch.tensor(neg_examples / pos_examples)
-
-    criterion = ActionsModelMixedLoss(pos_weight).to(device)
-
-    random_index_shuffle = random_indexes(df)
-    training_stopped = False
-
-    saver = Saver(OUTPUT_PATH, model, optimizer)
-    checkpoint = Checkpoint(save_step, saver, epoch_start, sample_start)
-    timer = Timeout(max_train_time, saver)
-    tracker = ProgressTracker(device, loss_averaging_span)
-
-    timer.start()
-    for data_batch, epoch, i in training_loop(
-        epoch_start,
-        sample_start,
-        num_epochs,
-        df,
-        batch_size,
-        batch_buffer_size,
-        random_index_shuffle,
-    ):
-        inputs = unflattened_column(data_batch, "source")
-        outputs = unflattened_column(data_batch, "target")
-        attentions_mask = unflattened_column(data_batch, "attention_mask")
-
-        inputs = torch.tensor(inputs, dtype=torch.long).to(device).squeeze(dim=2)
-
-        outputs = torch.tensor(outputs, dtype=torch.float).to(device)
-
-        # Convert to boolean
-        attentions_mask = torch.tensor(attentions_mask == 0).to(device)
-
-        y_pred = model(
-            input_ids=inputs,
-            actions=outputs[:, :-1, :],
-            attention_mask=attentions_mask,
-        )
-
-        loss = criterion(outputs[:, 1:, :], y_pred)
-        optimizer.zero_grad()
-
-        tracker.step(epoch, i, loss)
-        checkpoint.step(epoch, i)
-        if timer.step(epoch, i):
-            training_stopped = True
-            break
-
-        loss.backward()
-        optimizer.step()
-
-    if not training_stopped:
-        saver.save("final")
diff --git a/src/pipelines/actions_based/train_restricted.py b/src/pipelines/actions_based/train_restricted.py
deleted file mode 100755
index ed43789..0000000
--- a/src/pipelines/actions_based/train_restricted.py
+++ /dev/null
@@ -1,146 +0,0 @@
-#!/usr/bin/python3
-
-import pickle
-
-import dask.dataframe as dd
-import numpy as np
-import torch
-from transformers import BertTokenizerFast
-
-from src.models.actions_model_restricted import (
-    ActionsModelRestricted,
-    ActionsModelRestrictedLoss,
-    ActionsModelRestrictedParams,
-)
-from src.pipelines.actions_based.processing import ACTIONS_KEYS
-from src.utils import (
-    PROJECT_ROOT,
-    Checkpoint,
-    Loader,
-    ProgressTracker,
-    Saver,
-    Timeout,
-    convert_to_timedelta,
-    get_config,
-    random_indexes,
-    training_loop,
-    unflattened_column,
-)
-
-INPUT_PATH = f"{PROJECT_ROOT}/generated/actions/stage4_reindexing"
-INPUT_STATS_PATH = f"{PROJECT_ROOT}/generated/actions/stage5_stats"
-OUTPUT_PATH = f"{PROJECT_ROOT}/checkpoints/actions_restricted"
-
-
-if __name__ == "__main__":
-
-    config = get_config()
-    learning_rate = config["actions"]["training_restricted"]["learning_rate"]
-    num_epochs = config["actions"]["training_restricted"]["num_epochs"]
-    batch_size = config["actions"]["training_restricted"]["batch_size"]
-    save_step = config["actions"]["training_restricted"]["save_step"]
-    batch_buffer_size = config["actions"]["training_restricted"]["batch_buffer_size"]
-    loss_averaging_span = config["actions"]["training_restricted"][
-        "loss_averaging_span"
-    ]
-    fresh_start = config["actions"]["training_restricted"]["fresh_start"]
-    device_name = config["actions"]["training_restricted"]["device"]
-    max_train_time = convert_to_timedelta(
-        config["actions"]["training_restricted"]["max_training_time"]
-    )
-    base_model = config["global"]["base_model"]
-    seed = config["global"]["random_seed"]
-
-    np.random.seed(seed=seed)
-    df = dd.read_parquet(INPUT_PATH, engine="pyarrow")
-
-    device = torch.device(device_name if torch.cuda.is_available() else "cpu")
-    tokenizer = BertTokenizerFast.from_pretrained(base_model)
-
-    loader = Loader(OUTPUT_PATH, ActionsModelRestricted, torch.optim.AdamW, device)
-    if loader.has_checkpoints() and not fresh_start:
-        model, optimizer, epoch_start, sample_start = loader.load_latest()
-    else:
-        params = ActionsModelRestrictedParams(base_model, len(ACTIONS_KEYS) + 1)
-        model = ActionsModelRestricted(params)
-        model.to(device)
-        optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
-        epoch_start, sample_start = (0, 0)
-
-    model.train()
-
-    # Load loss weights
-    with open(f"{INPUT_STATS_PATH}/stats.pickle", "rb") as f:
-        stats = pickle.load(f)
-        pos_examples = stats["class_number"]
-        neg_examples = stats["num_examples"] - stats["class_number"]
-
-        uppercase_pos_examples = pos_examples[0]
-        uppercase_neg_examples = neg_examples[0]
-        uppercase_pos_odds = torch.tensor(
-            uppercase_pos_examples / uppercase_neg_examples, dtype=torch.float
-        )
-
-        has_punctuation_neg_examples = neg_examples[1:]
-        has_no_punctuation_neg_examples = np.sum(pos_examples[1:])
-
-        punctuation_neg_examples = np.concatenate(
-            [has_punctuation_neg_examples, has_no_punctuation_neg_examples.reshape(1)],
-            -1,
-        )
-
-        punctuation_class_weights = torch.tensor(
-            (punctuation_neg_examples) / np.sum(punctuation_neg_examples),
-            dtype=torch.float,
-        )
-
-    criterion = ActionsModelRestrictedLoss(
-        uppercase_pos_odds, punctuation_class_weights
-    ).to(device)
-
-    random_index_shuffle = random_indexes(df)
-    training_stopped = False
-
-    saver = Saver(OUTPUT_PATH, model, optimizer)
-    checkpoint = Checkpoint(save_step, saver, epoch_start, sample_start)
-    timer = Timeout(max_train_time, saver)
-    tracker = ProgressTracker(device, loss_averaging_span)
-
-    timer.start()
-    for data_batch, epoch, i in training_loop(
-        epoch_start,
-        sample_start,
-        num_epochs,
-        df,
-        batch_size,
-        batch_buffer_size,
-        random_index_shuffle,
-    ):
-        inputs = unflattened_column(data_batch, "source")
-        outputs = unflattened_column(data_batch, "target")
-        attentions_mask = unflattened_column(data_batch, "attention_mask")
-
-        inputs = torch.tensor(inputs, dtype=torch.long).squeeze(dim=-1).to(device)
-        outputs = torch.tensor(outputs, dtype=torch.float).to(device)
-        attentions_mask = torch.tensor(attentions_mask).to(device)
-
-        y_pred = model(input_ids=inputs, attention_mask=attentions_mask)
-
-        outputs = torch.cat(
-            [outputs, (1.0 - outputs[:, :, 1:].max(-1)[0]).unsqueeze(-1)], axis=-1
-        )
-
-        loss = criterion(y_pred, outputs)
-        optimizer.zero_grad()
-
-        tracker.step(epoch, i, loss)
-        checkpoint.step(epoch, i)
-        if timer.step(epoch, i):
-            training_stopped = True
-            break
-
-        loss.backward()
-        optimizer.step()
-
-    if not training_stopped:
-        saver.save("final")
diff --git a/src/pipelines/actions_based/utils.py b/src/pipelines/actions_based/utils.py
deleted file mode 100644
index b626357..0000000
--- a/src/pipelines/actions_based/utils.py
+++ /dev/null
@@ -1,135 +0,0 @@
-from typing import Tuple
-
-import numpy as np
-import torch
-import torch.nn as nn
-from transformers import BertForTokenClassification, BertTokenizerFast, PretrainedConfig
-
-from src.pipelines.actions_based.processing import (
-    ACTIONS_KEYS,
-    PUNCTUATION_INDEXES,
-    UPPERCASE_INDEX,
-    action_vector,
-    last_stop_label,
-    recover_text,
-    token_labels_to_word_labels,
-)
-
-
-def load_model(
-    model_path: str, base_model: str, device: str = "cpu"
-) -> Tuple[BertTokenizerFast, nn.Module]:
-    """Load pretrained model and it's tokenizer
-
-    Args:
-        model_path (str): Path to pretrained model
-        base_model (str): Name of base model
-        device (str, optional): Device on which model will be loaded. Defaults to "cpu".
-
-    Returns:
-        (BertTokenizerFast, nn.Module): Tokenizer & model
-    """
-
-    config = PretrainedConfig.from_pretrained(base_model)
-    config.num_labels = len(ACTIONS_KEYS)
-
-    device = torch.device(device)
-    tokenizer = BertTokenizerFast.from_pretrained(base_model)
-    model = BertForTokenClassification(config)
-    model.load_state_dict(torch.load(model_path, map_location=device))
-
-    return tokenizer, model
-
-
-def max_suppression(predictions: np.ndarray, threshold: float) -> np.ndarray:
-    """Converts raw prediction into action-vector with punctuation
-    limited to one sign.
-
-    Args:
-        predictions (np.ndarray): Raw predictions from the model
-        threshold (float): Thresholding value
-
-    Returns:
-        np.ndarray: Suppressed, thresholded action-vectr
-    """
-    output = np.zeros_like(predictions)
-
-    output[:, :, 0] = (predictions[:, :, UPPERCASE_INDEX] >= threshold).astype(np.int)
-
-    def assign_most_probable(x):
-        res = np.zeros_like(x)
-
-        if x.max() > threshold:
-            res[x.argmax()] = 1
-
-        return res
-
-    output[:, :, PUNCTUATION_INDEXES] = np.apply_along_axis(
-        assign_most_probable, -1, predictions[:, :, PUNCTUATION_INDEXES]
-    )
-
-    return output
-
-
-def apply_actions_punctuation(
-    text: str,
-    chunk_size: int,
-    tokenizer: BertTokenizerFast,
-    model: nn.Module,
-    threshold: float = 0.9,
-) -> str:
-    """Adds punctuation to text using actions model
-
-    Args:
-        text (str): Raw, unpuctuated text
-        chunk_size (int): Maxium number of tokens to precess at once (both memory & computing scales ~O(n^2))
-        tokenizer (BertTokenizerFast): Tokenizer to use
-        model (nn.Module): Trained actions model
-        threshold (float, optional): Threshold after which action will be applied. Defaults to 0.9.
-
-    Returns:
-        str: [description]
-    """
-
-    text = text.strip()
-
-    tokens = tokenizer(text, return_tensors="pt")["input_ids"]
-    output = None
-
-    index_start = 0
-    while index_start < len(tokens[0]):
-        index_end = min(index_start + chunk_size, len(tokens[0]))
-
-        tokens_chunk = tokens[:, index_start:index_end]
-
-        raw_output = (
-            model(
-                input_ids=tokens_chunk,
-                token_type_ids=torch.zeros_like(tokens_chunk),
-                attention_mask=torch.ones_like(tokens_chunk),
-            )[0]
-            .sigmoid()
-            .detach()
-            .numpy()
-        )
-
-        actions = max_suppression(raw_output, threshold)[0]
-        offset = last_stop_label(actions, action_vector("dot"))
-
-        # Prevent infinite loop
-        if (offset is None) or (offset == 0):
-            offset = index_end - index_start
-
-        if output is None:
-            output = raw_output[0, 0:offset]
-        else:
-            output = np.concatenate([output, raw_output[0, 0:offset]], axis=0)
-
-        index_start += offset
-
-    assert len(output) == len(tokens[0])
-
-    word_labels = token_labels_to_word_labels(text, output[1:-1], tokenizer)
-    actions = word_labels > threshold
-
-    return recover_text(text, actions)
diff --git a/src/pipelines/translation_based/__init__.py b/src/pipelines/translation_based/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/src/pipelines/translation_based/processing.py b/src/pipelines/translation_based/processing.py
deleted file mode 100644
index 608cf43..0000000
--- a/src/pipelines/translation_based/processing.py
+++ /dev/null
@@ -1,302 +0,0 @@
-from typing import List, Tuple
-
-import numpy as np
-from transformers import BertTokenizerFast
-
-from src.pipelines.actions_based.processing import text_from_xml
-from src.utils import input_preprocess
-
-
-def raw_to_dataframe(entry: dict) -> dict:
-    """Converts dask datarfame containing files paths into
-    dataframe with content of that files (text only)
-
-    Args:
-        x (dict): Dask dataframe entry with one column ('file')
-
-    Returns:
-        dict: Dask dataframe entry with format {'input': str}. Can have null entries
-    """
-    full_text = text_from_xml(entry.file)
-
-    if len(full_text) > 0:
-        return {"input": full_text}
-    else:
-        return {"input": None}
-
-
-RAW_TO_DATAFRAME_META = {"input": str}
-
-
-def generate_batches(
-    entry: dict,
-    min_len: int,
-    max_len: int,
-    separating_token: int,
-    tokenizer: BertTokenizerFast,
-) -> dict:
-    """Converts raw text entries into list of tokens
-
-    Args:
-        x (dict): Dask dataframe entry with one column ['input'] containing text
-        tokenizer (BertTokenizerFast): Tokenizer used to tokenize. Must be a deleayed object to prevent memory leak!
-
-    Returns:
-        dict: Dask dataset entry with one column ('tokens') containing np.array list of tokens
-    """
-    tokens = np.array(tokenizer(entry.input)["input_ids"][1:-1])
-
-    tokens_ending = (tokens == separating_token).astype(np.int)
-    batch_indices = get_batch_indexes(tokens_ending, min_len, max_len - 2)
-
-    source_batch, target_batch = crete_input_output_batch(
-        tokens, batch_indices, max_len, tokenizer
-    )
-    mask_batch = (source_batch != tokenizer.pad_token_id).astype(np.int)
-
-    source_batch_shape = np.array(source_batch.shape)
-    target_batch_shape = np.array(target_batch.shape)
-    mask_batch_shape = np.array(mask_batch.shape)
-
-    source_batch = source_batch.reshape(-1)
-    target_batch = target_batch.reshape(-1)
-    mask_batch = mask_batch.reshape(-1)
-
-    return {
-        "source": source_batch,
-        "target": target_batch,
-        "attention_mask": mask_batch,
-        "source_shape": source_batch_shape,
-        "target_shape": target_batch_shape,
-        "attention_mask_shape": mask_batch_shape,
-    }
-
-
-GENERATE_BATCHES_META = {
-    "source": object,
-    "target": object,
-    "attention_mask": object,
-    "source_shape": object,
-    "target_shape": object,
-    "attention_mask_shape": object,
-}
-
-
-def find_new_sentence_left(seq: np.array, pos: int) -> int:
-    """Finds nerest sentence on the left of the current position (including current position)
-
-    Args:
-        seq (np.array): Array of 0s and 1s of length equal to sequence. 1 means end of sentence (dot, semicolon etc.) and 0 - every other token
-        pos (int): Starting position
-
-    Returns:
-        int: Position of the nearest new sentence on the left. Start of the sequence always counts as a start of sentence
-    """
-    assert pos < len(seq)
-    assert pos >= 0
-
-    while pos > 0:
-        if seq[pos - 1] == 1:
-            return pos
-        else:
-            pos = pos - 1
-
-    return 0
-
-
-def find_new_sentence_right(seq: np.array, pos: int) -> int:
-    """Finds nerest sentence on the right of the current position (including current position)
-
-    Args:
-        seq (np.array): Array of 0s and 1s of length equal to sequence. 1 means end of sentence (dot, semicolon etc.) and 0 - every other token
-        pos (int): [description]
-
-    Returns:
-        int: Position of the nearest new sentence on the right. Returns none if no new sentence is found on the right
-    """
-    assert pos < len(seq)
-    assert pos >= 0
-
-    while pos < len(seq):
-        if seq[pos - 1] == 1:
-            return pos
-        else:
-            pos = pos + 1
-
-    return None
-
-
-def get_batch_indexes(
-    seq: np.array, min_length: int, max_length: int
-) -> List[np.array]:
-    """Turns long sequence into array of indices, composing a single batch file.
-
-    Args:
-        seq (np.array): Input sequence of 1s and 0s, where 1 means end of sequence token (dot, semicolon etc.)
-        min_length (int): Minimum length of sample in a batch
-        max_length (int): Maximum length of sample in a batch
-
-    Returns:
-        [np.array]: Array of indices, where each entry has length between <min_length, max_length>
-    """
-    pos = 0
-    batch = []
-
-    assert min_length <= max_length
-
-    while pos < len(seq):
-        pos_delta = min(max_length, len(seq) - pos)
-        assert pos + pos_delta <= len(seq)
-
-        if pos_delta >= min_length:
-            new_entry = np.array(list(range(pos, pos + pos_delta)))
-            assert len(new_entry) <= max_length
-
-            batch.append(new_entry)
-
-        if pos + pos_delta >= len(seq):
-            break
-
-        new_pos = find_new_sentence_left(seq, pos + pos_delta)
-        if new_pos == pos:
-            new_pos = find_new_sentence_right(seq, pos + pos_delta)
-            if new_pos is None:
-                break
-
-        pos = new_pos
-
-    return batch
-
-
-def add_padding(seq: np.ndarray, total_length: int, padding_symbol: any) -> np.ndarray:
-    """Pads a sequence with provided symbol, to get array of length total_length in the end
-
-    Args:
-        seq (np.ndarray): Input sequence
-        total_length (int): Desired length of a sequence
-        padding_symbol (any): Symbol that will be inserted at the end (total_legnth - len(seq)) times
-
-    Returns:
-        np.ndarray: N-dimensional array where first dimension is of length total_length
-    """
-    num_padding = total_length - len(seq)
-    assert num_padding >= 0
-
-    if num_padding > 0:
-        return np.concatenate([seq, np.array([padding_symbol] * num_padding)], axis=0)
-    else:
-        return np.copy(seq)
-
-
-def add_begin_end_tokens(
-    seq: np.ndarray, begin_token: any, end_token: any
-) -> np.ndarray:
-    """Adds preceding and ending special tokens to the sequence
-
-    Args:
-        seq (np.ndarray): Sequence of len L
-        begin_token (any): Tokend that will be added at the beginning of the sequence
-        end_token (any): Token that will be added at the end of the sequence
-
-    Returns:
-        np.ndarray: Sequence of len L+2
-    """
-
-    return np.concatenate([[begin_token], seq, [end_token]])
-
-
-def standarize_translation_sample(
-    seq: np.ndarray,
-    total_length: int,
-    padding_symbol: any,
-    begin_token: any,
-    end_token: any,
-) -> np.ndarray:
-    """Adds special tokens and padding so that every sample has identical shape
-
-    Args:
-        seq (np.ndarray): Input sequence of len L
-        total_length (int): Desired sequence length
-        padding_symbol (any): Token that will be used for padding
-        begin_token (any): Token that will be used as starting token
-        end_token (any): Token that will be used as ending token
-
-    Returns:
-        np.ndarray: Output sequence of length total_length
-    """
-    return add_padding(
-        add_begin_end_tokens(seq, begin_token, end_token), total_length, padding_symbol,
-    )
-
-
-def create_input_output(
-    tokens: np.ndarray, length: int, tokenizer: BertTokenizerFast
-) -> Tuple[np.ndarray, np.ndarray]:
-    """Transforms a sequence of tokens into "translation" input and output
-
-    Args:
-        tokens (np.ndarray): Input sequence
-        length (int): Maximum output length. Will add padding to match it
-        tokenizer (BertTokenizerFast): Tokenizer that was used to obtain tokens
-
-    Returns:
-        np.ndarray: Single sample that will serve as input to the model
-        np.ndarray: Single sample that will serve as expected output from the model
-    """
-    decoded_str = tokenizer.decode(tokens)
-    cleaned_str = input_preprocess(decoded_str).lower()
-    source_batch_entry = tokenizer(cleaned_str)["input_ids"][1:-1]
-    target_batch_entry = tokens
-
-    # In rare cases (because of encoding) unpunctuated lowercase input might be longer than output and exeed limits
-    # We need to trim in such cases
-    if len(source_batch_entry) > length - 2:
-        source_batch_entry = source_batch_entry[: (length - 2)]
-
-    source_batch_entry = standarize_translation_sample(
-        source_batch_entry,
-        length,
-        tokenizer.pad_token_id,
-        tokenizer.cls_token_id,
-        tokenizer.sep_token_id,
-    )
-    target_batch_entry = standarize_translation_sample(
-        target_batch_entry,
-        length,
-        tokenizer.pad_token_id,
-        tokenizer.cls_token_id,
-        tokenizer.sep_token_id,
-    )
-
-    return source_batch_entry, target_batch_entry
-
-
-def crete_input_output_batch(
-    seq: np.ndarray,
-    batch_indexes: List[np.ndarray],
-    length: int,
-    tokenizer: BertTokenizerFast,
-) -> Tuple[np.ndarray, np.ndarray]:
-    """Transforms a sequence of tokens into "translation" input and output batch
-
-    Args:
-        tokens (np.ndarray): Input sequence
-        batch_indexes ([np.ndarray]) List where every entry is array of indices representing a batch sample from tokens array.
-        length (int): Maximum output length. Will add padding to match it
-        tokenizer (BertTokenizerFast): Tokenizer that was used to obtain tokens
-
-    Returns:
-        np.ndarray: Single sample that will serve as input to the model
-        np.ndarray: Single sample that will serve as expected output from the model
-    """
-    base_batch = [seq[indexes] for indexes in batch_indexes]
-
-    source_batch = []
-    target_batch = []
-    for entry in base_batch:
-        source_entry, target_entry = create_input_output(entry, length, tokenizer)
-
-        source_batch.append(source_entry)
-        target_batch.append(target_entry)
-
-    return np.array(source_batch), np.array(target_batch)
diff --git a/src/pipelines/translation_based/stage1_extraction.py b/src/pipelines/translation_based/stage1_extraction.py
deleted file mode 100644
index 6ffdbf7..0000000
--- a/src/pipelines/translation_based/stage1_extraction.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# /usr/bin/python3
-from glob import glob
-
-import dask.dataframe as dd
-import numpy as np
-import pandas as pd
-from dask.distributed import Client
-
-from src.pipelines.translation_based.processing import (
-    RAW_TO_DATAFRAME_META,
-    raw_to_dataframe,
-)
-from src.utils import PROJECT_ROOT, get_config, prepare_folder
-
-INPUT_FOLDER = f"{PROJECT_ROOT}/data"
-OUTPUT_FOLDER = f"{PROJECT_ROOT}/generated/translations/stage1_extraction"
-
-if __name__ == "__main__":
-
-    config = get_config()
-    num_partitions = config["translations"]["extraction"]["num_partitions"]
-    num_workers = config["translations"]["extraction"]["num_workers"]
-    memory_limit = config["translations"]["extraction"]["worker_memory_limit"]
-
-    prepare_folder(OUTPUT_FOLDER)
-
-    file_schema = f"{INPUT_FOLDER}/**/text_structure.xml"
-    files_paths = glob(file_schema, recursive=True)
-
-    # Make sure python memory fragmentation won't go insane
-    np.random.shuffle(files_paths)
-
-    client = Client(n_workers=num_workers, memory_limit=memory_limit)
-    print(f"Dashboard: {client.dashboard_link}")
-
-    # Processing pipeline
-    df = dd.from_pandas(pd.DataFrame({"file": files_paths}), npartitions=num_partitions)
-
-    df = df.apply(
-        raw_to_dataframe, result_type="expand", axis=1, meta=RAW_TO_DATAFRAME_META,
-    )
-    df = df.dropna()
-
-    # Export
-    df.to_parquet(OUTPUT_FOLDER, engine="pyarrow")
diff --git a/src/pipelines/translation_based/stage2_create_batches.py b/src/pipelines/translation_based/stage2_create_batches.py
deleted file mode 100644
index 83a2edc..0000000
--- a/src/pipelines/translation_based/stage2_create_batches.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# /usr/bin/python3
-import dask.dataframe as dd
-from dask import delayed
-from dask.distributed import Client
-from transformers import BertTokenizerFast
-
-from src.pipelines.translation_based.processing import (
-    GENERATE_BATCHES_META,
-    generate_batches,
-)
-from src.utils import PROJECT_ROOT, get_config, prepare_folder
-
-INPUT_FOLDER = f"{PROJECT_ROOT}/generated/translations/stage1_extraction"
-OUTPUT_FOLDER = f"{PROJECT_ROOT}/generated/translations/stage2_create_batches"
-
-if __name__ == "__main__":
-
-    config = get_config()
-    num_workers = config["translations"]["create_batches"]["num_workers"]
-    memory_limit = config["translations"]["create_batches"]["worker_memory_limit"]
-    min_tokens = config["translations"]["create_batches"]["min_tokens"]
-    max_tokens = config["translations"]["create_batches"]["max_tokens"]
-    base_model = config["global"]["base_model"]
-
-    prepare_folder(OUTPUT_FOLDER)
-
-    client = Client(n_workers=num_workers, memory_limit=memory_limit)
-    print(f"Dashboard: {client.dashboard_link}")
-
-    tokenizer = BertTokenizerFast.from_pretrained(base_model)
-    tokenizer = delayed(tokenizer)
-
-    token_separating = tokenizer(".")["input_ids"][1]
-
-    df = dd.read_parquet(INPUT_FOLDER, engine="pyarrow")
-    df = df.apply(
-        generate_batches,
-        result_type="expand",
-        axis=1,
-        meta=GENERATE_BATCHES_META,
-        args=(min_tokens, max_tokens, token_separating, tokenizer),
-    )
-    df = df.dropna()
-
-    # Export
-    df.to_parquet(OUTPUT_FOLDER, engine="pyarrow")
diff --git a/src/pipelines/translation_based/stage3_exploding.py b/src/pipelines/translation_based/stage3_exploding.py
deleted file mode 100644
index d969dd1..0000000
--- a/src/pipelines/translation_based/stage3_exploding.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# /usr/bin/python3
-import dask.dataframe as dd
-from dask.distributed import Client
-
-from src.pipelines.translation_based.processing import (
-    EXPAND_DIMS_META,
-    FLATTEN_DIMS_META,
-    expand_dims,
-    flatten_dims,
-)
-from src.utils import PROJECT_ROOT, get_config, prepare_folder
-
-INPUT_FOLDER = f"{PROJECT_ROOT}/generated/translations/stage2_create_batches"
-OUTPUT_FOLDER = f"{PROJECT_ROOT}/generated/translations/stage3_exploding"
-
-if __name__ == "__main__":
-    config = get_config()
-    num_workers = config["translations"]["exploding"]["num_workers"]
-    memory_limit = config["translations"]["exploding"]["worker_memory_limit"]
-
-    prepare_folder(OUTPUT_FOLDER)
-
-    client = Client(n_workers=num_workers, memory_limit=memory_limit)
-    print(client.dashboard_link)
-
-    df = dd.read_parquet(INPUT_FOLDER, engine="pyarrow")
-
-    df = df.apply(expand_dims, result_type="expand", axis=1, meta=EXPAND_DIMS_META)
-    df = df.map_partitions(
-        lambda x: x.apply(lambda y: y.explode(), axis=0), meta=EXPAND_DIMS_META
-    )
-    df = df.apply(flatten_dims, result_type="expand", axis=1, meta=FLATTEN_DIMS_META)
-
-    df.to_parquet(OUTPUT_FOLDER, engine="pyarrow")
diff --git a/src/pipelines/translation_based/stage4_reindexing.py b/src/pipelines/translation_based/stage4_reindexing.py
deleted file mode 100644
index 6bbb541..0000000
--- a/src/pipelines/translation_based/stage4_reindexing.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# /usr/bin/python3
-import dask.dataframe as dd
-from dask.distributed import Client
-
-from src.utils import PROJECT_ROOT, get_config, prepare_folder
-
-INPUT_FOLDER = f"{PROJECT_ROOT}/generated/translations/stage3_exploding"
-OUTPUT_FOLDER = f"{PROJECT_ROOT}/generated/translations/stage4_reindexing"
-
-if __name__ == "__main__":
-    config = get_config()
-    num_workers = config["translations"]["reindexing"]["num_workers"]
-    memory_limit = config["translations"]["reindexing"]["worker_memory_limit"]
-
-    prepare_folder(OUTPUT_FOLDER)
-
-    client = Client(n_workers=num_workers, memory_limit=memory_limit)
-    print(client.dashboard_link)
-
-    df = dd.read_parquet(INPUT_FOLDER, engine="pyarrow")
-
-    # Add ordered indexes
-    df = df.assign(ones=1)
-    df = df.reset_index(drop=True)
-    idx = (df.ones.cumsum() - 1).persist()
-    df = df.assign(ones=idx)
-
-    # Shuffle
-    shuffled_idx = idx.compute().values
-    shuffled_idx = client.scatter(shuffled_idx)
-    mapped_ones = df.ones.apply(
-        lambda x, idx: idx[x], args=(shuffled_idx,), meta=("ones", "int64")
-    ).persist()
-    df = df.assign(ones=mapped_ones)
-
-    df = df.set_index("ones")
-    df.to_parquet(OUTPUT_FOLDER, engine="pyarrow")
diff --git a/src/pipelines/translation_based/train.py b/src/pipelines/translation_based/train.py
deleted file mode 100755
index 67fcf7a..0000000
--- a/src/pipelines/translation_based/train.py
+++ /dev/null
@@ -1,143 +0,0 @@
-#!/usr/bin/python3
-
-import glob
-from datetime import datetime
-
-import dask.dataframe as dd
-import numpy as np
-import torch
-from transformers import BertTokenizerFast
-
-from src.batch_loading import get_batches, get_ordered_dataframe_len
-from src.models.TransformerSeq2Seq import TransformerSeq2Seq
-from src.utils import (
-    PROJECT_ROOT,
-    convert_to_timedelta,
-    get_config,
-    latest_model,
-    prepare_folder,
-    save_training_step,
-)
-
-INPUT_PATH = f"{PROJECT_ROOT}/generated/translations/stage4_reindexing"
-OUTPUT_PATH = f"{PROJECT_ROOT}/checkpoints/translations"
-
-if __name__ == "__main__":
-    config = get_config()
-    learning_rate = config["translations"]["training"]["learning_rate"]
-    max_len = config["translations"]["create_batches"]["max_tokens"]
-    num_epochs = config["translations"]["training"]["num_epochs"]
-    batch_size = config["translations"]["training"]["batch_size"]
-    save_step = config["translations"]["training"]["save_step"]
-    loss_averaging_span = config["translations"]["training"]["loss_averaging_span"]
-    fresh_start = config["translations"]["training"]["fresh_start"]
-    device_name = config["translations"]["training"]["device"]
-    max_train_time = config["translations"]["training"]["max_training_time"]
-    base_model = config["global"]["base_model"]
-    seed = config["global"]["random_seed"]
-
-    prepare_folder(OUTPUT_PATH)
-    np.random.seed(seed=seed)
-
-    if max_train_time is not None:
-        max_train_time = convert_to_timedelta(max_train_time)
-
-    device = torch.device(device_name if torch.cuda.is_available() else "cpu")
-    print(f"Training on {device}")
-
-    df = dd.read_parquet(INPUT_PATH, engine="pyarrow")
-
-    tokenizer = BertTokenizerFast.from_pretrained(base_model)
-
-    model = TransformerSeq2Seq(tokenizer.vocab_size, 256, max_len, 4, 4, 4,).to(device)
-    criterion = torch.nn.CrossEntropyLoss(reduction="mean").to(device)
-    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
-
-    epoch_start = 0
-    sample_start = 0
-    if fresh_start is False:
-        checkpoint_files = glob.glob(f"{OUTPUT_PATH}/*.model")
-        latest = latest_model(checkpoint_files)
-
-        if latest is not None:
-            epoch, batch = latest
-            model.load_state_dict(
-                torch.load(f"{OUTPUT_PATH}/{epoch}-{batch}.model", map_location=device,)
-            )
-            optimizer.load_state_dict(
-                torch.load(
-                    f"{OUTPUT_PATH}/{epoch}-{batch}.optimizer", map_location=device,
-                )
-            )
-
-            epoch_start, sample_start = epoch, batch
-            print(f"Loaded {epoch}-{batch}")
-
-    model.train()
-    model.base_model.train()
-    losses = []
-
-    num_samples = get_ordered_dataframe_len(df)
-    random_index_shuffle = np.random.permutation(range(num_samples))
-
-    training_stopped = False
-
-    time_max = datetime.max
-    if max_train_time is not None:
-        time_max = datetime.now() + max_train_time
-
-    for epoch in range(epoch_start, num_epochs):
-        if training_stopped:
-            break
-
-        i = sample_start
-        for data_batch in get_batches(df, batch_size, 100, random_index_shuffle, i):
-            inputs = data_batch.apply(
-                lambda x: x["source"].reshape(x["source_shape"]), axis=1
-            ).values
-            outputs = data_batch.apply(
-                lambda x: x["target"].reshape(x["target_shape"]), axis=1
-            ).values
-            attentions_mask = data_batch.apply(
-                lambda x: x["attention_mask"].reshape(x["attention_mask_shape"]),
-                axis=1,
-            ).values
-
-            inputs = torch.tensor(np.stack(inputs, axis=0), dtype=torch.long).to(device)
-            attentions_mask = torch.tensor(np.stack(attentions_mask, axis=0) == 0).to(
-                device
-            )
-            output_indices = torch.tensor(
-                np.stack(outputs, axis=0), dtype=torch.long
-            ).to(device)
-
-            y_pred = model(inputs, output_indices[:, :-1], attentions_mask)
-            y_pred = y_pred.transpose(1, 2)
-
-            loss = criterion(y_pred, output_indices[:, 1:])
-
-            losses.append(loss.item())
-            if len(losses) > loss_averaging_span:
-                losses = losses[-loss_averaging_span:]
-
-            print(f"epoch: {epoch} | step: {i} | loss: {np.mean(losses)}")
-
-            optimizer.zero_grad()
-
-            if i % save_step == 0 and (i != sample_start or epoch != epoch_start):
-                print(f"Saving: Epoch {epoch}, step {i}")
-                save_training_step(OUTPUT_PATH, f"{epoch}-{i}", model, optimizer)
-
-            if datetime.now() > time_max:
-                print(f"Max time reached, saving: Epoch {epoch}, step {i}")
-                save_training_step(OUTPUT_PATH, f"{epoch}-{i}", model, optimizer)
-                training_stopped = True
-                break
-
-            loss.backward()
-            optimizer.step()
-
-            i += 1
-
-    if not training_stopped:
-        save_training_step(OUTPUT_PATH, "final", model, optimizer)
diff --git a/src/processing.py b/src/processing.py
deleted file mode 100644
index 2777416..0000000
--- a/src/processing.py
+++ /dev/null
@@ -1,65 +0,0 @@
-import numpy as np
-
-
-def expand_dims(entry) -> dict:
-    """Reshapes flat source, target, mask arrays into corresponding shapes
-
-    Args:
-        entry (dict): Dask dataframe row with columns: source, target, attention_mask, source_shape, target_shape, attention_mask_shape
-
-    Returns:
-        dict: Dask dataframe row with columns: source, target, attention_mask
-    """
-    source = entry.source.reshape(entry.source_shape)
-    target = entry.target.reshape(entry.target_shape)
-    mask = entry.attention_mask.reshape(entry.attention_mask_shape)
-
-    return {
-        "source": source,
-        "target": target,
-        "attention_mask": mask,
-    }
-
-
-EXPAND_DIMS_META = {
-    "source": object,
-    "target": object,
-    "attention_mask": object,
-}
-
-
-def flatten_dims(entry: dict) -> dict:
-    """Flattens arrays in dataframe rows into 1D and saves shapes into separate columns
-
-    Args:
-        entry (dict): Dask dataframe row with columns: source, target, attention_mask
-
-    Returns:
-        dict: Dask dataframe row with columns: source, target, attention_mask, source_shape, target_shape, attention_mask_shape
-    """
-    source_shape = np.array(entry.source.shape)
-    target_shape = np.array(entry.target.shape)
-    mask_shape = np.array(entry.attention_mask.shape)
-
-    source = entry.source.reshape(-1)
-    target = entry.target.reshape(-1)
-    mask = entry.attention_mask.reshape(-1)
-
-    return {
-        "source": source,
-        "target": target,
-        "attention_mask": mask,
-        "source_shape": source_shape,
-        "target_shape": target_shape,
-        "attention_mask_shape": mask_shape,
-    }
-
-
-FLATTEN_DIMS_META = {
-    "source": object,
-    "target": object,
-    "attention_mask": object,
-    "source_shape": object,
-    "target_shape": object,
-    "attention_mask_shape": object,
-}
diff --git a/src/utils.py b/src/utils.py
deleted file mode 100644
index 0bb292f..0000000
--- a/src/utils.py
+++ /dev/null
@@ -1,620 +0,0 @@
-from __future__ import annotations
-
-import os
-import pickle
-import re
-import shutil
-from datetime import datetime, timedelta
-from glob import glob
-from typing import Generator, List, Optional, Tuple, Type
-
-import dask.dataframe as dd
-import numpy as np
-import pandas as pd
-import torch
-import torch.nn as nn
-import yaml
-from torch.optim import Optimizer
-
-from src.batch_loading import get_batches, get_ordered_dataframe_len
-from src.models.interfaces import PunctuationModel
-
-PROJECT_ROOT = os.path.dirname(os.path.realpath("/".join(__file__.split("/")) + "/.."))
-
-
-class Saver:
-    """Class that allows saving and loading mode-optimizer pairs"""
-
-    def __init__(
-        self, save_dir: str, model: PunctuationModel, optimizer: Optimizer
-    ) -> None:
-        """Initializes Saver
-
-        Args:
-            save_dir (str): Directory where model and optimizer will be saved
-            model (PunctuationModel): Model to save
-            optimizer (Optimizer): Optimizer to save
-        """
-        self.save_dir = save_dir
-        self.model = model
-        self.optimizer = optimizer
-
-        prepare_folder(self.save_dir)
-
-    def save(self, name: str):
-        """Saves model and optimizer
-
-        Args:
-            name (str): Name under which modell will be saved
-        """
-        self.model.save(self.save_dir, name)
-        torch.save(self.optimizer.state_dict(), f"{self.save_dir}/{name}.optimizer")
-
-
-class Loader:
-    """Class for loading model and it's optimizer from checkpoint"""
-
-    def __init__(
-        self,
-        save_dir: str,
-        model_type: Type[PunctuationModel],
-        optimizer_type: Type[Optimizer],
-        device: torch.device,
-    ) -> None:
-        """Initializes Loader
-
-        Args:
-            save_dir (str): Directory where to search for models
-            model_type (Type[PunctuationModel]): Model class that should be loaded
-            optimizer_type (Type[Optimizer]): Optimizer class that should be loaded
-            device (torch.device): Device on which loaded model/optimizer will exists
-        """
-        self.save_dir = save_dir
-        self.device = device
-
-        self.model_type = model_type
-        self.optimizer_type = optimizer_type
-
-    def has_checkpoints(self) -> bool:
-        """Checks if there are any saved checkpoints in model's directory
-
-        Returns:
-            bool: True if checkpoints where found, False otherwise
-        """
-        files = glob(f"{self.save_dir}/*.model")
-
-        return latest_model(files) is not None
-
-    def load(self, name: str) -> Tuple[PunctuationModel, Optimizer]:
-        """Loads a model and optimizer from file
-
-        Args:
-            name (str): Name of the model that will be loaded
-
-        Returns:
-            Tuple[PunctuationModel, Optimizer]: Model and optimizer
-        """
-        model = self.model_type.load(self.save_dir, name, self.device)
-
-        optimizer = self.optimizer_type(model.parameters())
-        optimizer.load_state_dict(
-            torch.load(f"{self.save_dir}/{name}.optimizer", map_location=self.device)
-        )
-
-        print(f"Loaded model {name}")
-
-        return model, optimizer
-
-    def load_latest(self) -> Tuple[PunctuationModel, Optimizer, int, int]:
-        """Loads latest checkpoint in directory
-
-        Returns:
-            Tuple[PunctuationModel, Optimizer, int, int]: Model, Optimizer, Epoch at
-            which checkpoint was made, step at which checkpoint was made
-        """
-        files = glob(f"{self.save_dir}/*.model")
-
-        model_id = latest_model(files)
-        if model_id is None:
-            return None
-
-        epoch, step = model_id
-        model, optimizer = self.load(f"{epoch}-{step}")
-
-        return model, optimizer, epoch, step
-
-
-class Checkpoint:
-    """Utility class to make checkpoints every constant ammount of steps"""
-
-    def __init__(
-        self, save_step: int, saver: Saver, start_epoch: int, start_step: int
-    ) -> None:
-        """Initializes Checkpoint.
-        Starting epoch and step are provided, so that checkpoint will not be made right after
-        loading model.
-
-        Args:
-            save_step (int): Number of steps after which checkpoints will be saved
-            saver (Saver): Saver used to save model/optimizer state
-            start_epoch (int): Epoch at which training was started
-            start_step (int): Step at which training was started
-        """
-        self.start_step = start_step
-        self.start_epoch = start_epoch
-        self.save_step = save_step
-
-        self.saver = saver
-
-    def step(self, epoch: int, step: int) -> None:
-        """Check if checkpoint should be made, and save it if necessary
-
-        Args:
-            epoch (int): Epoch num
-            step (int): Step num
-        """
-        if step % self.save_step == 0 and (
-            step != self.start_step or epoch != self.start_epoch
-        ):
-            print(f"Saving: Epoch {epoch}, step {step}")
-            self.saver.save(f"{epoch}-{step}")
-
-
-class Timeout:
-    """Utility class that prevent training from surpassing maximum ammount of time"""
-
-    def __init__(self, duration: timedelta, saver: Optional[Saver]) -> None:
-        """Initializes Timeout
-
-        Args:
-            duration (timedelta): Maxium duration of training
-            saver (Optional[Saver]): Saver used to save checkpoint if traing time is
-            exceeded
-        """
-        self.saver = saver
-        self.duration = duration
-        self.time_max = None
-
-    def start(self, time_now: Optional[datetime] = None):
-        """Starts counting time from the start of training
-
-        Args:
-            time_now (Optional[datetime], optional): Point from which time will be measured.
-            Use current time if None. Defaults to None.
-        """
-        if time_now is None:
-            time_now = datetime.now()
-
-        self.time_max = datetime.max
-        if self.duration is not None:
-            self.time_max = time_now + self.max_train_time
-
-    def step(self, epoch: int, step: int, time: Optional[datetime] = None) -> bool:
-        """Check if timeout was not exceeded. Saved checkpoint if time is exceeded
-
-        Args:
-            epoch (int): Epoch number
-            step (int): Step number
-            time (Optional[datetime], optional): Current time. Use current time if None. Defaults to None.
-
-        Returns:
-            bool: True if time was exceeded, False otherwise
-        """
-        assert self.time_max is not None
-
-        if time is None:
-            time = datetime.now()
-
-        if time > self.time_max:
-            if self.checkpoint is not None:
-                print(f"Max time reached, saving: Epoch {epoch}, step {step}")
-                self.saver.save(f"{epoch}-{step}")
-
-            return True
-
-        return False
-
-
-class ProgressTracker:
-    """Utility class used to tracking loss and displaying it to user"""
-
-    def __init__(self, device: torch.device, loss_averaging_span: int) -> None:
-        """Initializes ProgressTracker
-
-        Args:
-            device (torch.device): Device on which training is performed
-            loss_averaging_span (int): Number of latest samples used to calculate average loss
-        """
-        print(f"Training on {device}")
-        self.loss_averaging_span = loss_averaging_span
-        self.losses = []
-
-    def step(self, epoch: int, step: int, loss: float) -> None:
-        """New loss was calculated. Informs user about it
-
-        Args:
-            epoch (int): Epoch number
-            step (int): Step number
-            loss (float): Loss value at provided epoch and step
-        """
-        self.losses.append(loss.item())
-        loss_mean, self.losses = moving_average(self.losses, self.loss_averaging_span)
-
-        print(f"epoch: {epoch} | step: {step} | loss: {loss_mean}")
-
-
-def get_config() -> dict:
-    """Returns dict with config values
-
-    Returns:
-        dict: Dict with condig values
-    """
-
-    with open(f"{PROJECT_ROOT}/params.yaml", "r") as file:
-        config = yaml.load(file, Loader=yaml.FullLoader)
-
-    return config
-
-
-def remove_multiple_spaces(text: str) -> str:
-    """Replaces multiple spaces by a single one
-
-    Args:
-        text (str): Text potentialy containing multiple spaces
-
-    Returns:
-        str: Text with all multiple spaces replaced by one
-    """
-    return re.sub(r"\s\s+", " ", text)
-
-
-def remove_punctuation(text: str, whitelist: List[str] = []) -> str:
-    """Removes all non-alphanumeric characters from the text.
-    Might result in multiple spaces while chracters like `-`
-    are used
-
-    Args:
-        text (str): Text containing punctuation
-
-    Returns:
-        str: Text with all punctuactions removed
-    """
-
-    return "".join(filter(lambda x: x.isalnum() or x.isspace() or x in whitelist, text))
-
-
-def unify_whitespaces(text: str) -> str:
-    """Maps all whitespace characters into a simple ' '
-
-    Args:
-        text (str): Text containing multiple forms of whitespace
-
-    Returns:
-        str: Text with a single form of whitespace
-    """
-    result = ""
-
-    for c in text:
-        if c.isspace():
-            result += " "
-        else:
-            result += c
-
-    return result
-
-
-def output_preprocess(text: str) -> str:
-    """Cleans the text out of bad formating and removes or replaces symbols that will not be predicted by a model
-
-    Args:
-        text (str): Arbitrary text
-
-    Returns:
-        str: Text that could be a direct output of punctuation prediction algorithm
-    """
-    # Whitespace-like characters
-    text = text.replace("-", " ").replace("/", " ").replace("+", " ")
-
-    # Punctuation-like characters
-    text = text.replace(";", ".").replace("!", ".")
-
-    text = remove_punctuation(text, [".", ",", "?"])
-    text = unify_whitespaces(text)
-    text = remove_multiple_spaces(text)
-    text = text.strip()
-
-    return text
-
-
-def input_preprocess(text: str) -> str:
-    """Makes sure that input is in the same format as training data (no non-alphanum chars, no double spaces,
-        all lowercase etc.)
-
-    Args:
-        text (str): Text to be processed
-
-    Returns:
-        str: Text in training-data format
-    """
-    text = remove_punctuation(text)
-    text = unify_whitespaces(text)
-    text = remove_multiple_spaces(text)
-    text = text.lower()
-    text = text.strip()
-
-    return text
-
-
-def prepare_folder(path: str, wipe: bool = False) -> None:
-    """Function make sure that provided path exists. Can aditionaly
-    remove all files from the path.
-
-    Args:
-        path (str): Full directory path
-        wipe (bool): Wheter to remove all files in folder
-    """
-
-    if wipe:
-        shutil.rmtree(path)
-
-    os.makedirs(path, exist_ok=True)
-
-
-def unflattened_column(df: pd.DataFrame, name: str) -> np.ndarray:
-    """Get column from the dataframe that was flattened. Dataframe must have columns
-    "name" and "name_shape", where name is 1D numpy array and name_shape is target
-    shape of this numpy array.
-
-    Args:
-        df (pd.DataFrame): Dataframe from which to extract array
-        name (str): Name of the column
-
-    Returns:
-        np.ndarray: Unflattened mutlidiamenional column of shape Lx*(name_shape)
-    """
-
-    values = df.apply(lambda x: x[name].reshape(x[f"{name}_shape"]), axis=1).values
-
-    return np.stack(values)
-
-
-def moving_average(
-    values: List[np.ndarray], average_span: int
-) -> Tuple[float, np.ndarray]:
-    """Computes moving average and keeps only latests records
-
-    Args:
-        values (List[np.ndarray]): Table containing values over which to compute moving averag
-        average_span (int): Maximum span over which to average
-
-    Returns:
-        Tuple[float, np.ndarray]: computetd average, values array trimed to last "average_span" entries
-    """
-
-    if len(values) > average_span:
-        values = values[-average_span:]
-
-    return np.mean(values), values
-
-
-def optimizer_step(loss: torch.Tensor, optimizer: torch.optim.Optimizer) -> None:
-    """Computes and applies a single step of optimization
-
-    Args:
-        loss (torch.Tensor): Loss that is optimized
-        optimizer (torch.optim.optimizer.Optimizer): Optimizer used to optimize loss
-    """
-    optimizer.zero_grad()
-    loss.backward()
-    optimizer.step()
-
-
-def training_loop(
-    epoch_start: int,
-    sample_start: int,
-    num_epochs: int,
-    df: dd.DataFrame,
-    batch_size: int,
-    batch_buffer_size: int,
-    random_index_shuffle: np.ndarray,
-) -> Generator[pd.DataFrame, int, int]:
-    """Generator providing all data necessary to perform a training steps. This function handels epochs/steps management
-
-    Args:
-        epoch_start (int): Epoch from which to start training
-        sample_start (int): Batch in epoch from which to start training
-        num_epochs (int): Number of epochs to train
-        df (dd.DataFrame): Dask dataframe with training dataset. Indexes must be continous from 0 to len
-        batch_size (int): Batch size
-        batch_buffer_size (int): Number of batches to load at once to memory
-        random_index_shuffle (np.ndarray): Shuffled indices of dataset
-
-    Yields:
-        Generator: batch, epoch_num, step_num
-    """
-    i = sample_start
-    for epoch in range(epoch_start, num_epochs):
-        for data_batch in get_batches(
-            df, batch_size, batch_buffer_size, random_index_shuffle, i
-        ):
-            if len(data_batch) == 0:
-                continue
-
-            yield data_batch, epoch, i
-
-            i += 1
-
-        i = 0
-
-
-def random_indexes(df: dd.DataFrame) -> np.ndarray:
-    """Provides array of randomly shuffled indices for dataset
-
-    Args:
-        df (dd.DataFrame): Dask dataframe with training dataset. Indexes must be continous from 0 to len
-
-    Returns:
-        np.ndarray: Shuffled indices
-    """
-    num_samples = get_ordered_dataframe_len(df)
-    return np.random.permutation(range(num_samples))
-
-
-def pickle_save(obj: any, path: str) -> None:
-    """Pickles and saves object to a file
-
-    Args:
-        obj (any): Object to pickle
-        path (str): Path to output file
-    """
-    with open(path, "wb") as f:
-        pickle.dump(obj, f)
-
-
-def pickle_read(path: str) -> any:
-    """Reads pickled objet from a file
-
-    Args:
-        path (str): Path to input file
-
-    Returns:
-        any: Unpickled object
-    """
-    with open(path, "rb") as f:
-        return pickle.load(f)
-
-
-def convert_to_timedelta(time_val: Optional[str]) -> Optional[timedelta]:
-    """
-    src: https://code.activestate.com/recipes/577894-convert-strings-like-5d-and-60s-to-timedelta-objec/
-    Given a *time_val* (string) such as '5d', returns a timedelta object
-    representing the given value (e.g. timedelta(days=5)).
-
-    =========   ======= ===================
-    Character   Meaning Example
-    =========   ======= ===================
-    s           Seconds '60s' -> 60 Seconds
-    m           Minutes '5m'  -> 5 Minutes
-    h           Hours   '24h' -> 24 Hours
-    d           Days    '7d'  -> 7 Days
-    =========   ======= ===================
-
-    Examples::
-
-        >>> convert_to_timedelta('7d')
-        datetime.timedelta(7)
-        >>> convert_to_timedelta('24h')
-        datetime.timedelta(1)
-        >>> convert_to_timedelta('60m')
-        datetime.timedelta(0, 3600)
-        >>> convert_to_timedelta('120s')
-        datetime.timedelta(0, 120)
-    """
-    if time_val is None:
-        return None
-
-    num = int(time_val[:-1])
-    if time_val.endswith("s"):
-        return timedelta(seconds=num)
-    elif time_val.endswith("m"):
-        return timedelta(minutes=num)
-    elif time_val.endswith("h"):
-        return timedelta(hours=num)
-    elif time_val.endswith("d"):
-        return timedelta(days=num)
-    else:
-        return None
-
-
-def latest_model(file_paths: List[str]) -> Optional[Tuple[int, int]]:
-    """Finds newest model in directory
-
-    Args:
-        files ([str]): List of all file paths that will be considered. File extension is discarded
-                       File names must be in format epoch_num-batch_num.extension
-
-    Returns:
-        (int, int): Tuple of (latest_batch, latest_step) for latest model
-    """
-
-    furthest_epoch = -1
-    furthest_batch_num = -1
-    for checkpoint_file in file_paths:
-        filename = checkpoint_file.split("/")[-1].split(".")[0]
-
-        result = re.search(r"^(\d+)-(\d+)$", filename)
-        if result is not None:
-            epoch, batch = [int(x) for x in result.groups()]
-
-            if epoch > furthest_epoch:
-                furthest_epoch = epoch
-                furthest_batch_num = batch
-            elif epoch == furthest_epoch:
-                furthest_batch_num = max(batch, furthest_batch_num)
-
-    if (furthest_epoch == -1) or (furthest_batch_num == -1):
-        return None
-
-    return furthest_epoch, furthest_batch_num
-
-
-def save_training_step(
-    dir: str,
-    name: str,
-    model: nn.Module,
-    optimizer: Optional[Optimizer] = None,
-    create_dir: bool = False,
-) -> None:
-    """Saves a trainig step to a directory
-
-    Args:
-        dir (str): Directory where step will be saved
-        name (str): Name of the step (eg. "0-1000")
-        model (nn.Module): model that will be saved
-        optimizer (optim.Optimizer): optimizer that will be saved. Might be None
-    """
-    if create_dir:
-        prepare_folder(dir, wipe=False)
-
-    torch.save(model.state_dict(), f"{dir}/{name}.model")
-
-    if optimizer is not None:
-        torch.save(
-            optimizer.state_dict(), f"{dir}/{name}.optimizer",
-        )
-
-
-def yaml_serializable(cls):
-    def save_yaml(self, path: str) -> None:
-        yml = yaml.dump(self.__dict__)
-        with open(path, "w") as f:
-            f.write(yml)
-
-    @staticmethod
-    def load_yaml(path: str) -> cls:
-        with open(path, "r") as f:
-            yml = f.read()
-
-        obj = cls()
-        obj.__dict__ = yaml.load(yml, Loader=yaml.FullLoader)
-
-        return obj
-
-    setattr(cls, "save_yaml", save_yaml)
-    setattr(cls, "load_yaml", load_yaml)
-
-    return cls
-
-
-def get_device(model: nn.Module) -> torch.device:
-    """Get device on which the module resides. Works only if all
-    parameters reside on single device.
-
-    Args:
-        model (nn.Module): Module to check
-
-    Returns:
-        torch.device: Device on which module's paraters exists
-    """
-
-    return next(model.parameters()).device
diff --git a/tests/__init__.py b/tests/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/tests/models/__init__.py b/tests/models/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/tests/models/test_actions_model_base.py b/tests/models/test_actions_model_base.py
deleted file mode 100644
index 900cf89..0000000
--- a/tests/models/test_actions_model_base.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import torch
-from transformers.tokenization_bert import BertTokenizerFast
-
-from src.models.actions_model_base import (
-    ActionsModelBase,
-    ActionsModelBaseLoss,
-    ActionsModelBaseParams,
-)
-from src.pipelines.actions_based.processing import ACTIONS_KEYS
-
-
-def test_dimensions():
-    base_model = "dkleczek/bert-base-polish-cased-v1"
-    action_vector_size = 5
-
-    tokens = BertTokenizerFast.from_pretrained(base_model)(
-        "Ala ma kota", return_tensors="pt"
-    )
-
-    params = ActionsModelBaseParams(base_model, action_vector_size)
-    model = ActionsModelBase(params)
-
-    result = model(tokens["input_ids"], tokens["attention_mask"])
-
-    assert len(result.shape) == 3
-
-    assert result.shape[0] == tokens["input_ids"].shape[0]
-    assert result.shape[1] == tokens["input_ids"].shape[1]
-    assert result.shape[2] == action_vector_size
-
-
-def test_loss_dimensions():
-    batch_size = 5
-    sequence_len = 10
-    actions_size = 3
-    weights = torch.zeros(actions_size) + 0.3
-    actions_vector_true = torch.zeros((batch_size, sequence_len, actions_size))
-    actions_vector_bad = torch.ones((batch_size, sequence_len, actions_size))
-    loss = ActionsModelBaseLoss(weights)
-
-    result = loss(actions_vector_bad, actions_vector_true)
-    assert len(result.shape) == 0
-
-    result_perfect = loss(actions_vector_true, actions_vector_true)
-    result_bad = loss(actions_vector_bad, actions_vector_true)
-
-    assert result_perfect < result_bad
-
-
-def test_predict():
-    params = ActionsModelBaseParams(
-        "dkleczek/bert-base-polish-cased-v1", len(ACTIONS_KEYS)
-    )
-    model = ActionsModelBase(params)
-
-    input_str = "testowy ciag znakow"
-    result = model.predict(input_str)
-
-    assert len(result) >= len(input_str)
diff --git a/tests/models/test_actions_model_mixed.py b/tests/models/test_actions_model_mixed.py
deleted file mode 100644
index 786136d..0000000
--- a/tests/models/test_actions_model_mixed.py
+++ /dev/null
@@ -1,94 +0,0 @@
-import torch
-from transformers.tokenization_bert import BertTokenizerFast
-
-from src.models.actions_model_mixed import (
-    ActionsModelMixed,
-    ActionsModelMixedLoss,
-    ActionsModelMixedParams,
-    ActionsModelMixedRuntimeParams,
-)
-from src.pipelines.actions_based.processing import ACTIONS_KEYS
-
-
-def test_dimensions():
-    base_model = "dkleczek/bert-base-polish-cased-v1"
-    action_vector_size = 5
-
-    tokenizer = BertTokenizerFast.from_pretrained(base_model)
-    tokens = tokenizer("Ala ma kota", return_tensors="pt")
-
-    embedding_size = 20
-    threshold = 0.9
-    num_heads = 2
-    num_layers = 2
-    feedforward_neurons = 10
-    max_len = 500
-    dropout = 0.1
-
-    params = ActionsModelMixedParams(
-        base_model,
-        tokenizer.vocab_size,
-        threshold,
-        embedding_size,
-        num_heads,
-        num_layers,
-        feedforward_neurons,
-        action_vector_size,
-        max_len,
-        dropout,
-    )
-    model = ActionsModelMixed(params)
-
-    actions_len = 3
-    actions = torch.distributions.Multinomial(
-        1, torch.tensor([0.5] * action_vector_size)
-    ).sample((tokens["input_ids"].shape[0], actions_len))
-
-    result = model(tokens["input_ids"], actions, tokens["attention_mask"])
-
-    assert len(result.shape) == 3
-
-    assert result.shape[0] == tokens["input_ids"].shape[0]
-    assert result.shape[1] == actions_len
-    assert result.shape[2] == action_vector_size
-
-
-def test_loss_dimensions():
-    batch_size = 5
-    sequence_len = 10
-    actions_size = 3
-    prior_odds = torch.zeros(actions_size) + 0.3
-    actions_vector_true = torch.zeros((batch_size, sequence_len, actions_size))
-    actions_vector_bad = torch.ones((batch_size, sequence_len, actions_size))
-    loss = ActionsModelMixedLoss(prior_odds)
-
-    result = loss(actions_vector_true, actions_vector_bad)
-    assert len(result.shape) == 0
-
-    result_perfect = loss(actions_vector_true, actions_vector_true)
-    result_bad = loss(actions_vector_true, actions_vector_bad)
-
-    assert result_perfect < result_bad
-
-
-def test_predict():
-    tokenizer = BertTokenizerFast.from_pretrained("dkleczek/bert-base-polish-cased-v1")
-    params = ActionsModelMixedParams(
-        "dkleczek/bert-base-polish-cased-v1",
-        tokenizer.vocab_size,
-        0.9,
-        10,
-        2,
-        1,
-        10,
-        len(ACTIONS_KEYS),
-        500,
-        0.1,
-    )
-    runtime = ActionsModelMixedRuntimeParams(0.9, 100)
-    model = ActionsModelMixed(params, runtime)
-
-    input_str = "testowy ciag znakow"
-    result = model.predict(input_str)
-
-    assert len(result) >= len(input_str)
diff --git a/tests/models/test_actions_model_restricted.py b/tests/models/test_actions_model_restricted.py
deleted file mode 100644
index b659b27..0000000
--- a/tests/models/test_actions_model_restricted.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import torch
-from transformers.tokenization_bert import BertTokenizerFast
-
-from src.models.actions_model_restricted import (
-    ActionsModelRestricted,
-    ActionsModelRestrictedLoss,
-    ActionsModelRestrictedParams,
-)
-from src.pipelines.actions_based.processing import ACTIONS_KEYS
-
-
-def test_dimensions():
-    base_model = "dkleczek/bert-base-polish-cased-v1"
-    action_vector_size = 5
-
-    tokens = BertTokenizerFast.from_pretrained(base_model)(
-        "Ala ma kota", return_tensors="pt"
-    )
-
-    params = ActionsModelRestrictedParams(base_model, action_vector_size)
-    model = ActionsModelRestricted(params)
-
-    result = model(tokens["input_ids"], tokens["attention_mask"])
-
-    assert len(result.shape) == 3
-
-    assert result.shape[0] == tokens["input_ids"].shape[0]
-    assert result.shape[1] == tokens["input_ids"].shape[1]
-    assert result.shape[2] == action_vector_size
-
-
-def test_loss_dimensions():
-    batch_size = 5
-    sequence_len = 10
-    action_vector_size = 4
-    uppercase_odds = torch.tensor(0.3, dtype=torch.float)
-    punctuation_weights = torch.tensor([0.3, 0.3, 0.1], dtype=torch.float)
-    loss = ActionsModelRestrictedLoss(uppercase_odds, punctuation_weights)
-
-    actions_vector_true = torch.zeros(
-        (batch_size, sequence_len, action_vector_size), dtype=torch.float
-    )
-    actions_vector_true[:, :, -1] = 1.0
-
-    actions_vector_bad = torch.zeros(
-        (batch_size, sequence_len, action_vector_size), dtype=torch.float
-    )
-    actions_vector_bad[:, :, :2] = 1.0
-    actions_vector_bad[:, :, -1] = 0.0
-
-    result = loss(actions_vector_true, actions_vector_bad)
-    assert len(result.shape) == 0
-
-    result_perfect = loss(actions_vector_true, actions_vector_true)
-    result_bad = loss(actions_vector_true, actions_vector_bad)
-
-    print(result_perfect)
-    print(result_bad)
-
-    assert result_perfect < result_bad
-    assert result_perfect > 0
-    assert result_bad > 0
-
-
-def test_predict():
-    params = ActionsModelRestrictedParams(
-        "dkleczek/bert-base-polish-cased-v1", len(ACTIONS_KEYS) + 1
-    )
-    model = ActionsModelRestricted(params)
-
-    input_str = "testowy ciag znakow"
-    result = model.predict(input_str)
-
-    assert len(result) >= len(input_str)
diff --git a/tests/pipelines/__init__.py b/tests/pipelines/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/tests/pipelines/actions_based/__init__.py b/tests/pipelines/actions_based/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/tests/pipelines/actions_based/test_processing.py b/tests/pipelines/actions_based/test_processing.py
deleted file mode 100644
index 8e4caaa..0000000
--- a/tests/pipelines/actions_based/test_processing.py
+++ /dev/null
@@ -1,230 +0,0 @@
-import numpy as np
-import pytest
-from transformers import BertTokenizerFast
-
-from src.pipelines.actions_based.processing import (
-    ACTIONS_KEYS,
-    action_vector,
-    batchify_data,
-    batchify_labels,
-    create_model_input_output,
-    decode_actions,
-    detect_actions,
-    encode_actions,
-    last_stop_label,
-    nearest_sentence_l,
-    nearest_sentence_r,
-    recover_text,
-    token_labels_to_word_labels,
-    token_word_mapping,
-    tokenize_labeled_text,
-)
-
-
-def test_detect_actions():
-    actions = detect_actions("Janek.", None)
-    assert actions == {
-        "upper_case": True,
-        "dot": True,
-        "colon": False,
-        "question_mark": False,
-    }
-
-    actions = detect_actions("ewka?", None)
-    assert actions == {
-        "upper_case": False,
-        "dot": False,
-        "colon": False,
-        "question_mark": True,
-    }
-
-    actions = detect_actions("Test", None)
-    assert actions == {
-        "upper_case": True,
-        "dot": False,
-        "colon": False,
-        "question_mark": False,
-    }
-
-
-def test_encode_actions():
-    x = {
-        "upper_case": False,
-        "dot": True,
-        "colon": False,
-        "question_mark": True,
-    }
-
-    assert np.all(encode_actions(x) == np.array([0, 1, 0, 1]))
-
-
-def test_decode_actions():
-    x = np.array([0, 1, 0, 1])
-
-    assert decode_actions(x) == {
-        "upper_case": False,
-        "dot": True,
-        "colon": False,
-        "question_mark": True,
-    }
-
-
-def test_token_word_mapping():
-    text = "janek poszedł do ogrodu"
-    tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")
-
-    text_tokenized = tokenizer(text)
-
-    mapping = token_word_mapping(text, tokenizer)
-
-    assert len(mapping) == (len(text_tokenized["input_ids"]) - 2)
-    assert min(mapping) == 0
-    assert max(mapping) == 3
-
-
-def test_token_labels_to_word_labels():
-    text = "janek poszedł do ogrodu"
-    labels = np.array([[0, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]])
-    tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")
-
-    _, token_labels = tokenize_labeled_text(text, labels, tokenizer)
-
-    word_labels = token_labels_to_word_labels(text, token_labels, tokenizer)
-
-    assert np.all(np.vectorize(pytest.approx)(word_labels, labels))
-
-
-def test_tokenize_labeled_text():
-    text = "Janek poszedł do ogrodu. Ogród był zwierzęcy. Spotkał tam Zosię?"
-    tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")
-
-    text_clean, labels = create_model_input_output(text)
-    tokens, token_labels = tokenize_labeled_text(text_clean, labels, tokenizer)
-
-    assert len(tokens.shape) == 2
-    assert len(token_labels.shape) == 2
-
-    assert tokens.shape[1] == 1
-    assert token_labels.shape[1] == len(ACTIONS_KEYS)
-
-    assert len(tokens) == len(token_labels)
-    assert tokens[0, 0] != tokenizer.cls_token_id
-    assert tokens[-1, 0] != tokenizer.sep_token_id
-
-
-def test_recover_text():
-    text = "Janek poszedł do ogrodu. Ogród był zwierzęcy. Spotkał tam Zosię?"
-    text_clean, word_labels = create_model_input_output(text)
-
-    result_text = recover_text(text_clean, word_labels)
-
-    assert result_text == text
-
-
-def test_nearest_sentence_l():
-    end = create_dummy_action(True)
-    word = create_dummy_action(False)
-
-    entry = np.array([word, word, word, end, end, word, word, end])
-
-    assert nearest_sentence_l(entry, 3) == 0
-    assert nearest_sentence_l(entry, 4) == 0
-    assert nearest_sentence_l(entry, 5) == 5
-    assert nearest_sentence_l(entry, 7) == 5
-
-
-def create_dummy_action(end_sentence: bool) -> np.array:
-    return encode_actions(
-        {
-            "upper_case": False,
-            "dot": end_sentence,
-            "colon": False,
-            "question_mark": False,
-        }
-    )
-
-
-def test_nearest_sentence_r():
-    end = create_dummy_action(True)
-    word = create_dummy_action(False)
-
-    entry = np.array([word, word, word, end, end, word, word, end])
-
-    assert nearest_sentence_r(entry, 0) == 0
-    assert nearest_sentence_r(entry, 4) == 5
-    assert nearest_sentence_r(entry, 5) == 5
-    assert nearest_sentence_r(entry, 6) is None
-    assert nearest_sentence_r(entry, 7) is None
-
-
-def test_batchify_labels():
-    end = create_dummy_action(True)
-    word = create_dummy_action(False)
-    entry = np.array([word, word, word, end, end, word, word, end])
-
-    batches = batchify_labels(entry, 3, 1)
-
-    assert len(batches) == 2
-    assert np.all(batches[0] == range(0, 3))
-    assert np.all(batches[1] == range(5, 8))
-
-
-def test_batchify_data():
-    text = "Janek poszedł do ogrodu. Ogród był zwierzęcy. Spotkał tam niedzwiedzia?"
-    tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")
-
-    text_clean, labels = create_model_input_output(text)
-    tokens, token_labels = tokenize_labeled_text(text_clean, labels, tokenizer)
-
-    input_batch, output_batch, mask_batch = batchify_data(
-        tokens, token_labels, 5, tokenizer
-    )
-
-    assert len(input_batch.shape) == 3
-    assert len(output_batch.shape) == 3
-    assert len(mask_batch.shape) == 2
-
-    assert input_batch.shape[0] == mask_batch.shape[0]
-    assert input_batch.shape[0] > 1
-
-    # Second dimension should be sequence length
-    assert input_batch.shape[1] == 5
-    assert output_batch.shape[1] == 5
-    assert mask_batch.shape[1] == 5
-
-    # Third dimension should be feature size
-    assert input_batch.shape[2] == 1
-    assert output_batch.shape[2] == len(ACTIONS_KEYS)
-
-    # Mask should be integer (1 - leave, 0 - mask out)
-    assert mask_batch.dtype == np.int
-
-    # Should never be fully masked
-    # TODO: Make sure correct convetions is used
-    assert np.all(mask_batch[:, 0] == 1)
-
-    # Should never be fully masked0
-    for i in range(input_batch.shape[0]):
-        # Should always start from beginning of the sentence
-        assert decode_actions(output_batch[i, 0, :])["upper_case"]
-        assert decode_actions(output_batch[i, 1, :])["upper_case"]
-
-        # Should always end with sep and padding#
-        # TODO: Test it
-
-
-def test_action_vector():
-    expected = encode_actions(
-        {"dot": True, "upper_case": True, "colon": False, "question_mark": False}
-    )
-
-    assert np.all(action_vector(["dot", "upper_case"]) == expected)
-
-
-def test_last_stop_label():
-    stop_action = action_vector(["Dot"])
-    not_stop_action = action_vector(["upper_case"])
-
-    labels = np.array([not_stop_action, not_stop_action, stop_action, not_stop_action])
-
-    assert last_stop_label(labels, stop_action) == 2
diff --git a/tests/pipelines/actions_based/test_scoring.py b/tests/pipelines/actions_based/test_scoring.py
deleted file mode 100644
index 62c524c..0000000
--- a/tests/pipelines/actions_based/test_scoring.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import numpy as np
-from numpy.testing import assert_allclose, assert_array_equal
-
-from src.pipelines.actions_based.scoring import (
-    multiclass_auc,
-    multiclass_roc_curve,
-    predictions_threshold,
-)
-
-
-def test_predictions_threshold():
-    threshold = 0.5
-    predictions = np.array([[[0.3, 0.6, 0.1, 0.2, 0.9], [0.3, 0.6, 0.1, 0.2, 0.9]]])
-    expected = np.array([[[0.0, 1.0, 0.0, 0.0, 1.0], [0.0, 1.0, 0.0, 0.0, 1.0]]])
-
-    got = predictions_threshold(predictions, threshold)
-
-    assert np.all(got == expected)
-
-
-def test_multiclass_roc_curve():
-    predictions = np.array([[0.3, 0.2, 0.1, 0.3, 0.1], [0.7, 0.5, 0.1, 0.2, 0.9]])
-    expected = np.array([[0.0, 1.0, 0.0, 0.0, 1.0], [1.0, 0.0, 1.0, 1.0, 0.0]])
-
-    fpr, tpr, thresholds = multiclass_roc_curve(expected, predictions)
-
-    assert len(thresholds) == expected.shape[1]
-
-    # Thresholds
-    assert_allclose(thresholds[0], [1.7, 0.7, 0.3])
-    assert_allclose(thresholds[1], [1.5, 0.5, 0.2])
-    assert_allclose(thresholds[2], [1.1, 0.1])
-    assert_allclose(thresholds[3], [1.3, 0.3, 0.2])
-    assert_allclose(thresholds[4], [1.9, 0.9, 0.1])
-
-    # False positive rate
-    assert_array_equal(fpr[0], [0.0, 0.0, 1.0])
-    assert_array_equal(fpr[1], [0.0, 1.0, 1.0])
-    assert_array_equal(fpr[2], [0.0, 1.0])
-    assert_array_equal(fpr[3], [0.0, 1.0, 1.0])
-    assert_array_equal(fpr[4], [0.0, 1.0, 1.0])
-
-    # True positive rate
-    assert_array_equal(tpr[0], [0.0, 1.0, 1.0])
-    assert_array_equal(tpr[1], [0.0, 0.0, 1.0])
-    assert_array_equal(tpr[2], [0.0, 1.0])
-    assert_array_equal(tpr[3], [0.0, 0.0, 1.0])
-    assert_array_equal(tpr[4], [0.0, 0.0, 1.0])
-
-
-def test_multiclass_auc():
-    predictions = np.array([[0.3, 0.2, 0.1, 0.3, 0.1], [0.7, 0.5, 0.1, 0.2, 0.9]])
-    expected = np.array([[0.0, 1.0, 0.0, 0.0, 1.0], [1.0, 0.0, 1.0, 1.0, 0.0]])
-
-    fpr, tpr, _ = multiclass_roc_curve(expected, predictions)
-    result = multiclass_auc(fpr, tpr)
-
-    assert len(result) == 5
-    assert np.all(result >= 0)
-    assert np.all(result <= 1)
diff --git a/tests/pipelines/translation_based/__init__.py b/tests/pipelines/translation_based/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/tests/pipelines/translation_based/test_processing.py b/tests/pipelines/translation_based/test_processing.py
deleted file mode 100644
index f7806a1..0000000
--- a/tests/pipelines/translation_based/test_processing.py
+++ /dev/null
@@ -1,185 +0,0 @@
-import numpy as np
-from transformers import BertTokenizerFast
-
-from src.pipelines.translation_based.processing import (
-    add_begin_end_tokens,
-    add_padding,
-    create_input_output,
-    crete_input_output_batch,
-    find_new_sentence_left,
-    find_new_sentence_right,
-    get_batch_indexes,
-    standarize_translation_sample,
-)
-
-
-def test_find_new_sentence_left():
-    test_input = np.array([0, 0, 1, 0, 1, 0])
-    assert find_new_sentence_left(test_input, 0) == 0
-    assert find_new_sentence_left(test_input, 1) == 0
-    assert find_new_sentence_left(test_input, 2) == 0
-    assert find_new_sentence_left(test_input, 3) == 3
-    assert find_new_sentence_left(test_input, 4) == 3
-    assert find_new_sentence_left(test_input, 5) == 5
-
-
-def test_find_new_sentence_right():
-    test_input = np.array([0, 0, 1, 0, 1, 0, 0])
-    assert find_new_sentence_right(test_input, 0) == 3
-    assert find_new_sentence_right(test_input, 1) == 3
-    assert find_new_sentence_right(test_input, 2) == 3
-    assert find_new_sentence_right(test_input, 3) == 3
-    assert find_new_sentence_right(test_input, 4) == 5
-    assert find_new_sentence_right(test_input, 5) == 5
-    assert find_new_sentence_right(test_input, 6) is None
-
-
-def test_split_to_samples():
-    min_len = 3
-    max_len = 5
-    test_input = np.array([0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0])
-    expeted_output = [np.array([0, 1, 2, 3, 4]), np.array([6, 7, 8, 9, 10])]
-
-    result = get_batch_indexes(test_input, min_len, max_len)
-    assert len(result) == len(expeted_output)
-
-    for got, expected in zip(result, expeted_output):
-        assert np.all(got == expected)
-
-
-def test_add_padding():
-    input_sequence = np.array([1, 2, 3, 4])
-
-    # Works with 0 padding
-    result = add_padding(input_sequence, 4, 9)
-    assert len(result) == 4
-    assert np.all(result == input_sequence)
-
-    # Normal use case
-    result = add_padding(input_sequence, 6, 9)
-    assert len(result) == 6
-    assert np.all(result == [1, 2, 3, 4, 9, 9])
-
-    # multidimensional use-case
-    input_sequence = np.array([[1, 2, 3], [4, 5, 6]])
-    padd = np.array([9, 9, 9])
-    result = add_padding(input_sequence, 4, padd)
-    assert len(result) == 4
-    assert np.all(result == [[1, 2, 3], [4, 5, 6], [9, 9, 9], [9, 9, 9]])
-
-
-def test_add_begin_end_tokens():
-    input_sequence = np.array([1])
-    result = add_begin_end_tokens(input_sequence, 9, 8)
-
-    assert len(result) == 3
-    assert np.all(result == [9, 1, 8])
-
-
-def test_standarize_translation_sample():
-    input_sequence = np.array([1])
-
-    result = standarize_translation_sample(input_sequence, 5, 5, 9, 8)
-
-    assert len(result) == 5
-    assert np.all(result == [9, 1, 8, 5, 5])
-
-
-def test_create_input_output():
-    sequence = [56500, 117, 10824, 30186, 11090, 10113, 119]
-    tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")
-
-    expected_output_sequence = [
-        tokenizer.cls_token_id,
-        56500,
-        117,
-        10824,
-        30186,
-        11090,
-        10113,
-        119,
-        tokenizer.sep_token_id,
-        tokenizer.pad_token_id,
-        tokenizer.pad_token_id,
-    ]
-    expected_input_sequence = [
-        tokenizer.cls_token_id,
-        21739,
-        10824,
-        16469,
-        tokenizer.sep_token_id,
-        tokenizer.pad_token_id,
-        tokenizer.pad_token_id,
-        tokenizer.pad_token_id,
-        tokenizer.pad_token_id,
-        tokenizer.pad_token_id,
-        tokenizer.pad_token_id,
-    ]
-
-    result_input, result_output = create_input_output(sequence, 11, tokenizer)
-
-    assert len(result_input) == len(expected_input_sequence)
-    assert len(result_output) == len(expected_output_sequence)
-    assert np.all(expected_input_sequence == result_input)
-    assert np.all(expected_output_sequence == result_output)
-
-
-def test_create_input_output_batch():
-    tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")
-
-    expected_output_1 = np.array(tokenizer("Ala, ma KoTa.")["input_ids"])[1:-1]
-    expected_output_2 = np.array(tokenizer("A kOt nie!")["input_ids"])[1:-1]
-
-    expected_input_1 = np.array(tokenizer("ala ma kota")["input_ids"])[1:-1]
-    expected_input_2 = np.array(tokenizer("a kot nie")["input_ids"])[1:-1]
-
-    input_sequence = np.concatenate([expected_output_1, expected_output_2])
-    batch_ids = [
-        np.array(list(range(len(expected_output_1)))),
-        np.array(list(range(len(expected_output_2)))) + len(expected_output_1),
-    ]
-
-    expected_input_1 = standarize_translation_sample(
-        expected_input_1,
-        20,
-        tokenizer.pad_token_id,
-        tokenizer.cls_token_id,
-        tokenizer.sep_token_id,
-    )
-    expected_input_2 = standarize_translation_sample(
-        expected_input_2,
-        20,
-        tokenizer.pad_token_id,
-        tokenizer.cls_token_id,
-        tokenizer.sep_token_id,
-    )
-    expected_output_1 = standarize_translation_sample(
-        expected_output_1,
-        20,
-        tokenizer.pad_token_id,
-        tokenizer.cls_token_id,
-        tokenizer.sep_token_id,
-    )
-    expected_output_2 = standarize_translation_sample(
-        expected_output_2,
-        20,
-        tokenizer.pad_token_id,
-        tokenizer.cls_token_id,
-        tokenizer.sep_token_id,
-    )
-
-    result_input, result_output = crete_input_output_batch(
-        input_sequence, batch_ids, 20, tokenizer
-    )
-
-    assert result_input.shape[0] == 2
-    assert result_input.shape[1] == 20
-
-    assert result_output.shape[0] == 2
-    assert result_output.shape[1] == 20
-
-    assert np.all(result_input[0] == expected_input_1)
-    assert np.all(result_input[1] == expected_input_2)
-
-    assert np.all(result_output[0] == expected_output_1)
-    assert np.all(result_output[1] == expected_output_2)
diff --git a/tests/test_batch_loading.py b/tests/test_batch_loading.py
deleted file mode 100644
index 641aaa7..0000000
--- a/tests/test_batch_loading.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import dask.dataframe as dd
-import numpy as np
-import pandas as pd
-
-from src.batch_loading import (
-    calculate_batch_buffer_id,
-    get_batches,
-    get_ordered_dataframe_len,
-    yield_batch_buffer_span,
-)
-
-
-def test_calculate_batch_buffer_id():
-    # ids = [0, 1, 2, 3, 4, 5, 6]
-    assert calculate_batch_buffer_id(0, 3) == 0
-    assert calculate_batch_buffer_id(1, 3) == 0
-    assert calculate_batch_buffer_id(2, 3) == 0
-    assert calculate_batch_buffer_id(3, 3) == 1
-    assert calculate_batch_buffer_id(4, 3) == 1
-    assert calculate_batch_buffer_id(5, 3) == 1
-    assert calculate_batch_buffer_id(6, 3) == 2
-
-
-def test_yield_batch_buffer_span():
-    ids = [0, 1, 2, 3, 4, 5, 6]
-
-    result = list(yield_batch_buffer_span(2, 2, len(ids)))
-
-    assert np.all(result[0] == [0, 1, 2, 3])
-    assert np.all(result[1] == [4, 5, 6])
-
-
-def test_get_ordered_dataframe_len():
-    df = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7]})
-
-    assert get_ordered_dataframe_len(df) == 7
-
-
-def test_get_batches():
-    batch_size = 2
-    batch_buffer_len = 2
-    pdf = pd.DataFrame({"a": [1, 0, 2, 3, 4, 5, 6]})
-    shuffled_ids = np.array([1, 0, 2, 3, 4, 5, 6])
-    df = dd.from_pandas(pdf, npartitions=2)
-
-    batches = list(get_batches(df, batch_size, batch_buffer_len, shuffled_ids))
-
-    assert np.all(batches[0]["a"].values == [0, 1])
-    assert np.all(batches[1]["a"].values == [2, 3])
-    assert np.all(batches[2]["a"].values == [4, 5])
-    assert np.all(batches[3]["a"].values == [6])
-
-    batches = list(get_batches(df, batch_size, batch_buffer_len, shuffled_ids, 1))
-
-    assert np.all(batches[1]["a"].values == [2, 3])
-    assert np.all(batches[2]["a"].values == [4, 5])
-    assert np.all(batches[3]["a"].values == [6])
diff --git a/tests/test_chunking.py b/tests/test_chunking.py
new file mode 100644
index 0000000..65e7b4b
--- /dev/null
+++ b/tests/test_chunking.py
@@ -0,0 +1,72 @@
+import numpy as np
+from punctuator.punctuator import combine_masks, inference_masks
+
+
+def test_inference_mask():
+    T = True
+    F = False
+
+    result, mask = inference_masks(11, 8, 2)
+
+    assert np.all(result == np.array([
+        [T, T, T, T, T, T, T, F, F, F, T],
+        [T, F, F, T, T, T, T, T, T, F, T],
+        [T, F, F, F, T, T, T, T, T, T, T],
+    ]))
+    assert np.all(mask == np.array([
+        [F, T, T, T, T, F, F, F],
+        [F, F, F, T, T, F, F, F],
+        [F, F, F, F, T, T, T, F],
+    ]))
+
+    result, mask = inference_masks(10, 8, 2)
+    assert np.all(result == np.array([
+        [T, T, T, T, T, T, T, F, F, T],
+        [T, F, F, T, T, T, T, T, T, T],
+    ]))
+    assert np.all(mask == np.array([
+        [F, T, T, T, T, F, F, F],
+        [F, F, F, T, T, T, T, F],
+    ]))
+
+    result, mask = inference_masks(5, 8, 2)
+    assert np.all(result == np.array([
+        [T, T, T, T, T],
+    ]))
+    assert np.all(mask == np.array([
+        [F, T, T, T, F]
+    ]))
+
+    result, mask = inference_masks(10, 9, 3)
+    assert np.all(result == np.array([
+        [T, T, T, T, T, T, T, T, F, T],
+        [T, F, T, T, T, T, T, T, T, T],
+    ]))
+    assert np.all(mask == np.array([
+        [F, T, T, T, T, F, F, F, F],
+        [F, F, F, F, T, T, T, T, F]
+
+    ]))
+
+
+def test_combine_mask():
+    T = True
+    F = False
+
+    result = combine_masks(11, 8, 2)
+    assert np.all(result == np.array([
+        [F, T, T, T, T, F, F, F, F, F, F],
+        [F, F, F, F, F, T, T, T, T, F, F],
+        [F, F, F, F, F, F, F, F, F, T, F],
+    ]))
+
+    result = combine_masks(10, 8, 2)
+    assert np.all(result == np.array([
+        [F, T, T, T, T, F, F, F, F, F],
+        [F, F, F, F, F, T, T, T, T, F],
+    ]))
+
+    result = combine_masks(5, 8, 2)
+    assert np.all(result == np.array([
+        [F, T, T, T, F],
+    ]))
diff --git a/tests/test_utils.py b/tests/test_utils.py
deleted file mode 100644
index 7e52db2..0000000
--- a/tests/test_utils.py
+++ /dev/null
@@ -1,105 +0,0 @@
-import os
-from dataclasses import dataclass
-
-from src.utils import (
-    convert_to_timedelta,
-    input_preprocess,
-    latest_model,
-    output_preprocess,
-    remove_multiple_spaces,
-    remove_punctuation,
-    yaml_serializable,
-)
-
-
-def test_remove_multiple_spaces():
-    provided = "Ala   ma Kota.      Kot ma Ale "
-    expected = "Ala ma Kota. Kot ma Ale "
-
-    assert remove_multiple_spaces(provided) == expected
-
-
-def test_remove_punctuation():
-    provided = "Ala..  ma-Kota!?.@@$ Kot ma Ale ()*"
-    expected = "Ala  maKota Kot ma Ale "
-
-    assert remove_punctuation(provided) == expected
-
-    whitelist = [".", "?"]
-    expected_whitelist = "Ala..  maKota?. Kot ma Ale "
-
-    assert remove_punctuation(provided, whitelist) == expected_whitelist
-
-
-def test_input_preprocess():
-    provided = "Ala  ma-Kota!?.@@$ Kot ma Ale ()*"
-    expected = "ala makota kot ma ale"
-
-    assert input_preprocess(provided) == expected
-
-
-def test_output_preprocess():
-    provided = "Ala  ma-Kota!?.@@$ Kot ma Ale ()*"
-    expected = "Ala ma Kota.?. Kot ma Ale"
-
-    assert output_preprocess(provided) == expected
-
-
-def test_convert_to_timedelta():
-    assert convert_to_timedelta("5d").days == 5
-    assert convert_to_timedelta("5d").seconds == 0
-    assert convert_to_timedelta("5d").microseconds == 0
-
-    assert convert_to_timedelta("4h").days == 0
-    assert convert_to_timedelta("4h").seconds == 4 * 60 * 60
-    assert convert_to_timedelta("4h").microseconds == 0
-
-    assert convert_to_timedelta("3m").days == 0
-    assert convert_to_timedelta("3m").seconds == 3 * 60
-    assert convert_to_timedelta("3m").microseconds == 0
-
-    assert convert_to_timedelta("2s").days == 0
-    assert convert_to_timedelta("2s").seconds == 2
-    assert convert_to_timedelta("2s").microseconds == 0
-
-
-def test_latest_model():
-    files = []
-    assert latest_model(files) is None
-
-    files.append("/path/tam/pam/Wrongformat.b")
-    assert latest_model(files) is None
-
-    files.append("/path/tam/pam/0-2000.b")
-    assert latest_model(files) == (0, 2000)
-
-    files.append("/path/tam/pam/0-3000.c")
-    assert latest_model(files) == (0, 3000)
-
-    files.append("/path/tam/pam/1-1000.a")
-    assert latest_model(files) == (1, 1000)
-
-    files.append("/path/tam/pam/1-500.a")
-    assert latest_model(files) == (1, 1000)
-
-
-def test_yaml_serializable(tmp_path):
-    PATH = tmp_path / "test.yaml"
-
-    @yaml_serializable
-    @dataclass
-    class Test:
-        x: int = 3
-        y: str = "test1"
-
-    x = Test()
-    x.x = -1
-    x.y = "test2"
-    x.save_yaml(PATH)
-
-    assert os.path.exists(PATH)
-
-    y = Test.load_yaml(PATH)
-
-    assert y.x == -1
-    assert y.y == "test2"
diff --git a/tox.ini b/tox.ini
index 02ec5a0..0735fd9 100644
--- a/tox.ini
+++ b/tox.ini
@@ -2,12 +2,6 @@
 envlist = unittest,pep8
 skipsdist = True
 
-[testenv]
-deps =  -rrequirements.txt
-
-[testenv:unittest]
-commands = pytest --ignore data --ignore generated
-
 [flake8]
 exclude =
      .tox,
diff --git a/train.sh b/train.sh
deleted file mode 100755
index 3d7da2d..0000000
--- a/train.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-
-# Usage: ./train.sh [module_to_run] [container_name]
-# Eg.: ./train.sh src.pipelines.actions_based.train_base base_training
-
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-docker build . -f ./docker/training/Dockerfile -t clarinpl/punctuator_training --build-arg USERNAME=$(whoami) --build-arg USER_UID=$(id -u) --build-arg USER_GID=$(id -u) && \
-docker run -v $DIR:/punctuator --name $2 --gpus all -it --entrypoint python clarinpl/punctuator_training -m $1
diff --git a/worker.py b/worker.py
old mode 100755
new mode 100644
index 98e5a75..5c5d479
--- a/worker.py
+++ b/worker.py
@@ -1,62 +1,60 @@
 #!/usr/bin/python
 
 import configparser
-import logging
-from src.models.model_factory import MODELS_MAP
-from typing import List
+import json
 
 import nlp_ws
-import torch
+from transformers import AutoModelForTokenClassification, AutoTokenizer
 
-from src.utils import input_preprocess, output_preprocess
+from punctuator.punctuator import (combine_masks, decode, decode_labels,
+                                   inference_masks)
 
 
 class Worker(nlp_ws.NLPWorker):
-    """Class that implements example worker."""
-
     def init(self):
         self.config = configparser.ConfigParser()
         self.config.read("config.ini")
+        self.config = self.config['deployment']
 
-        self.device = torch.device(self.config["deployment"]["device"])
-        self.models_dir = self.config["deployment"]["models_dir"]
-        self.models = {}
-
-        self._log = logging.getLogger(__name__)
+        self.max_context_size = int(self.config['max_context_size'])
+        self.overlap = int(self.config['overlap'])
 
-        models_enabled = self.config["deployment"]["models_enabled"]
-        models_enabled = models_enabled.split(",")
+        model_path = self.config['model_path']
+        self.model = AutoModelForTokenClassification.from_pretrained(
+            model_path)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
 
-        self._load_models(models_enabled)
+        with open(f"{model_path}/classes.json", 'r') as f:
+            mapping = json.load(f)
+            self.mapping = list(mapping.keys())
 
-    def _load_models(self, models_list: List[str]):
-        for model_type in models_list:
-            self.models[model_type] = MODELS_MAP[model_type].load(
-                f"{self.models_dir}/{model_type}", "production", self.device
-            )
-            self.models[model_type].train(False)
+    def process(self, input_path: str, task_options: dict, output_path: str) -> None:
+        with open(input_path, 'r') as f:
+            text = f.read()
 
-    def process(self, input_file: str, task_options: dict, output_file: str) -> None:
-        """Implementation of example tasks that copies files."""
+        tokenized = self.tokenizer(text, return_tensors='pt')
 
-        if (
-            "model" in task_options.keys()
-            and task_options["model"] in MODELS_MAP.keys()
-        ):
-            model_type = task_options["model"]
-        else:
-            model_type = "actions_base"
+        num_tokens = len(tokenized['input_ids'][0])
 
-        with open(input_file, "r") as f:
-            text = input_preprocess(output_preprocess(f.read()))
+        # TODO: Consider adding batching support
+        results = []
+        for inference_mask, mask_mask in zip(*inference_masks(num_tokens, self.max_context_size, self.overlap)):
+            result = self.model(input_ids=tokenized['input_ids'][:, inference_mask],
+                                attention_mask=tokenized['attention_mask'][:, inference_mask])
+            print(result.logits.shape)
+            labels_ids = result.logits.detach().argmax(
+                dim=-1).squeeze().numpy()[mask_mask]
+            results.append(decode_labels(labels_ids, self.mapping))
+        labels = sum(results, [])
 
-        result = self.models[model_type].predict(text)
+        tokens = []
+        for combine_mask in combine_masks(num_tokens, self.max_context_size, self.overlap):
+            tokens += tokenized['input_ids'][0, combine_mask].numpy().tolist()
 
-        with open(output_file, "w") as f:
-            f.write(result)
+        text_out = decode(tokens, labels, self.tokenizer)
 
-        if self.device.type != "cpu":
-            torch.cuda.empty_cache()
+        with open(output_path, 'w') as f:
+            f.write(text_out)
 
 
 if __name__ == "__main__":
-- 
GitLab


From 8ce4bc1a4c4fb2d854450ce9401bfd8574352bc2 Mon Sep 17 00:00:00 2001
From: Michal Pogoda <michalpogoda@hotmail.com>
Date: Mon, 15 Feb 2021 11:16:42 +0100
Subject: [PATCH 2/8] Adder preprocessing

---
 worker.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/worker.py b/worker.py
index 5c5d479..cfab5f9 100644
--- a/worker.py
+++ b/worker.py
@@ -8,8 +8,15 @@ from transformers import AutoModelForTokenClassification, AutoTokenizer
 
 from punctuator.punctuator import (combine_masks, decode, decode_labels,
                                    inference_masks)
+import string
 
 
+def preprocess_input(text: str):
+    text = text.translate(str.maketrans('', '', string.punctuation))
+    text = text.lower()
+
+    return text
+
 class Worker(nlp_ws.NLPWorker):
     def init(self):
         self.config = configparser.ConfigParser()
@@ -32,6 +39,9 @@ class Worker(nlp_ws.NLPWorker):
         with open(input_path, 'r') as f:
             text = f.read()
 
+        # Make sure that the text is lowercase & punctuationless
+        text = preprocess_input(text)
+
         tokenized = self.tokenizer(text, return_tensors='pt')
 
         num_tokens = len(tokenized['input_ids'][0])
-- 
GitLab


From 1d58fbf53a7830df4372b42da1f544407a2805a1 Mon Sep 17 00:00:00 2001
From: Michal Pogoda <michalpogoda@hotmail.com>
Date: Mon, 15 Feb 2021 11:17:23 +0100
Subject: [PATCH 3/8] Formatting fixes

---
 worker.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/worker.py b/worker.py
index cfab5f9..fd25b92 100644
--- a/worker.py
+++ b/worker.py
@@ -2,13 +2,13 @@
 
 import configparser
 import json
+import string
 
 import nlp_ws
 from transformers import AutoModelForTokenClassification, AutoTokenizer
 
 from punctuator.punctuator import (combine_masks, decode, decode_labels,
                                    inference_masks)
-import string
 
 
 def preprocess_input(text: str):
@@ -17,6 +17,7 @@ def preprocess_input(text: str):
 
     return text
 
+
 class Worker(nlp_ws.NLPWorker):
     def init(self):
         self.config = configparser.ConfigParser()
-- 
GitLab


From 2d4e41f494b1e139d892ce85e87eec017d5bbf72 Mon Sep 17 00:00:00 2001
From: Michal Pogoda <michalpogoda@hotmail.com>
Date: Mon, 15 Feb 2021 11:29:47 +0100
Subject: [PATCH 4/8] Added tokens

---
 punctuator/punctuator.py | 45 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 42 insertions(+), 3 deletions(-)

diff --git a/punctuator/punctuator.py b/punctuator/punctuator.py
index 0baf991..2ea5376 100644
--- a/punctuator/punctuator.py
+++ b/punctuator/punctuator.py
@@ -1,14 +1,33 @@
-from typing import List
+from typing import List, Tuple
 import numpy as np
 
 
 def decode_labels(results, labels_map) -> List[str]:
+    """Converts labes from ids to text representations
+
+    Args:
+        results (List[int]): List of ids of labels
+        labels_map (List[str]): List of classnames in order matching list of ids
+
+    Returns:
+        List[str]: List of classnames
+    """
     labels_decoded = list(map(lambda x: labels_map[x], results))
 
     return labels_decoded
 
 
 def decode(tokens, labels_decoded, tokenizer):
+    """Applies predictions to text in order to get punctuated text representation
+
+    Args:
+        tokens (List[int]): List of token-ids
+        labels_decoded (List[str]): Per-token classnames
+        tokenizer: Huggingface tokenizer
+
+    Returns:
+        str: Text with punctuation & casing applied
+    """
     text_recovered = []
     word = []
     word_end = ""
@@ -44,7 +63,17 @@ def decode(tokens, labels_decoded, tokenizer):
     return "".join(text_recovered)
 
 
-def inference_masks(num_tokens, max_len, overlap):
+def inference_masks(num_tokens: int, max_len: int, overlap: int) -> Tuple[List[List[bool]], List[List[bool]]]:
+    """ Splits text that is to long for predicting. The function provide list of masks for each prediction chunk
+
+    Args:
+        num_tokens (int): Number of tokens, including CLS & SEP
+        max_len (int): Prediction window (must be less than 512)
+        overlap (int): Ammout of overlapping between chunking windows
+
+    Returns:
+        Tuple[List[List[bool]], List[List[bool]]]: Masks for tokens provided for inference & for result of inference
+    """
     if max_len >= num_tokens:
         return [[True] * num_tokens], [[False] + [True] * (num_tokens - 2) + [False]]
 
@@ -85,7 +114,17 @@ def inference_masks(num_tokens, max_len, overlap):
     return entries, masks
 
 
-def combine_masks(num_tokens, max_len, overlap):
+def combine_masks(num_tokens: int, max_len: int, overlap: int) -> List[List[bool]]:
+    """Provides mask which tokens to take for each prediction. It makes sure that each token is only taken once & scored by best chunk.
+
+    Args:
+        num_tokens (int): Number of tokens, including CLS & SEP
+        max_len (int): Prediction window (must be less than 512)
+        overlap (int): Ammout of overlapping between chunking windows
+
+    Returns:
+        List[List[bool]]: Token mask
+    """
     if max_len >= num_tokens:
         return np.array([[False] + [True] * (num_tokens - 2) + [False]])
 
-- 
GitLab


From 527eebb5dcf350ee787c4b27f004c63ee40e95a6 Mon Sep 17 00:00:00 2001
From: Michal Pogoda <michalpogoda@hotmail.com>
Date: Mon, 15 Feb 2021 11:31:56 +0100
Subject: [PATCH 5/8] added readme

---
 README.md | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 README.md

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..2577a4c
--- /dev/null
+++ b/README.md
@@ -0,0 +1,24 @@
+# Punctuator
+A service that automatically adds punctuation to raw word-stream (eg. from speech2text) for polish language. 
+
+**Example input**:
+> według webometrycznego rankingu uniwersytetów świata ze stycznia 2019 pokazującego zaangażowanie instytucji akademickich w internecie uczelnia zajmuje 5 miejsce w polsce wśród uczelni technicznych a na świecie 964 wśród wszystkich typów uczelni w rankingu szkół wyższych perspektyw politechnika wrocławska zajęła w 2019 roku 3 miejsce wśród uczelni technicznych oraz 6 miejsce spośród wszystkich uczelni akademickich w polsce
+
+**Output**:
+> Według webometrycznego rankingu uniwersytetów świata ze stycznia 2019, pokazującego zaangażowanie instytucji akademickich w Internecie, uczelnia zajmuje 5. miejsce w Polsce wśród uczelni technicznych, a na świecie 964. Wśród wszystkich typów uczelni w rankingu szkół wyższych perspektyw Politechnika Wrocławska zajęła w 2019 roku 3. miejsce wśród uczelni technicznych oraz 6. miejsce spośród wszystkich uczelni akademickich w Polsce
+
+## Config
+```ini
+[deployment]
+device = cpu ; Device on which inference will be made (eg. cpu, cuda:0 etc)
+models_dir = deploy ; Relative path to directory, where models will be placed
+models_enabled = actions_base,actions_mixed,actions_restricted ; which models are available. 
+```
+
+## LPMN
+```
+filedir(/users/michal.pogoda)|any2txt|punctuator
+```
+
+## Mountpoints
+Directory where the model will be downloaded (~500Mb) needs to be mounted at `/model/punctuator`. Mount `/model` into directory if you want to make it persitent 
-- 
GitLab


From 780b82156ccfda6212af7be501382012d271cf60 Mon Sep 17 00:00:00 2001
From: Michal Pogoda <michalpogoda@hotmail.com>
Date: Mon, 15 Feb 2021 11:40:44 +0100
Subject: [PATCH 6/8] Select device in config

---
 README.md  |  5 +++--
 config.ini |  3 ++-
 worker.py  | 11 ++++++-----
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 2577a4c..3534bb9 100644
--- a/README.md
+++ b/README.md
@@ -11,8 +11,9 @@ A service that automatically adds punctuation to raw word-stream (eg. from speec
 ```ini
 [deployment]
 device = cpu ; Device on which inference will be made (eg. cpu, cuda:0 etc)
-models_dir = deploy ; Relative path to directory, where models will be placed
-models_enabled = actions_base,actions_mixed,actions_restricted ; which models are available. 
+model_path = /model/punctuator ; Path where the model will be placed
+max_context_size = 256 ; Number of tokens that will be oonsidered in prediciton at once. Must be between in range 2*overlap+1 to 512
+overlap = 20 ; The number of tokens from the environment that will be taken at inference for a text fragment
 ```
 
 ## LPMN
diff --git a/config.ini b/config.ini
index 47abace..037ac99 100644
--- a/config.ini
+++ b/config.ini
@@ -15,4 +15,5 @@ local_log_level = INFO
 [deployment]
 model_path = /model/punctuator
 max_context_size = 256
-overlap = 20
\ No newline at end of file
+overlap = 20
+device = cpu
\ No newline at end of file
diff --git a/worker.py b/worker.py
index fd25b92..794828d 100644
--- a/worker.py
+++ b/worker.py
@@ -27,9 +27,11 @@ class Worker(nlp_ws.NLPWorker):
         self.max_context_size = int(self.config['max_context_size'])
         self.overlap = int(self.config['overlap'])
 
+        self.device = self.config['device']
+
         model_path = self.config['model_path']
         self.model = AutoModelForTokenClassification.from_pretrained(
-            model_path)
+            model_path).to(self.device)
         self.tokenizer = AutoTokenizer.from_pretrained(model_path)
 
         with open(f"{model_path}/classes.json", 'r') as f:
@@ -50,10 +52,9 @@ class Worker(nlp_ws.NLPWorker):
         # TODO: Consider adding batching support
         results = []
         for inference_mask, mask_mask in zip(*inference_masks(num_tokens, self.max_context_size, self.overlap)):
-            result = self.model(input_ids=tokenized['input_ids'][:, inference_mask],
-                                attention_mask=tokenized['attention_mask'][:, inference_mask])
-            print(result.logits.shape)
-            labels_ids = result.logits.detach().argmax(
+            result = self.model(input_ids=tokenized['input_ids'][:, inference_mask].to(self.device),
+                                attention_mask=tokenized['attention_mask'][:, inference_mask].to(self.device))
+            labels_ids = result.logits.detach().cpu().argmax(
                 dim=-1).squeeze().numpy()[mask_mask]
             results.append(decode_labels(labels_ids, self.mapping))
         labels = sum(results, [])
-- 
GitLab


From 0d3e1b422fdaa0b1cdc530a6353822230fb4175f Mon Sep 17 00:00:00 2001
From: Michal Pogoda <michalpogoda@hotmail.com>
Date: Mon, 15 Feb 2021 16:48:54 +0100
Subject: [PATCH 7/8] Style fixes (88 lines limit), moved model into
 /home/worker/model

---
 Dockerfile.worker        |   2 +
 README.md                |   2 +-
 config.ini               |   4 +-
 entrypoint.sh            |  16 +++---
 punctuator/punctuator.py |  78 ++++++++++++++++++++---------
 tests/__init__.py        |   0
 tests/test_chunking.py   | 103 ++++++++++++++++++++++-----------------
 tox.ini                  |  11 ++++-
 worker.py                |  72 +++++++++++++++++----------
 9 files changed, 183 insertions(+), 105 deletions(-)
 create mode 100644 tests/__init__.py

diff --git a/Dockerfile.worker b/Dockerfile.worker
index 049273f..1046391 100644
--- a/Dockerfile.worker
+++ b/Dockerfile.worker
@@ -10,6 +10,8 @@ WORKDIR /workspace
 
 RUN pip3 install --index-url https://pypi.clarin-pl.eu/simple/ nlp_ws==0.6
 
+WORKDIR /home/worker
+
 COPY punctuator punctuator
 COPY entrypoint.sh entrypoint.sh
 COPY worker.py worker.py
diff --git a/README.md b/README.md
index 3534bb9..480c09c 100644
--- a/README.md
+++ b/README.md
@@ -22,4 +22,4 @@ filedir(/users/michal.pogoda)|any2txt|punctuator
 ```
 
 ## Mountpoints
-Directory where the model will be downloaded (~500Mb) needs to be mounted at `/model/punctuator`. Mount `/model` into directory if you want to make it persitent 
+Directory where the model will be downloaded (~500Mb) needs to be mounted at `/home/worker/model/punctuator`. Mount `/home/worker/model` into directory if you want to make it persitent 
diff --git a/config.ini b/config.ini
index 037ac99..de392b1 100644
--- a/config.ini
+++ b/config.ini
@@ -1,5 +1,5 @@
 [service]
-tool = textcleaner_test
+tool = punctuator_test
 root = /samba/requests/
 rabbit_host = test
 rabbit_user = test
@@ -13,7 +13,7 @@ port = 9981
 local_log_level = INFO
 
 [deployment]
-model_path = /model/punctuator
+model_path = /home/worker/model/punctuator
 max_context_size = 256
 overlap = 20
 device = cpu
\ No newline at end of file
diff --git a/entrypoint.sh b/entrypoint.sh
index bb89756..ecfde1c 100644
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
-if ! test -d "/model/punctuator"; then
-    mkdir -p /model/punctuator
-    wget https://minio.clarin-pl.eu/public/models/punctuator/model/pytorch_model.bin -O /model/punctuator/pytorch_model.bin
-    wget https://minio.clarin-pl.eu/public/models/punctuator/model/vocab.txt -O /model/punctuator/vocab.txt
-    wget https://minio.clarin-pl.eu/public/models/punctuator/model/config.json -O /model/punctuator/config.json
-    wget https://minio.clarin-pl.eu/public/models/punctuator/model/tokenizer_config.json -O /model/punctuator/tokenizer_config.json
-    wget https://minio.clarin-pl.eu/public/models/punctuator/model/special_tokens_map.json -O /model/punctuator/special_tokens_map.json
-    wget https://minio.clarin-pl.eu/public/models/punctuator/model/classes.json -O /model/punctuator/classes.json
+if ! test -d "/home/worker/model/punctuator"; then
+    mkdir -p /home/worker/model/punctuator
+    wget https://minio.clarin-pl.eu/public/models/punctuator/model/pytorch_model.bin -O /home/worker/model/punctuator/pytorch_model.bin
+    wget https://minio.clarin-pl.eu/public/models/punctuator/model/vocab.txt -O /home/worker/model/punctuator/vocab.txt
+    wget https://minio.clarin-pl.eu/public/models/punctuator/model/config.json -O /home/worker/model/punctuator/config.json
+    wget https://minio.clarin-pl.eu/public/models/punctuator/model/tokenizer_config.json -O /home/worker/model/punctuator/tokenizer_config.json
+    wget https://minio.clarin-pl.eu/public/models/punctuator/model/special_tokens_map.json -O /home/worker/model/punctuator/special_tokens_map.json
+    wget https://minio.clarin-pl.eu/public/models/punctuator/model/classes.json -O /home/worker/model/punctuator/classes.json
 fi
 
 python worker.py
\ No newline at end of file
diff --git a/punctuator/punctuator.py b/punctuator/punctuator.py
index 2ea5376..45f8efa 100644
--- a/punctuator/punctuator.py
+++ b/punctuator/punctuator.py
@@ -63,8 +63,11 @@ def decode(tokens, labels_decoded, tokenizer):
     return "".join(text_recovered)
 
 
-def inference_masks(num_tokens: int, max_len: int, overlap: int) -> Tuple[List[List[bool]], List[List[bool]]]:
-    """ Splits text that is to long for predicting. The function provide list of masks for each prediction chunk
+def inference_masks(
+    num_tokens: int, max_len: int, overlap: int
+) -> Tuple[List[List[bool]], List[List[bool]]]:
+    """Splits text that is to long for predicting. The function provide list
+       of masks for each prediction chunk
 
     Args:
         num_tokens (int): Number of tokens, including CLS & SEP
@@ -72,10 +75,14 @@ def inference_masks(num_tokens: int, max_len: int, overlap: int) -> Tuple[List[L
         overlap (int): Ammout of overlapping between chunking windows
 
     Returns:
-        Tuple[List[List[bool]], List[List[bool]]]: Masks for tokens provided for inference & for result of inference
+        Tuple[List[List[bool]], List[List[bool]]]: Masks for tokens provided
+            for inference & for result of inference
     """
     if max_len >= num_tokens:
-        return [[True] * num_tokens], [[False] + [True] * (num_tokens - 2) + [False]]
+        return (
+            [[True] * num_tokens],
+            [[False] + [True] * (num_tokens - 2) + [False]],
+        )
 
     # Account for CLS & SEP tokens
     real_max_len = max_len - 2
@@ -88,22 +95,44 @@ def inference_masks(num_tokens: int, max_len: int, overlap: int) -> Tuple[List[L
     for start_id in range(0, real_num_tokens, step_size):
         stop = False
         if start_id == 0:
-            entry = [True] + [True] * real_max_len + [False] * \
-                (real_num_tokens - real_max_len) + [True]
-            mask = [False] + [True] * \
-                (real_max_len - overlap) + [False] * (overlap + 1)
+            entry = (
+                [True]
+                + [True] * real_max_len
+                + [False] * (real_num_tokens - real_max_len)
+                + [True]
+            )
+            mask = (
+                [False]
+                + [True] * (real_max_len - overlap)
+                + [False] * (overlap + 1)
+            )
         elif start_id + real_max_len >= real_num_tokens:
             offset_start = real_num_tokens - real_max_len
-            entry = [True] + [False] * \
-                (offset_start) + [True] * real_max_len + [True]
-            mask = [False] * (overlap + 1 + (start_id - offset_start)) + [True] * \
-                (real_max_len - overlap - (start_id - offset_start)) + [False]
+            entry = (
+                [True]
+                + [False] * (offset_start)
+                + [True] * real_max_len
+                + [True]
+            )
+            mask = (
+                [False] * (overlap + 1 + (start_id - offset_start))
+                + [True] * (real_max_len - overlap - (start_id - offset_start))
+                + [False]
+            )
             stop = True
         else:
-            entry = [True] + [False] * start_id + [True] * real_max_len + \
-                [False] * (real_num_tokens - (start_id + real_max_len)) + [True]
-            mask = [False] * (overlap + 1) + [True] * \
-                (real_max_len - 2 * overlap) + [False] * (overlap + 1)
+            entry = (
+                [True]
+                + [False] * start_id
+                + [True] * real_max_len
+                + [False] * (real_num_tokens - (start_id + real_max_len))
+                + [True]
+            )
+            mask = (
+                [False] * (overlap + 1)
+                + [True] * (real_max_len - 2 * overlap)
+                + [False] * (overlap + 1)
+            )
 
         masks.append(mask)
         entries.append(entry)
@@ -114,8 +143,11 @@ def inference_masks(num_tokens: int, max_len: int, overlap: int) -> Tuple[List[L
     return entries, masks
 
 
-def combine_masks(num_tokens: int, max_len: int, overlap: int) -> List[List[bool]]:
-    """Provides mask which tokens to take for each prediction. It makes sure that each token is only taken once & scored by best chunk.
+def combine_masks(
+    num_tokens: int, max_len: int, overlap: int
+) -> List[List[bool]]:
+    """Provides mask which tokens to take for each prediction. It makes sure
+       that each token is only taken once & scored by best chunk.
 
     Args:
         num_tokens (int): Number of tokens, including CLS & SEP
@@ -135,10 +167,12 @@ def combine_masks(num_tokens: int, max_len: int, overlap: int) -> List[List[bool
         stop = False
 
         if start + max_len - 2 - overlap < num_tokens - 2:
-            entry = [False] + [False] * \
-                (start) + [True] * (max_len - 2 - overlap)
-            entry += [False] * (num_tokens - 2
-                                - (start + max_len - 2 - overlap))
+            entry = (
+                [False] + [False] * (start) + [True] * (max_len - 2 - overlap)
+            )
+            entry += [False] * (
+                num_tokens - 2 - (start + max_len - 2 - overlap)
+            )
             entry += [False]
         else:
             entry = [False] + [False] * (start)
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_chunking.py b/tests/test_chunking.py
index 65e7b4b..17cb854 100644
--- a/tests/test_chunking.py
+++ b/tests/test_chunking.py
@@ -8,45 +8,53 @@ def test_inference_mask():
 
     result, mask = inference_masks(11, 8, 2)
 
-    assert np.all(result == np.array([
-        [T, T, T, T, T, T, T, F, F, F, T],
-        [T, F, F, T, T, T, T, T, T, F, T],
-        [T, F, F, F, T, T, T, T, T, T, T],
-    ]))
-    assert np.all(mask == np.array([
-        [F, T, T, T, T, F, F, F],
-        [F, F, F, T, T, F, F, F],
-        [F, F, F, F, T, T, T, F],
-    ]))
+    assert np.all(
+        result
+        == np.array(
+            [
+                [T, T, T, T, T, T, T, F, F, F, T],
+                [T, F, F, T, T, T, T, T, T, F, T],
+                [T, F, F, F, T, T, T, T, T, T, T],
+            ]
+        )
+    )
+    assert np.all(
+        mask
+        == np.array(
+            [
+                [F, T, T, T, T, F, F, F],
+                [F, F, F, T, T, F, F, F],
+                [F, F, F, F, T, T, T, F],
+            ]
+        )
+    )
 
     result, mask = inference_masks(10, 8, 2)
-    assert np.all(result == np.array([
-        [T, T, T, T, T, T, T, F, F, T],
-        [T, F, F, T, T, T, T, T, T, T],
-    ]))
-    assert np.all(mask == np.array([
-        [F, T, T, T, T, F, F, F],
-        [F, F, F, T, T, T, T, F],
-    ]))
+    assert np.all(
+        result
+        == np.array(
+            [[T, T, T, T, T, T, T, F, F, T], [T, F, F, T, T, T, T, T, T, T], ]
+        )
+    )
+    assert np.all(
+        mask == np.array([[F, T, T, T, T, F, F, F], [F, F, F, T, T, T, T, F], ])
+    )
 
     result, mask = inference_masks(5, 8, 2)
-    assert np.all(result == np.array([
-        [T, T, T, T, T],
-    ]))
-    assert np.all(mask == np.array([
-        [F, T, T, T, F]
-    ]))
+    assert np.all(result == np.array([[T, T, T, T, T], ]))
+    assert np.all(mask == np.array([[F, T, T, T, F]]))
 
     result, mask = inference_masks(10, 9, 3)
-    assert np.all(result == np.array([
-        [T, T, T, T, T, T, T, T, F, T],
-        [T, F, T, T, T, T, T, T, T, T],
-    ]))
-    assert np.all(mask == np.array([
-        [F, T, T, T, T, F, F, F, F],
-        [F, F, F, F, T, T, T, T, F]
-
-    ]))
+    assert np.all(
+        result
+        == np.array(
+            [[T, T, T, T, T, T, T, T, F, T], [T, F, T, T, T, T, T, T, T, T], ]
+        )
+    )
+    assert np.all(
+        mask
+        == np.array([[F, T, T, T, T, F, F, F, F], [F, F, F, F, T, T, T, T, F]])
+    )
 
 
 def test_combine_mask():
@@ -54,19 +62,24 @@ def test_combine_mask():
     F = False
 
     result = combine_masks(11, 8, 2)
-    assert np.all(result == np.array([
-        [F, T, T, T, T, F, F, F, F, F, F],
-        [F, F, F, F, F, T, T, T, T, F, F],
-        [F, F, F, F, F, F, F, F, F, T, F],
-    ]))
+    assert np.all(
+        result
+        == np.array(
+            [
+                [F, T, T, T, T, F, F, F, F, F, F],
+                [F, F, F, F, F, T, T, T, T, F, F],
+                [F, F, F, F, F, F, F, F, F, T, F],
+            ]
+        )
+    )
 
     result = combine_masks(10, 8, 2)
-    assert np.all(result == np.array([
-        [F, T, T, T, T, F, F, F, F, F],
-        [F, F, F, F, F, T, T, T, T, F],
-    ]))
+    assert np.all(
+        result
+        == np.array(
+            [[F, T, T, T, T, F, F, F, F, F], [F, F, F, F, F, T, T, T, T, F], ]
+        )
+    )
 
     result = combine_masks(5, 8, 2)
-    assert np.all(result == np.array([
-        [F, T, T, T, F],
-    ]))
+    assert np.all(result == np.array([[F, T, T, T, F], ]))
diff --git a/tox.ini b/tox.ini
index 0735fd9..5de4ec6 100644
--- a/tox.ini
+++ b/tox.ini
@@ -2,6 +2,13 @@
 envlist = unittest,pep8
 skipsdist = True
 
+[testenv]
+deps =  -rrequirements.txt 
+        pytest >= 6.0.1
+
+[testenv:unittest]
+commands = pytest
+
 [flake8]
 exclude =
      .tox,
@@ -19,9 +26,9 @@ exclude =
     generated
 max-complexity = 10
 min_python_version = 3.8
-max-line-length = 80
+max-line-length = 88
 select = I,C,E,F,W,B,B950,TYP,T
-ignore = E501, C901, I201, W503
+ignore = E231, W503
 
 
 [testenv:pep8]
diff --git a/worker.py b/worker.py
index 794828d..7f2d6e7 100644
--- a/worker.py
+++ b/worker.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+"""Implementation of punctuator service"""
 
 import configparser
 import json
@@ -7,12 +7,16 @@ import string
 import nlp_ws
 from transformers import AutoModelForTokenClassification, AutoTokenizer
 
-from punctuator.punctuator import (combine_masks, decode, decode_labels,
-                                   inference_masks)
+from punctuator.punctuator import (
+    combine_masks,
+    decode,
+    decode_labels,
+    inference_masks,
+)
 
 
-def preprocess_input(text: str):
-    text = text.translate(str.maketrans('', '', string.punctuation))
+def _preprocess_input(text: str):
+    text = text.translate(str.maketrans("", "", string.punctuation))
     text = text.lower()
 
     return text
@@ -22,50 +26,68 @@ class Worker(nlp_ws.NLPWorker):
     def init(self):
         self.config = configparser.ConfigParser()
         self.config.read("config.ini")
-        self.config = self.config['deployment']
+        self.config = self.config["deployment"]
 
-        self.max_context_size = int(self.config['max_context_size'])
-        self.overlap = int(self.config['overlap'])
+        self.max_context_size = int(self.config["max_context_size"])
+        self.overlap = int(self.config["overlap"])
 
-        self.device = self.config['device']
+        self.device = self.config["device"]
 
-        model_path = self.config['model_path']
+        model_path = self.config["model_path"]
         self.model = AutoModelForTokenClassification.from_pretrained(
-            model_path).to(self.device)
+            model_path
+        ).to(self.device)
         self.tokenizer = AutoTokenizer.from_pretrained(model_path)
 
-        with open(f"{model_path}/classes.json", 'r') as f:
+        with open(f"{model_path}/classes.json", "r") as f:
             mapping = json.load(f)
             self.mapping = list(mapping.keys())
 
-    def process(self, input_path: str, task_options: dict, output_path: str) -> None:
-        with open(input_path, 'r') as f:
+    def process(
+        self, input_path: str, task_options: dict, output_path: str
+    ) -> None:
+        with open(input_path, "r") as f:
             text = f.read()
 
         # Make sure that the text is lowercase & punctuationless
-        text = preprocess_input(text)
+        text = _preprocess_input(text)
 
-        tokenized = self.tokenizer(text, return_tensors='pt')
+        tokenized = self.tokenizer(text, return_tensors="pt")
 
-        num_tokens = len(tokenized['input_ids'][0])
+        num_tokens = len(tokenized["input_ids"][0])
 
         # TODO: Consider adding batching support
         results = []
-        for inference_mask, mask_mask in zip(*inference_masks(num_tokens, self.max_context_size, self.overlap)):
-            result = self.model(input_ids=tokenized['input_ids'][:, inference_mask].to(self.device),
-                                attention_mask=tokenized['attention_mask'][:, inference_mask].to(self.device))
-            labels_ids = result.logits.detach().cpu().argmax(
-                dim=-1).squeeze().numpy()[mask_mask]
+        for inference_mask, mask_mask in zip(
+            *inference_masks(num_tokens, self.max_context_size, self.overlap)
+        ):
+            result = self.model(
+                input_ids=tokenized["input_ids"][:, inference_mask].to(
+                    self.device
+                ),
+                attention_mask=tokenized["attention_mask"][
+                    :, inference_mask
+                ].to(self.device),
+            )
+            labels_ids = (
+                result.logits.detach()
+                .cpu()
+                .argmax(dim=-1)
+                .squeeze()
+                .numpy()[mask_mask]
+            )
             results.append(decode_labels(labels_ids, self.mapping))
         labels = sum(results, [])
 
         tokens = []
-        for combine_mask in combine_masks(num_tokens, self.max_context_size, self.overlap):
-            tokens += tokenized['input_ids'][0, combine_mask].numpy().tolist()
+        for combine_mask in combine_masks(
+            num_tokens, self.max_context_size, self.overlap
+        ):
+            tokens += tokenized["input_ids"][0, combine_mask].numpy().tolist()
 
         text_out = decode(tokens, labels, self.tokenizer)
 
-        with open(output_path, 'w') as f:
+        with open(output_path, "w") as f:
             f.write(text_out)
 
 
-- 
GitLab


From c3e69ebd3d55f2f6babd62249bbe62cfb87eba97 Mon Sep 17 00:00:00 2001
From: Michal Pogoda <michalpogoda@hotmail.com>
Date: Mon, 15 Feb 2021 16:50:08 +0100
Subject: [PATCH 8/8] Added unit testing to ci

---
 .gitlab-ci.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 3c0266d..fa2e58a 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -6,6 +6,7 @@ cache:
 
 stages:
   - check_style
+  - testing
   - build
 
 before_script:
@@ -16,6 +17,11 @@ pep8:
   script:
     - tox -v -e pep8
 
+unittest:
+  stage: testing
+  script:
+    - tox -v -e unittest
+
 build_image:
   stage: build
   image: docker:18.09.7
-- 
GitLab