diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000000000000000000000000000000000000..19d3acfbdd4924abfe9ebdaf1154f9bb8af3a0b8
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,15 @@
+data
+__pycache__
+.devcontainer
+.dvc
+.idea
+.metals
+.pytest_cache
+.tox
+.vscode
+checkpoints
+dask-worker-space
+data
+generated
+notebooks
+tests
\ No newline at end of file
diff --git a/.dvc/.gitignore b/.dvc/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..528f30c71c687de473bbb506c071e902beba6cd9
--- /dev/null
+++ b/.dvc/.gitignore
@@ -0,0 +1,3 @@
+/config.local
+/tmp
+/cache
diff --git a/.dvc/config b/.dvc/config
new file mode 100644
index 0000000000000000000000000000000000000000..c30b54e8bb1397015b945d52aff4287518efa2c5
--- /dev/null
+++ b/.dvc/config
@@ -0,0 +1,6 @@
+[core]
+    remote = newremote
+['remote "newremote"']
+    url = s3://punctuation/action_based
+    endpointurl = https://minio.clarin-pl.eu
+    profile = clarinpl
diff --git a/.dvc/plots/confusion.json b/.dvc/plots/confusion.json
new file mode 100644
index 0000000000000000000000000000000000000000..0d9a3336542c4a9219297b878a241e5cae4b5d1f
--- /dev/null
+++ b/.dvc/plots/confusion.json
@@ -0,0 +1,30 @@
+{
+    "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
+    "data": {
+        "values": "<DVC_METRIC_DATA>"
+    },
+    "title": "<DVC_METRIC_TITLE>",
+    "mark": "rect",
+    "encoding": {
+        "x": {
+            "field": "<DVC_METRIC_X>",
+            "type": "nominal",
+            "sort": "ascending",
+            "title": "<DVC_METRIC_X_LABEL>"
+        },
+        "y": {
+            "field": "<DVC_METRIC_Y>",
+            "type": "nominal",
+            "sort": "ascending",
+            "title": "<DVC_METRIC_Y_LABEL>"
+        },
+        "color": {
+            "aggregate": "count",
+            "type": "quantitative"
+        },
+        "facet": {
+            "field": "rev",
+            "type": "nominal"
+        }
+    }
+}
diff --git a/.dvc/plots/default.json b/.dvc/plots/default.json
new file mode 100644
index 0000000000000000000000000000000000000000..d00782a82919f89fdfd63f2b4be97a5c3c71389d
--- /dev/null
+++ b/.dvc/plots/default.json
@@ -0,0 +1,29 @@
+{
+    "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
+    "data": {
+        "values": "<DVC_METRIC_DATA>"
+    },
+    "title": "<DVC_METRIC_TITLE>",
+    "mark": {
+        "type": "line"
+    },
+    "encoding": {
+        "x": {
+            "field": "<DVC_METRIC_X>",
+            "type": "quantitative",
+            "title": "<DVC_METRIC_X_LABEL>"
+        },
+        "y": {
+            "field": "<DVC_METRIC_Y>",
+            "type": "quantitative",
+            "title": "<DVC_METRIC_Y_LABEL>",
+            "scale": {
+                "zero": false
+            }
+        },
+        "color": {
+            "field": "rev",
+            "type": "nominal"
+        }
+    }
+}
diff --git a/.dvc/plots/scatter.json b/.dvc/plots/scatter.json
new file mode 100644
index 0000000000000000000000000000000000000000..90165d4cf67864c94992b83c6c8f027f89fe41c4
--- /dev/null
+++ b/.dvc/plots/scatter.json
@@ -0,0 +1,27 @@
+{
+    "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
+    "data": {
+        "values": "<DVC_METRIC_DATA>"
+    },
+    "title": "<DVC_METRIC_TITLE>",
+    "mark": "point",
+    "encoding": {
+        "x": {
+            "field": "<DVC_METRIC_X>",
+            "type": "quantitative",
+            "title": "<DVC_METRIC_X_LABEL>"
+        },
+        "y": {
+            "field": "<DVC_METRIC_Y>",
+            "type": "quantitative",
+            "title": "<DVC_METRIC_Y_LABEL>",
+            "scale": {
+                "zero": false
+            }
+        },
+        "color": {
+            "field": "rev",
+            "type": "nominal"
+        }
+    }
+}
diff --git a/.dvc/plots/smooth.json b/.dvc/plots/smooth.json
new file mode 100644
index 0000000000000000000000000000000000000000..d497ce75e9e5375733781bd3c3b8b936b9bdec0b
--- /dev/null
+++ b/.dvc/plots/smooth.json
@@ -0,0 +1,39 @@
+{
+    "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
+    "data": {
+        "values": "<DVC_METRIC_DATA>"
+    },
+    "title": "<DVC_METRIC_TITLE>",
+    "mark": {
+        "type": "line"
+    },
+    "encoding": {
+        "x": {
+            "field": "<DVC_METRIC_X>",
+            "type": "quantitative",
+            "title": "<DVC_METRIC_X_LABEL>"
+        },
+        "y": {
+            "field": "<DVC_METRIC_Y>",
+            "type": "quantitative",
+            "title": "<DVC_METRIC_Y_LABEL>",
+            "scale": {
+                "zero": false
+            }
+        },
+        "color": {
+            "field": "rev",
+            "type": "nominal"
+        }
+    },
+    "transform": [
+        {
+            "loess": "<DVC_METRIC_Y>",
+            "on": "<DVC_METRIC_X>",
+            "groupby": [
+                "rev"
+            ],
+            "bandwidth": 0.3
+        }
+    ]
+}
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..c929ac9f1d18bbc3f31025b852c0b314088e31b4
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,16 @@
+dane/**
+dataset_simple
+dataset_actions
+**/dask-worker-space
+.vscode
+.devcontainer
+.idea
+.metals
+/data
+__pycache__
+.pytest_cache
+/checkpoints
+.dvc
+.tox
+notebooks
+dvc.lock
\ No newline at end of file
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 0000000000000000000000000000000000000000..4d608510e92abaf9cb192f7fc3d70ef0ee68b9e0
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,39 @@
+image: python:3.8.5
+
+cache:
+  paths:
+    - .tox
+
+stages:
+  - check_style
+  - testing
+  - build
+
+before_script:
+  - pip install tox==3.19.0
+
+pep8:
+  stage: check_style
+  script:
+    - tox -v -e pep8
+
+unittest:
+  stage: testing
+  script:
+    - tox -v -e unittest
+
+build_image:
+  stage: build
+  image: docker:18.09.7
+  only:
+    - master
+  services:
+    - docker:18.09.7-dind
+  before_script:
+    - ''
+  script:
+    - docker build -t clarinpl/punctuator .
+    - echo $DOCKER_PASSWORD > pass.txt
+    - cat pass.txt | docker login --username $DOCKER_USERNAME --password-stdin
+    - rm pass.txt
+    - docker push clarinpl/punctuator
\ No newline at end of file
diff --git a/.isort.cfg b/.isort.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..9e5a06c7bf348e962699f493661b04f32b35f6d2
--- /dev/null
+++ b/.isort.cfg
@@ -0,0 +1,3 @@
+[settings]
+profile=hug
+src_paths=src,test
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..2b3b6a3852dc79edf55914ce01f983c6a2f9b98c
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,16 @@
+FROM clarinpl/python:3.8
+
+RUN DEBIAN_FRONTEND=noninteractive apt-get update && apt-get install -y gcc python3-dev
+RUN mkdir /punctuator
+WORKDIR /punctuator
+
+COPY requirements.txt requirements.txt
+RUN pip3 install -r requirements.txt && rm requirements.txt
+
+COPY src ./src
+COPY config.ini .
+COPY worker.py .
+  
+RUN pip3 freeze
+
+ENTRYPOINT [ "./worker.py" ]
\ No newline at end of file
diff --git a/README.md b/README.md
index b9e0cc6d557287bf17f7eccd4821ab9264505f83..0b99108079d2776d97c90b456d3a21769ff92f85 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,10 @@
-# punctuator
+# Punctuator
+A service that automatically adds punctuation to raw word-stream (eg. from speech2text).  
 
+## Approaches
+1. Token classification (actions): Each token is classified with 4 labels: Uppercase, dot, colon, question mark. The model is based on the stacked encoder part of transformer architecture (Bert), followed by FC-layer that transforms the output into per-token multilabel binary classifications. For now, there is no restriction for taking dot, question_mark and colon labels simultaneously, so that's the are of improvement  (hierarchical, multilabel classification)
+
+2. Sequence-to-Sequence (translations): Full encoder-decoder stack that takes input (unpunctuated text) and the output produced so far to predict the next token. In theory, this model should be able to represent many more cases (eg. all upper, some upper, dashes, ellipsis etc...) without explicit defines. However, the lack of constraints makes it much harder to train. 
+
+## Mountpoints
+Directory where model will be downloaded (~500Mb) needs to be mounted at /punctuator/deploy
diff --git a/config.ini b/config.ini
new file mode 100644
index 0000000000000000000000000000000000000000..9e5854734140c5933f8f9dcd97bb152cdcd9c5fd
--- /dev/null
+++ b/config.ini
@@ -0,0 +1,21 @@
+[service]
+tool = Punctuator
+
+root = /samba/requests/
+rabbit_host = addr
+rabbit_user = test
+rabbit_password = test
+
+[tool]
+workers_number = 1
+
+[logging]
+port = 9981
+local_log_level = INFO
+
+[deployment]
+device = "cpu"
+chunk_size = 500
+threshold = 0.9
+model = "deploy/model"
+base_model = "dkleczek/bert-base-polish-cased-v1"
\ No newline at end of file
diff --git a/data.dvc b/data.dvc
new file mode 100644
index 0000000000000000000000000000000000000000..eb543e5fe1c58fc1fe1bd7c3bef099098c00dae6
--- /dev/null
+++ b/data.dvc
@@ -0,0 +1,3 @@
+outs:
+- md5: 1fa175e752af1638dc896838e82a9d7d.dir
+  path: data
diff --git a/docker/development/Dockerfile b/docker/development/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..1535758262ef4bd86ae4fa82c0fc661b0d560d35
--- /dev/null
+++ b/docker/development/Dockerfile
@@ -0,0 +1,41 @@
+from ubuntu:20.04
+
+RUN apt update && apt install -y python3 python3-pip
+RUN apt update && apt install -y git
+RUN pip3 install ipywidgets
+
+#### CUDA Installation
+RUN apt-get update && apt-get install -y --no-install-recommends \
+gnupg2 curl ca-certificates && \
+    curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub | apt-key add - && \
+    echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
+    echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \
+rm -rf /var/lib/apt/lists/*
+
+ENV CUDA_VERSION 10.2.89
+
+ENV CUDA_PKG_VERSION 10-2=$CUDA_VERSION-1
+
+# For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        cuda-cudart-$CUDA_PKG_VERSION \
+cuda-compat-10-2 && \
+ln -s cuda-10.2 /usr/local/cuda && \
+    rm -rf /var/lib/apt/lists/*
+
+# Required for nvidia-docker v1
+RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
+    echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
+
+ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
+ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
+
+# nvidia-container-runtime
+ENV NVIDIA_VISIBLE_DEVICES all
+ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
+ENV NVIDIA_REQUIRE_CUDA "cuda>=10.2 brand=tesla,driver>=384,driver<385 brand=tesla,driver>=396,driver<397 brand=tesla,driver>=410,driver<411 brand=tesla,driver>=418,driver<419"
+
+### END CUDA Installation
+
+RUN pip3 install numpy pandas tqdm seaborn torch dask[complete] transformers pyarrow==0.17.1 pytest lxml
+RUN ln -s /usr/bin/pip3 /usr/bin/pip
\ No newline at end of file
diff --git a/download_dataset.sh b/download_dataset.sh
new file mode 100755
index 0000000000000000000000000000000000000000..5c70a48b7d483afe8d3756a523efc091234add2d
--- /dev/null
+++ b/download_dataset.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+wget http://manage.legis.nlp.ipipan.waw.pl/download/ppc-nanno.tar.gz
+tar -xvf ppc-nanno.tar.gz
+rm ppc-nanno.tar.gz
diff --git a/dvc.yaml b/dvc.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bc292439f63f2203deb927a933a3cbd5d15d31af
--- /dev/null
+++ b/dvc.yaml
@@ -0,0 +1,99 @@
+stages:
+  actions_extraction:
+    cmd: python3 -m src.pipelines.actions_based.stage1_extraction
+    deps:
+    - data
+    - src/pipelines/actions_based/stage1_extraction.py
+    params:
+    - actions.extraction.num_partitions
+    outs:
+    - generated/actions/stage1_extraction
+  actions_tokenization:
+    cmd: python3 -m src.pipelines.actions_based.stage2_tokenization
+    deps:
+    - generated/actions/stage1_extraction
+    - src/pipelines/actions_based/stage2_tokenization.py
+    params:
+    - actions.tokenization.max_tokens
+    - actions.tokenization.min_tokens
+    - global.base_model
+    outs:
+    - generated/actions/stage2_tokenization
+  actions_exploding:
+    cmd: python3 -m src.pipelines.actions_based.stage3_exploding
+    deps:
+    - generated/actions/stage2_tokenization
+    - src/pipelines/actions_based/stage3_exploding.py
+    outs:
+    - generated/actions/stage3_exploding
+  actions_reindexing:
+    cmd: python3 -m src.pipelines.actions_based.stage4_reindexing
+    deps:
+    - generated/actions/stage3_exploding
+    - src/pipelines/actions_based/stage4_reindexing.py
+    outs:
+    - generated/actions/stage4_reindexing
+  actions_stats:
+    cmd: python3 -m src.pipelines.actions_based.stage5_stats
+    deps:
+    - generated/actions/stage4_reindexing
+    - src/pipelines/actions_based/stage5_stats.py
+    outs:
+    - generated/actions/stage5_stats
+  actions_training:
+    cmd: python3 -m src.pipelines.actions_based.train
+    deps:
+    - generated/actions/stage4_reindexing
+    - generated/actions/stage5_stats
+    - src/pipelines/actions_based/train.py
+    params:
+    - global.base_model
+    - actions.training.max_training_time
+    - actions.training.learning_rate
+    - actions.training.num_epochs
+    - actions.training.batch_size
+    - actions.training.save_step
+    outs:
+    - checkpoints/actions
+  translations_extraction:
+    cmd: python3 -m src.pipelines.translation_based.stage1_extraction
+    deps:
+    - data
+    params:
+    - translations.extraction.num_partitions
+    outs:
+    - generated/translations/stage1_extraction
+  translations_create_batches:
+    cmd: python3 -m src.pipelines.translation_based.stage2_create_batches
+    deps:
+    - generated/translations/stage1_extraction
+    params:
+    - global.base_model
+    outs:
+    - generated/translations/stage2_create_batches
+  translations_exploding:
+    cmd: python3 -m src.pipelines.translation_based.stage3_exploding
+    deps:
+    - generated/translations/stage2_create_batches
+    outs:
+    - generated/translations/stage3_exploding
+  translations_reindexing:
+    cmd: python3 -m src.pipelines.translation_based.stage4_reindexing
+    deps:
+    - generated/translations/stage3_exploding
+    outs:
+    - generated/translations/stage4_reindexing
+  translations_training:
+    cmd: python3 -m src.pipelines.translation_based.train
+    deps:
+    - generated/translations/stage4_reindexing
+    - src/pipelines/translation_based/train.py
+    params:
+    - global.base_model
+    - translations.training.max_training_time
+    - translations.training.learning_rate
+    - translations.training.num_epochs
+    - translations.training.batch_size
+    - translations.training.save_step
+    outs:
+    - checkpoints/translations
\ No newline at end of file
diff --git a/entrypoint.sh b/entrypoint.sh
new file mode 100755
index 0000000000000000000000000000000000000000..e548dca6707b76c9ba68b7c4709dfd2151b5b34a
--- /dev/null
+++ b/entrypoint.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+if test -f "./deploy/model"; then
+    mkdir -p ./deploy
+    wget https://minio.clarin-pl.eu/public/models/punctuation/0-190000.model -O deploy/model
+fi
+
+python3 worker.py
\ No newline at end of file
diff --git a/generated/.gitignore b/generated/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..c96a04f008ee21e260b28f7701595ed59e2839e3
--- /dev/null
+++ b/generated/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
\ No newline at end of file
diff --git a/generated/actions/.gitignore b/generated/actions/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..49854ca837e8da5dd49b36a8ace3a190f7bb6581
--- /dev/null
+++ b/generated/actions/.gitignore
@@ -0,0 +1,5 @@
+/stage1_extraction
+/stage2_tokenization
+/stage3_exploding
+/stage4_reindexing
+/stage5_stats
\ No newline at end of file
diff --git a/generated/translations/.gitignore b/generated/translations/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..c31dad5b4ce3e42b41b9a521290b0bbffa383ebd
--- /dev/null
+++ b/generated/translations/.gitignore
@@ -0,0 +1,4 @@
+/stage1_extraction
+/stage2_create_batches
+/stage3_exploding
+/stage4_reindexing
diff --git a/params.yaml b/params.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3b9977d9621ab7d8a87e6d8170baedb08bedc817
--- /dev/null
+++ b/params.yaml
@@ -0,0 +1,67 @@
+global:
+    dashboard_port: 8787
+    base_model: "dkleczek/bert-base-polish-cased-v1"
+    random_seed: 44
+
+actions:
+    extraction:
+        num_partitions: 2_000
+        num_workers: 24
+        worker_memory_limit: "2GB"
+
+    tokenization:
+        min_tokens: 10
+        max_tokens: 500
+        num_workers: 24
+        worker_memory_limit: "2GB"
+
+    exploding:
+        num_workers: 24
+        worker_memory_limit: "2GB"
+
+    reindexing:
+        num_workers: 1
+        worker_memory_limit: "60GB"
+
+    stats:
+        num_workers: 24
+        worker_memory_limit: "2GB"
+
+    training:
+        learning_rate: 0.0001
+        num_epochs: 5
+        batch_size: 2
+        save_step: 100
+        max_training_time: null
+        loss_averaging_span: 1000
+        fresh_start: true
+        device: "cuda:0"
+translations:
+    extraction:
+        num_partitions: 2_000
+        num_workers: 24
+        worker_memory_limit: "2GB"
+
+    create_batches:
+        num_workers: 24
+        worker_memory_limit: "2GB"
+        min_tokens: 5
+        max_tokens: 300
+
+    exploding:
+        num_workers: 24
+        worker_memory_limit: "2GB"
+
+    reindexing:
+        num_workers: 1
+        worker_memory_limit: "60GB"
+
+    training:
+        learning_rate: 0.0001
+        num_epochs: 5
+        batch_size: 10
+        save_step: 1000
+        max_training_time: "4h"
+        loss_averaging_span: 1000
+        fresh_start: false
+        device: "cuda:1"
\ No newline at end of file
diff --git a/punctuate.py b/punctuate.py
new file mode 100755
index 0000000000000000000000000000000000000000..510eaad122f7452287806f57d6ef82f6bf0518e7
--- /dev/null
+++ b/punctuate.py
@@ -0,0 +1,52 @@
+#!/usr/bin/python3
+
+import argparse
+import os
+from argparse import Namespace
+
+from src.pipelines.actions_based.processing import apply_actions_punctuation
+from src.pipelines.actions_based.utils import load_model
+from src.utils import preprocess
+
+
+def get_args() -> Namespace:
+    parser = argparse.ArgumentParser(
+        description="Adds punctuaiton in to raw text stream."
+    )
+    parser.add_argument(
+        "-i", "--input", type=str, required=True, help="Path to input text file",
+    )
+    parser.add_argument(
+        "-o", "--output", type=str, required=True, help="Path to input text file",
+    )
+    parser.add_argument(
+        "-m", "--model", required=True, type=str, help="Path to the pretrained model",
+    )
+    parser.add_argument(
+        "-b", "--base", required=True, type=str, help="Name of base model",
+    )
+    parser.add_argument(
+        "-c", "--chunk_size", default=500, type=int, help="Maximum chunk size"
+    )
+    parser.add_argument("-t", "--threshold", default=0.9, type=float, help="Threshold")
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    if not os.path.exists(args.input):
+        print(f"Error: File '{args.input}' does not exists")
+        exit(-1)
+
+    tokenizer, model = load_model(args.model, args.base, "cpu")
+
+    with open(args.input, "r") as f:
+        text = preprocess(f.read())
+        text_processed = apply_actions_punctuation(
+            text, args.chunk_size, tokenizer, model, args.threshold
+        )
+
+    with open(args.output, "w") as f:
+        f.write(text_processed)
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..17154b987755ad39634351de67de2f3166cbe8be
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,61 @@
+attrs==19.3.0
+bokeh==2.1.1
+certifi==2020.6.20
+chardet==3.0.4
+click==7.1.2
+cloudpickle==1.5.0
+cycler==0.10.0
+dask==2.22.0
+distributed==2.22.0
+filelock==3.0.12
+fsspec==0.8.0
+future==0.18.2
+HeapDict==1.0.1
+idna==2.10
+iniconfig==1.0.1
+Jinja2==2.11.2
+joblib==0.16.0
+kiwisolver==1.2.0
+locket==0.2.0
+lxml==4.5.2
+MarkupSafe==1.1.1
+matplotlib==3.3.0
+more-itertools==8.4.0
+msgpack==1.0.0
+numpy==1.19.1
+packaging==20.4
+pandas==1.1.0
+partd==1.1.0
+Pillow==7.2.0
+pluggy==0.13.1
+psutil==5.7.2
+py==1.9.0
+pyarrow==0.17.1
+pycurl==7.43.0
+pygobject==3.20.0
+pyparsing==2.4.7
+pytest==6.0.1
+python-apt==1.1.0b1+ubuntu0.16.4.9
+python-dateutil==2.8.1
+pytz==2020.1
+PyYAML==5.3.1
+regex==2020.7.14
+requests==2.24.0
+sacremoses==0.0.43
+scipy==1.5.2
+seaborn==0.10.1
+sentencepiece==0.1.91
+six==1.15.0
+sortedcontainers==2.2.2
+tblib==1.7.0
+tokenizers==0.8.1rc1
+toml==0.10.1
+toolz==0.10.0
+torch==1.6.0
+tornado==6.0.4
+tqdm==4.48.2
+transformers==3.0.2
+typing-extensions==3.7.4.2
+unattended-upgrades==0.1
+urllib3==1.25.10
+zict==2.0.0
\ No newline at end of file
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/batch_loading.py b/src/batch_loading.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0527e6a444d98027cc4761872da8f795b0cd5f7
--- /dev/null
+++ b/src/batch_loading.py
@@ -0,0 +1,96 @@
+from typing import Union
+
+import dask.dataframe as dd
+import numpy as np
+import pandas as pd
+
+
+def calculate_batch_buffer_id(batch_id: int, buffer_batch_num: int) -> int:
+    """Calculate which buffer should be loaded into memory for a given batch
+
+    Args:
+        batch_id (int): Id of the batch, counted from the start
+        buffer_batch_num (int): Number of batches that are loaded at once into memory
+
+    Returns:
+        int: Batch buffer id that needs to be in memory for a given batch
+    """
+    return batch_id // buffer_batch_num
+
+
+def yield_batch_buffer_span(
+    batch_size: int, batch_buffer_len: int, num_samples: int
+) -> np.array:
+    """Calculates which samples should be loaded in a given batch buffer
+
+    Args:
+        batch_buffer_id (int): Id of the buffer, counting from beggining
+        batch_buffer_size (int): Size of batch buffer (in number of batches)
+        num_samples (int): Number of samples in a dataset
+
+    Returns:
+        np.array: Contignous ids that should be loaded to memory for a given buffer
+    """
+    batch_buffer_size = batch_size * batch_buffer_len
+
+    batch_buffer_id = 0
+
+    while batch_buffer_id < (num_samples / batch_buffer_size):
+        buffer_start = batch_buffer_size * batch_buffer_id
+        buffer_end = min(num_samples, buffer_start + batch_buffer_size)
+
+        yield np.arange(buffer_start, buffer_end, 1, np.long)
+        batch_buffer_id += 1
+
+
+def get_ordered_dataframe_len(df: Union[pd.DataFrame, dd.DataFrame]) -> int:
+    """Gets length of a dataframe, which ids are ORDERED CONTINUOUSLY from 0 to N
+    without counting all the elements
+
+    Args:
+        df (Union[pd.DataFrame, dd.DataFrame]): Dataframe
+
+    Returns:
+        int: Length of the dataframe
+    """
+    return df.tail(1).index.values[0] + 1
+
+
+def get_batches(
+    df: dd.DataFrame,
+    batch_size: int,
+    batch_buffer_len: int,
+    shuffled_ids: np.array,
+    batch_start: int = 0,
+) -> pd.DataFrame:
+    """Generator for getting batches from large Dask dataframe with implemented buffering
+
+    Args:
+        df (dd.DataFrame): Source dask dataframe
+        batch_size (int): Desired size of a batch
+        batch_buffer_len (int): Number of batches to load to memory at once
+        shuffled_ids (np.array): Shuffled order of samples
+
+    Returns:
+        pd.DataFrame: [description]
+
+    Yields:
+        Iterator[pd.DataFrame]: [description]
+    """
+    length = get_ordered_dataframe_len(df)
+
+    batch_id = batch_start
+
+    for batch_buffer_span in yield_batch_buffer_span(
+        batch_size, batch_buffer_len, length
+    ):
+        buffer_ids = shuffled_ids[batch_buffer_span]
+        buffer = df.loc[buffer_ids].compute()
+
+        for i in range(batch_buffer_len):
+            batch_ids = buffer_ids[
+                range(i * batch_size, min((i + 1) * batch_size, len(buffer_ids)))
+            ]
+
+            yield buffer.loc[batch_ids]
+            batch_id += 1
diff --git a/src/models/TransformerSeq2Seq.py b/src/models/TransformerSeq2Seq.py
new file mode 100644
index 0000000000000000000000000000000000000000..753df1e392729766b9984216d24615f1e83098fa
--- /dev/null
+++ b/src/models/TransformerSeq2Seq.py
@@ -0,0 +1,113 @@
+import math
+
+import torch
+import torch.nn as nn
+
+
+class PositionalEncoding(nn.Module):
+    """Adds sinsusoidal positional encoding (as in original "Attention is all you need" paper.)
+    src: https://pytorch.org/tutorials/beginner/transformer_tutorial.html
+
+    """
+
+    def __init__(self, d_model: int, max_len: int, dropout=0.1):
+        """Sinusidal positional encodings
+
+        Args:
+            d_model (int): Embedding dimension
+            max_len (int): Maximum length of sequence
+            dropout (float, optional): Dropout ratio. Defaults to 0.1.
+        """
+        super(PositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(p=dropout)
+
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0).transpose(0, 1)
+        self.register_buffer("pe", pe)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Applies positional encoding
+
+        Args:
+            x (torch.Tensor): Word embeddings tensor
+
+        Returns:
+            torch.Tensor: Word embeddings with added positional encodings
+        """
+        x = x + self.pe[: x.size(0), :]
+        return self.dropout(x)
+
+
+class TransformerSeq2Seq(nn.Module):
+    """Class representing a sequence to sequence transformer, based on original "Attention is all you need" paper."""
+
+    def __init__(
+        self,
+        vocab_size: int,
+        embedding_size: int,
+        max_len: int,
+        num_heads: int = 8,
+        encoder_layers: int = 6,
+        decoder_layers: int = 6,
+        feedforward_neurons: int = 2048,
+        dropout: float = 0.1,
+    ):
+
+        super(TransformerSeq2Seq, self).__init__()
+
+        # Embedd from token to vec space
+        self.word_embedding = nn.Embedding(vocab_size, embedding_size)
+
+        # Add positional encoding
+        self.position_embedding = PositionalEncoding(embedding_size, max_len, dropout)
+
+        # Combined encoder-decoder step
+        self.core = nn.Transformer(
+            embedding_size,
+            num_heads,
+            encoder_layers,
+            decoder_layers,
+            feedforward_neurons,
+            dropout,
+        )
+
+        # Map embedding to word
+        self.embedding_to_words = nn.Linear(embedding_size, vocab_size)
+
+    def forward(
+        self, source: torch.Tensor, target: torch.Tensor, source_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        """Full encoder-decoder pass
+
+        Args:
+            source (torch.Tensor): Tensor with batch of source sentences tokens [BxL shape]
+            target (torch.Tensor): Tensor with batch of target sentences tokens [BxL-1 shape]
+            source_mask (torch.Tensor): Mask applied to source (True if element is padding, False otherwise) [BxL shape]
+
+        Returns:
+            torch.Tensor: Tensor with predicted target sentences tokens [Bx(L-1)xV]
+        """
+        # Input to encoder
+        x = source.transpose(0, 1)
+        x = self.word_embedding(x)
+        x = self.position_embedding(x)
+
+        # Input to decoder
+        y = target.transpose(0, 1)
+        y = self.word_embedding(y)
+        y = self.position_embedding(y)
+
+        tgt_mask = self.core.generate_square_subsequent_mask(y.shape[0]).to(y.device)
+
+        z = self.core(
+            x, y, src_key_padding_mask=source_mask, tgt_mask=tgt_mask
+        ).transpose(1, 0)
+        z = self.embedding_to_words(z)
+
+        return z
diff --git a/src/models/__init__.py b/src/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/pipelines/__init__.py b/src/pipelines/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/pipelines/actions_based/__init__.py b/src/pipelines/actions_based/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/pipelines/actions_based/processing.py b/src/pipelines/actions_based/processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc5ac1fe42964b88c7471253ffee441f58ab5115
--- /dev/null
+++ b/src/pipelines/actions_based/processing.py
@@ -0,0 +1,626 @@
+from collections import defaultdict
+from typing import List, Mapping, Optional, Tuple
+from xml.etree import ElementTree as ET
+
+import numpy as np
+from transformers import BertTokenizerFast
+from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+
+from src.utils import remove_punctuation
+
+ACTIONS_KEYS = ["dot", "upper_case", "colon", "question_mark"]
+
+
+def apply_file_processing(x: dict) -> dict:
+    """Creates input-output pairs from xml file from dataset
+
+    Args:
+        x (dict): Dask dataframe row with columns: file
+
+    Returns:
+        dict: Dask dataframe row with columns: source, target, target_shape
+    """
+    full_text = text_from_xml(x.file)
+
+    if len(full_text) > 0:
+        model_input, model_output = create_model_input_output(full_text)
+
+        output_shape = np.array(model_output.shape, dtype=np.int)
+
+        return {
+            "source": model_input,
+            "target": model_output.reshape(-1),
+            "target_shape": output_shape,
+        }
+    else:
+        return {"source": None, "target": None, "target_shape": None}
+
+
+APPLY_FILE_PROCESSING_META = {
+    "source": object,
+    "target": object,
+    "target_shape": object,
+}
+
+
+def apply_tokenization(
+    df: dict, min_tokens: int, max_tokens: int, tokenizer: BertTokenizerFast
+) -> dict:
+    """Applies tokenization and chunking
+
+    Args:
+        df (dict): Dataframe entry with columns: source, target, target_shape
+        min_tokens (int): Minimum number of tokens in a single training example
+        max_tokens (int): Maximum number of tokens in a single testing example
+        tokenizer (BertTokenizerFast): Tokenizer that will be used for tokenization
+
+    Returns:
+        dict: Dataframe entry with columns: source, target, attention_mask, source_shape
+                , target_shape, attention_mask_shape
+    """
+    text_clean = df.source
+    labels = df.target
+    shape = df.target_shape
+
+    tokens, token_labels = tokenize_labeled_text(
+        text_clean, labels.reshape(shape), tokenizer
+    )
+
+    inputs, outputs, attentions = batchify_data(
+        tokens, token_labels, max_tokens, tokenizer, min_tokens
+    )
+
+    inputs_shape = np.array(inputs.shape)
+    outputs_shape = np.array(outputs.shape)
+    attentions_shape = np.array(attentions.shape)
+
+    return {
+        "source": inputs.reshape(-1),
+        "target": outputs.reshape(-1),
+        "attention_mask": attentions.reshape(-1),
+        "source_shape": inputs_shape,
+        "target_shape": outputs_shape,
+        "attention_mask_shape": attentions_shape,
+    }
+
+
+APPLY_TOKENIZATION_META = {
+    "source": object,
+    "target": object,
+    "attention_mask": object,
+    "source_shape": object,
+    "target_shape": object,
+    "attention_mask_shape": object,
+}
+
+
+def action_vector(actions: List[str]) -> np.ndarray:
+    """Transforms array of label names into an action vector.
+
+    Args:
+        actions ([str]): Actions that should be in action vector (eg. ["dot", "upper_case"])
+
+    Returns:
+        np.ndarray: Action vector with provided actions
+    """
+    return encode_actions(
+        {
+            "dot": "dot" in actions,
+            "upper_case": "upper_case" in actions,
+            "colon": "colon" in actions,
+            "question_mark": "question_mark" in actions,
+        }
+    )
+
+
+def last_stop_label(labels: np.array, stop_action: np.array) -> Optional[int]:
+    """Finds the position of the last sentence ending token
+
+    Args:
+        labels (np.array): Array of token-labels in form of action vectors (LxA shape)
+        stop_token (np.array): Action vector that mark a stop token (A shape)
+
+    Returns:
+        int: Index of the last found stop token in a sentence. None if no stop token is found
+    """
+
+    assert len(labels.shape) == 2
+    assert len(stop_action.shape) == 1
+    stop_labels = np.argwhere(np.all(labels == stop_action, axis=1))
+
+    if len(stop_labels) == 0:
+        return None
+
+    return stop_labels[-1][0]
+
+
+def empty_action_vector() -> np.ndarray:
+    """Returns a do-nothing actions vector
+
+    Returns:
+        np.ndarray: Vector with all zeroes and length of ACTION_KEYS
+    """
+    return np.zeros(len(ACTIONS_KEYS))
+
+
+def empty_action_dict() -> dict:
+    """Returns a do-noting unencoded action dict
+
+    Returns:
+        dict: Action dict with all actions set to False
+    """
+
+    return decode_actions(empty_action_vector())
+
+
+def text_from_xml(path: str) -> str:
+    """Extract spoken text from dataset's xml format
+
+    Args:
+        path (str): Path to xml
+
+    Returns:
+        str: Raw text
+    """
+    root = ET.parse(path).getroot()
+
+    full_text = ""
+
+    for node in root.iter("*"):
+        if len(node) == 0:
+            who = node.get("who")
+            text = node.text
+
+            if text is not None and who is not None and who != "#komentarz":
+                full_text = " ".join([full_text, text])
+
+    del root
+
+    return full_text
+
+
+def detect_actions(word: str, next_word: Optional[str]) -> Mapping[str, bool]:
+    """Detect what actions should model perform on a word and returns encoded
+       action vector
+
+    Args:
+        word (str): Word on wich action is decided
+        next_word (Optional[str]): Word that follows considered word. Can be
+            None if nothing follows a word
+
+    Returns:
+        Mapping[str, bool]: Mapping telling if each of possible actions should be performed (True) or not (False)
+    """
+    # Unsuported characters
+    word.replace(";", ".")
+    word.replace('"', "")
+    word.replace("(", "")
+    word.replace(")", "")
+
+    while len(word) > 0 and not word[0].isalnum():  # remove proceding characters
+        word = word[1:]
+
+    if len(word) == 0:
+        return dict(zip(ACTIONS_KEYS, [False] * len(ACTIONS_KEYS)))
+
+    actions = {
+        "dot": word[-1] == ".",
+        "upper_case": word[0].isupper(),
+        "colon": word[-1] == ",",
+        "question_mark": word[-1] == "?",
+    }
+
+    return actions
+
+
+def encode_actions(actions: Mapping[str, bool]) -> np.ndarray:
+    """Transforms actions into vector
+
+    Args:
+        actions (Mapping[str, bool]): Map telling which actions should be made
+
+    Returns:
+        np.ndarray: 1 dimensional action vector
+    """
+    return np.array(list(actions.values())).astype(float)
+
+
+def decode_actions(encoded_actions: np.ndarray) -> Mapping[str, bool]:
+    """Decodes actions
+
+    Args:
+        encoded_actions (np.ndarray): 1 dimensional action vector
+
+    Returns:
+        Mapping[str, bool]: Map telling which actions should be made
+    """
+    assert encoded_actions.shape[0] == len(ACTIONS_KEYS)
+
+    return dict(zip(ACTIONS_KEYS, encoded_actions.astype(np.bool).tolist()))
+
+
+def create_model_input_output(text: str) -> Tuple[str, np.ndarray]:
+    """Returns a pair of input and desired output of the model
+
+    Args:
+        text (str): Correct text sample
+
+    Returns:
+        text_cleaned (str): Text without any interpuction and all lowercase
+        actions (np.ndarray): To dimensional array, where each row is aciton vector for each word (columns)
+    """
+    words = text.split(" ")
+
+    words_output = []
+    actions_output = []
+
+    i = 0
+    while i < len(words):
+        word = words[i]
+        next_word = words[i + 1] if len(words) > i + 1 else None
+
+        word_sanitized = remove_punctuation(word).lower()
+        if len(word_sanitized) > 0:
+            actions = detect_actions(word, next_word)
+            actions_encoded = encode_actions(actions)
+
+            words_output.append(word_sanitized)
+            actions_output.append(actions_encoded)
+
+        i += 1
+
+    assert len(words_output) == len(actions_output)
+
+    return " ".join(words_output), np.array(actions_output)
+
+
+def token_word_mapping(text: str, tokenizer: PreTrainedTokenizerFast) -> np.ndarray:
+    """Returns mapping where each token is labeled with index of word it's part of
+
+    Args:
+        text (str): Input text
+        tokenizer (PreTrainedTokenizerFast): Tokenizer used to tokenize text
+
+    Returns:
+        np.ndarray: Array of length L (number of tokens) where each entry is index of word (cls and sep labels are not counted).
+    """
+    text_tokenized = tokenizer(text, return_offsets_mapping=True)
+    offset_mappings = text_tokenized["offset_mapping"][1:-1]
+
+    offset_mappings = text_tokenized["offset_mapping"][1:-1]
+
+    # Create a map where each character is assigned index of it's word
+    words_mapping = []
+    actual_word = 0
+    for character in text:
+        words_mapping.append(actual_word)
+        if character == " ":
+            actual_word += 1
+
+    token_mapping = [words_mapping[x[0]] for x in offset_mappings]
+
+    return np.array(token_mapping)
+
+
+def token_labels_to_word_labels(
+    text: str, token_labels: np.ndarray, tokenizer: PreTrainedTokenizerFast
+) -> np.ndarray:
+    mapping = token_word_mapping(text, tokenizer)
+
+    assert len(mapping) == len(token_labels)
+
+    labels = defaultdict(list)
+
+    for i in range(len(mapping)):
+        labels[mapping[i]].append(token_labels[i])
+
+    return np.array([np.mean(labels[x], axis=0) for x in sorted(labels)])
+
+
+def tokenize_labeled_text(
+    text: str, labels: np.ndarray, tokenizer: PreTrainedTokenizerFast
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Transforms text into numerical tokens. Also expand word-level labels into token-level labels
+
+    Args:
+        text (str): Text that will be tokenized (TODO: Change to array)
+        labels (np.ndarray): Word-level labels for text to be tokenized. Word is defined via space spearation
+        tokenizer (PreTrainedTokenizerFast): Tokenizer that will be used for tokenization
+
+    Returns:
+        np.ndarray: 2-dimensional array with tokens (without cls and sep tokens!)
+        np.ndarray 2-dimensional array with token-level labels
+    """
+    text_tokenized = tokenizer(text, return_offsets_mapping=True)
+
+    offset_mappings = text_tokenized["offset_mapping"][1:-1]
+    input_ids = text_tokenized["input_ids"][1:-1]
+
+    # Create a map where each character is assigned index of it's word
+    words_mapping = []
+    actual_word = 0
+    for character in text:
+        words_mapping.append(actual_word)
+        if character == " ":
+            actual_word += 1
+
+    # Assign each token to a word
+    token_mapping = [words_mapping[x[0]] for x in offset_mappings]
+
+    # Expand word-based labels to token-based labels
+    labels_tokenized = [labels[i] for i in token_mapping]
+
+    return np.array(input_ids).reshape(-1, 1), np.array(labels_tokenized)
+
+
+def recover_word(word: str, action: Mapping[str, bool]) -> str:
+    """Applies action to a word
+
+    Args:
+        word (str): word on which action will be applied
+        action (Mapping[str, bool]): Action to be applied
+
+    Returns:
+        str: transfomed word
+    """
+    word_result = word
+
+    if action["dot"]:
+        word_result += "."
+    if action["upper_case"]:
+        word_result = word_result.capitalize()
+    if action["colon"]:
+        word_result += ","
+    if action["question_mark"]:
+        word_result += "?"
+
+    return word_result
+
+
+def is_sentence_end(actions_encoded: np.ndarray) -> bool:
+    """Returns if given action would end a sentence
+
+    Args:
+        actions_encoded (np.ndarray): Action vector
+
+    Returns:
+        bool: True if action would end a sentence, False otherwise
+    """
+    actions_decoded = decode_actions(actions_encoded)
+
+    return actions_decoded["dot"] is True
+
+
+def nearest_sentence_l(labels: np.array, index_start: int) -> int:
+    """Find nearest word that begins a sentence that has lower or equal index to index_start
+
+    Args:
+        labels (np.array): 2-dimensonal array of action-vectors
+        index_start (int): Index from which search will be started
+
+    Returns:
+        int: Index of nearest left-oriented start of the sentence. If no sentence is found, first index is assumed to
+             start a sentence
+    """
+    result_index = index_start
+
+    while result_index > 0:
+        if is_sentence_end(labels[result_index, :]):
+            # prevent beeing in the middle of token
+            result_index -= 1
+        elif is_sentence_end(labels[result_index - 1, :]):
+            break
+        elif result_index == 1:
+            result_index = 0
+            break
+        else:
+            result_index -= 1
+
+    return result_index
+
+
+def nearest_sentence_r(labels: np.array, index_start: int) -> Optional[int]:
+    """Find nearest word that begins a sentence that has higher or equal index to index_start
+
+    Args:
+        labels (np.array): 2-dimensonal array of action-vectors
+        index_start (int): Index from which search will be started
+
+    Returns:
+        int: Index of nearest right-oriented start of the sentence. None if no later sentence is found
+    """
+    result_index = index_start
+
+    while result_index < len(labels):
+        if is_sentence_end(labels[result_index - 1]) and not is_sentence_end(
+            labels[result_index]
+        ):
+            break
+        else:
+            result_index += 1
+
+    if result_index >= len(labels):
+        return None
+    else:
+        return result_index
+
+
+def batchify_labels(
+    labels: np.ndarray, max_tokens: int, min_tokens: int = 3
+) -> List[np.ndarray]:
+    """Splits long labels array into batches of desired size
+
+    Args:
+        labels (np.ndarray): 2-dimensional array of action-vectors
+        max_tokens (int): Maximum number of labels in a single batch
+        min_tokens (int, optional): Minimum number of labels in a single batch. Defaults to 3.
+
+    Returns:
+        [np.ndarray]: List of arrays with indexes composing each batch
+    """
+    assert min_tokens >= 1
+    assert max_tokens >= 1
+
+    labels_batches = []
+
+    index = 0
+    new_index = 0
+    while index < (labels.shape[0] - min_tokens):
+        num_consumed = min(max_tokens, labels.shape[0] - index)
+
+        assert num_consumed >= min_tokens
+
+        if index + num_consumed < (labels.shape[0] - min_tokens):
+            new_index = nearest_sentence_l(labels, index + num_consumed)
+            if new_index == index:
+                new_index = nearest_sentence_r(labels, index + num_consumed)
+                if new_index is None:
+                    labels_batches.append(
+                        np.array(list(range(index, index + num_consumed)))
+                    )
+                    break
+        else:
+            labels_batches.append(np.array(list(range(index, index + num_consumed))))
+            break
+
+        labels_batches.append(np.array(list(range(index, index + num_consumed))))
+
+        index = new_index
+
+    return labels_batches
+
+
+def add_cls_sep(
+    tokens: np.ndarray, labels: np.ndarray, tokenizer: PreTrainedTokenizerFast
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Adds staring cls and ending sep token ids into tokens & labels
+
+    Args:
+        tokens (np.ndarray): 2-dimensional array (with 1 feature!) of tokens
+        labels (np.ndarray): 2-dimensional array of action vectors
+
+    Returns:
+        np.ndarray: tokens with added cls & sep tokens ids
+        np.ndarray: labels with first and last item duplicated to accomodate for cls & sep
+    """
+
+    tokens = np.concatenate(
+        [[[tokenizer.cls_token_id]], tokens, [[tokenizer.sep_token_id]]]
+    )
+    labels = np.concatenate([labels[:1, :], labels, labels[-1:, :]])
+
+    return tokens, labels
+
+
+def add_padding(
+    tokens: np.ndarray,
+    labels: np.ndarray,
+    length: int,
+    tokenizer: PreTrainedTokenizerFast,
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """Appends padding to tokens and labels to match desired length
+
+    Args:
+        tokens (np.ndarray): Lx1 array of token ids
+        labels (np.ndarray): LxA array of action vectors
+        length (int): Desired length of a vector. Must be higher than L
+        tokenizer (PreTrainedTokenizerFast): Tokenizer that was used for tokenization
+
+    Returns:
+        np.ndarray: (L+P)x1 array of token ids with added padding
+        np.ndarray: (L+P)xA array of action vectors with added padding
+        np.ndarray: (L+P)-length array of masks where True means token False - padding
+    """
+
+    pad_length = length - tokens.shape[0]
+    assert pad_length >= 0
+
+    if pad_length > 0:
+        tokens = np.concatenate([tokens, [[tokenizer.pad_token_id]] * pad_length])
+        labels = np.concatenate([labels, [empty_action_vector()] * pad_length])
+
+    mask = np.ones(len(tokens)).astype(np.int)
+
+    if pad_length > 0:
+        mask[-pad_length:] = False
+
+    return tokens, labels, mask
+
+
+def batchify_data(
+    tokens: np.ndarray,
+    labels: np.ndarray,
+    max_tokens: int,
+    tokenizer: PreTrainedTokenizerFast,
+    min_tokens: int = 3,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Chop long tokens-labels pair into smaller ones of equal length (with added padding)
+
+    Args:
+        tokens (np.ndarray): Tokens representing long, unpunctuated text (Shape L)
+        labels (np.ndarray): Action-labels to transform provided text into punctuated one (Shape LxA)
+        max_tokens (int): Maxium number of tokens in a single entry
+        tokenizer (PreTrainedTokenizerFast): Tokenizer used to tokenize sentence into tokens
+        min_tokens (int, optional): Minimum number of tokens in a sentence. Defaults to 3.
+
+    Returns:
+        Tuple[np.ndarray, np.ndarray]:
+            tokens_batch - Tokens array splitted into smaller chunks. (Shape (max_tokens)xL )
+            labels_batch - LAbels array splitted into smaller chunks. (Shape (max_tokens)xLxA )
+
+    """
+
+    assert max_tokens >= min_tokens + 2
+    assert min_tokens >= 1
+
+    tokens_batch = []
+    labels_batch = []
+    mask_batch = []
+
+    idxs = batchify_labels(labels, max_tokens - 2, min_tokens)
+
+    for ids in idxs:
+        tokens_sample = tokens[ids, :]
+        labels_sample = labels[ids, :]
+
+        assert len(ids) >= min_tokens
+        assert len(ids) <= max_tokens - 2
+
+        tokens_sample, labels_sample = add_cls_sep(
+            tokens_sample, labels_sample, tokenizer
+        )
+
+        assert len(tokens_sample) <= max_tokens
+
+        tokens_sample, labels_sample, mask = add_padding(
+            tokens_sample, labels_sample, max_tokens, tokenizer
+        )
+
+        tokens_batch.append(tokens_sample)
+        labels_batch.append(labels_sample)
+        mask_batch.append(mask)
+
+    return np.array(tokens_batch), np.array(labels_batch), np.array(mask_batch)
+
+
+def recover_text(text: str, actions_encoded: np.ndarray) -> str:
+    """Applies per-word actions to unpunctuated text
+
+    Args:
+        text (str): lowercase, unpunctuated text
+        actions_encoded (np.ndarray): Array of per-word action vectors (Shape LxA)
+
+    Returns:
+        str: Punctuated version of the text
+    """
+    words = text.split(" ")
+
+    words_output = []
+
+    for word, action_encoded in zip(words, actions_encoded.tolist()):
+        action_decoded = decode_actions(np.array(action_encoded))
+
+        word_recovered = recover_word(word, action_decoded)
+        words_output.append(word_recovered)
+
+    return " ".join(words_output)
diff --git a/src/pipelines/actions_based/stage1_extraction.py b/src/pipelines/actions_based/stage1_extraction.py
new file mode 100644
index 0000000000000000000000000000000000000000..94dc26c7ce1265914989f1c3fa894537666e4535
--- /dev/null
+++ b/src/pipelines/actions_based/stage1_extraction.py
@@ -0,0 +1,49 @@
+# /usr/bin/python3
+import glob
+
+import dask.dataframe as dd
+import numpy as np
+import pandas as pd
+from dask.distributed import Client
+
+from src.pipelines.actions_based.processing import (
+    APPLY_FILE_PROCESSING_META,
+    apply_file_processing,
+)
+from src.utils import PROJECT_ROOT, get_config, prepare_folder
+
+INPUT_FOLDER = f"{PROJECT_ROOT}/data"
+OUTPUT_FOLDER = f"{PROJECT_ROOT}/generated/actions/stage1_extraction"
+
+
+if __name__ == "__main__":
+
+    config = get_config()
+    num_partitions = config["actions"]["extraction"]["num_partitions"]
+    num_workers = config["actions"]["extraction"]["num_workers"]
+    memory_limit = config["actions"]["extraction"]["worker_memory_limit"]
+
+    prepare_folder(OUTPUT_FOLDER)
+
+    file_schema = "data/**/text_structure.xml"
+    files_paths = glob.glob(file_schema, recursive=True)
+
+    # Make sure python memory fragmentation won't go insane
+    np.random.shuffle(files_paths)
+
+    client = Client(n_workers=num_workers, memory_limit=memory_limit)
+    print(f"Dashboard: {client.dashboard_link}")
+
+    # Processing pipeline
+    df = dd.from_pandas(pd.DataFrame({"file": files_paths}), npartitions=num_partitions)
+
+    df = df.apply(
+        apply_file_processing,
+        result_type="expand",
+        axis=1,
+        meta=APPLY_FILE_PROCESSING_META,
+    )
+    df = df.dropna()
+
+    # Export
+    df.to_parquet(OUTPUT_FOLDER, engine="pyarrow")
diff --git a/src/pipelines/actions_based/stage2_tokenization.py b/src/pipelines/actions_based/stage2_tokenization.py
new file mode 100644
index 0000000000000000000000000000000000000000..b30445f601bb75d963a2d7b07dc50506c68e6a3b
--- /dev/null
+++ b/src/pipelines/actions_based/stage2_tokenization.py
@@ -0,0 +1,43 @@
+# /usr/bin/python3
+import dask
+import dask.dataframe as dd
+from dask.distributed import Client
+from transformers import BertTokenizerFast
+
+from src.pipelines.actions_based.processing import (
+    APPLY_TOKENIZATION_META,
+    apply_tokenization,
+)
+from src.utils import PROJECT_ROOT, get_config, prepare_folder
+
+INPUT_FOLDER = f"{PROJECT_ROOT}/generated/actions/stage1_extraction"
+OUTPUT_FOLDER = f"{PROJECT_ROOT}/generated/actions/stage2_tokenization"
+
+if __name__ == "__main__":
+
+    config = get_config()
+    max_tokens = config["actions"]["tokenization"]["max_tokens"]
+    min_tokens = config["actions"]["tokenization"]["min_tokens"]
+    num_workers = config["actions"]["tokenization"]["num_workers"]
+    memory_limit = config["actions"]["tokenization"]["worker_memory_limit"]
+    base_model = config["global"]["base_model"]
+
+    prepare_folder(OUTPUT_FOLDER)
+
+    client = Client(n_workers=num_workers, memory_limit=memory_limit)
+    print(client.dashboard_link)
+
+    tokenizer = BertTokenizerFast.from_pretrained(base_model)
+
+    tokenizer = dask.delayed(tokenizer)
+
+    df = dd.read_parquet(INPUT_FOLDER, engine="pyarrow")
+    df = df.apply(
+        apply_tokenization,
+        args=(min_tokens, max_tokens, tokenizer),
+        result_type="expand",
+        axis=1,
+        meta=APPLY_TOKENIZATION_META,
+    )
+
+    df.to_parquet(OUTPUT_FOLDER, engine="pyarrow")
diff --git a/src/pipelines/actions_based/stage3_exploding.py b/src/pipelines/actions_based/stage3_exploding.py
new file mode 100644
index 0000000000000000000000000000000000000000..72ec12854063e4f73fcfc1dd99ac9f2b356cf70c
--- /dev/null
+++ b/src/pipelines/actions_based/stage3_exploding.py
@@ -0,0 +1,34 @@
+# /usr/bin/python3
+import dask.dataframe as dd
+from dask.distributed import Client
+
+from src.processing import (
+    EXPAND_DIMS_META,
+    FLATTEN_DIMS_META,
+    expand_dims,
+    flatten_dims,
+)
+from src.utils import PROJECT_ROOT, get_config, prepare_folder
+
+INPUT_FOLDER = f"{PROJECT_ROOT}/generated/actions/stage2_tokenization"
+OUTPUT_FOLDER = f"{PROJECT_ROOT}/generated/actions/stage3_exploding"
+
+if __name__ == "__main__":
+    config = get_config()
+    num_workers = config["actions"]["exploding"]["num_workers"]
+    memory_limit = config["actions"]["exploding"]["worker_memory_limit"]
+
+    prepare_folder(OUTPUT_FOLDER)
+
+    client = Client(n_workers=num_workers, memory_limit=memory_limit)
+    print(client.dashboard_link)
+
+    df = dd.read_parquet(INPUT_FOLDER, engine="pyarrow")
+
+    df = df.apply(expand_dims, result_type="expand", axis=1, meta=EXPAND_DIMS_META)
+    df = df.map_partitions(
+        lambda x: x.apply(lambda y: y.explode(), axis=0), meta=EXPAND_DIMS_META
+    )
+    df = df.apply(flatten_dims, result_type="expand", axis=1, meta=FLATTEN_DIMS_META)
+
+    df.to_parquet(OUTPUT_FOLDER, engine="pyarrow")
diff --git a/src/pipelines/actions_based/stage4_reindexing.py b/src/pipelines/actions_based/stage4_reindexing.py
new file mode 100644
index 0000000000000000000000000000000000000000..fade725f0df5baea4e43da8fca19c6005bcf0af6
--- /dev/null
+++ b/src/pipelines/actions_based/stage4_reindexing.py
@@ -0,0 +1,29 @@
+# /usr/bin/python3
+import dask.dataframe as dd
+from dask.distributed import Client
+
+from src.utils import PROJECT_ROOT, get_config, prepare_folder
+
+INPUT_FOLDER = f"{PROJECT_ROOT}/generated/actions/stage3_exploding"
+OUTPUT_FOLDER = f"{PROJECT_ROOT}/generated/actions/stage4_reindexing"
+
+if __name__ == "__main__":
+    config = get_config()
+    num_workers = config["actions"]["reindexing"]["num_workers"]
+    memory_limit = config["actions"]["reindexing"]["worker_memory_limit"]
+
+    prepare_folder(OUTPUT_FOLDER)
+
+    client = Client(n_workers=num_workers, memory_limit=memory_limit)
+    print(client.dashboard_link)
+
+    df = dd.read_parquet(INPUT_FOLDER, engine="pyarrow")
+
+    # Add ordered indexes
+    df = df.assign(ones=1)
+    df = df.reset_index(drop=True)
+    idx = (df.ones.cumsum() - 1).persist()
+    df = df.assign(ones=idx)
+
+    df = df.set_index("ones", shuffle="tasks")
+    df.to_parquet(OUTPUT_FOLDER, engine="pyarrow")
diff --git a/src/pipelines/actions_based/stage5_stats.py b/src/pipelines/actions_based/stage5_stats.py
new file mode 100644
index 0000000000000000000000000000000000000000..a91ae2f10cbf7741d683108538857c0ee1e10e4b
--- /dev/null
+++ b/src/pipelines/actions_based/stage5_stats.py
@@ -0,0 +1,55 @@
+# /usr/bin/python3
+import pickle
+
+import dask.dataframe as dd
+import numpy as np
+from dask.distributed import Client
+
+from src.pipelines.actions_based.processing import ACTIONS_KEYS
+from src.processing import EXPAND_DIMS_META, expand_dims
+from src.utils import PROJECT_ROOT, get_config, prepare_folder
+
+INPUT_FOLDER = f"{PROJECT_ROOT}/generated/actions/stage4_reindexing"
+OUTPUT_FOLDER = f"{PROJECT_ROOT}/generated/actions/stage5_stats"
+
+
+def reduce_fold(fold_value, new_value):
+    return {
+        "class_number": fold_value["class_number"] + np.sum(new_value, axis=0),
+        "num_examples": fold_value["num_examples"] + new_value.shape[0],
+    }
+
+
+def reduce_partitions(x, y):
+    return {
+        "class_number": x["class_number"] + y["class_number"],
+        "num_examples": x["num_examples"] + y["num_examples"],
+    }
+
+
+if __name__ == "__main__":
+    config = get_config()
+    num_workers = config["actions"]["stats"]["num_workers"]
+    memory_limit = config["actions"]["stats"]["worker_memory_limit"]
+
+    prepare_folder(OUTPUT_FOLDER)
+
+    client = Client(n_workers=num_workers, memory_limit=memory_limit)
+    print(client.dashboard_link)
+
+    df = dd.read_parquet(INPUT_FOLDER, engine="pyarrow")
+    df = df.apply(expand_dims, result_type="expand", axis=1, meta=EXPAND_DIMS_META)
+
+    outputs_bag = df["target"].to_bag()
+
+    inital_values = {
+        "class_number": np.array([0] * len(ACTIONS_KEYS)),
+        "num_examples": 0,
+    }
+
+    result = outputs_bag.fold(
+        reduce_fold, reduce_partitions, initial=inital_values
+    ).compute()
+
+    with open(f"{OUTPUT_FOLDER}/stats.pickle", "wb") as f:
+        pickle.dump(result, f)
diff --git a/src/pipelines/actions_based/train.py b/src/pipelines/actions_based/train.py
new file mode 100755
index 0000000000000000000000000000000000000000..e6ed38e621e363d77408da543109f71eb419dc25
--- /dev/null
+++ b/src/pipelines/actions_based/train.py
@@ -0,0 +1,142 @@
+#!/usr/bin/python3
+
+import glob
+import pickle
+from datetime import datetime
+
+import dask.dataframe as dd
+import numpy as np
+import torch
+from torch.nn import BCEWithLogitsLoss
+from transformers import BertForTokenClassification, BertTokenizerFast
+
+from src.batch_loading import get_batches
+from src.pipelines.actions_based.processing import ACTIONS_KEYS
+from src.training import latest_model, save_training_step
+from src.utils import PROJECT_ROOT, convert_to_timedelta, get_config, prepare_folder
+
+INPUT_PATH = f"{PROJECT_ROOT}/generated/actions/stage4_reindexing"
+INPUT_STATS_PATH = f"{PROJECT_ROOT}/generated/actions/stage5_stats"
+OUTPUT_PATH = f"{PROJECT_ROOT}/checkpoints/actions"
+
+if __name__ == "__main__":
+    config = get_config()
+    learning_rate = config["actions"]["training"]["learning_rate"]
+    num_epochs = config["actions"]["training"]["num_epochs"]
+    batch_size = config["actions"]["training"]["batch_size"]
+    save_step = config["actions"]["training"]["save_step"]
+    loss_averaging_span = config["actions"]["training"]["loss_averaging_span"]
+    fresh_start = config["actions"]["training"]["fresh_start"]
+    device_name = config["actions"]["training"]["device"]
+    max_train_time = config["actions"]["training"]["max_training_time"]
+    base_model = config["global"]["base_model"]
+    seed = config["global"]["random_seed"]
+
+    prepare_folder(OUTPUT_PATH)
+    np.random.seed(seed=seed)
+
+    if max_train_time is not None:
+        max_train_time = convert_to_timedelta(max_train_time)
+
+    device = torch.device(device_name if torch.cuda.is_available() else "cpu")
+    print(f"Training on {device}")
+
+    # Load loss weights
+    with open(f"{INPUT_STATS_PATH}/stats.pickle", "rb") as f:
+        stats = pickle.load(f)
+        pos_examples = stats["class_number"]
+        neg_examples = stats["num_examples"] - stats["class_number"]
+        pos_weight = torch.tensor(neg_examples / pos_examples)
+
+    df = dd.read_parquet(INPUT_PATH, engine="pyarrow")
+    tokenizer = BertTokenizerFast.from_pretrained(base_model)
+
+    model = BertForTokenClassification.from_pretrained(
+        base_model, num_labels=len(ACTIONS_KEYS)
+    ).to(device)
+    criterion = BCEWithLogitsLoss(pos_weight=pos_weight).to(device)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
+
+    epoch_start = 0
+    sample_start = 0
+    if fresh_start is False:
+        checkpoint_files = glob.glob(f"{OUTPUT_PATH}/*.model")
+        latest = latest_model(checkpoint_files)
+
+        if latest is not None:
+            epoch, batch = latest
+            model.load_state_dict(
+                torch.load(f"{OUTPUT_PATH}/{epoch}-{batch}.model", map_location=device,)
+            )
+            optimizer.load_state_dict(
+                torch.load(
+                    f"{OUTPUT_PATH}/{epoch}-{batch}.optimizer", map_location=device,
+                )
+            )
+
+            epoch_start, sample_start = epoch, batch
+            print(f"Loaded {epoch}-{batch}")
+
+    model.train()
+    model.base_model.train()
+    losses = []
+
+    num_samples = df.tail(1).index.values[0] + 1
+    random_index_shuffle = np.random.permutation(range(num_samples))
+
+    training_stopped = False
+
+    time_max = datetime.max
+    if max_train_time is not None:
+        time_max = datetime.now() + max_train_time
+
+    for epoch in range(epoch_start, num_epochs):
+        if training_stopped:
+            break
+
+        i = sample_start
+        for data_batch in get_batches(df, batch_size, 100, random_index_shuffle, i):
+            inputs = data_batch.apply(
+                lambda x: x["source"].reshape(x["source_shape"]), axis=1
+            ).values
+            outputs = data_batch.apply(
+                lambda x: x["target"].reshape(x["target_shape"]), axis=1
+            ).values
+            attentions_mask = data_batch.apply(
+                lambda x: x["attention_mask"].reshape(x["attention_mask_shape"]),
+                axis=1,
+            ).values
+
+            inputs = torch.tensor(np.stack(inputs).squeeze()).to(device)
+            outputs = torch.tensor(np.stack(outputs)).to(device)
+            attentions_mask = torch.tensor(np.stack(attentions_mask)).to(device)
+
+            y_pred = model(input_ids=inputs, attention_mask=attentions_mask)[0]
+
+            loss = criterion(y_pred, outputs)
+
+            losses.append(loss.item())
+            if len(losses) > loss_averaging_span:
+                losses = losses[-loss_averaging_span:]
+
+            print(f"epoch: {epoch} | step: {i} | loss: {np.mean(losses)}")
+
+            optimizer.zero_grad()
+
+            if i % save_step == 0 and (i != sample_start or epoch != epoch_start):
+                print(f"Saving: Epoch {epoch}, step {i}")
+                save_training_step(OUTPUT_PATH, f"{epoch}-{i}", model, optimizer)
+
+            if datetime.now() > time_max:
+                print(f"Max time reached, saving: Epoch {epoch}, step {i}")
+                save_training_step(OUTPUT_PATH, f"{epoch}-{i}", model, optimizer)
+                training_stopped = True
+                break
+
+            loss.backward()
+            optimizer.step()
+
+            i += 1
+
+    if not training_stopped:
+        save_training_step(OUTPUT_PATH, "final", model, optimizer)
diff --git a/src/pipelines/actions_based/utils.py b/src/pipelines/actions_based/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f62b611c4d1e41e53148cd8a77fa36e5a819add
--- /dev/null
+++ b/src/pipelines/actions_based/utils.py
@@ -0,0 +1,99 @@
+from typing import Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+from transformers import BertForTokenClassification, BertTokenizerFast, PretrainedConfig
+
+from src.pipelines.actions_based.processing import (
+    action_vector,
+    last_stop_label,
+    recover_text,
+    token_labels_to_word_labels,
+)
+from src.processing import ACTIONS_KEYS
+
+
+def load_model(
+    model_path: str, base_model: str, device: str = "cpu"
+) -> Tuple[BertTokenizerFast, nn.Module]:
+    """Load pretrained model and it's tokenizer
+
+    Args:
+        model_path (str): Path to pretrained model
+        base_model (str): Name of base model
+        device (str, optional): Device on which model will be loaded. Defaults to "cpu".
+
+    Returns:
+        (BertTokenizerFast, nn.Module): Tokenizer & model
+    """
+
+    config = PretrainedConfig.from_pretrained(base_model)
+    config.num_labels = len(ACTIONS_KEYS)
+
+    device = torch.device(device)
+    tokenizer = BertTokenizerFast.from_pretrained(base_model)
+    model = BertForTokenClassification(config)
+    model.load_state_dict(torch.load(model_path, map_location=device))
+
+    return tokenizer, model
+
+
+def apply_actions_punctuation(
+    text: str,
+    chunk_size: int,
+    tokenizer: BertTokenizerFast,
+    model: nn.Module,
+    threshold: float = 0.9,
+) -> str:
+    """Adds punctuation to text using actions model
+
+    Args:
+        text (str): Raw, unpuctuated text
+        chunk_size (int): Maxium number of tokens to precess at once (both memory & computing scales ~O(n^2))
+        tokenizer (BertTokenizerFast): Tokenizer to use
+        model (nn.Module): Trained actions model
+        threshold (float, optional): Threshold after which action will be applied. Defaults to 0.9.
+
+    Returns:
+        str: [description]
+    """
+
+    text = text.strip()
+
+    tokens = tokenizer(text, return_tensors="pt")["input_ids"]
+    output = None
+
+    index_start = 0
+    while index_start < len(tokens[0]):
+        index_end = min(index_start + chunk_size, len(tokens[0]))
+
+        tokens_chunk = tokens[:, index_start:index_end]
+
+        raw_output = model(
+            input_ids=tokens_chunk,
+            token_type_ids=torch.zeros_like(tokens_chunk),
+            attention_mask=torch.ones_like(tokens_chunk),
+        )[0].sigmoid()
+        raw_output = raw_output[0].detach().numpy()
+
+        actions = raw_output > threshold
+        offset = last_stop_label(actions, action_vector("dot"))
+
+        # Prevent infinite loop
+        if (offset is None) or (offset == 0):
+            offset = index_end - index_start
+
+        if output is None:
+            output = raw_output[0:offset]
+        else:
+            output = np.concatenate([output, raw_output[0:offset]], axis=0)
+
+        index_start += offset
+
+    assert len(output) == len(tokens[0])
+
+    word_labels = token_labels_to_word_labels(text, output[1:-1], tokenizer)
+    actions = word_labels > threshold
+
+    return recover_text(text, actions)
diff --git a/src/pipelines/translation_based/__init__.py b/src/pipelines/translation_based/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/pipelines/translation_based/processing.py b/src/pipelines/translation_based/processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..41962da3b1e75641192bb018f3b5d028635a6316
--- /dev/null
+++ b/src/pipelines/translation_based/processing.py
@@ -0,0 +1,299 @@
+from typing import Tuple
+
+import numpy as np
+from transformers import BertTokenizerFast
+
+from src.pipelines.actions_based.processing import remove_punctuation, text_from_xml
+
+
+def raw_to_dataframe(entry: dict) -> dict:
+    """Converts dask datarfame containing files paths into
+    dataframe with content of that files (text only)
+
+    Args:
+        x (dict): Dask dataframe entry with one column ('file')
+
+    Returns:
+        dict: Dask dataframe entry with format {'input': str}. Can have null entries
+    """
+    full_text = text_from_xml(entry.file)
+
+    if len(full_text) > 0:
+        return {"input": full_text}
+    else:
+        return {"input": None}
+
+
+RAW_TO_DATAFRAME_META = {"input": str}
+
+
+def generate_batches(
+    entry: dict,
+    min_len: int,
+    max_len: int,
+    separating_token: int,
+    tokenizer: BertTokenizerFast,
+) -> dict:
+    """Converts raw text entries into list of tokens
+
+    Args:
+        x (dict): Dask dataframe entry with one column ['input'] containing text
+        tokenizer (BertTokenizerFast): Tokenizer used to tokenize. Must be a deleayed object to prevent memory leak!
+
+    Returns:
+        dict: Dask dataset entry with one column ('tokens') containing np.array list of tokens
+    """
+    tokens = np.array(tokenizer(entry.input)["input_ids"][1:-1])
+
+    tokens_ending = (tokens == separating_token).astype(np.int)
+    batch_indices = get_batch_indexes(tokens_ending, min_len, max_len - 2)
+
+    source_batch, target_batch = crete_input_output_batch(
+        tokens, batch_indices, max_len, tokenizer
+    )
+    mask_batch = (source_batch != tokenizer.pad_token_id).astype(np.int)
+
+    source_batch_shape = np.array(source_batch.shape)
+    target_batch_shape = np.array(target_batch.shape)
+    mask_batch_shape = np.array(mask_batch.shape)
+
+    source_batch = source_batch.reshape(-1)
+    target_batch = target_batch.reshape(-1)
+    mask_batch = mask_batch.reshape(-1)
+
+    return {
+        "source": source_batch,
+        "target": target_batch,
+        "attention_mask": mask_batch,
+        "source_shape": source_batch_shape,
+        "target_shape": target_batch_shape,
+        "attention_mask_shape": mask_batch_shape,
+    }
+
+
+GENERATE_BATCHES_META = {
+    "source": object,
+    "target": object,
+    "attention_mask": object,
+    "source_shape": object,
+    "target_shape": object,
+    "attention_mask_shape": object,
+}
+
+
+def find_new_sentence_left(seq: np.array, pos: int) -> int:
+    """Finds nerest sentence on the left of the current position (including current position)
+
+    Args:
+        seq (np.array): Array of 0s and 1s of length equal to sequence. 1 means end of sentence (dot, semicolon etc.) and 0 - every other token
+        pos (int): Starting position
+
+    Returns:
+        int: Position of the nearest new sentence on the left. Start of the sequence always counts as a start of sentence
+    """
+    assert pos < len(seq)
+    assert pos >= 0
+
+    while pos > 0:
+        if seq[pos - 1] == 1:
+            return pos
+        else:
+            pos = pos - 1
+
+    return 0
+
+
+def find_new_sentence_right(seq: np.array, pos: int) -> int:
+    """Finds nerest sentence on the right of the current position (including current position)
+
+    Args:
+        seq (np.array): Array of 0s and 1s of length equal to sequence. 1 means end of sentence (dot, semicolon etc.) and 0 - every other token
+        pos (int): [description]
+
+    Returns:
+        int: Position of the nearest new sentence on the right. Returns none if no new sentence is found on the right
+    """
+    assert pos < len(seq)
+    assert pos >= 0
+
+    while pos < len(seq):
+        if seq[pos - 1] == 1:
+            return pos
+        else:
+            pos = pos + 1
+
+    return None
+
+
+def get_batch_indexes(seq: np.array, min_length: int, max_length: int) -> [np.array]:
+    """Turns long sequence into array of indices, composing a single batch file.
+
+    Args:
+        seq (np.array): Input sequence of 1s and 0s, where 1 means end of sequence token (dot, semicolon etc.)
+        min_length (int): Minimum length of sample in a batch
+        max_length (int): Maximum length of sample in a batch
+
+    Returns:
+        [np.array]: Array of indices, where each entry has length between <min_length, max_length>
+    """
+    pos = 0
+    batch = []
+
+    assert min_length <= max_length
+
+    while pos < len(seq):
+        pos_delta = min(max_length, len(seq) - pos)
+        assert pos + pos_delta <= len(seq)
+
+        if pos_delta >= min_length:
+            new_entry = np.array(list(range(pos, pos + pos_delta)))
+            assert len(new_entry) <= max_length
+
+            batch.append(new_entry)
+
+        if pos + pos_delta >= len(seq):
+            break
+
+        new_pos = find_new_sentence_left(seq, pos + pos_delta)
+        if new_pos == pos:
+            new_pos = find_new_sentence_right(seq, pos + pos_delta)
+            if new_pos is None:
+                break
+
+        pos = new_pos
+
+    return batch
+
+
+def add_padding(seq: np.ndarray, total_length: int, padding_symbol: any) -> np.ndarray:
+    """Pads a sequence with provided symbol, to get array of length total_length in the end
+
+    Args:
+        seq (np.ndarray): Input sequence
+        total_length (int): Desired length of a sequence
+        padding_symbol (any): Symbol that will be inserted at the end (total_legnth - len(seq)) times
+
+    Returns:
+        np.ndarray: N-dimensional array where first dimension is of length total_length
+    """
+    num_padding = total_length - len(seq)
+    assert num_padding >= 0
+
+    if num_padding > 0:
+        return np.concatenate([seq, np.array([padding_symbol] * num_padding)], axis=0)
+    else:
+        return np.copy(seq)
+
+
+def add_begin_end_tokens(
+    seq: np.ndarray, begin_token: any, end_token: any
+) -> np.ndarray:
+    """Adds preceding and ending special tokens to the sequence
+
+    Args:
+        seq (np.ndarray): Sequence of len L
+        begin_token (any): Tokend that will be added at the beginning of the sequence
+        end_token (any): Token that will be added at the end of the sequence
+
+    Returns:
+        np.ndarray: Sequence of len L+2
+    """
+
+    return np.concatenate([[begin_token], seq, [end_token]])
+
+
+def standarize_translation_sample(
+    seq: np.ndarray,
+    total_length: int,
+    padding_symbol: any,
+    begin_token: any,
+    end_token: any,
+) -> np.ndarray:
+    """Adds special tokens and padding so that every sample has identical shape
+
+    Args:
+        seq (np.ndarray): Input sequence of len L
+        total_length (int): Desired sequence length
+        padding_symbol (any): Token that will be used for padding
+        begin_token (any): Token that will be used as starting token
+        end_token (any): Token that will be used as ending token
+
+    Returns:
+        np.ndarray: Output sequence of length total_length
+    """
+    return add_padding(
+        add_begin_end_tokens(seq, begin_token, end_token), total_length, padding_symbol,
+    )
+
+
+def create_input_output(
+    tokens: np.ndarray, length: int, tokenizer: BertTokenizerFast
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Transforms a sequence of tokens into "translation" input and output
+
+    Args:
+        tokens (np.ndarray): Input sequence
+        length (int): Maximum output length. Will add padding to match it
+        tokenizer (BertTokenizerFast): Tokenizer that was used to obtain tokens
+
+    Returns:
+        np.ndarray: Single sample that will serve as input to the model
+        np.ndarray: Single sample that will serve as expected output from the model
+    """
+    decoded_str = tokenizer.decode(tokens)
+    cleaned_str = remove_punctuation(decoded_str).lower()
+    source_batch_entry = tokenizer(cleaned_str)["input_ids"][1:-1]
+    target_batch_entry = tokens
+
+    # In rare cases (because of encoding) unpunctuated lowercase input might be longer than output and exeed limits
+    # We need to trim in such cases
+    if len(source_batch_entry) > length - 2:
+        source_batch_entry = source_batch_entry[: (length - 2)]
+
+    source_batch_entry = standarize_translation_sample(
+        source_batch_entry,
+        length,
+        tokenizer.pad_token_id,
+        tokenizer.cls_token_id,
+        tokenizer.sep_token_id,
+    )
+    target_batch_entry = standarize_translation_sample(
+        target_batch_entry,
+        length,
+        tokenizer.pad_token_id,
+        tokenizer.cls_token_id,
+        tokenizer.sep_token_id,
+    )
+
+    return source_batch_entry, target_batch_entry
+
+
+def crete_input_output_batch(
+    seq: np.ndarray,
+    batch_indexes: [np.ndarray],
+    length: int,
+    tokenizer: BertTokenizerFast,
+) -> (np.ndarray, np.ndarray):
+    """Transforms a sequence of tokens into "translation" input and output batch
+
+    Args:
+        tokens (np.ndarray): Input sequence
+        batch_indexes ([np.ndarray]) List where every entry is array of indices representing a batch sample from tokens array.
+        length (int): Maximum output length. Will add padding to match it
+        tokenizer (BertTokenizerFast): Tokenizer that was used to obtain tokens
+
+    Returns:
+        np.ndarray: Single sample that will serve as input to the model
+        np.ndarray: Single sample that will serve as expected output from the model
+    """
+    base_batch = [seq[indexes] for indexes in batch_indexes]
+
+    source_batch = []
+    target_batch = []
+    for entry in base_batch:
+        source_entry, target_entry = create_input_output(entry, length, tokenizer)
+
+        source_batch.append(source_entry)
+        target_batch.append(target_entry)
+
+    return np.array(source_batch), np.array(target_batch)
diff --git a/src/pipelines/translation_based/stage1_extraction.py b/src/pipelines/translation_based/stage1_extraction.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ffdbf7ac08cafb8db75a8b669b3ed55d6ce6fa9
--- /dev/null
+++ b/src/pipelines/translation_based/stage1_extraction.py
@@ -0,0 +1,45 @@
+# /usr/bin/python3
+from glob import glob
+
+import dask.dataframe as dd
+import numpy as np
+import pandas as pd
+from dask.distributed import Client
+
+from src.pipelines.translation_based.processing import (
+    RAW_TO_DATAFRAME_META,
+    raw_to_dataframe,
+)
+from src.utils import PROJECT_ROOT, get_config, prepare_folder
+
+INPUT_FOLDER = f"{PROJECT_ROOT}/data"
+OUTPUT_FOLDER = f"{PROJECT_ROOT}/generated/translations/stage1_extraction"
+
+if __name__ == "__main__":
+
+    config = get_config()
+    num_partitions = config["translations"]["extraction"]["num_partitions"]
+    num_workers = config["translations"]["extraction"]["num_workers"]
+    memory_limit = config["translations"]["extraction"]["worker_memory_limit"]
+
+    prepare_folder(OUTPUT_FOLDER)
+
+    file_schema = f"{INPUT_FOLDER}/**/text_structure.xml"
+    files_paths = glob(file_schema, recursive=True)
+
+    # Make sure python memory fragmentation won't go insane
+    np.random.shuffle(files_paths)
+
+    client = Client(n_workers=num_workers, memory_limit=memory_limit)
+    print(f"Dashboard: {client.dashboard_link}")
+
+    # Processing pipeline
+    df = dd.from_pandas(pd.DataFrame({"file": files_paths}), npartitions=num_partitions)
+
+    df = df.apply(
+        raw_to_dataframe, result_type="expand", axis=1, meta=RAW_TO_DATAFRAME_META,
+    )
+    df = df.dropna()
+
+    # Export
+    df.to_parquet(OUTPUT_FOLDER, engine="pyarrow")
diff --git a/src/pipelines/translation_based/stage2_create_batches.py b/src/pipelines/translation_based/stage2_create_batches.py
new file mode 100644
index 0000000000000000000000000000000000000000..83a2edcc98599c870a0aa26e4afd74ae1e77fd77
--- /dev/null
+++ b/src/pipelines/translation_based/stage2_create_batches.py
@@ -0,0 +1,46 @@
+# /usr/bin/python3
+import dask.dataframe as dd
+from dask import delayed
+from dask.distributed import Client
+from transformers import BertTokenizerFast
+
+from src.pipelines.translation_based.processing import (
+    GENERATE_BATCHES_META,
+    generate_batches,
+)
+from src.utils import PROJECT_ROOT, get_config, prepare_folder
+
+INPUT_FOLDER = f"{PROJECT_ROOT}/generated/translations/stage1_extraction"
+OUTPUT_FOLDER = f"{PROJECT_ROOT}/generated/translations/stage2_create_batches"
+
+if __name__ == "__main__":
+
+    config = get_config()
+    num_workers = config["translations"]["create_batches"]["num_workers"]
+    memory_limit = config["translations"]["create_batches"]["worker_memory_limit"]
+    min_tokens = config["translations"]["create_batches"]["min_tokens"]
+    max_tokens = config["translations"]["create_batches"]["max_tokens"]
+    base_model = config["global"]["base_model"]
+
+    prepare_folder(OUTPUT_FOLDER)
+
+    client = Client(n_workers=num_workers, memory_limit=memory_limit)
+    print(f"Dashboard: {client.dashboard_link}")
+
+    tokenizer = BertTokenizerFast.from_pretrained(base_model)
+    tokenizer = delayed(tokenizer)
+
+    token_separating = tokenizer(".")["input_ids"][1]
+
+    df = dd.read_parquet(INPUT_FOLDER, engine="pyarrow")
+    df = df.apply(
+        generate_batches,
+        result_type="expand",
+        axis=1,
+        meta=GENERATE_BATCHES_META,
+        args=(min_tokens, max_tokens, token_separating, tokenizer),
+    )
+    df = df.dropna()
+
+    # Export
+    df.to_parquet(OUTPUT_FOLDER, engine="pyarrow")
diff --git a/src/pipelines/translation_based/stage3_exploding.py b/src/pipelines/translation_based/stage3_exploding.py
new file mode 100644
index 0000000000000000000000000000000000000000..d969dd144166b30c83d801d064deae896e573cde
--- /dev/null
+++ b/src/pipelines/translation_based/stage3_exploding.py
@@ -0,0 +1,34 @@
+# /usr/bin/python3
+import dask.dataframe as dd
+from dask.distributed import Client
+
+from src.pipelines.translation_based.processing import (
+    EXPAND_DIMS_META,
+    FLATTEN_DIMS_META,
+    expand_dims,
+    flatten_dims,
+)
+from src.utils import PROJECT_ROOT, get_config, prepare_folder
+
+INPUT_FOLDER = f"{PROJECT_ROOT}/generated/translations/stage2_create_batches"
+OUTPUT_FOLDER = f"{PROJECT_ROOT}/generated/translations/stage3_exploding"
+
+if __name__ == "__main__":
+    config = get_config()
+    num_workers = config["translations"]["exploding"]["num_workers"]
+    memory_limit = config["translations"]["exploding"]["worker_memory_limit"]
+
+    prepare_folder(OUTPUT_FOLDER)
+
+    client = Client(n_workers=num_workers, memory_limit=memory_limit)
+    print(client.dashboard_link)
+
+    df = dd.read_parquet(INPUT_FOLDER, engine="pyarrow")
+
+    df = df.apply(expand_dims, result_type="expand", axis=1, meta=EXPAND_DIMS_META)
+    df = df.map_partitions(
+        lambda x: x.apply(lambda y: y.explode(), axis=0), meta=EXPAND_DIMS_META
+    )
+    df = df.apply(flatten_dims, result_type="expand", axis=1, meta=FLATTEN_DIMS_META)
+
+    df.to_parquet(OUTPUT_FOLDER, engine="pyarrow")
diff --git a/src/pipelines/translation_based/stage4_reindexing.py b/src/pipelines/translation_based/stage4_reindexing.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bbb541cbc735c5e45a0f92c79e7cf4ab11c682c
--- /dev/null
+++ b/src/pipelines/translation_based/stage4_reindexing.py
@@ -0,0 +1,37 @@
+# /usr/bin/python3
+import dask.dataframe as dd
+from dask.distributed import Client
+
+from src.utils import PROJECT_ROOT, get_config, prepare_folder
+
+INPUT_FOLDER = f"{PROJECT_ROOT}/generated/translations/stage3_exploding"
+OUTPUT_FOLDER = f"{PROJECT_ROOT}/generated/translations/stage4_reindexing"
+
+if __name__ == "__main__":
+    config = get_config()
+    num_workers = config["translations"]["reindexing"]["num_workers"]
+    memory_limit = config["translations"]["reindexing"]["worker_memory_limit"]
+
+    prepare_folder(OUTPUT_FOLDER)
+
+    client = Client(n_workers=num_workers, memory_limit=memory_limit)
+    print(client.dashboard_link)
+
+    df = dd.read_parquet(INPUT_FOLDER, engine="pyarrow")
+
+    # Add ordered indexes
+    df = df.assign(ones=1)
+    df = df.reset_index(drop=True)
+    idx = (df.ones.cumsum() - 1).persist()
+    df = df.assign(ones=idx)
+
+    # Shuffle
+    shuffled_idx = idx.compute().values
+    shuffled_idx = client.scatter(shuffled_idx)
+    mapped_ones = df.ones.apply(
+        lambda x, idx: idx[x], args=(shuffled_idx,), meta=("ones", "int64")
+    ).persist()
+    df = df.assign(ones=mapped_ones)
+
+    df = df.set_index("ones")
+    df.to_parquet(OUTPUT_FOLDER, engine="pyarrow")
diff --git a/src/pipelines/translation_based/train.py b/src/pipelines/translation_based/train.py
new file mode 100755
index 0000000000000000000000000000000000000000..6e39eccdebf220bff2f60c72a440971fe41ef5b2
--- /dev/null
+++ b/src/pipelines/translation_based/train.py
@@ -0,0 +1,137 @@
+#!/usr/bin/python3
+
+import glob
+from datetime import datetime
+
+import dask.dataframe as dd
+import numpy as np
+import torch
+from transformers import BertTokenizerFast
+
+from src.batch_loading import get_batches, get_ordered_dataframe_len
+from src.models.TransformerSeq2Seq import TransformerSeq2Seq
+from src.training import latest_model, save_training_step
+from src.utils import PROJECT_ROOT, convert_to_timedelta, get_config, prepare_folder
+
+INPUT_PATH = f"{PROJECT_ROOT}/generated/translations/stage4_reindexing"
+OUTPUT_PATH = f"{PROJECT_ROOT}/checkpoints/translations"
+
+if __name__ == "__main__":
+    config = get_config()
+    learning_rate = config["translations"]["training"]["learning_rate"]
+    max_len = config["translations"]["create_batches"]["max_tokens"]
+    num_epochs = config["translations"]["training"]["num_epochs"]
+    batch_size = config["translations"]["training"]["batch_size"]
+    save_step = config["translations"]["training"]["save_step"]
+    loss_averaging_span = config["translations"]["training"]["loss_averaging_span"]
+    fresh_start = config["translations"]["training"]["fresh_start"]
+    device_name = config["translations"]["training"]["device"]
+    max_train_time = config["translations"]["training"]["max_training_time"]
+    base_model = config["global"]["base_model"]
+    seed = config["global"]["random_seed"]
+
+    prepare_folder(OUTPUT_PATH)
+    np.random.seed(seed=seed)
+
+    if max_train_time is not None:
+        max_train_time = convert_to_timedelta(max_train_time)
+
+    device = torch.device(device_name if torch.cuda.is_available() else "cpu")
+    print(f"Training on {device}")
+
+    df = dd.read_parquet(INPUT_PATH, engine="pyarrow")
+
+    tokenizer = BertTokenizerFast.from_pretrained(base_model)
+
+    model = TransformerSeq2Seq(tokenizer.vocab_size, 256, max_len, 4, 4, 4,).to(device)
+    criterion = torch.nn.CrossEntropyLoss(reduction="mean").to(device)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
+
+    epoch_start = 0
+    sample_start = 0
+    if fresh_start is False:
+        checkpoint_files = glob.glob(f"{OUTPUT_PATH}/*.model")
+        latest = latest_model(checkpoint_files)
+
+        if latest is not None:
+            epoch, batch = latest
+            model.load_state_dict(
+                torch.load(f"{OUTPUT_PATH}/{epoch}-{batch}.model", map_location=device,)
+            )
+            optimizer.load_state_dict(
+                torch.load(
+                    f"{OUTPUT_PATH}/{epoch}-{batch}.optimizer", map_location=device,
+                )
+            )
+
+            epoch_start, sample_start = epoch, batch
+            print(f"Loaded {epoch}-{batch}")
+
+    model.train()
+    model.base_model.train()
+    losses = []
+
+    num_samples = get_ordered_dataframe_len(df)
+    random_index_shuffle = np.random.permutation(range(num_samples))
+
+    training_stopped = False
+
+    time_max = datetime.max
+    if max_train_time is not None:
+        time_max = datetime.now() + max_train_time
+
+    for epoch in range(epoch_start, num_epochs):
+        if training_stopped:
+            break
+
+        i = sample_start
+        for data_batch in get_batches(df, batch_size, 100, random_index_shuffle, i):
+            inputs = data_batch.apply(
+                lambda x: x["source"].reshape(x["source_shape"]), axis=1
+            ).values
+            outputs = data_batch.apply(
+                lambda x: x["target"].reshape(x["target_shape"]), axis=1
+            ).values
+            attentions_mask = data_batch.apply(
+                lambda x: x["attention_mask"].reshape(x["attention_mask_shape"]),
+                axis=1,
+            ).values
+
+            inputs = torch.tensor(np.stack(inputs, axis=0), dtype=torch.long).to(device)
+            attentions_mask = torch.tensor(np.stack(attentions_mask, axis=0) == 0).to(
+                device
+            )
+            output_indices = torch.tensor(
+                np.stack(outputs, axis=0), dtype=torch.long
+            ).to(device)
+
+            y_pred = model(inputs, output_indices[:, :-1], attentions_mask)
+            y_pred = y_pred.transpose(1, 2)
+
+            loss = criterion(y_pred, output_indices[:, 1:])
+
+            losses.append(loss.item())
+            if len(losses) > loss_averaging_span:
+                losses = losses[-loss_averaging_span:]
+
+            print(f"epoch: {epoch} | step: {i} | loss: {np.mean(losses)}")
+
+            optimizer.zero_grad()
+
+            if i % save_step == 0 and (i != sample_start or epoch != epoch_start):
+                print(f"Saving: Epoch {epoch}, step {i}")
+                save_training_step(OUTPUT_PATH, f"{epoch}-{i}", model, optimizer)
+
+            if datetime.now() > time_max:
+                print(f"Max time reached, saving: Epoch {epoch}, step {i}")
+                save_training_step(OUTPUT_PATH, f"{epoch}-{i}", model, optimizer)
+                training_stopped = True
+                break
+
+            loss.backward()
+            optimizer.step()
+
+            i += 1
+
+    if not training_stopped:
+        save_training_step(OUTPUT_PATH, "final", model, optimizer)
diff --git a/src/processing.py b/src/processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..2777416c7a6693854f90bed05a282adcc0254ed4
--- /dev/null
+++ b/src/processing.py
@@ -0,0 +1,65 @@
+import numpy as np
+
+
+def expand_dims(entry) -> dict:
+    """Reshapes flat source, target, mask arrays into corresponding shapes
+
+    Args:
+        entry (dict): Dask dataframe row with columns: source, target, attention_mask, source_shape, target_shape, attention_mask_shape
+
+    Returns:
+        dict: Dask dataframe row with columns: source, target, attention_mask
+    """
+    source = entry.source.reshape(entry.source_shape)
+    target = entry.target.reshape(entry.target_shape)
+    mask = entry.attention_mask.reshape(entry.attention_mask_shape)
+
+    return {
+        "source": source,
+        "target": target,
+        "attention_mask": mask,
+    }
+
+
+EXPAND_DIMS_META = {
+    "source": object,
+    "target": object,
+    "attention_mask": object,
+}
+
+
+def flatten_dims(entry: dict) -> dict:
+    """Flattens arrays in dataframe rows into 1D and saves shapes into separate columns
+
+    Args:
+        entry (dict): Dask dataframe row with columns: source, target, attention_mask
+
+    Returns:
+        dict: Dask dataframe row with columns: source, target, attention_mask, source_shape, target_shape, attention_mask_shape
+    """
+    source_shape = np.array(entry.source.shape)
+    target_shape = np.array(entry.target.shape)
+    mask_shape = np.array(entry.attention_mask.shape)
+
+    source = entry.source.reshape(-1)
+    target = entry.target.reshape(-1)
+    mask = entry.attention_mask.reshape(-1)
+
+    return {
+        "source": source,
+        "target": target,
+        "attention_mask": mask,
+        "source_shape": source_shape,
+        "target_shape": target_shape,
+        "attention_mask_shape": mask_shape,
+    }
+
+
+FLATTEN_DIMS_META = {
+    "source": object,
+    "target": object,
+    "attention_mask": object,
+    "source_shape": object,
+    "target_shape": object,
+    "attention_mask_shape": object,
+}
diff --git a/src/training.py b/src/training.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9ffd92e5702a91f59ed97ca4ac2e9c912824e13
--- /dev/null
+++ b/src/training.py
@@ -0,0 +1,66 @@
+import re
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+
+from src.utils import prepare_folder
+
+
+def latest_model(file_paths: [str]) -> Optional[Tuple[int, int]]:
+    """Finds newest model in directory
+
+    Args:
+        files ([str]): List of all file paths that will be considered. File extension is discarded
+                       File names must be in format epoch_num-batch_num.extension
+
+    Returns:
+        (int, int): Tuple of (latest_batch, latest_step) for latest model
+    """
+
+    furthest_epoch = -1
+    furthest_batch_num = -1
+    for checkpoint_file in file_paths:
+        filename = checkpoint_file.split("/")[-1].split(".")[0]
+
+        result = re.search(r"^(\d+)-(\d+)$", filename)
+        if result is not None:
+            epoch, batch = [int(x) for x in result.groups()]
+
+            if epoch > furthest_epoch:
+                furthest_epoch = epoch
+                furthest_batch_num = batch
+            elif epoch == furthest_epoch:
+                furthest_batch_num = max(batch, furthest_batch_num)
+
+    if (furthest_epoch == -1) or (furthest_batch_num == -1):
+        return None
+
+    return furthest_epoch, furthest_batch_num
+
+
+def save_training_step(
+    dir: str,
+    name: str,
+    model: nn.Module,
+    optimizer: Optional[optim.Optimizer] = None,
+    create_dir: bool = False,
+) -> None:
+    """Saves a trainig step to a directory
+
+    Args:
+        dir (str): Directory where step will be saved
+        name (str): Name of the step (eg. "0-1000")
+        model (nn.Module): model that will be saved
+        optimizer (optim.Optimizer): optimizer that will be saved. Might be None
+    """
+    if create_dir:
+        prepare_folder(dir, wipe=False)
+
+    torch.save(model.state_dict(), f"{dir}/{name}.model")
+
+    if optimizer is not None:
+        torch.save(
+            optimizer.state_dict(), f"{dir}/{name}.optimizer",
+        )
diff --git a/src/utils.py b/src/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..906de65e72dbf1561ec8107d2a76cc2b77f0c922
--- /dev/null
+++ b/src/utils.py
@@ -0,0 +1,124 @@
+import os
+import re
+import shutil
+from datetime import timedelta
+from typing import Optional
+
+import yaml
+
+PROJECT_ROOT = os.path.dirname(os.path.realpath("/".join(__file__.split("/")) + "/.."))
+
+
+def get_config() -> dict:
+    """Returns dict with config values
+
+    Returns:
+        dict: Dict with condig values
+    """
+
+    with open(f"{PROJECT_ROOT}/params.yaml", "r") as file:
+        config = yaml.load(file, Loader=yaml.FullLoader)
+
+    return config
+
+
+def remove_multiple_spaces(text: str) -> str:
+    """Replaces multiple spaces by a single one
+
+    Args:
+        text (str): Text potentialy containing multiple spaces
+
+    Returns:
+        str: Text with all multiple spaces replaced by one
+    """
+    return re.sub(r"\s\s+", " ", text)
+
+
+def remove_punctuation(text: str) -> str:
+    """Removes all non-alphanumeric characters from the text.
+    Might result in multiple spaces while chracters like `-`
+    are used
+
+    Args:
+        text (str): Text containing punctuation
+
+    Returns:
+        str: Text with all punctuactions removed
+    """
+
+    # Separating characters
+    text = text.replace("-", " ").replace("/", " ").replace("+", " ")
+
+    return "".join(filter(lambda x: x.isalnum() or x.isspace(), text))
+
+
+def preprocess(text: str) -> str:
+    """Makes sure that input is in the same format as training data (no non-alphanum chars, no double spaces,
+        all lowercase etc.)
+
+    Args:
+        text (str): Text to be processed
+
+    Returns:
+        str: Text in training-data format
+    """
+    text = remove_punctuation(text)
+    text = remove_multiple_spaces(text)
+    text = text.lower()
+    text = text.strip()
+
+    return text
+
+
+def prepare_folder(path: str, wipe: bool = False) -> None:
+    """Function make sure that provided path exists. Can aditionaly
+    remove all files from the path.
+
+    Args:
+        path (str): Full directory path
+        wipe (bool): Wheter to remove all files in folder
+    """
+
+    if wipe:
+        shutil.rmtree(path)
+
+    os.makedirs(path, exist_ok=True)
+
+
+def convert_to_timedelta(time_val: str) -> Optional[timedelta]:
+    """
+    src: https://code.activestate.com/recipes/577894-convert-strings-like-5d-and-60s-to-timedelta-objec/
+    Given a *time_val* (string) such as '5d', returns a timedelta object
+    representing the given value (e.g. timedelta(days=5)).
+
+    =========   ======= ===================
+    Character   Meaning Example
+    =========   ======= ===================
+    s           Seconds '60s' -> 60 Seconds
+    m           Minutes '5m'  -> 5 Minutes
+    h           Hours   '24h' -> 24 Hours
+    d           Days    '7d'  -> 7 Days
+    =========   ======= ===================
+
+    Examples::
+
+        >>> convert_to_timedelta('7d')
+        datetime.timedelta(7)
+        >>> convert_to_timedelta('24h')
+        datetime.timedelta(1)
+        >>> convert_to_timedelta('60m')
+        datetime.timedelta(0, 3600)
+        >>> convert_to_timedelta('120s')
+        datetime.timedelta(0, 120)
+    """
+    num = int(time_val[:-1])
+    if time_val.endswith("s"):
+        return timedelta(seconds=num)
+    elif time_val.endswith("m"):
+        return timedelta(minutes=num)
+    elif time_val.endswith("h"):
+        return timedelta(hours=num)
+    elif time_val.endswith("d"):
+        return timedelta(days=num)
+    else:
+        return None
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tests/pipelines/__init__.py b/tests/pipelines/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tests/pipelines/actions_based/__init__.py b/tests/pipelines/actions_based/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tests/pipelines/actions_based/test_processing.py b/tests/pipelines/actions_based/test_processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..c626ff25dcd894a7055b14dd8e3ecce45149573b
--- /dev/null
+++ b/tests/pipelines/actions_based/test_processing.py
@@ -0,0 +1,230 @@
+import numpy as np
+import pytest
+from transformers import BertTokenizerFast
+
+from src.pipelines.actions_based.processing import (
+    ACTIONS_KEYS,
+    action_vector,
+    batchify_data,
+    batchify_labels,
+    create_model_input_output,
+    decode_actions,
+    detect_actions,
+    encode_actions,
+    last_stop_label,
+    nearest_sentence_l,
+    nearest_sentence_r,
+    recover_text,
+    token_labels_to_word_labels,
+    token_word_mapping,
+    tokenize_labeled_text,
+)
+
+
+def test_detect_actions():
+    actions = detect_actions("Janek.", None)
+    assert actions == {
+        "dot": True,
+        "upper_case": True,
+        "colon": False,
+        "question_mark": False,
+    }
+
+    actions = detect_actions("ewka?", None)
+    assert actions == {
+        "dot": False,
+        "upper_case": False,
+        "colon": False,
+        "question_mark": True,
+    }
+
+    actions = detect_actions("Test", None)
+    assert actions == {
+        "dot": False,
+        "upper_case": True,
+        "colon": False,
+        "question_mark": False,
+    }
+
+
+def test_encode_actions():
+    x = {
+        "dot": True,
+        "upper_case": False,
+        "colon": False,
+        "question_mark": True,
+    }
+
+    assert np.all(encode_actions(x) == np.array([1, 0, 0, 1]))
+
+
+def test_decode_actions():
+    x = np.array([1, 0, 0, 1])
+
+    assert decode_actions(x) == {
+        "dot": True,
+        "upper_case": False,
+        "colon": False,
+        "question_mark": True,
+    }
+
+
+def test_token_word_mapping():
+    text = "janek poszedł do ogrodu"
+    tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")
+
+    text_tokenized = tokenizer(text)
+
+    mapping = token_word_mapping(text, tokenizer)
+
+    assert len(mapping) == (len(text_tokenized["input_ids"]) - 2)
+    assert min(mapping) == 0
+    assert max(mapping) == 3
+
+
+def test_token_labels_to_word_labels():
+    text = "janek poszedł do ogrodu"
+    labels = np.array([[0, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]])
+    tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")
+
+    _, token_labels = tokenize_labeled_text(text, labels, tokenizer)
+
+    word_labels = token_labels_to_word_labels(text, token_labels, tokenizer)
+
+    assert np.all(np.vectorize(pytest.approx)(word_labels, labels))
+
+
+def test_tokenize_labeled_text():
+    text = "Janek poszedł do ogrodu. Ogród był zwierzęcy. Spotkał tam Zosię?"
+    tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")
+
+    text_clean, labels = create_model_input_output(text)
+    tokens, token_labels = tokenize_labeled_text(text_clean, labels, tokenizer)
+
+    assert len(tokens.shape) == 2
+    assert len(token_labels.shape) == 2
+
+    assert tokens.shape[1] == 1
+    assert token_labels.shape[1] == len(ACTIONS_KEYS)
+
+    assert len(tokens) == len(token_labels)
+    assert tokens[0, 0] != tokenizer.cls_token_id
+    assert tokens[-1, 0] != tokenizer.sep_token_id
+
+
+def test_recover_text():
+    text = "Janek poszedł do ogrodu. Ogród był zwierzęcy. Spotkał tam Zosię?"
+    text_clean, word_labels = create_model_input_output(text)
+
+    result_text = recover_text(text_clean, word_labels)
+
+    assert result_text == text
+
+
+def test_nearest_sentence_l():
+    end = create_dummy_action(True)
+    word = create_dummy_action(False)
+
+    entry = np.array([word, word, word, end, end, word, word, end])
+
+    assert nearest_sentence_l(entry, 3) == 0
+    assert nearest_sentence_l(entry, 4) == 0
+    assert nearest_sentence_l(entry, 5) == 5
+    assert nearest_sentence_l(entry, 7) == 5
+
+
+def create_dummy_action(end_sentence: bool) -> np.array:
+    return encode_actions(
+        {
+            "dot": end_sentence,
+            "upper_case": False,
+            "colon": False,
+            "question_mark": False,
+        }
+    )
+
+
+def test_nearest_sentence_r():
+    end = create_dummy_action(True)
+    word = create_dummy_action(False)
+
+    entry = np.array([word, word, word, end, end, word, word, end])
+
+    assert nearest_sentence_r(entry, 0) == 0
+    assert nearest_sentence_r(entry, 4) == 5
+    assert nearest_sentence_r(entry, 5) == 5
+    assert nearest_sentence_r(entry, 6) is None
+    assert nearest_sentence_r(entry, 7) is None
+
+
+def test_batchify_labels():
+    end = create_dummy_action(True)
+    word = create_dummy_action(False)
+    entry = np.array([word, word, word, end, end, word, word, end])
+
+    batches = batchify_labels(entry, 3, 1)
+
+    assert len(batches) == 2
+    assert np.all(batches[0] == range(0, 3))
+    assert np.all(batches[1] == range(5, 8))
+
+
+def test_batchify_data():
+    text = "Janek poszedł do ogrodu. Ogród był zwierzęcy. Spotkał tam niedzwiedzia?"
+    tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")
+
+    text_clean, labels = create_model_input_output(text)
+    tokens, token_labels = tokenize_labeled_text(text_clean, labels, tokenizer)
+
+    input_batch, output_batch, mask_batch = batchify_data(
+        tokens, token_labels, 5, tokenizer
+    )
+
+    assert len(input_batch.shape) == 3
+    assert len(output_batch.shape) == 3
+    assert len(mask_batch.shape) == 2
+
+    assert input_batch.shape[0] == mask_batch.shape[0]
+    assert input_batch.shape[0] > 1
+
+    # Second dimension should be sequence length
+    assert input_batch.shape[1] == 5
+    assert output_batch.shape[1] == 5
+    assert mask_batch.shape[1] == 5
+
+    # Third dimension should be feature size
+    assert input_batch.shape[2] == 1
+    assert output_batch.shape[2] == len(ACTIONS_KEYS)
+
+    # Mask should be integer (1 - leave, 0 - mask out)
+    assert mask_batch.dtype == np.int
+
+    # Should never be fully masked
+    # TODO: Make sure correct convetions is used
+    assert np.all(mask_batch[:, 0] == 1)
+
+    # Should never be fully masked0
+    for i in range(input_batch.shape[0]):
+        # Should always start from beginning of the sentence
+        assert decode_actions(output_batch[i, 0, :])["upper_case"]
+        assert decode_actions(output_batch[i, 1, :])["upper_case"]
+
+        # Should always end with sep and padding#
+        # TODO: Test it
+
+
+def test_action_vector():
+    expected = encode_actions(
+        {"dot": True, "upper_case": True, "colon": False, "question_mark": False}
+    )
+
+    assert np.all(action_vector(["dot", "upper_case"]) == expected)
+
+
+def test_last_stop_label():
+    stop_action = action_vector(["Dot"])
+    not_stop_action = action_vector(["upper_case"])
+
+    labels = np.array([not_stop_action, not_stop_action, stop_action, not_stop_action])
+
+    assert last_stop_label(labels, stop_action) == 2
diff --git a/tests/pipelines/translation_based/__init__.py b/tests/pipelines/translation_based/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tests/pipelines/translation_based/test_processing.py b/tests/pipelines/translation_based/test_processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7806a150e7f07372e4c928c33f5547b5efe3f31
--- /dev/null
+++ b/tests/pipelines/translation_based/test_processing.py
@@ -0,0 +1,185 @@
+import numpy as np
+from transformers import BertTokenizerFast
+
+from src.pipelines.translation_based.processing import (
+    add_begin_end_tokens,
+    add_padding,
+    create_input_output,
+    crete_input_output_batch,
+    find_new_sentence_left,
+    find_new_sentence_right,
+    get_batch_indexes,
+    standarize_translation_sample,
+)
+
+
+def test_find_new_sentence_left():
+    test_input = np.array([0, 0, 1, 0, 1, 0])
+    assert find_new_sentence_left(test_input, 0) == 0
+    assert find_new_sentence_left(test_input, 1) == 0
+    assert find_new_sentence_left(test_input, 2) == 0
+    assert find_new_sentence_left(test_input, 3) == 3
+    assert find_new_sentence_left(test_input, 4) == 3
+    assert find_new_sentence_left(test_input, 5) == 5
+
+
+def test_find_new_sentence_right():
+    test_input = np.array([0, 0, 1, 0, 1, 0, 0])
+    assert find_new_sentence_right(test_input, 0) == 3
+    assert find_new_sentence_right(test_input, 1) == 3
+    assert find_new_sentence_right(test_input, 2) == 3
+    assert find_new_sentence_right(test_input, 3) == 3
+    assert find_new_sentence_right(test_input, 4) == 5
+    assert find_new_sentence_right(test_input, 5) == 5
+    assert find_new_sentence_right(test_input, 6) is None
+
+
+def test_split_to_samples():
+    min_len = 3
+    max_len = 5
+    test_input = np.array([0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0])
+    expeted_output = [np.array([0, 1, 2, 3, 4]), np.array([6, 7, 8, 9, 10])]
+
+    result = get_batch_indexes(test_input, min_len, max_len)
+    assert len(result) == len(expeted_output)
+
+    for got, expected in zip(result, expeted_output):
+        assert np.all(got == expected)
+
+
+def test_add_padding():
+    input_sequence = np.array([1, 2, 3, 4])
+
+    # Works with 0 padding
+    result = add_padding(input_sequence, 4, 9)
+    assert len(result) == 4
+    assert np.all(result == input_sequence)
+
+    # Normal use case
+    result = add_padding(input_sequence, 6, 9)
+    assert len(result) == 6
+    assert np.all(result == [1, 2, 3, 4, 9, 9])
+
+    # multidimensional use-case
+    input_sequence = np.array([[1, 2, 3], [4, 5, 6]])
+    padd = np.array([9, 9, 9])
+    result = add_padding(input_sequence, 4, padd)
+    assert len(result) == 4
+    assert np.all(result == [[1, 2, 3], [4, 5, 6], [9, 9, 9], [9, 9, 9]])
+
+
+def test_add_begin_end_tokens():
+    input_sequence = np.array([1])
+    result = add_begin_end_tokens(input_sequence, 9, 8)
+
+    assert len(result) == 3
+    assert np.all(result == [9, 1, 8])
+
+
+def test_standarize_translation_sample():
+    input_sequence = np.array([1])
+
+    result = standarize_translation_sample(input_sequence, 5, 5, 9, 8)
+
+    assert len(result) == 5
+    assert np.all(result == [9, 1, 8, 5, 5])
+
+
+def test_create_input_output():
+    sequence = [56500, 117, 10824, 30186, 11090, 10113, 119]
+    tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")
+
+    expected_output_sequence = [
+        tokenizer.cls_token_id,
+        56500,
+        117,
+        10824,
+        30186,
+        11090,
+        10113,
+        119,
+        tokenizer.sep_token_id,
+        tokenizer.pad_token_id,
+        tokenizer.pad_token_id,
+    ]
+    expected_input_sequence = [
+        tokenizer.cls_token_id,
+        21739,
+        10824,
+        16469,
+        tokenizer.sep_token_id,
+        tokenizer.pad_token_id,
+        tokenizer.pad_token_id,
+        tokenizer.pad_token_id,
+        tokenizer.pad_token_id,
+        tokenizer.pad_token_id,
+        tokenizer.pad_token_id,
+    ]
+
+    result_input, result_output = create_input_output(sequence, 11, tokenizer)
+
+    assert len(result_input) == len(expected_input_sequence)
+    assert len(result_output) == len(expected_output_sequence)
+    assert np.all(expected_input_sequence == result_input)
+    assert np.all(expected_output_sequence == result_output)
+
+
+def test_create_input_output_batch():
+    tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")
+
+    expected_output_1 = np.array(tokenizer("Ala, ma KoTa.")["input_ids"])[1:-1]
+    expected_output_2 = np.array(tokenizer("A kOt nie!")["input_ids"])[1:-1]
+
+    expected_input_1 = np.array(tokenizer("ala ma kota")["input_ids"])[1:-1]
+    expected_input_2 = np.array(tokenizer("a kot nie")["input_ids"])[1:-1]
+
+    input_sequence = np.concatenate([expected_output_1, expected_output_2])
+    batch_ids = [
+        np.array(list(range(len(expected_output_1)))),
+        np.array(list(range(len(expected_output_2)))) + len(expected_output_1),
+    ]
+
+    expected_input_1 = standarize_translation_sample(
+        expected_input_1,
+        20,
+        tokenizer.pad_token_id,
+        tokenizer.cls_token_id,
+        tokenizer.sep_token_id,
+    )
+    expected_input_2 = standarize_translation_sample(
+        expected_input_2,
+        20,
+        tokenizer.pad_token_id,
+        tokenizer.cls_token_id,
+        tokenizer.sep_token_id,
+    )
+    expected_output_1 = standarize_translation_sample(
+        expected_output_1,
+        20,
+        tokenizer.pad_token_id,
+        tokenizer.cls_token_id,
+        tokenizer.sep_token_id,
+    )
+    expected_output_2 = standarize_translation_sample(
+        expected_output_2,
+        20,
+        tokenizer.pad_token_id,
+        tokenizer.cls_token_id,
+        tokenizer.sep_token_id,
+    )
+
+    result_input, result_output = crete_input_output_batch(
+        input_sequence, batch_ids, 20, tokenizer
+    )
+
+    assert result_input.shape[0] == 2
+    assert result_input.shape[1] == 20
+
+    assert result_output.shape[0] == 2
+    assert result_output.shape[1] == 20
+
+    assert np.all(result_input[0] == expected_input_1)
+    assert np.all(result_input[1] == expected_input_2)
+
+    assert np.all(result_output[0] == expected_output_1)
+    assert np.all(result_output[1] == expected_output_2)
diff --git a/tests/test_batch_loading.py b/tests/test_batch_loading.py
new file mode 100644
index 0000000000000000000000000000000000000000..641aaa7652d71cd0110e66c12fe84c3b6fc0b937
--- /dev/null
+++ b/tests/test_batch_loading.py
@@ -0,0 +1,57 @@
+import dask.dataframe as dd
+import numpy as np
+import pandas as pd
+
+from src.batch_loading import (
+    calculate_batch_buffer_id,
+    get_batches,
+    get_ordered_dataframe_len,
+    yield_batch_buffer_span,
+)
+
+
+def test_calculate_batch_buffer_id():
+    # ids = [0, 1, 2, 3, 4, 5, 6]
+    assert calculate_batch_buffer_id(0, 3) == 0
+    assert calculate_batch_buffer_id(1, 3) == 0
+    assert calculate_batch_buffer_id(2, 3) == 0
+    assert calculate_batch_buffer_id(3, 3) == 1
+    assert calculate_batch_buffer_id(4, 3) == 1
+    assert calculate_batch_buffer_id(5, 3) == 1
+    assert calculate_batch_buffer_id(6, 3) == 2
+
+
+def test_yield_batch_buffer_span():
+    ids = [0, 1, 2, 3, 4, 5, 6]
+
+    result = list(yield_batch_buffer_span(2, 2, len(ids)))
+
+    assert np.all(result[0] == [0, 1, 2, 3])
+    assert np.all(result[1] == [4, 5, 6])
+
+
+def test_get_ordered_dataframe_len():
+    df = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7]})
+
+    assert get_ordered_dataframe_len(df) == 7
+
+
+def test_get_batches():
+    batch_size = 2
+    batch_buffer_len = 2
+    pdf = pd.DataFrame({"a": [1, 0, 2, 3, 4, 5, 6]})
+    shuffled_ids = np.array([1, 0, 2, 3, 4, 5, 6])
+    df = dd.from_pandas(pdf, npartitions=2)
+
+    batches = list(get_batches(df, batch_size, batch_buffer_len, shuffled_ids))
+
+    assert np.all(batches[0]["a"].values == [0, 1])
+    assert np.all(batches[1]["a"].values == [2, 3])
+    assert np.all(batches[2]["a"].values == [4, 5])
+    assert np.all(batches[3]["a"].values == [6])
+
+    batches = list(get_batches(df, batch_size, batch_buffer_len, shuffled_ids, 1))
+
+    assert np.all(batches[1]["a"].values == [2, 3])
+    assert np.all(batches[2]["a"].values == [4, 5])
+    assert np.all(batches[3]["a"].values == [6])
diff --git a/tests/test_training.py b/tests/test_training.py
new file mode 100644
index 0000000000000000000000000000000000000000..2aa5d6a2b70c9b197bb72b2438b61c39620a3c49
--- /dev/null
+++ b/tests/test_training.py
@@ -0,0 +1,21 @@
+from src.training import latest_model
+
+
+def test_latest_model():
+    files = []
+    assert latest_model(files) is None
+
+    files.append("/path/tam/pam/Wrongformat.b")
+    assert latest_model(files) is None
+
+    files.append("/path/tam/pam/0-2000.b")
+    assert latest_model(files) == (0, 2000)
+
+    files.append("/path/tam/pam/0-3000.c")
+    assert latest_model(files) == (0, 3000)
+
+    files.append("/path/tam/pam/1-1000.a")
+    assert latest_model(files) == (1, 1000)
+
+    files.append("/path/tam/pam/1-500.a")
+    assert latest_model(files) == (1, 1000)
diff --git a/tests/test_utils.py b/tests/test_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9887354d0d1a728744d00a1a4b921a36877e110f
--- /dev/null
+++ b/tests/test_utils.py
@@ -0,0 +1,40 @@
+from src.utils import convert_to_timedelta, preprocess, remove_multiple_spaces, remove_punctuation
+
+
+def test_remove_multiple_spaces():
+    provided = "Ala   ma Kota.      Kot ma Ale "
+    expected = "Ala ma Kota. Kot ma Ale "
+
+    assert remove_multiple_spaces(provided) == expected
+
+
+def test_remove_punctuation():
+    provided = "Ala..  ma-Kota!?.@@$ Kot ma Ale ()*"
+    expected = "Ala  ma Kota Kot ma Ale "
+
+    assert remove_punctuation(provided) == expected
+
+
+def test_preprocess():
+    provided = "Ala  ma-Kota!?.@@$ Kot ma Ale ()*"
+    expected = "ala ma kota kot ma ale"
+
+    assert preprocess(provided) == expected
+
+
+def test_convert_to_timedelta():
+    assert convert_to_timedelta("5d").days == 5
+    assert convert_to_timedelta("5d").seconds == 0
+    assert convert_to_timedelta("5d").microseconds == 0
+
+    assert convert_to_timedelta("4h").days == 0
+    assert convert_to_timedelta("4h").seconds == 4 * 60 * 60
+    assert convert_to_timedelta("4h").microseconds == 0
+
+    assert convert_to_timedelta("3m").days == 0
+    assert convert_to_timedelta("3m").seconds == 3 * 60
+    assert convert_to_timedelta("3m").microseconds == 0
+
+    assert convert_to_timedelta("2s").days == 0
+    assert convert_to_timedelta("2s").seconds == 2
+    assert convert_to_timedelta("2s").microseconds == 0
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 0000000000000000000000000000000000000000..4326963241d376dcdec8a3b163a80412e667ab9b
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,50 @@
+[tox]
+envlist = unittest,pep8
+skipsdist = True
+
+[testenv]
+deps = 
+    pytest
+    numpy
+    pyyaml
+    pandas 
+    tqdm 
+    torch 
+    dask[complete] 
+    transformers 
+    pyarrow==0.17.1
+    lxml
+
+[testenv:unittest]
+commands = pytest --ignore data --ignore generated
+
+[flake8]
+exclude =
+     .tox,
+    .git,
+    __pycache__,
+    docs/source/conf.py,
+    build,
+    dist,
+    tests/fixtures/*,
+    *.pyc,
+    *.egg-info,
+    .cache,
+    .eggs
+    data
+    generated
+max-complexity = 10
+max-line-length = 80
+select = I,C,E,F,W,B,B950,TYP,T
+ignore = E501, C901, I201
+
+
+[testenv:pep8]
+deps =
+    flake8
+    flake8-type-annotations
+    flake8-typing-imports
+basepython = python
+commands =
+    flake8 {posargs}
+
diff --git a/worker.py b/worker.py
new file mode 100755
index 0000000000000000000000000000000000000000..5bf6e0c88a423ec27cb04f0728b5eeeac71406e6
--- /dev/null
+++ b/worker.py
@@ -0,0 +1,41 @@
+#!/usr/bin/python
+
+import configparser
+
+import nlp_ws
+
+from src.pipelines.actions_based.processing import apply_actions_punctuation
+from src.pipelines.actions_based.utils import load_model
+from src.utils import preprocess
+
+
+class Worker(nlp_ws.NLPWorker):
+    """Class that implements example worker."""
+
+    def init(self):
+        self.config = configparser.ConfigParser()
+        self.config.read("config.ini")
+
+        self.threshold = self.config["deployment"]["threshold"]
+        self.chunk_size = self.config["deployment"]["chunk_size"]
+        self.tokenizer, self.model = load_model(
+            self.config["deployment"]["model"],
+            self.config["deployment"]["base_model"],
+            self.config["deployment"]["device"],
+        )
+
+    def process(self, input_file: str, task_options: dict, output_file: str) -> None:
+        """Implementation of example tasks that copies files."""
+
+        with open(input_file, "r") as f:
+            text = preprocess(f.read())
+            text_processed = apply_actions_punctuation(
+                text, self.chunk_size, self.tokenizer, self.model, self.threshold
+            )
+
+        with open(output_file, "w") as f:
+            f.write(text_processed)
+
+
+if __name__ == "__main__":
+    nlp_ws.NLPService.main(Worker)