From 2b98e2b1cd735a066a436b2c5c5d661b7871d1e2 Mon Sep 17 00:00:00 2001
From: Jarema Radom <jaremaradom@gmail.com>
Date: Tue, 13 Jul 2021 13:44:09 +0200
Subject: [PATCH 1/6] Fixes described in merge request

---
 Dockerfile.worker | 2 +-
 worker.py         | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/Dockerfile.worker b/Dockerfile.worker
index 54cbf34..abaecf3 100644
--- a/Dockerfile.worker
+++ b/Dockerfile.worker
@@ -1,4 +1,4 @@
-FROM clarinpl/cuda-python:3.7 AS base
+FROM clarinpl/cuda-python:3.7
 
 RUN DEBIAN_FRONTEND=noninteractive apt-get update && apt-get install -y gcc python3-dev
 
diff --git a/worker.py b/worker.py
index 4275942..c6ad6b9 100644
--- a/worker.py
+++ b/worker.py
@@ -89,9 +89,8 @@ class Worker(nlp_ws.NLPWorker):
                 input_ids=tokenized["input_ids"][:, inference_mask].to(
                     self.device
                 ),
-                attention_mask=tokenized["attention_mask"][
-                    :, inference_mask
-                ].to(self.device),
+                attention_mask=tokenized["attention_mask"][:, inference_mask]
+                .to(self.device),
             )
             labels_ids = (
                 result.logits.detach()
-- 
GitLab


From 39b3f106b83bf94af6ce843c7fd82eb5e3e3d47d Mon Sep 17 00:00:00 2001
From: Jarema Radom <jaremaradom@gmail.com>
Date: Wed, 28 Jul 2021 16:20:53 +0200
Subject: [PATCH 2/6] fix for bpe related decoding

---
 punctuator/punctuator.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/punctuator/punctuator.py b/punctuator/punctuator.py
index 1bea097..72041e0 100644
--- a/punctuator/punctuator.py
+++ b/punctuator/punctuator.py
@@ -35,6 +35,8 @@ def decode(tokens, labels_decoded, tokenizer, bpe=False):
     for label, token in zip(labels_decoded, tokens):
         if bpe:
             token_str = tokenizer.decode(token)
+            if token_str.startswith(" "):
+                token_str = token_str[1:]
         else:
             token_str = tokenizer.convert_ids_to_tokens([token])[0]
         if token_str == "[PAD]":
@@ -43,8 +45,7 @@ def decode(tokens, labels_decoded, tokenizer, bpe=False):
             word.append(token_str.replace("##", ""))
         else:
             if len(word) > 0:
-                if not bpe or word_end != ' ':
-                    word.append(word_end)
+                word.append(word_end)
                 text_recovered.append("".join(word))
                 word = []
             if label.startswith("__ALL_UPPER__"):
-- 
GitLab


From a04db9cfbc73f58c306b6f35024b2262916011a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Pogoda?= <mipo57@e-science.pl>
Date: Wed, 13 Oct 2021 17:06:47 +0200
Subject: [PATCH 3/6] Adding s3 synchronization

- Changed manual model downloading to s3 sync
- Added option to run a docker in a test verison, to make debugging easierwq
---
 .dockerignore                        |   2 +
 .dvc/.gitignore                      |   2 +
 .dvc/config                          |   0
 .dvc/plots/confusion.json            | 107 ++++++++++
 .dvc/plots/confusion_normalized.json | 112 +++++++++++
 .dvc/plots/default.json              |  31 +++
 .dvc/plots/linear.json               | 116 +++++++++++
 .dvc/plots/scatter.json              | 104 ++++++++++
 .dvc/plots/smooth.json               |  39 ++++
 .dvcignore                           |   3 +
 .gitignore                           |   3 +-
 Dockerfile.worker => Dockerfile      |   0
 README.md                            |  30 ++-
 config.ini                           |   8 +-
 entrypoint.sh                        |  39 +---
 example_texts/expected/en.txt        |   1 +
 example_texts/expected/pl.txt        |   1 +
 example_texts/expected/ru.txt        |   1 +
 example_texts/input/en.txt           |   1 +
 example_texts/input/pl.txt           |   1 +
 example_texts/input/ru.txt           |   1 +
 example_texts/output/.gitignore      |   1 +
 models/.gitignore                    |   3 +
 models/en.dvc                        |  12 ++
 models/pl.dvc                        |   5 +
 models/ru.dvc                        |  12 ++
 punctuator/punctuator.py             | 285 +++++++++++----------------
 punctuator/utils.py                  | 190 ++++++++++++++++++
 requirements.txt                     |   3 +-
 sync_to_s3.sh                        |   5 +
 worker.py                            | 152 +++-----------
 31 files changed, 936 insertions(+), 334 deletions(-)
 create mode 100644 .dockerignore
 create mode 100644 .dvc/.gitignore
 create mode 100644 .dvc/config
 create mode 100644 .dvc/plots/confusion.json
 create mode 100644 .dvc/plots/confusion_normalized.json
 create mode 100644 .dvc/plots/default.json
 create mode 100644 .dvc/plots/linear.json
 create mode 100644 .dvc/plots/scatter.json
 create mode 100644 .dvc/plots/smooth.json
 create mode 100644 .dvcignore
 rename Dockerfile.worker => Dockerfile (100%)
 create mode 100644 example_texts/expected/en.txt
 create mode 100644 example_texts/expected/pl.txt
 create mode 100644 example_texts/expected/ru.txt
 create mode 100644 example_texts/input/en.txt
 create mode 100644 example_texts/input/pl.txt
 create mode 100644 example_texts/input/ru.txt
 create mode 100644 example_texts/output/.gitignore
 create mode 100644 models/.gitignore
 create mode 100644 models/en.dvc
 create mode 100644 models/pl.dvc
 create mode 100644 models/ru.dvc
 create mode 100644 punctuator/utils.py
 create mode 100755 sync_to_s3.sh

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..73cf2f1
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,2 @@
+models
+example_texts
\ No newline at end of file
diff --git a/.dvc/.gitignore b/.dvc/.gitignore
new file mode 100644
index 0000000..5ecbd4c
--- /dev/null
+++ b/.dvc/.gitignore
@@ -0,0 +1,2 @@
+/config.local
+/tmp
diff --git a/.dvc/config b/.dvc/config
new file mode 100644
index 0000000..e69de29
diff --git a/.dvc/plots/confusion.json b/.dvc/plots/confusion.json
new file mode 100644
index 0000000..af1b48d
--- /dev/null
+++ b/.dvc/plots/confusion.json
@@ -0,0 +1,107 @@
+{
+    "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
+    "data": {
+        "values": "<DVC_METRIC_DATA>"
+    },
+    "title": "<DVC_METRIC_TITLE>",
+    "facet": {
+        "field": "rev",
+        "type": "nominal"
+    },
+    "spec": {
+        "transform": [
+            {
+                "aggregate": [
+                    {
+                        "op": "count",
+                        "as": "xy_count"
+                    }
+                ],
+                "groupby": [
+                    "<DVC_METRIC_Y>",
+                    "<DVC_METRIC_X>"
+                ]
+            },
+            {
+                "impute": "xy_count",
+                "groupby": [
+                    "rev",
+                    "<DVC_METRIC_Y>"
+                ],
+                "key": "<DVC_METRIC_X>",
+                "value": 0
+            },
+            {
+                "impute": "xy_count",
+                "groupby": [
+                    "rev",
+                    "<DVC_METRIC_X>"
+                ],
+                "key": "<DVC_METRIC_Y>",
+                "value": 0
+            },
+            {
+                "joinaggregate": [
+                    {
+                        "op": "max",
+                        "field": "xy_count",
+                        "as": "max_count"
+                    }
+                ],
+                "groupby": []
+            },
+            {
+                "calculate": "datum.xy_count / datum.max_count",
+                "as": "percent_of_max"
+            }
+        ],
+        "encoding": {
+            "x": {
+                "field": "<DVC_METRIC_X>",
+                "type": "nominal",
+                "sort": "ascending",
+                "title": "<DVC_METRIC_X_LABEL>"
+            },
+            "y": {
+                "field": "<DVC_METRIC_Y>",
+                "type": "nominal",
+                "sort": "ascending",
+                "title": "<DVC_METRIC_Y_LABEL>"
+            }
+        },
+        "layer": [
+            {
+                "mark": "rect",
+                "width": 300,
+                "height": 300,
+                "encoding": {
+                    "color": {
+                        "field": "xy_count",
+                        "type": "quantitative",
+                        "title": "",
+                        "scale": {
+                            "domainMin": 0,
+                            "nice": true
+                        }
+                    }
+                }
+            },
+            {
+                "mark": "text",
+                "encoding": {
+                    "text": {
+                        "field": "xy_count",
+                        "type": "quantitative"
+                    },
+                    "color": {
+                        "condition": {
+                            "test": "datum.percent_of_max > 0.5",
+                            "value": "white"
+                        },
+                        "value": "black"
+                    }
+                }
+            }
+        ]
+    }
+}
diff --git a/.dvc/plots/confusion_normalized.json b/.dvc/plots/confusion_normalized.json
new file mode 100644
index 0000000..1d38849
--- /dev/null
+++ b/.dvc/plots/confusion_normalized.json
@@ -0,0 +1,112 @@
+{
+    "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
+    "data": {
+        "values": "<DVC_METRIC_DATA>"
+    },
+    "title": "<DVC_METRIC_TITLE>",
+    "facet": {
+        "field": "rev",
+        "type": "nominal"
+    },
+    "spec": {
+        "transform": [
+            {
+                "aggregate": [
+                    {
+                        "op": "count",
+                        "as": "xy_count"
+                    }
+                ],
+                "groupby": [
+                    "<DVC_METRIC_Y>",
+                    "<DVC_METRIC_X>"
+                ]
+            },
+            {
+                "impute": "xy_count",
+                "groupby": [
+                    "rev",
+                    "<DVC_METRIC_Y>"
+                ],
+                "key": "<DVC_METRIC_X>",
+                "value": 0
+            },
+            {
+                "impute": "xy_count",
+                "groupby": [
+                    "rev",
+                    "<DVC_METRIC_X>"
+                ],
+                "key": "<DVC_METRIC_Y>",
+                "value": 0
+            },
+            {
+                "joinaggregate": [
+                    {
+                        "op": "sum",
+                        "field": "xy_count",
+                        "as": "sum_y"
+                    }
+                ],
+                "groupby": [
+                    "<DVC_METRIC_Y>"
+                ]
+            },
+            {
+                "calculate": "datum.xy_count / datum.sum_y",
+                "as": "percent_of_y"
+            }
+        ],
+        "encoding": {
+            "x": {
+                "field": "<DVC_METRIC_X>",
+                "type": "nominal",
+                "sort": "ascending",
+                "title": "<DVC_METRIC_X_LABEL>"
+            },
+            "y": {
+                "field": "<DVC_METRIC_Y>",
+                "type": "nominal",
+                "sort": "ascending",
+                "title": "<DVC_METRIC_Y_LABEL>"
+            }
+        },
+        "layer": [
+            {
+                "mark": "rect",
+                "width": 300,
+                "height": 300,
+                "encoding": {
+                    "color": {
+                        "field": "percent_of_y",
+                        "type": "quantitative",
+                        "title": "",
+                        "scale": {
+                            "domain": [
+                                0,
+                                1
+                            ]
+                        }
+                    }
+                }
+            },
+            {
+                "mark": "text",
+                "encoding": {
+                    "text": {
+                        "field": "percent_of_y",
+                        "type": "quantitative",
+                        "format": ".2f"
+                    },
+                    "color": {
+                        "condition": {
+                            "test": "datum.percent_of_y > 0.5",
+                            "value": "white"
+                        },
+                        "value": "black"
+                    }
+                }
+            }
+        ]
+    }
+}
diff --git a/.dvc/plots/default.json b/.dvc/plots/default.json
new file mode 100644
index 0000000..9cf71ce
--- /dev/null
+++ b/.dvc/plots/default.json
@@ -0,0 +1,31 @@
+{
+    "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
+    "data": {
+        "values": "<DVC_METRIC_DATA>"
+    },
+    "title": "<DVC_METRIC_TITLE>",
+    "width": 300,
+    "height": 300,
+    "mark": {
+        "type": "line"
+    },
+    "encoding": {
+        "x": {
+            "field": "<DVC_METRIC_X>",
+            "type": "quantitative",
+            "title": "<DVC_METRIC_X_LABEL>"
+        },
+        "y": {
+            "field": "<DVC_METRIC_Y>",
+            "type": "quantitative",
+            "title": "<DVC_METRIC_Y_LABEL>",
+            "scale": {
+                "zero": false
+            }
+        },
+        "color": {
+            "field": "rev",
+            "type": "nominal"
+        }
+    }
+}
diff --git a/.dvc/plots/linear.json b/.dvc/plots/linear.json
new file mode 100644
index 0000000..65549f9
--- /dev/null
+++ b/.dvc/plots/linear.json
@@ -0,0 +1,116 @@
+{
+    "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
+    "data": {
+        "values": "<DVC_METRIC_DATA>"
+    },
+    "title": "<DVC_METRIC_TITLE>",
+    "width": 300,
+    "height": 300,
+    "layer": [
+        {
+            "encoding": {
+                "x": {
+                    "field": "<DVC_METRIC_X>",
+                    "type": "quantitative",
+                    "title": "<DVC_METRIC_X_LABEL>"
+                },
+                "y": {
+                    "field": "<DVC_METRIC_Y>",
+                    "type": "quantitative",
+                    "title": "<DVC_METRIC_Y_LABEL>",
+                    "scale": {
+                        "zero": false
+                    }
+                },
+                "color": {
+                    "field": "rev",
+                    "type": "nominal"
+                }
+            },
+            "layer": [
+                {
+                    "mark": "line"
+                },
+                {
+                    "selection": {
+                        "label": {
+                            "type": "single",
+                            "nearest": true,
+                            "on": "mouseover",
+                            "encodings": [
+                                "x"
+                            ],
+                            "empty": "none",
+                            "clear": "mouseout"
+                        }
+                    },
+                    "mark": "point",
+                    "encoding": {
+                        "opacity": {
+                            "condition": {
+                                "selection": "label",
+                                "value": 1
+                            },
+                            "value": 0
+                        }
+                    }
+                }
+            ]
+        },
+        {
+            "transform": [
+                {
+                    "filter": {
+                        "selection": "label"
+                    }
+                }
+            ],
+            "layer": [
+                {
+                    "mark": {
+                        "type": "rule",
+                        "color": "gray"
+                    },
+                    "encoding": {
+                        "x": {
+                            "field": "<DVC_METRIC_X>",
+                            "type": "quantitative"
+                        }
+                    }
+                },
+                {
+                    "encoding": {
+                        "text": {
+                            "type": "quantitative",
+                            "field": "<DVC_METRIC_Y>"
+                        },
+                        "x": {
+                            "field": "<DVC_METRIC_X>",
+                            "type": "quantitative"
+                        },
+                        "y": {
+                            "field": "<DVC_METRIC_Y>",
+                            "type": "quantitative"
+                        }
+                    },
+                    "layer": [
+                        {
+                            "mark": {
+                                "type": "text",
+                                "align": "left",
+                                "dx": 5,
+                                "dy": -5
+                            },
+                            "encoding": {
+                                "color": {
+                                    "type": "nominal",
+                                    "field": "rev"
+                                }
+                            }
+                        }
+                    ]
+                }
+            ]
+        }
+    ]
+}
diff --git a/.dvc/plots/scatter.json b/.dvc/plots/scatter.json
new file mode 100644
index 0000000..9af9304
--- /dev/null
+++ b/.dvc/plots/scatter.json
@@ -0,0 +1,104 @@
+{
+    "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
+    "data": {
+        "values": "<DVC_METRIC_DATA>"
+    },
+    "title": "<DVC_METRIC_TITLE>",
+    "width": 300,
+    "height": 300,
+    "layer": [
+        {
+            "encoding": {
+                "x": {
+                    "field": "<DVC_METRIC_X>",
+                    "type": "quantitative",
+                    "title": "<DVC_METRIC_X_LABEL>"
+                },
+                "y": {
+                    "field": "<DVC_METRIC_Y>",
+                    "type": "quantitative",
+                    "title": "<DVC_METRIC_Y_LABEL>",
+                    "scale": {
+                        "zero": false
+                    }
+                },
+                "color": {
+                    "field": "rev",
+                    "type": "nominal"
+                }
+            },
+            "layer": [
+                {
+                    "mark": "point"
+                },
+                {
+                    "selection": {
+                        "label": {
+                            "type": "single",
+                            "nearest": true,
+                            "on": "mouseover",
+                            "encodings": [
+                                "x"
+                            ],
+                            "empty": "none",
+                            "clear": "mouseout"
+                        }
+                    },
+                    "mark": "point",
+                    "encoding": {
+                        "opacity": {
+                            "condition": {
+                                "selection": "label",
+                                "value": 1
+                            },
+                            "value": 0
+                        }
+                    }
+                }
+            ]
+        },
+        {
+            "transform": [
+                {
+                    "filter": {
+                        "selection": "label"
+                    }
+                }
+            ],
+            "layer": [
+                {
+                    "encoding": {
+                        "text": {
+                            "type": "quantitative",
+                            "field": "<DVC_METRIC_Y>"
+                        },
+                        "x": {
+                            "field": "<DVC_METRIC_X>",
+                            "type": "quantitative"
+                        },
+                        "y": {
+                            "field": "<DVC_METRIC_Y>",
+                            "type": "quantitative"
+                        }
+                    },
+                    "layer": [
+                        {
+                            "mark": {
+                                "type": "text",
+                                "align": "left",
+                                "dx": 5,
+                                "dy": -5
+                            },
+                            "encoding": {
+                                "color": {
+                                    "type": "nominal",
+                                    "field": "rev"
+                                }
+                            }
+                        }
+                    ]
+                }
+            ]
+        }
+    ]
+}
diff --git a/.dvc/plots/smooth.json b/.dvc/plots/smooth.json
new file mode 100644
index 0000000..d497ce7
--- /dev/null
+++ b/.dvc/plots/smooth.json
@@ -0,0 +1,39 @@
+{
+    "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
+    "data": {
+        "values": "<DVC_METRIC_DATA>"
+    },
+    "title": "<DVC_METRIC_TITLE>",
+    "mark": {
+        "type": "line"
+    },
+    "encoding": {
+        "x": {
+            "field": "<DVC_METRIC_X>",
+            "type": "quantitative",
+            "title": "<DVC_METRIC_X_LABEL>"
+        },
+        "y": {
+            "field": "<DVC_METRIC_Y>",
+            "type": "quantitative",
+            "title": "<DVC_METRIC_Y_LABEL>",
+            "scale": {
+                "zero": false
+            }
+        },
+        "color": {
+            "field": "rev",
+            "type": "nominal"
+        }
+    },
+    "transform": [
+        {
+            "loess": "<DVC_METRIC_Y>",
+            "on": "<DVC_METRIC_X>",
+            "groupby": [
+                "rev"
+            ],
+            "bandwidth": 0.3
+        }
+    ]
+}
diff --git a/.dvcignore b/.dvcignore
new file mode 100644
index 0000000..5197305
--- /dev/null
+++ b/.dvcignore
@@ -0,0 +1,3 @@
+# Add patterns of files dvc should ignore, which could improve
+# the performance. Learn more at
+# https://dvc.org/doc/user-guide/dvcignore
diff --git a/.gitignore b/.gitignore
index 007a515..060439b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,4 +7,5 @@
 /config.test.ini
 /wandb
 __pycache__
-/notebook.ipynb
\ No newline at end of file
+/notebook.ipynb
+/en
diff --git a/Dockerfile.worker b/Dockerfile
similarity index 100%
rename from Dockerfile.worker
rename to Dockerfile
diff --git a/README.md b/README.md
index 4e5e9af..28c0174 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 # Punctuator
-A service that automatically adds punctuation to raw word-stream (eg. from speech2text) for polish, russian and english language. 
+A service that automatically adds punctuation and casing to raw word-stream (eg. from speech2text) for polish, russian and english language. 
 
 **Example input**:
 > według webometrycznego rankingu uniwersytetów świata ze stycznia 2019 pokazującego zaangażowanie instytucji akademickich w internecie uczelnia zajmuje 5 miejsce w polsce wśród uczelni technicznych a na świecie 964 wśród wszystkich typów uczelni w rankingu szkół wyższych perspektyw politechnika wrocławska zajęła w 2019 roku 3 miejsce wśród uczelni technicznych oraz 6 miejsce spośród wszystkich uczelni akademickich w polsce
@@ -11,8 +11,8 @@ A service that automatically adds punctuation to raw word-stream (eg. from speec
 ```ini
 [deployment]
 model_path_pl = /home/worker/model/punctuator_pl ; Path where the polish model is located
-model_path_en = /home/worker/model/punctuator_en ; Path where the polish model is located
-model_path_ru = /home/worker/model/punctuator_ru ; Path where the polish model is located
+model_path_en = /home/worker/model/punctuator_en ; Path where the english model is located
+model_path_ru = /home/worker/model/punctuator_ru ; Path where the russian model is located
 languagetool_path = /home/worker/model/languagetool ; Path where languagetool server will be placed
 max_context_size = 256 ; Number of tokens that will be oonsidered in prediciton at once. Must be between in range 2*overlap+1 to 512
 overlap = 20 ; The number of tokens from the environment that will be taken at inference for a text fragment
@@ -22,8 +22,28 @@ device = cpu ; Device on which inference will be made (eg. cpu, cuda:0 etc)
 ## LPMN
 Punctuator have one optional argument `language` with options: `pl` `ru` `en` (defaults to pl):
 ```
-filedir(/users/michal.pogoda)|any2txt|punctuator({"language":"en"})
+filedir(/users/michal.pogoda)|any2txt|punctuator({"language":"pl"})
 ```
 
 ## Mountpoints
-Directory where the model will be downloaded (~500Mb) needs to be mounted at `/home/worker/model/punctuator`. Mount `/home/worker/model` into directory if you want to make it persitent 
+If you want to cache the models, you need to mount the directory of `/home/worker/models/`. Also, samba needs to be mounted as `/samba`
+
+## Pushing models into production
+Production models should be placed in models directory. Then, to push them into production all you need to do is to run `sync_to_s3.sh` script
+```bash
+./sync_to_s3.sh
+```
+
+## Testing
+There are example texts in the `example_texts` directory. You can run punctuator, to predicth on those texts to see how well it is working for all available languages.
+```bash
+docker build . -t punctuator
+```
+
+```bash
+docker run -it \
+    -e PUNCTUATOR_TEST=TRUE \
+    -v $(pwd)/example_texts:/samba \
+    -v $(pwd)/models:/home/worker/models/punctuator \
+    punctuator
+```
\ No newline at end of file
diff --git a/config.ini b/config.ini
index ee8967b..d8eb44b 100644
--- a/config.ini
+++ b/config.ini
@@ -13,10 +13,10 @@ port = 9981
 local_log_level = INFO
 
 [deployment]
-model_path_pl = /home/worker/model/punctuator_pl
-model_path_en = /home/worker/model/punctuator_en
-model_path_ru = /home/worker/model/punctuator_ru
-languagetool_path = /home/worker/model/languagetool
+models_path_pl = /home/worker/models/punctuator/pl
+model_path_en = /home/worker/models/punctuator/en
+model_path_ru = /home/worker/models/punctuator/ru
+languagetool_path = /home/worker/models/languagetool
 max_context_size = 256
 overlap = 20
 device = cpu
\ No newline at end of file
diff --git a/entrypoint.sh b/entrypoint.sh
index 7863788..91b2c18 100644
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -1,33 +1,10 @@
 #!/bin/bash
 
-if ! test -d "/home/worker/model/punctuator_pl"; then
-    mkdir -p /home/worker/model/punctuator_pl
-    wget https://minio.clarin-pl.eu/public/models/punctuator/punctuator_pl/pytorch_model.bin -O /home/worker/model/punctuator_pl/pytorch_model.bin
-    wget https://minio.clarin-pl.eu/public/models/punctuator/punctuator_pl/vocab.txt -O /home/worker/model/punctuator_pl/vocab.txt
-    wget https://minio.clarin-pl.eu/public/models/punctuator/punctuator_pl/config.json -O /home/worker/model/punctuator_pl/config.json
-    wget https://minio.clarin-pl.eu/public/models/punctuator/punctuator_pl/tokenizer_config.json -O /home/worker/model/punctuator_pl/tokenizer_config.json
-    wget https://minio.clarin-pl.eu/public/models/punctuator/punctuator_pl/special_tokens_map.json -O /home/worker/model/punctuator_pl/special_tokens_map.json
-    wget https://minio.clarin-pl.eu/public/models/punctuator/punctuator_pl/classes.json -O /home/worker/model/punctuator_pl/classes.json
-fi
-
-if ! test -d "/home/worker/model/punctuator_en"; then
-    mkdir -p /home/worker/model/punctuator_en
-    wget https://minio.clarin-pl.eu/public/models/punctuator/punctuator_en/pytorch_model.bin -O /home/worker/model/punctuator_en/pytorch_model.bin
-    wget https://minio.clarin-pl.eu/public/models/punctuator/punctuator_en/vocab.txt -O /home/worker/model/punctuator_en/vocab.txt
-    wget https://minio.clarin-pl.eu/public/models/punctuator/punctuator_en/config.json -O /home/worker/model/punctuator_en/config.json
-    wget https://minio.clarin-pl.eu/public/models/punctuator/punctuator_en/tokenizer_config.json -O /home/worker/model/punctuator_en/tokenizer_config.json
-    wget https://minio.clarin-pl.eu/public/models/punctuator/punctuator_en/tokenizer.json -O /home/worker/model/punctuator_en/tokenizer.json
-    wget https://minio.clarin-pl.eu/public/models/punctuator/punctuator_en/special_tokens_map.json -O /home/worker/model/punctuator_en/special_tokens_map.json
-fi
-
-if ! test -d "/home/worker/model/punctuator_ru"; then
-    mkdir -p /home/worker/model/punctuator_ru
-    wget https://minio.clarin-pl.eu/public/models/punctuator/punctuator_ru/pytorch_model.bin -O /home/worker/model/punctuator_ru/pytorch_model.bin
-    wget https://minio.clarin-pl.eu/public/models/punctuator/punctuator_ru/vocab.json -O /home/worker/model/punctuator_ru/vocab.json
-    wget https://minio.clarin-pl.eu/public/models/punctuator/punctuator_ru/config.json -O /home/worker/model/punctuator_ru/config.json
-    wget https://minio.clarin-pl.eu/public/models/punctuator/punctuator_ru/tokenizer.json -O /home/worker/model/punctuator_ru/tokenizer.json
-    wget https://minio.clarin-pl.eu/public/models/punctuator/punctuator_ru/tokenizer_config.json -O /home/worker/model/punctuator_ru/tokenizer_config.json
-    wget https://minio.clarin-pl.eu/public/models/punctuator/punctuator_ru/special_tokens_map.json -O /home/worker/model/punctuator_ru/special_tokens_map.json
-fi
-
-python worker.py
\ No newline at end of file
+aws --no-sign-request --endpoint-url "https://s3.clarin-pl.eu" s3 sync s3://workers/punctuator/models /home/worker/models/punctuator
+
+if [[ "$PUNCTUATOR_TEST" == "TRUE" ]]
+then
+    python worker.py --test
+else
+    python worker.py
+fi
\ No newline at end of file
diff --git a/example_texts/expected/en.txt b/example_texts/expected/en.txt
new file mode 100644
index 0000000..b95aa97
--- /dev/null
+++ b/example_texts/expected/en.txt
@@ -0,0 +1 @@
+Water plays an important role in the world economy. Approximately 70% of the freshwater used by humans goes to agriculture. Fishing in salt and fresh water bodies is a major source of food for many parts of the world. Much of the long-distance trade of commodities (such as oil, natural gas, and manufactured products) is transported by boats through seas, rivers, lakes, and canals. Large quantities of water, ice, and steam are used for cooling and heating, in industry and homes. Water is an excellent solvent for a wide variety of substances both mineral and organic; as such it is widely used in industrial processes, and in cooking and washing. Water, ice and snow are also central to many sports and other forms of entertainment, such as swimming, pleasure boating, boat racing, surfing, sport fishing, diving, ice skating and skiing.
\ No newline at end of file
diff --git a/example_texts/expected/pl.txt b/example_texts/expected/pl.txt
new file mode 100644
index 0000000..8150ec1
--- /dev/null
+++ b/example_texts/expected/pl.txt
@@ -0,0 +1 @@
+Woda odgrywa ważną rolę w gospodarce światowej. Około 70% słodkiej wody zużywanej przez człowieka jest przeznaczane na rolnictwo. Połowy w słonych i słodkich zbiornikach wodnych są głównym źródłem pożywienia w wielu częściach świata. Duża część długodystansowego handlu towarami (takimi jak ropa naftowa, gaz ziemny i produkty przemysłowe) jest transportowana łodziami przez morza, rzeki, jeziora i kanały. Duże ilości wody, lodu i pary są wykorzystywane do chłodzenia i ogrzewania w przemyśle i domach. Woda jest doskonałym rozpuszczalnikiem dla szerokiej gamy substancji zarówno mineralnych, jak i organicznych; jako taka jest szeroko stosowana w procesach przemysłowych, a także w gotowaniu i praniu. Woda, lód i śnieg są również podstawą wielu sportów i innych form rozrywki, takich jak pływanie, żeglarstwo, wyścigi łodzi, surfing, wędkarstwo sportowe, nurkowanie, łyżwiarstwo i narciarstwo.
\ No newline at end of file
diff --git a/example_texts/expected/ru.txt b/example_texts/expected/ru.txt
new file mode 100644
index 0000000..05ca819
--- /dev/null
+++ b/example_texts/expected/ru.txt
@@ -0,0 +1 @@
+Вода играет важную роль в мировой экономике. Примерно 70% пресной воды, используемой человеком, идет на нужды сельского хозяйства. Рыболовство в соленых и пресных водоемах является основным источником пищи для многих регионов мира. Большая часть торговли товарами (такими как нефть, природный газ и промышленные товары) на дальние расстояния осуществляется с помощью судов по морям, рекам, озерам и каналам. Большое количество воды, льда и пара используется для охлаждения и отопления в промышленности и домах. Вода является прекрасным растворителем для широкого спектра веществ, как минеральных, так и органических; поэтому она широко используется в промышленных процессах, а также при приготовлении пищи и стирке. Вода, лед и снег также играют центральную роль во многих видах спорта и других формах развлечений, таких как плавание, прогулочные суда, гонки на лодках, серфинг, спортивная рыбалка, дайвинг, катание на коньках и лыжах.
\ No newline at end of file
diff --git a/example_texts/input/en.txt b/example_texts/input/en.txt
new file mode 100644
index 0000000..1e5a95c
--- /dev/null
+++ b/example_texts/input/en.txt
@@ -0,0 +1 @@
+water plays an important role in the world economy approximately 70 of the freshwater used by humans goes to agriculture fishing in salt and fresh water bodies is a major source of food for many parts of the world much of the longdistance trade of commodities such as oil natural gas and manufactured products is transported by boats through seas rivers lakes and canals large quantities of water ice and steam are used for cooling and heating in industry and homes water is an excellent solvent for a wide variety of substances both mineral and organic as such it is widely used in industrial processes and in cooking and washing water ice and snow are also central to many sports and other forms of entertainment such as swimming pleasure boating boat racing surfing sport fishing diving ice skating and skiing
\ No newline at end of file
diff --git a/example_texts/input/pl.txt b/example_texts/input/pl.txt
new file mode 100644
index 0000000..88956e6
--- /dev/null
+++ b/example_texts/input/pl.txt
@@ -0,0 +1 @@
+woda odgrywa ważną rolę w gospodarce światowej około 70 słodkiej wody zużywanej przez człowieka jest przeznaczane na rolnictwo połowy w słonych i słodkich zbiornikach wodnych są głównym źródłem pożywienia w wielu częściach świata duża część długodystansowego handlu towarami takimi jak ropa naftowa gaz ziemny i produkty przemysłowe jest transportowana łodziami przez morza rzeki jeziora i kanały duże ilości wody lodu i pary są wykorzystywane do chłodzenia i ogrzewania w przemyśle i domach woda jest doskonałym rozpuszczalnikiem dla szerokiej gamy substancji zarówno mineralnych jak i organicznych jako taka jest szeroko stosowana w procesach przemysłowych a także w gotowaniu i praniu woda lód i śnieg są również podstawą wielu sportów i innych form rozrywki takich jak pływanie żeglarstwo wyścigi łodzi surfing wędkarstwo sportowe nurkowanie łyżwiarstwo i narciarstwo
\ No newline at end of file
diff --git a/example_texts/input/ru.txt b/example_texts/input/ru.txt
new file mode 100644
index 0000000..9492060
--- /dev/null
+++ b/example_texts/input/ru.txt
@@ -0,0 +1 @@
+вода играет важную роль в мировой экономике примерно 70 пресной воды используемой человеком идет на нужды сельского хозяйства рыболовство в соленых и пресных водоемах является основным источником пищи для многих регионов мира большая часть торговли товарами такими как нефть природный газ и промышленные товары на дальние расстояния осуществляется с помощью судов по морям рекам озерам и каналам большое количество воды льда и пара используется для охлаждения и отопления в промышленности и домах вода является прекрасным растворителем для широкого спектра веществ как минеральных так и органических поэтому она широко используется в промышленных процессах а также при приготовлении пищи и стирке вода лед и снег также играют центральную роль во многих видах спорта и других формах развлечений таких как плавание прогулочные суда гонки на лодках серфинг спортивная рыбалка дайвинг катание на коньках и лыжах
\ No newline at end of file
diff --git a/example_texts/output/.gitignore b/example_texts/output/.gitignore
new file mode 100644
index 0000000..4b7c4ed
--- /dev/null
+++ b/example_texts/output/.gitignore
@@ -0,0 +1 @@
+/*.txt
diff --git a/models/.gitignore b/models/.gitignore
new file mode 100644
index 0000000..bba1fb9
--- /dev/null
+++ b/models/.gitignore
@@ -0,0 +1,3 @@
+/en
+/ru
+/pl
diff --git a/models/en.dvc b/models/en.dvc
new file mode 100644
index 0000000..ecb5bd4
--- /dev/null
+++ b/models/en.dvc
@@ -0,0 +1,12 @@
+md5: 73f04ea7e7335101e783317fbe5189c7
+frozen: true
+deps:
+- path: data/models/en
+  repo:
+    url: git@gitlab.clarin-pl.eu:grupa-wieszcz/punctuator/models.git
+    rev_lock: 7d36e8cb5d372008ff7c99158eb91184203ec7f6
+outs:
+- md5: e407614911d85095f0d3286b890b7b3a.dir
+  size: 1990793672
+  nfiles: 24
+  path: en
diff --git a/models/pl.dvc b/models/pl.dvc
new file mode 100644
index 0000000..83533de
--- /dev/null
+++ b/models/pl.dvc
@@ -0,0 +1,5 @@
+outs:
+- md5: da34f3b71f6e56a526ae1f0191300be5.dir
+  size: 527171457
+  nfiles: 6
+  path: pl
diff --git a/models/ru.dvc b/models/ru.dvc
new file mode 100644
index 0000000..a720271
--- /dev/null
+++ b/models/ru.dvc
@@ -0,0 +1,12 @@
+md5: 1f8156c987aaaa71f777fc23bce52d35
+frozen: true
+deps:
+- path: data/models/ru
+  repo:
+    url: git@gitlab.clarin-pl.eu:grupa-wieszcz/punctuator/models.git
+    rev_lock: 7d36e8cb5d372008ff7c99158eb91184203ec7f6
+outs:
+- md5: db59bd054b718b0925e20fe16a12b473.dir
+  size: 2845417138
+  nfiles: 22
+  path: ru
diff --git a/punctuator/punctuator.py b/punctuator/punctuator.py
index 72041e0..bc4d444 100644
--- a/punctuator/punctuator.py
+++ b/punctuator/punctuator.py
@@ -1,190 +1,141 @@
-from typing import List, Tuple
-import numpy as np
+"""Implementation of punctuator service"""
 
+import configparser
+import json
+import string
+import os
 
-def decode_labels(results, labels_map) -> List[str]:
-    """Converts labes from ids to text representations
+import nlp_ws
+from transformers import AutoModelForTokenClassification, AutoTokenizer
+import language_tool_python
 
-    Args:
-        results (List[int]): List of ids of labels
-        labels_map (List[str]): List of classnames in order matching list of
-        ids
 
-    Returns:
-        List[str]: List of classnames
-    """
-    labels_decoded = list(map(lambda x: labels_map[x], results))
+from punctuator.utils import (
+    combine_masks,
+    decode,
+    decode_labels,
+    inference_masks,
+)
 
-    return labels_decoded
 
+def _preprocess_input(text: str):
+    text = text.translate(str.maketrans("", "", string.punctuation))
+    text = text.lower()
 
-def decode(tokens, labels_decoded, tokenizer, bpe=False):
-    """Applies predictions to text in order to get punctuated text representation
+    return text
 
-    Args:
-        tokens (List[int]): List of token-ids
-        labels_decoded (List[str]): Per-token classnames
-        tokenizer: Huggingface tokenizer
 
-    Returns:
-        str: Text with punctuation & casing applied
-    """
-    text_recovered = []
-    word = []
-    word_end = ""
-    for label, token in zip(labels_decoded, tokens):
-        if bpe:
-            token_str = tokenizer.decode(token)
-            if token_str.startswith(" "):
-                token_str = token_str[1:]
-        else:
-            token_str = tokenizer.convert_ids_to_tokens([token])[0]
-        if token_str == "[PAD]":
-            break
-        if token_str.startswith("##"):
-            word.append(token_str.replace("##", ""))
-        else:
-            if len(word) > 0:
-                word.append(word_end)
-                text_recovered.append("".join(word))
-                word = []
-            if label.startswith("__ALL_UPPER__"):
-                # TODO: Make all uppercase
-                word.append(token_str[0].upper() + token_str[1:])
-            elif label.startswith("__UPPER__"):
-                word.append(token_str[0].upper() + token_str[1:])
-            else:
-                word.append(token_str)
-
-            label = label.replace("__UPPER__", "")
-            label = label.replace("__ALL_UPPER__", "")
-            word_end = label
-    text_recovered.append("".join(word))
-    if word_end != '':
-        text_recovered += word_end
-    return "".join(text_recovered)
-
-
-def inference_masks(
-    num_tokens: int, max_len: int, overlap: int
-) -> Tuple[List[List[bool]], List[List[bool]]]:
-    """Splits text that is to long for predicting. The function provide list
-       of masks for each prediction chunk
-
-    Args:
-        num_tokens (int): Number of tokens, including CLS & SEP
-        max_len (int): Prediction window (must be less than 512)
-        overlap (int): Ammout of overlapping between chunking windows
-
-    Returns:
-        Tuple[List[List[bool]], List[List[bool]]]: Masks for tokens provided
-            for inference & for result of inference
-    """
-    if max_len >= num_tokens:
-        return (
-            [[True] * num_tokens],
-            [[False] + [True] * (num_tokens - 2) + [False]],
-        )
-
-    # Account for CLS & SEP tokens
-    real_max_len = max_len - 2
-    real_num_tokens = num_tokens - 2
-
-    step_size = real_max_len - 2 * overlap
-
-    masks = []
-    entries = []
-    for start_id in range(0, real_num_tokens, step_size):
-        stop = False
-        if start_id == 0:
-            entry = (
-                [True]
-                + [True] * real_max_len
-                + [False] * (real_num_tokens - real_max_len)
-                + [True]
-            )
-            mask = (
-                [False]
-                + [True] * (real_max_len - overlap)
-                + [False] * (overlap + 1)
-            )
-        elif start_id + real_max_len >= real_num_tokens:
-            offset_start = real_num_tokens - real_max_len
-            entry = (
-                [True]
-                + [False] * (offset_start)
-                + [True] * real_max_len
-                + [True]
-            )
-            mask = (
-                [False] * (overlap + 1 + (start_id - offset_start))
-                + [True] * (real_max_len - overlap - (start_id - offset_start))
-                + [False]
-            )
-            stop = True
-        else:
-            entry = (
-                [True]
-                + [False] * start_id
-                + [True] * real_max_len
-                + [False] * (real_num_tokens - (start_id + real_max_len))
-                + [True]
-            )
-            mask = (
-                [False] * (overlap + 1)
-                + [True] * (real_max_len - 2 * overlap)
-                + [False] * (overlap + 1)
-            )
+def is_punctuation_rule(rule):
+    lambda rule: rule.category != 'PUNCTUATION' and len(rule.replacements)
+
+
+def _post_process(text: str, tool):
+    matches = tool.check(text)
+    matches = [rule for rule in matches if not is_punctuation_rule(rule)]
+    return language_tool_python.utils.correct(text, matches)
+
 
-        masks.append(mask)
-        entries.append(entry)
+class Punctuator:
+    def __init__(self, config):
+        self.config = config
+        self.languagetool_map = {'model_path_pl': 'pl-PL', 'model_path_ru':
+                                 'ru', 'model_path_en': 'en-US'}
+        self.max_context_size = int(self.config.get("max_context_size", 256))
+        self.overlap = int(self.config.get("overlap", 20))
 
-        if stop:
-            break
+        self.device = self.config.get("device", "cpu")
 
-    return entries, masks
+        self.languagetool_path = self.config.get("languagetool_path", "/home/worker/models/languagetool")
+        os.environ["LTP_PATH"] = self.languagetool_path
+        self.model_path_pl = self.config.get("model_path_pl", "/home/worker/models/punctuator/pl")
+        self.model_path_ru = self.config.get("model_path_ru", "/home/worker/models/punctuator/en")
+        self.model_path_en = self.config.get("model_path_en", "/home/worker/models/punctuator/ru")
+        self.tool_pl, self.model_pl, self.tokenizer_pl, self.mapping_pl \
+            = self._initialize_model('pl-PL', self.model_path_pl, self.device)
+        self.tool_en, self.model_en, self.tokenizer_en, self.mapping_en \
+            = self._initialize_model('en-US', self.model_path_en, 'cpu')
+        self.tool_ru, self.model_ru, self.tokenizer_ru, self.mapping_ru \
+            = self._initialize_model('ru', self.model_path_ru, 'cpu')
+        self.current_model = self.model_path_pl
 
+    def process(
+        self, input_path: str, task_options: dict, output_path: str
+    ) -> None:
+        language = task_options.get("language", "pl")
 
-def combine_masks(
-    num_tokens: int, max_len: int, overlap: int
-) -> List[List[bool]]:
-    """Provides mask which tokens to take for each prediction. It makes sure
-       that each token is only taken once & scored by best chunk.
+        if language == 'en':
+            bpe = True
+        else:
+            bpe = False
+
+        tool, model, tokenizer, mapping = self._get_setup_for_language(
+            language)
 
-    Args:
-        num_tokens (int): Number of tokens, including CLS & SEP
-        max_len (int): Prediction window (must be less than 512)
-        overlap (int): Ammout of overlapping between chunking windows
+        with open(input_path, "r") as f:
+            text = f.read()
 
-    Returns:
-        List[List[bool]]: Token mask
-    """
-    if max_len >= num_tokens:
-        return np.array([[False] + [True] * (num_tokens - 2) + [False]])
+        # Make sure that the text is lowercase & punctuationless
+        text = _preprocess_input(text)
 
-    step_size = max_len - 2 - overlap
+        tokenized = tokenizer(text, return_tensors="pt")
 
-    entries = []
-    for start in range(0, num_tokens - 2, step_size):
-        stop = False
+        num_tokens = len(tokenized["input_ids"][0])
 
-        if start + max_len - 2 - overlap < num_tokens - 2:
-            entry = (
-                [False] + [False] * (start) + [True] * (max_len - 2 - overlap)
+        # TODO: Consider adding batching support
+        results = []
+        for inference_mask, mask_mask in zip(
+            *inference_masks(num_tokens, self.max_context_size, self.overlap)
+        ):
+            result = model(
+                input_ids=tokenized["input_ids"][:, inference_mask].to(
+                    self.device
+                ),
+                attention_mask=tokenized["attention_mask"][:, inference_mask]
+                .to(self.device),
             )
-            entry += [False] * (
-                num_tokens - 2 - (start + max_len - 2 - overlap)
+            labels_ids = (
+                result.logits.detach()
+                .cpu()
+                .argmax(dim=-1)
+                .squeeze()
+                .numpy()[mask_mask]
             )
-            entry += [False]
+            results.append(decode_labels(labels_ids, mapping))
+        labels = sum(results, [])
+
+        tokens = []
+        for combine_mask in combine_masks(
+            num_tokens, self.max_context_size, self.overlap
+        ):
+            tokens += tokenized["input_ids"][0, combine_mask].numpy().tolist()
+
+        text_out = decode(tokens, labels, tokenizer, bpe)
+        text_out = _post_process(text_out, tool)
+        with open(output_path, "w") as f:
+            f.write(text_out)
+
+    def _initialize_model(self, lang, model_path: str, device: str):
+        tool = language_tool_python.LanguageTool(lang)
+        model = AutoModelForTokenClassification.from_pretrained(
+            model_path
+        ).to(device)
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
+        mapping = {}
+        with open(f"{self.model_path_pl}/classes.json", "r") as f:
+            mapping = json.load(f)
+            mapping = list(mapping.keys())
+        return tool, model, tokenizer, mapping
+
+    def _get_setup_for_language(self, language):
+        if language == 'ru':
+            return self.tool_ru, self.model_ru, self.tokenizer_ru, self.mapping_ru
+        elif language == 'en':
+            return self.tool_en, self.model_en, self.tokenizer_en, self.mapping_en
         else:
-            entry = [False] + [False] * (start)
-            entry += [True] * (num_tokens - 2 - start)
-            entry += [False]
-            stop = True
-
-        entries.append(entry)
-
-        if stop:
-            break
+            return self.tool_pl, self.model_pl, self.tokenizer_pl, self.mapping_pl
 
-    return entries
+    def _pass_device(self, new_language):
+        _, current_model, _, _ = self._get_setup_for_language(self.current_model)
+        current_model.to('cpu')
+        _, current_model, _, _ = self._get_setup_for_language(new_language)
+        current_model.to(self.device)
diff --git a/punctuator/utils.py b/punctuator/utils.py
new file mode 100644
index 0000000..72041e0
--- /dev/null
+++ b/punctuator/utils.py
@@ -0,0 +1,190 @@
+from typing import List, Tuple
+import numpy as np
+
+
+def decode_labels(results, labels_map) -> List[str]:
+    """Converts labes from ids to text representations
+
+    Args:
+        results (List[int]): List of ids of labels
+        labels_map (List[str]): List of classnames in order matching list of
+        ids
+
+    Returns:
+        List[str]: List of classnames
+    """
+    labels_decoded = list(map(lambda x: labels_map[x], results))
+
+    return labels_decoded
+
+
+def decode(tokens, labels_decoded, tokenizer, bpe=False):
+    """Applies predictions to text in order to get punctuated text representation
+
+    Args:
+        tokens (List[int]): List of token-ids
+        labels_decoded (List[str]): Per-token classnames
+        tokenizer: Huggingface tokenizer
+
+    Returns:
+        str: Text with punctuation & casing applied
+    """
+    text_recovered = []
+    word = []
+    word_end = ""
+    for label, token in zip(labels_decoded, tokens):
+        if bpe:
+            token_str = tokenizer.decode(token)
+            if token_str.startswith(" "):
+                token_str = token_str[1:]
+        else:
+            token_str = tokenizer.convert_ids_to_tokens([token])[0]
+        if token_str == "[PAD]":
+            break
+        if token_str.startswith("##"):
+            word.append(token_str.replace("##", ""))
+        else:
+            if len(word) > 0:
+                word.append(word_end)
+                text_recovered.append("".join(word))
+                word = []
+            if label.startswith("__ALL_UPPER__"):
+                # TODO: Make all uppercase
+                word.append(token_str[0].upper() + token_str[1:])
+            elif label.startswith("__UPPER__"):
+                word.append(token_str[0].upper() + token_str[1:])
+            else:
+                word.append(token_str)
+
+            label = label.replace("__UPPER__", "")
+            label = label.replace("__ALL_UPPER__", "")
+            word_end = label
+    text_recovered.append("".join(word))
+    if word_end != '':
+        text_recovered += word_end
+    return "".join(text_recovered)
+
+
+def inference_masks(
+    num_tokens: int, max_len: int, overlap: int
+) -> Tuple[List[List[bool]], List[List[bool]]]:
+    """Splits text that is to long for predicting. The function provide list
+       of masks for each prediction chunk
+
+    Args:
+        num_tokens (int): Number of tokens, including CLS & SEP
+        max_len (int): Prediction window (must be less than 512)
+        overlap (int): Ammout of overlapping between chunking windows
+
+    Returns:
+        Tuple[List[List[bool]], List[List[bool]]]: Masks for tokens provided
+            for inference & for result of inference
+    """
+    if max_len >= num_tokens:
+        return (
+            [[True] * num_tokens],
+            [[False] + [True] * (num_tokens - 2) + [False]],
+        )
+
+    # Account for CLS & SEP tokens
+    real_max_len = max_len - 2
+    real_num_tokens = num_tokens - 2
+
+    step_size = real_max_len - 2 * overlap
+
+    masks = []
+    entries = []
+    for start_id in range(0, real_num_tokens, step_size):
+        stop = False
+        if start_id == 0:
+            entry = (
+                [True]
+                + [True] * real_max_len
+                + [False] * (real_num_tokens - real_max_len)
+                + [True]
+            )
+            mask = (
+                [False]
+                + [True] * (real_max_len - overlap)
+                + [False] * (overlap + 1)
+            )
+        elif start_id + real_max_len >= real_num_tokens:
+            offset_start = real_num_tokens - real_max_len
+            entry = (
+                [True]
+                + [False] * (offset_start)
+                + [True] * real_max_len
+                + [True]
+            )
+            mask = (
+                [False] * (overlap + 1 + (start_id - offset_start))
+                + [True] * (real_max_len - overlap - (start_id - offset_start))
+                + [False]
+            )
+            stop = True
+        else:
+            entry = (
+                [True]
+                + [False] * start_id
+                + [True] * real_max_len
+                + [False] * (real_num_tokens - (start_id + real_max_len))
+                + [True]
+            )
+            mask = (
+                [False] * (overlap + 1)
+                + [True] * (real_max_len - 2 * overlap)
+                + [False] * (overlap + 1)
+            )
+
+        masks.append(mask)
+        entries.append(entry)
+
+        if stop:
+            break
+
+    return entries, masks
+
+
+def combine_masks(
+    num_tokens: int, max_len: int, overlap: int
+) -> List[List[bool]]:
+    """Provides mask which tokens to take for each prediction. It makes sure
+       that each token is only taken once & scored by best chunk.
+
+    Args:
+        num_tokens (int): Number of tokens, including CLS & SEP
+        max_len (int): Prediction window (must be less than 512)
+        overlap (int): Ammout of overlapping between chunking windows
+
+    Returns:
+        List[List[bool]]: Token mask
+    """
+    if max_len >= num_tokens:
+        return np.array([[False] + [True] * (num_tokens - 2) + [False]])
+
+    step_size = max_len - 2 - overlap
+
+    entries = []
+    for start in range(0, num_tokens - 2, step_size):
+        stop = False
+
+        if start + max_len - 2 - overlap < num_tokens - 2:
+            entry = (
+                [False] + [False] * (start) + [True] * (max_len - 2 - overlap)
+            )
+            entry += [False] * (
+                num_tokens - 2 - (start + max_len - 2 - overlap)
+            )
+            entry += [False]
+        else:
+            entry = [False] + [False] * (start)
+            entry += [True] * (num_tokens - 2 - start)
+            entry += [False]
+            stop = True
+
+        entries.append(entry)
+
+        if stop:
+            break
+
+    return entries
diff --git a/requirements.txt b/requirements.txt
index 9df4a0c..3e07ff7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
 numpy==1.19.4
 transformers==4.3.2
 torch==1.7.1
-language-tool-python==2.5.4
\ No newline at end of file
+language-tool-python==2.5.4
+awscli==1.20.11
\ No newline at end of file
diff --git a/sync_to_s3.sh b/sync_to_s3.sh
new file mode 100755
index 0000000..666b79d
--- /dev/null
+++ b/sync_to_s3.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+aws --endpoint-url "https://s3.clarin-pl.eu" s3 sync models/pl s3://workers/punctuator/models/pl
+aws --endpoint-url "https://s3.clarin-pl.eu" s3 sync models/en s3://workers/punctuator/models/en
+aws --endpoint-url "https://s3.clarin-pl.eu" s3 sync models/ru s3://workers/punctuator/models/ru
\ No newline at end of file
diff --git a/worker.py b/worker.py
index 253ce48..6cda4a2 100644
--- a/worker.py
+++ b/worker.py
@@ -8,139 +8,41 @@ import os
 import nlp_ws
 from transformers import AutoModelForTokenClassification, AutoTokenizer
 import language_tool_python
+import argparse
 
 
-from punctuator.punctuator import (
-    combine_masks,
-    decode,
-    decode_labels,
-    inference_masks,
-)
-
-
-def _preprocess_input(text: str):
-    text = text.translate(str.maketrans("", "", string.punctuation))
-    text = text.lower()
-
-    return text
-
-
-def is_punctuation_rule(rule):
-    lambda rule: rule.category != 'PUNCTUATION' and len(rule.replacements)
-
-
-def _post_process(text: str, tool):
-    matches = tool.check(text)
-    matches = [rule for rule in matches if not is_punctuation_rule(rule)]
-    return language_tool_python.utils.correct(text, matches)
-
+from punctuator.punctuator import Punctuator
 
 class Worker(nlp_ws.NLPWorker):
     def init(self):
-        self.config = configparser.ConfigParser()
-        self.config.read("config.ini")
-        self.config = self.config["deployment"]
-        self.languagetool_map = {'model_path_pl': 'pl-PL', 'model_path_ru':
-                                 'ru', 'model_path_en': 'en-US'}
-        self.max_context_size = int(self.config["max_context_size"])
-        self.overlap = int(self.config["overlap"])
-
-        self.device = self.config["device"]
-
-        self.languagetool_path = self.config["languagetool_path"]
-        os.environ["LTP_PATH"] = self.languagetool_path
-        self.model_path_pl = self.config["model_path_pl"]
-        self.model_path_ru = self.config["model_path_ru"]
-        self.model_path_en = self.config["model_path_en"]
-        self.tool_pl, self.model_pl, self.tokenizer_pl, self.mapping_pl \
-            = self.initialize_model('pl-PL', self.model_path_pl, self.device)
-        self.tool_en, self.model_en, self.tokenizer_en, self.mapping_en \
-            = self.initialize_model('en-US', self.model_path_en, 'cpu')
-        self.tool_ru, self.model_ru, self.tokenizer_ru, self.mapping_ru \
-            = self.initialize_model('ru', self.model_path_ru, 'cpu')
-        self.current_model = self.model_path_pl
+        config = configparser.ConfigParser()
+        config.read("config.ini")
+        config = config["deployment"]
+        
+        self.punctuator = Punctuator(config)
 
     def process(
         self, input_path: str, task_options: dict, output_path: str
     ) -> None:
-        language = task_options.get("language", "pl")
-
-        if language == 'en':
-            bpe = True
-        else:
-            bpe = False
-        tool, model, tokenizer, mapping = self.get_setup_for_language(
-            language)
-
-        with open(input_path, "r") as f:
-            text = f.read()
-
-        # Make sure that the text is lowercase & punctuationless
-        text = _preprocess_input(text)
-
-        tokenized = tokenizer(text, return_tensors="pt")
-
-        num_tokens = len(tokenized["input_ids"][0])
-
-        # TODO: Consider adding batching support
-        results = []
-        for inference_mask, mask_mask in zip(
-            *inference_masks(num_tokens, self.max_context_size, self.overlap)
-        ):
-            result = model(
-                input_ids=tokenized["input_ids"][:, inference_mask].to(
-                    self.device
-                ),
-                attention_mask=tokenized["attention_mask"][:, inference_mask]
-                .to(self.device),
-            )
-            labels_ids = (
-                result.logits.detach()
-                .cpu()
-                .argmax(dim=-1)
-                .squeeze()
-                .numpy()[mask_mask]
-            )
-            results.append(decode_labels(labels_ids, mapping))
-        labels = sum(results, [])
-
-        tokens = []
-        for combine_mask in combine_masks(
-            num_tokens, self.max_context_size, self.overlap
-        ):
-            tokens += tokenized["input_ids"][0, combine_mask].numpy().tolist()
-
-        text_out = decode(tokens, labels, tokenizer, bpe)
-        text_out = _post_process(text_out, tool)
-        with open(output_path, "w") as f:
-            f.write(text_out)
-
-    def initialize_model(self, lang, model_path: str, device: str):
-        tool = language_tool_python.LanguageTool(lang)
-        model = AutoModelForTokenClassification.from_pretrained(
-            model_path
-        ).to(device)
-        tokenizer = AutoTokenizer.from_pretrained(model_path)
-        mapping = {}
-        with open("model/punctuator_pl/classes.json", "r") as f:
-            mapping = json.load(f)
-            mapping = list(mapping.keys())
-        return tool, model, tokenizer, mapping
-
-    def get_setup_for_language(self, language):
-        if language == 'ru':
-            return self.tool_ru, self.model_ru, self.tokenizer_ru, self.mapping_ru
-        elif language == 'en':
-            return self.tool_en, self.model_en, self.tokenizer_en, self.mapping_en
-        else:
-            return self.tool_pl, self.model_pl, self.tokenizer_pl, self.mapping_pl
-
-    def pass_device(self, new_language):
-        _, current_model, _, _ = self.get_setup_for_language(self.current_model)
-        current_model.to('cpu')
-        _, current_model, _, _ = self.get_setup_for_language(new_language)
-        current_model.to(self.device)
-
+        self.punctuator.process(input_path, task_options, output_path)
+        
+def perform_test():
+    config = configparser.ConfigParser()
+    config.read("config.ini")
+    config = config["deployment"]
+    
+    punctuator = Punctuator(config)
+    punctuator.process("/samba/input/pl.txt", {"language": "pl"}, "/samba/output/pl.txt")
+    punctuator.process("/samba/input/en.txt", {"language": "en"}, "/samba/output/en.txt")
+    punctuator.process("/samba/input/ru.txt", {"language": "ru"}, "/samba/output/ru.txt")
 
 if __name__ == "__main__":
-    nlp_ws.NLPService.main(Worker)
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--test", action="store_true", help="")
+    args = parser.parse_args()
+    
+    if args.test:
+        perform_test()
+    else:
+        nlp_ws.NLPService.main(Worker)
+
-- 
GitLab


From effc2d1a483b12d39f8bf3e11dce94c702dab207 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Pogoda?= <mipo57@e-science.pl>
Date: Wed, 13 Oct 2021 17:24:05 +0200
Subject: [PATCH 4/6] Added automatic deployment

---
 .gitlab-ci.yml  | 18 ++++++++++++++++--
 config.ini      |  9 +++++----
 deployment.yaml | 42 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 63 insertions(+), 6 deletions(-)
 create mode 100644 deployment.yaml

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index f4c8b92..a674b42 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,4 +1,4 @@
-image: python:3.8.5
+image: "clarinpl/python:3.8"
 
 cache:
   paths:
@@ -8,6 +8,7 @@ stages:
   - check_style
   - testing
   - build
+  - deploy
 
 before_script:
   - pip install tox==3.18.1
@@ -34,7 +35,7 @@ build_image:
   before_script:
     - ''
   script:
-    - docker build -t $DOCKERHUB_NAME -f Dockerfile.worker .
+    - docker build -t $DOCKERHUB_NAME .
     - echo $DOCKER_PASSWORD > pass.txt
     - cat pass.txt | docker login --username $DOCKER_USERNAME --password-stdin
     - rm pass.txt
@@ -44,3 +45,16 @@ build_image:
     - docker image tag $DOCKERHUB_NAME $CI_REGISTRY_IMAGE:latest
     - docker push $CI_REGISTRY_IMAGE
 
+deploy:
+  stage: deploy
+  before_script:
+    - ""
+  image:
+    name: clarinpl/kubectl
+  only:
+    - master
+  script:
+    - envsubst < config.ini > config.temp.ini
+    - envsubst < deployment.yaml > deployment.temp.yaml
+    - kubectl --kubeconfig=$KUBECONFIG --namespace=nlpworkers create configmap punctuator-config-ini --from-file=config.ini=config.temp.ini --dry-run -o yaml | kubectl apply -f -
+    - kubectl --kubeconfig=$KUBECONFIG --namespace=nlpworkers apply -f deployment.temp.yaml
diff --git a/config.ini b/config.ini
index d8eb44b..afe9243 100644
--- a/config.ini
+++ b/config.ini
@@ -1,9 +1,10 @@
 [service]
-tool = punctuator_test
+tool = punctuator
 root = /samba/requests/
-rabbit_host = test
-rabbit_user = test
-rabbit_password = test
+rabbit_host = $RABBIT_HOST
+rabbit_user = $RABBIT_USER
+rabbit_password = $RABBIT_PASSWORD
+queue_prefix = nlp_
 
 [tool]
 workers_number = 1
diff --git a/deployment.yaml b/deployment.yaml
new file mode 100644
index 0000000..2fe7a7b
--- /dev/null
+++ b/deployment.yaml
@@ -0,0 +1,42 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: punctuator
+  labels:
+    app: punctuator
+  namespace: nlpworkers
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: punctuator
+  template:
+    metadata:
+      labels:
+        app: punctuator
+    spec:
+      containers:
+      - name: punctuator
+        image: clarinpl/$CI_PROJECT_NAME:latest
+        imagePullPolicy: Always
+        volumeMounts:
+        - name: config
+          mountPath: /home/worker/config.ini
+          subPath: config.ini
+        - name: samba
+          mountPath: /samba
+        - name: models
+          mountPath: /home/worker/models/
+      volumes:
+      - name: vol1
+        configMap:
+          name: punctuator-config-ini
+      - name: sambax
+        hostPath:
+          path: /samba
+          type: ""
+      - name: models
+        hostPath:
+          path: /tmp/punctuator-models
+          type: DirectoryOrCreate
+        
\ No newline at end of file
-- 
GitLab


From a590b5ff7cfdd209104090006acc00e81f5dd42d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Pogoda?= <mipo57@e-science.pl>
Date: Wed, 13 Oct 2021 17:27:41 +0200
Subject: [PATCH 5/6] PEP8 formatting

---
 punctuator/punctuator.py | 79 +++++++++++++++++++++++-----------------
 punctuator/utils.py      | 27 +++-----------
 tests/test_chunking.py   | 44 ++++++++++++++++++----
 worker.py                | 28 ++++++++------
 4 files changed, 105 insertions(+), 73 deletions(-)

diff --git a/punctuator/punctuator.py b/punctuator/punctuator.py
index bc4d444..4fbb2dc 100644
--- a/punctuator/punctuator.py
+++ b/punctuator/punctuator.py
@@ -26,7 +26,7 @@ def _preprocess_input(text: str):
 
 
 def is_punctuation_rule(rule):
-    lambda rule: rule.category != 'PUNCTUATION' and len(rule.replacements)
+    lambda rule: rule.category != "PUNCTUATION" and len(rule.replacements)
 
 
 def _post_process(text: str, tool):
@@ -38,38 +38,58 @@ def _post_process(text: str, tool):
 class Punctuator:
     def __init__(self, config):
         self.config = config
-        self.languagetool_map = {'model_path_pl': 'pl-PL', 'model_path_ru':
-                                 'ru', 'model_path_en': 'en-US'}
+        self.languagetool_map = {
+            "model_path_pl": "pl-PL",
+            "model_path_ru": "ru",
+            "model_path_en": "en-US",
+        }
         self.max_context_size = int(self.config.get("max_context_size", 256))
         self.overlap = int(self.config.get("overlap", 20))
 
         self.device = self.config.get("device", "cpu")
 
-        self.languagetool_path = self.config.get("languagetool_path", "/home/worker/models/languagetool")
+        self.languagetool_path = self.config.get(
+            "languagetool_path", "/home/worker/models/languagetool"
+        )
         os.environ["LTP_PATH"] = self.languagetool_path
-        self.model_path_pl = self.config.get("model_path_pl", "/home/worker/models/punctuator/pl")
-        self.model_path_ru = self.config.get("model_path_ru", "/home/worker/models/punctuator/en")
-        self.model_path_en = self.config.get("model_path_en", "/home/worker/models/punctuator/ru")
-        self.tool_pl, self.model_pl, self.tokenizer_pl, self.mapping_pl \
-            = self._initialize_model('pl-PL', self.model_path_pl, self.device)
-        self.tool_en, self.model_en, self.tokenizer_en, self.mapping_en \
-            = self._initialize_model('en-US', self.model_path_en, 'cpu')
-        self.tool_ru, self.model_ru, self.tokenizer_ru, self.mapping_ru \
-            = self._initialize_model('ru', self.model_path_ru, 'cpu')
+        self.model_path_pl = self.config.get(
+            "model_path_pl", "/home/worker/models/punctuator/pl"
+        )
+        self.model_path_ru = self.config.get(
+            "model_path_ru", "/home/worker/models/punctuator/en"
+        )
+        self.model_path_en = self.config.get(
+            "model_path_en", "/home/worker/models/punctuator/ru"
+        )
+        (
+            self.tool_pl,
+            self.model_pl,
+            self.tokenizer_pl,
+            self.mapping_pl,
+        ) = self._initialize_model("pl-PL", self.model_path_pl, self.device)
+        (
+            self.tool_en,
+            self.model_en,
+            self.tokenizer_en,
+            self.mapping_en,
+        ) = self._initialize_model("en-US", self.model_path_en, "cpu")
+        (
+            self.tool_ru,
+            self.model_ru,
+            self.tokenizer_ru,
+            self.mapping_ru,
+        ) = self._initialize_model("ru", self.model_path_ru, "cpu")
         self.current_model = self.model_path_pl
 
-    def process(
-        self, input_path: str, task_options: dict, output_path: str
-    ) -> None:
+    def process(self, input_path: str, task_options: dict, output_path: str) -> None:
         language = task_options.get("language", "pl")
 
-        if language == 'en':
+        if language == "en":
             bpe = True
         else:
             bpe = False
 
-        tool, model, tokenizer, mapping = self._get_setup_for_language(
-            language)
+        tool, model, tokenizer, mapping = self._get_setup_for_language(language)
 
         with open(input_path, "r") as f:
             text = f.read()
@@ -87,18 +107,13 @@ class Punctuator:
             *inference_masks(num_tokens, self.max_context_size, self.overlap)
         ):
             result = model(
-                input_ids=tokenized["input_ids"][:, inference_mask].to(
+                input_ids=tokenized["input_ids"][:, inference_mask].to(self.device),
+                attention_mask=tokenized["attention_mask"][:, inference_mask].to(
                     self.device
                 ),
-                attention_mask=tokenized["attention_mask"][:, inference_mask]
-                .to(self.device),
             )
             labels_ids = (
-                result.logits.detach()
-                .cpu()
-                .argmax(dim=-1)
-                .squeeze()
-                .numpy()[mask_mask]
+                result.logits.detach().cpu().argmax(dim=-1).squeeze().numpy()[mask_mask]
             )
             results.append(decode_labels(labels_ids, mapping))
         labels = sum(results, [])
@@ -116,9 +131,7 @@ class Punctuator:
 
     def _initialize_model(self, lang, model_path: str, device: str):
         tool = language_tool_python.LanguageTool(lang)
-        model = AutoModelForTokenClassification.from_pretrained(
-            model_path
-        ).to(device)
+        model = AutoModelForTokenClassification.from_pretrained(model_path).to(device)
         tokenizer = AutoTokenizer.from_pretrained(model_path)
         mapping = {}
         with open(f"{self.model_path_pl}/classes.json", "r") as f:
@@ -127,15 +140,15 @@ class Punctuator:
         return tool, model, tokenizer, mapping
 
     def _get_setup_for_language(self, language):
-        if language == 'ru':
+        if language == "ru":
             return self.tool_ru, self.model_ru, self.tokenizer_ru, self.mapping_ru
-        elif language == 'en':
+        elif language == "en":
             return self.tool_en, self.model_en, self.tokenizer_en, self.mapping_en
         else:
             return self.tool_pl, self.model_pl, self.tokenizer_pl, self.mapping_pl
 
     def _pass_device(self, new_language):
         _, current_model, _, _ = self._get_setup_for_language(self.current_model)
-        current_model.to('cpu')
+        current_model.to("cpu")
         _, current_model, _, _ = self._get_setup_for_language(new_language)
         current_model.to(self.device)
diff --git a/punctuator/utils.py b/punctuator/utils.py
index 72041e0..3d39316 100644
--- a/punctuator/utils.py
+++ b/punctuator/utils.py
@@ -60,7 +60,7 @@ def decode(tokens, labels_decoded, tokenizer, bpe=False):
             label = label.replace("__ALL_UPPER__", "")
             word_end = label
     text_recovered.append("".join(word))
-    if word_end != '':
+    if word_end != "":
         text_recovered += word_end
     return "".join(text_recovered)
 
@@ -103,19 +103,10 @@ def inference_masks(
                 + [False] * (real_num_tokens - real_max_len)
                 + [True]
             )
-            mask = (
-                [False]
-                + [True] * (real_max_len - overlap)
-                + [False] * (overlap + 1)
-            )
+            mask = [False] + [True] * (real_max_len - overlap) + [False] * (overlap + 1)
         elif start_id + real_max_len >= real_num_tokens:
             offset_start = real_num_tokens - real_max_len
-            entry = (
-                [True]
-                + [False] * (offset_start)
-                + [True] * real_max_len
-                + [True]
-            )
+            entry = [True] + [False] * (offset_start) + [True] * real_max_len + [True]
             mask = (
                 [False] * (overlap + 1 + (start_id - offset_start))
                 + [True] * (real_max_len - overlap - (start_id - offset_start))
@@ -145,9 +136,7 @@ def inference_masks(
     return entries, masks
 
 
-def combine_masks(
-    num_tokens: int, max_len: int, overlap: int
-) -> List[List[bool]]:
+def combine_masks(num_tokens: int, max_len: int, overlap: int) -> List[List[bool]]:
     """Provides mask which tokens to take for each prediction. It makes sure
        that each token is only taken once & scored by best chunk.
 
@@ -169,12 +158,8 @@ def combine_masks(
         stop = False
 
         if start + max_len - 2 - overlap < num_tokens - 2:
-            entry = (
-                [False] + [False] * (start) + [True] * (max_len - 2 - overlap)
-            )
-            entry += [False] * (
-                num_tokens - 2 - (start + max_len - 2 - overlap)
-            )
+            entry = [False] + [False] * (start) + [True] * (max_len - 2 - overlap)
+            entry += [False] * (num_tokens - 2 - (start + max_len - 2 - overlap))
             entry += [False]
         else:
             entry = [False] + [False] * (start)
diff --git a/tests/test_chunking.py b/tests/test_chunking.py
index 17cb854..7517223 100644
--- a/tests/test_chunking.py
+++ b/tests/test_chunking.py
@@ -33,27 +33,45 @@ def test_inference_mask():
     assert np.all(
         result
         == np.array(
-            [[T, T, T, T, T, T, T, F, F, T], [T, F, F, T, T, T, T, T, T, T], ]
+            [
+                [T, T, T, T, T, T, T, F, F, T],
+                [T, F, F, T, T, T, T, T, T, T],
+            ]
         )
     )
     assert np.all(
-        mask == np.array([[F, T, T, T, T, F, F, F], [F, F, F, T, T, T, T, F], ])
+        mask
+        == np.array(
+            [
+                [F, T, T, T, T, F, F, F],
+                [F, F, F, T, T, T, T, F],
+            ]
+        )
     )
 
     result, mask = inference_masks(5, 8, 2)
-    assert np.all(result == np.array([[T, T, T, T, T], ]))
+    assert np.all(
+        result
+        == np.array(
+            [
+                [T, T, T, T, T],
+            ]
+        )
+    )
     assert np.all(mask == np.array([[F, T, T, T, F]]))
 
     result, mask = inference_masks(10, 9, 3)
     assert np.all(
         result
         == np.array(
-            [[T, T, T, T, T, T, T, T, F, T], [T, F, T, T, T, T, T, T, T, T], ]
+            [
+                [T, T, T, T, T, T, T, T, F, T],
+                [T, F, T, T, T, T, T, T, T, T],
+            ]
         )
     )
     assert np.all(
-        mask
-        == np.array([[F, T, T, T, T, F, F, F, F], [F, F, F, F, T, T, T, T, F]])
+        mask == np.array([[F, T, T, T, T, F, F, F, F], [F, F, F, F, T, T, T, T, F]])
     )
 
 
@@ -77,9 +95,19 @@ def test_combine_mask():
     assert np.all(
         result
         == np.array(
-            [[F, T, T, T, T, F, F, F, F, F], [F, F, F, F, F, T, T, T, T, F], ]
+            [
+                [F, T, T, T, T, F, F, F, F, F],
+                [F, F, F, F, F, T, T, T, T, F],
+            ]
         )
     )
 
     result = combine_masks(5, 8, 2)
-    assert np.all(result == np.array([[F, T, T, T, F], ]))
+    assert np.all(
+        result
+        == np.array(
+            [
+                [F, T, T, T, F],
+            ]
+        )
+    )
diff --git a/worker.py b/worker.py
index 6cda4a2..0e491d3 100644
--- a/worker.py
+++ b/worker.py
@@ -13,36 +13,42 @@ import argparse
 
 from punctuator.punctuator import Punctuator
 
+
 class Worker(nlp_ws.NLPWorker):
     def init(self):
         config = configparser.ConfigParser()
         config.read("config.ini")
         config = config["deployment"]
-        
+
         self.punctuator = Punctuator(config)
 
-    def process(
-        self, input_path: str, task_options: dict, output_path: str
-    ) -> None:
+    def process(self, input_path: str, task_options: dict, output_path: str) -> None:
         self.punctuator.process(input_path, task_options, output_path)
-        
+
+
 def perform_test():
     config = configparser.ConfigParser()
     config.read("config.ini")
     config = config["deployment"]
-    
+
     punctuator = Punctuator(config)
-    punctuator.process("/samba/input/pl.txt", {"language": "pl"}, "/samba/output/pl.txt")
-    punctuator.process("/samba/input/en.txt", {"language": "en"}, "/samba/output/en.txt")
-    punctuator.process("/samba/input/ru.txt", {"language": "ru"}, "/samba/output/ru.txt")
+    punctuator.process(
+        "/samba/input/pl.txt", {"language": "pl"}, "/samba/output/pl.txt"
+    )
+    punctuator.process(
+        "/samba/input/en.txt", {"language": "en"}, "/samba/output/en.txt"
+    )
+    punctuator.process(
+        "/samba/input/ru.txt", {"language": "ru"}, "/samba/output/ru.txt"
+    )
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--test", action="store_true", help="")
     args = parser.parse_args()
-    
+
     if args.test:
         perform_test()
     else:
         nlp_ws.NLPService.main(Worker)
-
-- 
GitLab


From a05497817dd437cf584782f3fe87b576f07f6763 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Pogoda?= <mipo57@e-science.pl>
Date: Wed, 13 Oct 2021 17:29:41 +0200
Subject: [PATCH 6/6] Pep8 fixes

---
 .dockerignore            |  3 ++-
 punctuator/punctuator.py | 15 ++++-----------
 worker.py                |  8 +-------
 3 files changed, 7 insertions(+), 19 deletions(-)

diff --git a/.dockerignore b/.dockerignore
index 73cf2f1..36d93c8 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,2 +1,3 @@
 models
-example_texts
\ No newline at end of file
+example_texts
+.tox
\ No newline at end of file
diff --git a/punctuator/punctuator.py b/punctuator/punctuator.py
index 4fbb2dc..a5fb967 100644
--- a/punctuator/punctuator.py
+++ b/punctuator/punctuator.py
@@ -1,21 +1,14 @@
 """Implementation of punctuator service"""
 
-import configparser
 import json
-import string
 import os
+import string
 
-import nlp_ws
-from transformers import AutoModelForTokenClassification, AutoTokenizer
 import language_tool_python
+from transformers import AutoModelForTokenClassification, AutoTokenizer
 
-
-from punctuator.utils import (
-    combine_masks,
-    decode,
-    decode_labels,
-    inference_masks,
-)
+from punctuator.utils import (combine_masks, decode, decode_labels,
+                              inference_masks)
 
 
 def _preprocess_input(text: str):
diff --git a/worker.py b/worker.py
index 0e491d3..6bb8c0c 100644
--- a/worker.py
+++ b/worker.py
@@ -1,15 +1,9 @@
 """Implementation of punctuator service"""
 
+import argparse
 import configparser
-import json
-import string
-import os
 
 import nlp_ws
-from transformers import AutoModelForTokenClassification, AutoTokenizer
-import language_tool_python
-import argparse
-
 
 from punctuator.punctuator import Punctuator
 
-- 
GitLab