Commit e4c02c2e authored by Mateusz Gniewkowski's avatar Mateusz Gniewkowski

Merge branch 'punctuator_v2' into 'master'

Punctuator v2

See merge request !12
parents 092f2326 f6bb7893
Pipeline #2507 passed with stages
in 16 minutes and 26 seconds
data
__pycache__
.devcontainer
.dvc
.idea
.metals
.pytest_cache
.tox
.vscode
checkpoints
dask-worker-space
data
generated
notebooks
tests
deploy
\ No newline at end of file
/config.local
/tmp
/cache
[core]
remote = newremote
['remote "newremote"']
url = s3://punctuation/action_based
endpointurl = https://minio.clarin-pl.eu
profile = clarinpl
{
"$schema": "https://vega.github.io/schema/vega-lite/v4.json",
"data": {
"values": "<DVC_METRIC_DATA>"
},
"title": "<DVC_METRIC_TITLE>",
"mark": "rect",
"encoding": {
"x": {
"field": "<DVC_METRIC_X>",
"type": "nominal",
"sort": "ascending",
"title": "<DVC_METRIC_X_LABEL>"
},
"y": {
"field": "<DVC_METRIC_Y>",
"type": "nominal",
"sort": "ascending",
"title": "<DVC_METRIC_Y_LABEL>"
},
"color": {
"aggregate": "count",
"type": "quantitative"
},
"facet": {
"field": "rev",
"type": "nominal"
}
}
}
{
"$schema": "https://vega.github.io/schema/vega-lite/v4.json",
"data": {
"values": "<DVC_METRIC_DATA>"
},
"title": "<DVC_METRIC_TITLE>",
"mark": {
"type": "line"
},
"encoding": {
"x": {
"field": "<DVC_METRIC_X>",
"type": "quantitative",
"title": "<DVC_METRIC_X_LABEL>"
},
"y": {
"field": "<DVC_METRIC_Y>",
"type": "quantitative",
"title": "<DVC_METRIC_Y_LABEL>",
"scale": {
"zero": false
}
},
"color": {
"field": "rev",
"type": "nominal"
}
}
}
{
"$schema": "https://vega.github.io/schema/vega-lite/v4.json",
"data": {
"values": "<DVC_METRIC_DATA>"
},
"title": "<DVC_METRIC_TITLE>",
"mark": "point",
"encoding": {
"x": {
"field": "<DVC_METRIC_X>",
"type": "quantitative",
"title": "<DVC_METRIC_X_LABEL>"
},
"y": {
"field": "<DVC_METRIC_Y>",
"type": "quantitative",
"title": "<DVC_METRIC_Y_LABEL>",
"scale": {
"zero": false
}
},
"color": {
"field": "rev",
"type": "nominal"
}
}
}
{
"$schema": "https://vega.github.io/schema/vega-lite/v4.json",
"data": {
"values": "<DVC_METRIC_DATA>"
},
"title": "<DVC_METRIC_TITLE>",
"mark": {
"type": "line"
},
"encoding": {
"x": {
"field": "<DVC_METRIC_X>",
"type": "quantitative",
"title": "<DVC_METRIC_X_LABEL>"
},
"y": {
"field": "<DVC_METRIC_Y>",
"type": "quantitative",
"title": "<DVC_METRIC_Y_LABEL>",
"scale": {
"zero": false
}
},
"color": {
"field": "rev",
"type": "nominal"
}
},
"transform": [
{
"loess": "<DVC_METRIC_Y>",
"on": "<DVC_METRIC_X>",
"groupby": [
"rev"
],
"bandwidth": 0.3
}
]
}
dane/**
dataset_simple
dataset_actions
**/dask-worker-space
.vscode
.devcontainer
.idea
.metals
/data
/samba
/.pytest_cache
/.tox
/.vscode
/.env
/model
/config.test.ini
/wandb
__pycache__
.pytest_cache
/checkpoints
.dvc
.tox
notebooks
dvc.lock
dask-worker-space
test_data
.env
deploy
service.log
\ No newline at end of file
/notebook.ipynb
\ No newline at end of file
......@@ -32,8 +32,8 @@ build_image:
before_script:
- ''
script:
- docker build -t clarinpl/punctuator .
- docker build -t clarinpl/punctuator -f Dockerfile.worker .
- echo $DOCKER_PASSWORD > pass.txt
- cat pass.txt | docker login --username $DOCKER_USERNAME --password-stdin
- rm pass.txt
- docker push clarinpl/punctuator
\ No newline at end of file
- docker push clarinpl/punctuator
[settings]
profile=hug
src_paths=src,test
FROM clarinpl/cuda-python:3.7
RUN DEBIAN_FRONTEND=noninteractive apt-get update && apt-get install -y gcc python3-dev
RUN mkdir /punctuator
WORKDIR /punctuator
COPY requirements.txt requirements.txt
RUN pip3 install -r requirements.txt && rm requirements.txt
RUN pip install -r requirements.txt && rm requirements.txt
COPY src ./src
COPY config.ini .
COPY worker.py .
COPY entrypoint.sh .
RUN mkdir /workspace
WORKDIR /workspace
ENTRYPOINT [ "./entrypoint.sh" ]
\ No newline at end of file
RUN pip3 install --index-url https://pypi.clarin-pl.eu/simple/ nlp_ws==0.6
WORKDIR /home/worker
COPY punctuator punctuator
COPY entrypoint.sh entrypoint.sh
COPY worker.py worker.py
COPY config.ini config.ini
ENTRYPOINT ["bash", "entrypoint.sh"]
\ No newline at end of file
......@@ -7,62 +7,19 @@ A service that automatically adds punctuation to raw word-stream (eg. from speec
**Output**:
> Według webometrycznego rankingu uniwersytetów świata ze stycznia 2019, pokazującego zaangażowanie instytucji akademickich w Internecie, uczelnia zajmuje 5. miejsce w Polsce wśród uczelni technicznych, a na świecie 964. Wśród wszystkich typów uczelni w rankingu szkół wyższych perspektyw Politechnika Wrocławska zajęła w 2019 roku 3. miejsce wśród uczelni technicznych oraz 6. miejsce spośród wszystkich uczelni akademickich w Polsce
## Models
### Action-Based
1. actions_base: A simple model, architecturally based on BERT. It's learned on a task to predict an "Action" for each token in the sentence. Action is described as either uppercasing of the token or adding a punctuation sign at the end of the token.
2. actions_restricted: The model nearly identical with actions_base, however it predicts punctuation as a categorical distribution (so that punctuation is mutually exclusive in training time). The idea is to better differentiate between each punctuation.
3. actions_mixed: A model based on the full transformer (encoder + decoder) architecture. It's much less performant, as it only predicts actions for one word at the time. However, it can model action probabilities conditioned on both the input and output predicted so far. Because of that, it's much less prone to not uppercasing letters in a new sentence or placing multiple punctuation signs in close proximity.
### Translation
2. translation (Deprecated): Full encoder-decoder stack that takes input (unpunctuated text) and the output produced so far to predict the next token. The main difference from the actions model is that it's a full text2text model without restriction on tokens. Because of that, in theory, it can represent more cases (eg. all upper, some upper, dashes, ellipsis, etc...), as opposed to only a few explicitly defined actions. However, the lack of constraints makes it much harder to train (both in performance and data size).
## Usage
To test the model localy you can use `punctuate.py` script.
```bash
punctuate.py [-h] -a {base,restricted,mixed} -d DIRECTORY -i INPUT [-m MODEL] [-l {upper_case,dot,colon,question_mark,none}] [-dv DEVICE]
Evaluate actions model
optional arguments:
-h, --help show this help message and exit
-a {base,restricted,mixed}, --architecture {base,restricted,mixed}
Model architecture
-d DIRECTORY, --directory DIRECTORY
Directory where trained model is located, relative to project root
-i INPUT, --input INPUT
Input text file
-m MODEL, --model MODEL
Pretrained model name
-l {upper_case,dot,colon,question_mark,none}, --highlight {upper_case,dot,colon,question_mark,none}
Highlight prediction confidence of selected action per-word
-dv DEVICE, --device DEVICE
Device on which inference will be made
```
Eg. if you place your model named "production" at `punctuator/checkpoints/actions_base/` and example unpunctuated at `punctuator/test_data/test.txt` you can call
```bash
python3 punctuate.py -a mixed -d /deploy/actions_mixed -i test_data/text.txt -m production -dv cuda:0
```
## Config
```ini
[deployment]
device = cpu ; Device on which inference will be made (eg. cpu, cuda:0 etc)
models_dir = deploy ; Relative path to directory, where models will be placed
models_enabled = actions_base,actions_mixed,actions_restricted ; which models are available.
model_path = /model/punctuator ; Path where the model will be placed
max_context_size = 256 ; Number of tokens that will be oonsidered in prediciton at once. Must be between in range 2*overlap+1 to 512
overlap = 20 ; The number of tokens from the environment that will be taken at inference for a text fragment
```
## LPMN
```
filedir(/users/michal.pogoda)|any2txt|punctuator_test
```
or
```
filedir(/users/michal.pogoda)|any2txt|punctuator_test({"model":"model_name"})
filedir(/users/michal.pogoda)|any2txt|punctuator
```
where model_name is one of models specified in models_enabled. If no model is provided or requested model is unavailable, actions_base will be used.
## Mountpoints
Directory where the model will be downloaded (~500Mb) needs to be mounted at /punctuator/deploy
Directory where the model will be downloaded (~500Mb) needs to be mounted at `/home/worker/model/punctuator`. Mount `/home/worker/model` into directory if you want to make it persitent
......@@ -13,6 +13,7 @@ port = 9981
local_log_level = INFO
[deployment]
device = cpu
models_dir = deploy
models_enabled = actions_base,actions_mixed,actions_restricted
model_path = /home/worker/model/punctuator
max_context_size = 256
overlap = 20
device = cpu
\ No newline at end of file
outs:
- md5: 1fa175e752af1638dc896838e82a9d7d.dir
path: data
FROM clarinpl/cuda-python:3.7
RUN DEBIAN_FRONTEND=noninteractive apt-get update && apt-get install -y gcc python3-dev
RUN mkdir /punctuator
WORKDIR /punctuator
COPY requirements.txt requirements.txt
RUN pip3 install -r requirements.txt && rm requirements.txt
ARG USERNAME=clarin
ARG USER_UID=1000
ARG USER_GID=1000
# Create the user
RUN groupadd --gid $USER_GID $USERNAME \
&& useradd --uid $USER_UID --gid $USER_GID -m $USERNAME \
&& apt-get update \
&& apt-get install -y sudo \
&& echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME \
&& chmod 0440 /etc/sudoers.d/$USERNAME
ENV PATH="/home/${USERNAME}/.local/bin:${PATH}"
USER ${USERNAME}
\ No newline at end of file
../../requirements.txt
\ No newline at end of file
#!/bin/bash
wget http://manage.legis.nlp.ipipan.waw.pl/download/ppc-nanno.tar.gz
tar -xvf ppc-nanno.tar.gz
rm ppc-nanno.tar.gz
stages:
######################
# Action #
######################
actions_extraction:
cmd: python3 -m src.pipelines.actions_based.stage1_extraction
deps:
- data
- src/pipelines/actions_based/stage1_extraction.py
params:
- actions.extraction.num_partitions
outs:
- generated/actions/stage1_extraction
actions_tokenization:
cmd: python3 -m src.pipelines.actions_based.stage2_tokenization
deps:
- generated/actions/stage1_extraction
- src
params:
- actions.tokenization.max_tokens
- actions.tokenization.min_tokens
- global.base_model
outs:
- generated/actions/stage2_tokenization
actions_exploding:
cmd: python3 -m src.pipelines.actions_based.stage3_exploding
deps:
- generated/actions/stage2_tokenization
- src
outs:
- generated/actions/stage3_exploding
actions_reindexing:
cmd: python3 -m src.pipelines.actions_based.stage4_reindexing
deps:
- generated/actions/stage3_exploding
- src
outs:
- generated/actions/stage4_reindexing
actions_stats:
cmd: python3 -m src.pipelines.actions_based.stage5_stats
deps:
- generated/actions/stage4_reindexing
- src
outs:
- generated/actions/stage5_stats
# Base
actions_base_training:
cmd: python3 -m src.pipelines.actions_based.train_base
deps:
- generated/actions/stage4_reindexing
- generated/actions/stage5_stats
- src
params:
- global.base_model
- global.random_seed
- actions.training_base.max_training_time
- actions.training_base.learning_rate
- actions.training_base.num_epochs
- actions.training_base.batch_size
- actions.training_base.save_step
outs:
- checkpoints/actions_base
actions_base_testing:
cmd: python3 -m src.pipelines.actions_based.test -a base -d checkpoints/actions_base/ -m "final" -ds generated/actions/stage4_reindexing/ -o generated/actions/test_results_base -s testing_base
deps:
- checkpoints/actions_base
- generated/actions/stage4_reindexing
- src
params:
- actions.testing_base.limit
outs:
- generated/actions/test_results_base
# Restricted
actions_restricted_training:
cmd: python3 -m src.pipelines.actions_based.train_restricted
deps:
- generated/actions/stage4_reindexing
- generated/actions/stage5_stats
- src
params:
- global.base_model
- global.random_seed
- actions.training_restricted.max_training_time
- actions.training_restricted.learning_rate
- actions.training_restricted.num_epochs
- actions.training_restricted.batch_size
- actions.training_restricted.save_step
outs:
- checkpoints/actions_restricted
actions_restricted_testing:
cmd: python3 -m src.pipelines.actions_based.test -a restricted -d checkpoints/actions_restricted/ -m "final" -ds generated/actions/stage4_reindexing/ -o generated/actions/test_results_restricted -s testing_restricted
deps:
- checkpoints/actions_restricted
- generated/actions/stage4_reindexing
- src
params:
- actions.testing_restricted.limit
outs:
- generated/actions/test_results_restricted
# Mixed
actions_mixed_training:
cmd: python3 -m src.pipelines.actions_based.train_mixed
deps:
- generated/actions/stage4_reindexing
- generated/actions/stage5_stats
- src
params:
- global.base_model
- global.random_seed
- actions.training_mixed.embedding_size
- actions.training_mixed.num_heads
- actions.training_mixed.num_layers
- actions.training_mixed.dropout
- actions.training_mixed.feedforward_neurons
- actions.training_mixed.max_training_time
- actions.training_mixed.learning_rate
- actions.training_mixed.num_epochs
- actions.training_mixed.batch_size
- actions.training_mixed.save_step
outs:
- checkpoints/actions_mixed
actions_mixed_testing:
cmd: python3 -m src.pipelines.actions_based.test -a mixed -d checkpoints/actions_mixed/ -m "final" -ds generated/actions/stage4_reindexing/ -o generated/actions/test_results_mixed -s testing_mixed
deps:
- checkpoints/actions_mixed
- generated/actions/stage4_reindexing
- src
params:
- actions.testing_mixed.limit
outs:
- generated/actions/test_results_mixed
######################
# Translation #
######################
translations_extraction:
cmd: python3 -m src.pipelines.translation_based.stage1_extraction
deps:
- data
params:
- translations.extraction.num_partitions
outs:
- generated/translations/stage1_extraction
translations_create_batches:
cmd: python3 -m src.pipelines.translation_based.stage2_create_batches
deps:
- generated/translations/stage1_extraction
params:
- global.base_model
outs:
- generated/translations/stage2_create_batches
translations_exploding:
cmd: python3 -m src.pipelines.translation_based.stage3_exploding
deps:
- generated/translations/stage2_create_batches
outs:
- generated/translations/stage3_exploding
translations_reindexing:
cmd: python3 -m src.pipelines.translation_based.stage4_reindexing
deps:
- generated/translations/stage3_exploding
outs:
- generated/translations/stage4_reindexing
translations_training:
cmd: python3 -m src.pipelines.translation_based.train
deps:
- generated/translations/stage4_reindexing
- src/pipelines/translation_based/train.py
params:
- global.base_model
- global.random_seed
- translations.training.max_training_time
- translations.training.learning_rate
- translations.training.num_epochs
- translations.training.batch_size
- translations.training.save_step
outs:
- checkpoints/translations
\ No newline at end of file
#!/bin/bash
if ! test -d "./deploy/actions_base"; then
mkdir -p ./deploy/actions_base
wget https://minio.clarin-pl.eu/public/models/punctuation/actions_base/production.model -O deploy/actions_base/production.model
wget https://minio.clarin-pl.eu/public/models/punctuation/actions_base/production.config -O deploy/actions_base/production.config
wget https://minio.clarin-pl.eu/public/models/punctuation/actions_base/production.runtime.yaml -O deploy/actions_base/production.runtime.yaml
if ! test -d "/home/worker/model/punctuator"; then
mkdir -p /home/worker/model/punctuator
wget https://minio.clarin-pl.eu/public/models/punctuator/model/pytorch_model.bin -O /home/worker/model/punctuator/pytorch_model.bin
wget https://minio.clarin-pl.eu/public/models/punctuator/model/vocab.txt -O /home/worker/model/punctuator/vocab.txt
wget https://minio.clarin-pl.eu/public/models/punctuator/model/config.json -O /home/worker/model/punctuator/config.json
wget https://minio.clarin-pl.eu/public/models/punctuator/model/tokenizer_config.json -O /home/worker/model/punctuator/tokenizer_config.json
wget https://minio.clarin-pl.eu/public/models/punctuator/model/special_tokens_map.json -O /home/worker/model/punctuator/special_tokens_map.json
wget https://minio.clarin-pl.eu/public/models/punctuator/model/classes.json -O /home/worker/model/punctuator/classes.json
fi
if ! test -d "./deploy/actions_mixed"; then
mkdir -p ./deploy/actions_mixed
wget https://minio.clarin-pl.eu/public/models/punctuation/actions_mixed/production.model -O deploy/actions_mixed/production.model
wget https://minio.clarin-pl.eu/public/models/punctuation/actions_mixed/production.config -O deploy/actions_mixed/production.config
wget https://minio.clarin-pl.eu/public/models/punctuation/actions_mixed/production.runtime.yaml -O deploy/actions_mixed/production.runtime.yaml
fi
if ! test -d "./deploy/actions_restricted"; then
mkdir -p ./deploy/actions_restricted
wget https://minio.clarin-pl.eu/public/models/punctuation/actions_restricted/production.model -O deploy/actions_restricted/production.model
wget https://minio.clarin-pl.eu/public/models/punctuation/actions_restricted/production.config -O deploy/actions_restricted/production.config
wget https://minio.clarin-pl.eu/public/models/punctuation/actions_restricted/production.runtime.yaml -O deploy/actions_restricted/production.runtime.yaml
fi
python worker.py
python worker.py
\ No newline at end of file
*
!.gitignore
\ No newline at end of file
/stage1_extraction
/stage2_tokenization
/stage3_exploding
/stage4_reindexing
/stage5_stats
\ No newline at end of file
/stage1_extraction
/stage2_create_batches
/stage3_exploding
/stage4_reindexing
global:
dashboard_port: 8787
base_model: "dkleczek/bert-base-polish-cased-v1"
random_seed: 44
actions:
extraction:
num_partitions: 2_000
num_workers: 24
worker_memory_limit: "2GB"
tokenization:
min_tokens: 10