From 7f9305d54e3c1643eac89c056650b637c694a8bc Mon Sep 17 00:00:00 2001 From: Michal Pogoda <michalpogoda@hotmail.com> Date: Thu, 11 Mar 2021 12:21:06 +0100 Subject: [PATCH] Update readme & refactor train.sh --- README.md | 39 ++++++++++++++++++++++++++++++++++++++ train.sh | 56 ++++++++++++++++++++----------------------------------- 2 files changed, 59 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index 3b17747..234cb25 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,49 @@ # Bert Document Classifier +## Quickstart +For convinience, dummy datasets are provided at `example_datset` folder. For start, we will use `example_dataset/text/split` dataset. + +#### Training +Go to project root directory and execute +```bash +./train.sh --dataset_dir example_dataset/text/split --output_dir trained_model +``` + ## Training ```bash ./train.sh --train_dir <dataset dir> --output_dir <dir where final model will be stored> [Training Options] ``` +## Training parameters +### Required +- `dataset_dir` - Location of train dataset +- `output_dir` - Location where output will be stored +- `base_model` - Name of base bert model (see https://huggingface.co/models). Defaults to `dkleczek/bert-base-polish-cased-v1` +- `truncation` - What to remove, if text is too large to fit within 512 tokens. Can be one of ['front', 'end']. Defaults to end +- `valid_frac` - What part of dataset will be used for validation. Defaults to 0.25 +- `test_frac` - What part of dataset will be used for final testing. Defaults to 0.1. Use only when `dataset_splitted` is set to False +- `project_name` - Name of the project. Used for wandb logging +- `experiment_name` - String that uniquely indentifies experiment. Used for wandb logging. +- `learning_rate` - What learning rate to use. Defaults to 1e-5 +- `classificator_size` - Hidden dimension of final MLP layer. Defaults to 782 +- `dropout` - Dropout fraction. defaults to 0.1 (deprecated) +- `freeze_embeddings` - Wheter to freeze pretrained token -> vector embeddings. Defaults to True +- `max_epochs` - Maximum number of epochs to train. Defaults to 20 +- `accum_steps` - Batch accumulation steps. Effectively multiplies batch_size. Defaults to 1 (no accumulation) +- `batch_size` - Batch size. Defaults to 10 +- `layers_frozen` - Number of layers that are frozen during fine-tuning. Eg. value of 4 means that first 4 layers of Bert will be frozen and only final 8 will be fine-tuned. Defaults to 0 +- `early_stopping` - Wheter to stop when validation f1 will not raise for n folowing epochs. Defaults to True +- `pooling` - What type of pooling will be used between bert & MLP. Can be one of ['cls', 'max', 'mean']. Defaults to 'cls' +- `checkpointing` - Wheter to save checkpoints during training. Defaults to True. +- `num_workers` - Number of CPU workers feeding data for training. Defaults to 4. +- `weighted_sampling` - Wheter to use upsampling based on class occurances. Defaults to True +- `fast_dev_run` - Train on only 1 batch. Used only for debugging. Defaults to False. +- `cache_dir` - Directory where pretrained models will be cached. Optional +- `wand` - Wheter to use wandb for logging +- `dataset_type` - Format of dataset. Can be one of ['sqlite3', 'text', 'text_raw'] (see formats section). Defaults to 'text' +- `dataset_splitted` - Wheter the dataset is already splitted into train/test or not (see formats section). Defaults to False + +- `num_gpus` - Number of gpus used during training. Defaults to 0 ## Dataset format Bert trainer accepts dataset in format of SQLite dataset & Text dataset. Both of them can be predivided into train/test pairs or not. diff --git a/train.sh b/train.sh index 5c4d97c..361dc9e 100755 --- a/train.sh +++ b/train.sh @@ -1,32 +1,47 @@ #!/bin/bash # Usage: train.sh --dataset_dir <path_to_dataset> --output_dir <output_path> --gpu <gpu_id> [additional options] +command=(docker run) +docker_params=( + -v $(pwd)/bert_document_classifier:/home/clarin/workspace/bert_document_classifier:ro + --rm +) script_params=() + while test $# -gt 0; do case "$1" in --dataset_dir) shift - dataset_dir=$1 + docker_params+=(-v $(realpath $1):/home/clarin/workspace/dataset:ro) + script_params+=(--dataset_dir /home/clarin/workspace/dataset) shift ;; --output_dir) shift - output_dir=$1 + mkdir -p $1 + docker_params+=(-v $(realpath $1):/home/clarin/workspace/output) + script_params+=(--output_dir /home/clarin/workspace/output) shift ;; --cache_dir) shift - cache_dir=$1 + docker_params+=(-v $(realpath $1):/home/clarin/workspace/cache/) + script_params+=(--cache_dir /home/clarin/workspace/cache/) + mkdir -p $1 shift ;; --wandb) shift - use_wandb=$1 + if [ "$1" = "True" ]; then + docker_params+=(-e WANDB_API_KEY) + script_params+=(--wandb True) + fi shift ;; --gpu) shift - gpu=$1 + docker_params+=(--gpus device=$1) + script_params+=(--num_gpus 1) shift ;; *) @@ -36,37 +51,6 @@ while test $# -gt 0; do esac done -docker_params=( - -v $(pwd)/bert_document_classifier:/home/clarin/workspace/bert_document_classifier:ro - -v $(realpath $dataset_dir):/home/clarin/workspace/dataset:ro - -v $(realpath $output_dir):/home/clarin/workspace/output - --rm -) - -script_params+=( - --dataset_dir /home/clarin/workspace/dataset - --output_dir /home/clarin/workspace/output -) - -command=(docker run) - -if [ ! -z "$cache_dir" ]; then - docker_params+=(-v $(realpath $cache_dir):/home/clarin/workspace/cache/) - script_params+=(--cache_dir /home/clarin/workspace/cache/) - mkdir -p $cache_dir -fi - -if [ "$use_wandb" = "True" ]; then - docker_params+=(-e WANDB_API_KEY) - script_params+=(--wandb True) -fi - -if [ ! -z "$gpu" ]; then - docker_params+=(--gpus device=$gpu) - script_params+=(--num_gpus 1) -fi - -mkdir -p $output_dir echo "Building image..." ${command[@]} -it \ -- GitLab