From 7f9305d54e3c1643eac89c056650b637c694a8bc Mon Sep 17 00:00:00 2001
From: Michal Pogoda <michalpogoda@hotmail.com>
Date: Thu, 11 Mar 2021 12:21:06 +0100
Subject: [PATCH] Update readme & refactor train.sh

---
 README.md | 39 ++++++++++++++++++++++++++++++++++++++
 train.sh  | 56 ++++++++++++++++++++-----------------------------------
 2 files changed, 59 insertions(+), 36 deletions(-)

diff --git a/README.md b/README.md
index 3b17747..234cb25 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,49 @@
 # Bert Document Classifier
 
+## Quickstart
+For convinience, dummy datasets are provided at `example_datset` folder. For start, we will use `example_dataset/text/split` dataset.
+
+#### Training
+Go to project root directory and execute
+```bash
+./train.sh --dataset_dir example_dataset/text/split --output_dir trained_model
+```
+
 ## Training
 ```bash
 ./train.sh --train_dir <dataset dir> --output_dir <dir where final model will be stored> [Training Options]
 ```
 
+## Training parameters
+### Required
+- `dataset_dir` - Location of train dataset
+- `output_dir` - Location where output will be stored
+- `base_model` - Name of base bert model (see https://huggingface.co/models). Defaults to `dkleczek/bert-base-polish-cased-v1`
+- `truncation` - What to remove, if text is too large to fit within 512 tokens. Can be one of ['front', 'end']. Defaults to end
+- `valid_frac` - What part of dataset will be used for validation. Defaults to 0.25
+- `test_frac` - What part of dataset will be used for final testing. Defaults to 0.1. Use only when `dataset_splitted` is set to False
+- `project_name` - Name of the project. Used for wandb logging
+- `experiment_name` - String that uniquely indentifies experiment. Used for wandb logging.
+- `learning_rate` - What learning rate to use. Defaults to 1e-5
+- `classificator_size` - Hidden dimension of final MLP layer. Defaults to 782
+- `dropout` - Dropout fraction. defaults to 0.1 (deprecated)
+- `freeze_embeddings` - Wheter to freeze pretrained token -> vector embeddings. Defaults to True
+- `max_epochs` - Maximum number of epochs to train. Defaults to 20
+- `accum_steps` - Batch accumulation steps. Effectively multiplies batch_size. Defaults to 1 (no accumulation)
+- `batch_size` - Batch size. Defaults to 10
+- `layers_frozen` - Number of layers that are frozen during fine-tuning. Eg. value of 4 means that first 4 layers of Bert will be frozen and only final 8 will be fine-tuned. Defaults to 0
+- `early_stopping` - Wheter to stop when validation f1 will not raise for n folowing epochs. Defaults to True
+- `pooling` - What type of pooling will be used between bert & MLP. Can be one of ['cls', 'max', 'mean']. Defaults to 'cls'
+- `checkpointing` - Wheter to save checkpoints during training. Defaults to True.
+- `num_workers` - Number of CPU workers feeding data for training. Defaults to 4.
+- `weighted_sampling` - Wheter to use upsampling based on class occurances. Defaults to True
+- `fast_dev_run` - Train on only 1 batch. Used only for debugging. Defaults to False.
+- `cache_dir` - Directory where pretrained models will be cached. Optional
+- `wand` - Wheter to use wandb for logging
+- `dataset_type` - Format of dataset. Can be one of ['sqlite3', 'text', 'text_raw'] (see formats section). Defaults to 'text'
+- `dataset_splitted` - Wheter the dataset is already splitted into train/test or not (see formats section). Defaults to False
+
+- `num_gpus` - Number of gpus used during training. Defaults to 0
 ## Dataset format
 Bert trainer accepts dataset in format of SQLite dataset & Text dataset. Both of them can be predivided into train/test pairs or not.
 
diff --git a/train.sh b/train.sh
index 5c4d97c..361dc9e 100755
--- a/train.sh
+++ b/train.sh
@@ -1,32 +1,47 @@
 #!/bin/bash
 # Usage: train.sh --dataset_dir <path_to_dataset> --output_dir <output_path> --gpu <gpu_id> [additional options]
 
+command=(docker run)
+docker_params=(
+    -v $(pwd)/bert_document_classifier:/home/clarin/workspace/bert_document_classifier:ro
+    --rm
+)
 script_params=()
+
 while test $# -gt 0; do
     case "$1" in
         --dataset_dir)
             shift
-            dataset_dir=$1
+            docker_params+=(-v $(realpath $1):/home/clarin/workspace/dataset:ro)
+            script_params+=(--dataset_dir /home/clarin/workspace/dataset)
             shift
             ;;
         --output_dir)
             shift
-            output_dir=$1
+            mkdir -p $1
+            docker_params+=(-v $(realpath $1):/home/clarin/workspace/output)
+            script_params+=(--output_dir /home/clarin/workspace/output)
             shift
             ;;
         --cache_dir)
             shift
-            cache_dir=$1
+            docker_params+=(-v $(realpath $1):/home/clarin/workspace/cache/)
+            script_params+=(--cache_dir /home/clarin/workspace/cache/)
+            mkdir -p $1
             shift
             ;;
         --wandb)
             shift
-            use_wandb=$1
+            if [ "$1" = "True" ]; then
+                docker_params+=(-e WANDB_API_KEY)
+                script_params+=(--wandb True)
+            fi
             shift
             ;;
         --gpu)
             shift
-            gpu=$1
+            docker_params+=(--gpus device=$1)
+            script_params+=(--num_gpus 1)
             shift
             ;;
         *)
@@ -36,37 +51,6 @@ while test $# -gt 0; do
     esac
   done  
 
-docker_params=(
-    -v $(pwd)/bert_document_classifier:/home/clarin/workspace/bert_document_classifier:ro
-    -v $(realpath $dataset_dir):/home/clarin/workspace/dataset:ro
-    -v $(realpath $output_dir):/home/clarin/workspace/output
-    --rm
-)
-
-script_params+=(
-    --dataset_dir /home/clarin/workspace/dataset
-    --output_dir /home/clarin/workspace/output
-)
-
-command=(docker run)
-
-if [ ! -z "$cache_dir" ]; then
-    docker_params+=(-v $(realpath $cache_dir):/home/clarin/workspace/cache/)
-    script_params+=(--cache_dir /home/clarin/workspace/cache/)
-    mkdir -p $cache_dir
-fi
-
-if [ "$use_wandb" = "True" ]; then
-    docker_params+=(-e WANDB_API_KEY)
-    script_params+=(--wandb True)
-fi
-
-if [ ! -z "$gpu" ]; then
-    docker_params+=(--gpus device=$gpu)
-    script_params+=(--num_gpus 1)
-fi
-
-mkdir -p $output_dir
 echo "Building image..."
 
 ${command[@]} -it \
-- 
GitLab