# NER model training

Specify model directory. There will be 4 files saved there: 
- confg.json - json config file specifying model architecture
- char_to_id.json - mapping between characters and ids
- label_to_id.json - mapping between ner tag and ids
- best_model.ckpt - model weights

To demonstrate how to do it we will use very small subset of NER dataset for Polish.

In [1]:
from pathlib import Path
serialization_directory = Path("./models/notebook_example")

In [2]:
from combo.ner_modules.utils.utils import fix_common_warnings
fix_common_warnings()

 from .autonotebook import tqdm as notebook_tqdm


In [3]:
import torch 
config = {
 "data": {
 "path_data" : r".\example_data",
 "use_char_level_embeddings": True,
 "use_start_end_token": True,
 "tokenize_entities": True,
 "batch_size": 32,
 "encoding": "utf-8",
 "num_workers": 1
 },

 "model": {
 "bert_embedder": {
 "pretrained_model_name": "allegro/herbert-base-cased",
 "pretrained_model_type": "AutoModel",
 "projection_dimension": None,
 "freeze_bert": True,
 "token_pooling": True,
 "pooling_strategy": "max"
 },
 "char_embedder": {"type" : "combo",
 "char_embedding_dim": 64
 },
 "classifier": {"type" : "vanilla",
 "to_tag_space" : "linear"},
 "dropout": 0
 },
 "loss": "ce",
 "learning_rate": 0.0007585775750,
 "callbacks": {"FixedProgressBar": True},
 "trainer": {
 "devices" : [0],
 "max_epochs": 2,
 "accelerator": "cuda" if torch.cuda.is_available() else "cpu",
 "log_every_n_steps": 10}
}

# Training using config file

## create vocabularies

In [4]:
from combo.ner_modules.data.utils import create_tag2id, create_char2id
from pathlib import Path

In [5]:
char_to_id = create_char2id(file_path=Path(config["data"]["path_data"]) / "train.txt", )
label_to_id = create_tag2id(file_path=Path(config["data"]["path_data"]) / "train.txt",
 encoding=config["data"]["encoding"],
 include_special_tokens=config["data"]["use_start_end_token"])

## create tokenizer

In [6]:
from combo.ner_modules.utils.constructors import construct_tokenizer_from_config

In [7]:
tokenizer = construct_tokenizer_from_config(config=config,
 char_to_id_map=char_to_id,
 label_to_id_map=label_to_id)

Using model LAMBO-UD_Polish-PDB


## create pytorch lightning datamodule

In [8]:
from combo.ner_modules.utils.constructors import construct_data_module_from_config
data_module = construct_data_module_from_config(config=config,
 tokenizer=tokenizer)

## create loss

In [9]:
from combo.ner_modules.utils.constructors import construct_loss_from_config

In [10]:
loss = construct_loss_from_config(config=config,
 label_to_id=label_to_id)

## saving data to serialization directory

In [11]:
serialization_directory.mkdir(parents=True, exist_ok=True)

In [12]:
import json
with open(serialization_directory / "char_to_id.json", "w+") as f:
 json.dump(char_to_id, f)

with open(serialization_directory / "label_to_id.json", "w+") as f:
 json.dump(label_to_id, f)

with open(serialization_directory / "config.json", "w+") as f:
 json.dump(config, f)

## creating model instance

In [13]:
from combo.ner_modules.NerModel import NerModel 

In [14]:
model = NerModel(loss_fn=loss,
 char_to_id_map=char_to_id,
 label_to_id_map=label_to_id,
 config=config)

## training

In [15]:
import pytorch_lightning as pl
from combo.ner_modules.utils.constructors import construct_callbacks_from_config

In [16]:
params = config["trainer"]
params["callbacks"] = construct_callbacks_from_config(config.get("callbacks", {}))
params["default_root_dir"] = serialization_directory
trainer = pl.Trainer(**params)

# start training
trainer.fit(model,
 datamodule=data_module)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

 | Name | Type | Params
----------------------------------------------------
0 | bert_embedder | BertEmbedder | 124 M 
1 | char_embedder | ComboCharEmbedder | 546 K 
2 | classifier | VanillaClassifier | 65.8 K
3 | dropout | Dropout | 0 
4 | loss_fn | CrossEntropyLoss | 0 
----------------------------------------------------
612 K Trainable params
124 M Non-trainable params
125 M Total params
500.223 Total estimated model params size (MB)


Epoch 0: 100%|█████████████████████████████████████████████| 20/20 [00:19<00:00, 1.03it/s, v_num=11, train_loss=1.200]
Epoch 1: 100%|█| 20/20 [00:19<00:00, 1.03it/s, v_num=11, train_loss=0.630, validation_loss=0.480, validation_precision
Epoch 1: 100%|█| 20/20 [00:23<00:00, 1.17s/it, v_num=11, train_loss=0.630, validation_loss=0.251, validation_precision

`Trainer.fit` stopped: `max_epochs=2` reached.


Epoch 1: 100%|█| 20/20 [00:28<00:00, 1.43s/it, v_num=11, train_loss=0.630, validation_loss=0.251, validation_precision


## Evaluate on test data

In [18]:
results = trainer.test(verbose=True,
 ckpt_path='best',
 datamodule=data_module)

Restoring states from the checkpoint path at models\notebook_example\lightning_logs\version_11\checkpoints\epoch=2-step=40.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Loaded model weights from the checkpoint at models\notebook_example\lightning_logs\version_11\checkpoints\epoch=2-step=40.ckpt


Testing: 0it [00:00, ?it/s] precision recall f1-score support

 nam_adj_country 1.00 0.00 0.00 3
 nam_liv_person 1.00 0.00 0.00 3
 nam_loc_gpe_city 1.00 0.00 0.00 1
 nam_loc_gpe_country 1.00 0.00 0.00 8
 nam_org_company 1.00 0.00 0.00 10
 nam_org_nation 1.00 0.00 0.00 3
nam_org_organization 1.00 0.00 0.00 3
 nam_oth_currency 1.00 0.00 0.00 2
 nam_oth_tech 1.00 0.00 0.00 5
 nam_pro_brand 1.00 0.00 0.00 2
 nam_pro_software 1.00 0.00 0.00 37

 micro avg 1.00 0.00 0.00 77
 macro avg 1.00 0.00 0.00 77
 weighted avg 1.00 0.00 0.00 77



# Training using as little config file as possible

## create vocabularies

In [19]:
from combo.ner_modules.data.utils import create_tag2id, create_char2id
from pathlib import Path

In [20]:
training_data_path = Path(r".\example_data\train.txt") 
char_to_id = create_char2id(file_path=training_data_path)
label_to_id = create_tag2id(file_path=training_data_path,
 encoding="utf-8",
 include_special_tokens=True)

## create tokenizer

In [21]:
from combo.ner_modules.data.NerTokenizer import NerTokenizer

In [22]:
tokenizer = NerTokenizer(pretrained_model_type="AutoModel",
 pretrained_model_name="allegro/herbert-base-cased",
 char_to_id_map=char_to_id,
 label_to_id_map=label_to_id,
 use_char_level_embeddings=True,
 use_start_end_token=True,
 tokenize_entities=True)

Using model LAMBO-UD_Polish-PDB


## create pytorch lightning datamodule

In [23]:
from combo.ner_modules.NerDataModule import NerDataModule

In [24]:
data_path = Path(r".\example_data") 
data_module = NerDataModule(path_data=data_path,
 tokenizer=tokenizer,
 batch_size=32,
 encoding="utf-8",
 num_workers=1)

## create losss function

In [25]:
import torch

In [26]:
loss = torch.nn.CrossEntropyLoss()

## create model instance

In [27]:
from combo.ner_modules.NerModel import NerModel

Minimal config should contain information about model architecture, learning rate and whether to use start and end tokens as well as whether to use character level embeddings

In [28]:
config = {
 "data": {
 "use_char_level_embeddings": True,
 "use_start_end_token": True},
 
 "model": {
 "bert_embedder": {
 "pretrained_model_name": "allegro/herbert-base-cased",
 "pretrained_model_type": "AutoModel",
 "projection_dimension": None,
 "freeze_bert": True,
 "token_pooling": True,
 "pooling_strategy": "max"
 },
 "char_embedder": {"type" : "combo",
 "char_embedding_dim": 64
 },
 "classifier": {"type" : "vanilla",
 "to_tag_space" : "linear"},
 "dropout": 0
 },
 "learning_rate": 0.0007585775750}

In [29]:
model = NerModel(loss_fn=loss,
 char_to_id_map=char_to_id,
 label_to_id_map=label_to_id,
 config=config)

## train

In [30]:
from combo.ner_modules.callbacks.FixedProgressBar import FixedProgressBar
import pytorch_lightning as pl

In [31]:
callbacks = [FixedProgressBar()]
trainer = pl.Trainer(devices = [0],
 accelerator="cuda",
 max_epochs=2,
 callbacks=callbacks)
trainer.fit(model,
 datamodule=data_module)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

 | Name | Type | Params
----------------------------------------------------
0 | bert_embedder | BertEmbedder | 124 M 
1 | char_embedder | ComboCharEmbedder | 546 K 
2 | classifier | VanillaClassifier | 65.8 K
3 | dropout | Dropout | 0 
4 | loss_fn | CrossEntropyLoss | 0 
----------------------------------------------------
612 K Trainable params
124 M Non-trainable params
125 M Total params
500.223 Total estimated model params size (MB)


 

 rank_zero_warn(


Epoch 0: 100%|██████████████████████████████████████████████| 20/20 [00:19<00:00, 1.05it/s, v_num=0, train_loss=1.020]
Epoch 1: 100%|█| 20/20 [00:19<00:00, 1.03it/s, v_num=0, train_loss=0.128, validation_loss=0.448, validation_precision=
Epoch 1: 100%|█| 20/20 [00:23<00:00, 1.17s/it, v_num=0, train_loss=0.128, validation_loss=0.256, validation_precision=

`Trainer.fit` stopped: `max_epochs=2` reached.


Epoch 1: 100%|█| 20/20 [00:28<00:00, 1.42s/it, v_num=0, train_loss=0.128, validation_loss=0.256, validation_precision=


## evaluate on test data

In [32]:
reults = trainer.test(verbose=True,
 ckpt_path='best',
 datamodule=data_module)

Restoring states from the checkpoint path at L:\combo-lightning\docs\ner_docs\lightning_logs\version_0\checkpoints\epoch=2-step=40.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Loaded model weights from the checkpoint at L:\combo-lightning\docs\ner_docs\lightning_logs\version_0\checkpoints\epoch=2-step=40.ckpt


Testing: 0it [00:00, ?it/s] precision recall f1-score support

 nam_adj_country 1.00 0.00 0.00 3
 nam_liv_person 1.00 0.00 0.00 3
 nam_loc_gpe_city 1.00 0.00 0.00 1
 nam_loc_gpe_country 1.00 0.00 0.00 8
 nam_org_company 1.00 0.00 0.00 10
 nam_org_nation 1.00 0.00 0.00 3
nam_org_organization 1.00 0.00 0.00 3
 nam_oth_currency 1.00 0.00 0.00 2
 nam_oth_tech 1.00 0.00 0.00 5
 nam_pro_brand 1.00 0.00 0.00 2
 nam_pro_software 1.00 0.00 0.00 37

 micro avg 1.00 0.00 0.00 77
 macro avg 1.00 0.00 0.00 77
 weighted avg 1.00 0.00 0.00 77

