Skip to content
Snippets Groups Projects
Commit 2067c441 authored by Wiktor Walentynowicz's avatar Wiktor Walentynowicz :construction_worker_tone1:
Browse files

Merge branch '14-base-scripts' into 'develop-biz'

Resolve "Base scripts"

See merge request !34
parents 391df10f bd54df3a
No related branches found
No related tags found
3 merge requests!39Version 0.7.0,!38Version 0.7,!34Resolve "Base scripts"
Pipeline #4732 passed
[model]
device = cpu
gpu_num = 0
path = /mnt/sda/pdn2scripts/nkjp_base
pretrained_path = /mnt/sda/pdn2scripts/roberta_base
[predict]
device = cpu
save_to_file = true
path = /mnt/sda/pdn2scripts/roberta_base
max_seq_len = 100
path_to_save = predict_res.txt
[evaluate]
device = cpu
gpu_num = 0
path = E:/ClarinProjects/nkjp_base
pretrained_path = ./roberta_base
squeeze = false
max_seq_len = 100
hidden_size = 32
dropout = 0.05
[data]
tag_column_index = 3
eval_path = data/coNLL-2003/test.txt
pred_path = tests/resources/text_krakow.txt
[train]
adam_epsilon = 0.1
data_test = data/coNLL-2003/test.txt
data_train = data/coNLL-2003/train.txt
data_tune = data/coNLL-2003/valid.txt
device = cuda
dropout = 0.05
epoch_save_model = True
eval_batch_size = 16
fp16 = false
fp16_opt_level = ''
freeze_model = True
gradient_accumulation_steps = 5
hidden_size = 32
learning_rate = 0.001
max_grad_norm = 5
max_seq_length = 32
num_train_epochs = 100
output_dir = test_res
pretrained_path = /mnt/sda/pdn2scripts/roberta_base
seed = 42
squeeze = true
train_batch_size = 16
training_mix = False
transfer = None
warmup_proportion = 0.3
weight_decay = 0.1
"""Script for evaluating models on a pre-defined set of data."""
import configparser
import os
import time
from poldeepner2.utils.data_utils import NerProcessor, create_dataset, \
convert_examples_to_features
from poldeepner2.utils.train_utils import evaluate_model
def main():
config_file = "config.cfg"
config = configparser.ConfigParser()
config.read(config_file)
pretrained_model = config['evaluate']['pretrained_path']
device = config['evaluate']['device']
squeeze = config.getboolean('evaluate', 'squeeze')
tag_column_index = config.getint('data', 'tag_column_index')
processor = NerProcessor()
data_path = config['data']['eval_path']
datasets = [data_path]
labels_list = \
processor.get_labels(datasets, config.getint('data',
'tag_column_index'))
num_labels = len(labels_list) + 1
hidden_size = config.getint('evaluate', 'hidden_size')
dropout = config.getfloat('train', 'dropout')
hidden_size = 1024 if 'large' in pretrained_model \
else (768 if 'base' in pretrained_model else hidden_size)
device = device
pretrained_path = config['model']['pretrained_path']
if pretrained_path.startswith("hf:"):
from poldeepner2.model.hf_for_token_calssification \
import HfModelForTokenClassification
pretrained_dir = pretrained_path.split(':')[1]
model = HfModelForTokenClassification(
pretrained_path=pretrained_dir, n_labels=num_labels,
hidden_size=hidden_size, dropout_p=dropout,
device=device)
elif pretrained_path.startswith("mt5:"):
from poldeepner2.model.mt5_for_token_calssification \
import Mt5ModelForTokenClassification
variant = pretrained_path.split(':')[1]
model = Mt5ModelForTokenClassification(
variant=variant, n_labels=num_labels,
hidden_size=hidden_size, dropout_p=dropout, device=device)
else:
from poldeepner2.model.xlmr_for_token_classification \
import XLMRForTokenClassification
pretrained_dir = pretrained_path
if ":" in pretrained_dir:
pretrained_dir = pretrained_dir.split(':')[1]
if not os.path.exists(pretrained_dir):
raise ValueError("RoBERTa language model not found on path '%s'"
% pretrained_dir)
model = XLMRForTokenClassification(
pretrained_path=pretrained_dir, n_labels=num_labels,
hidden_size=hidden_size, dropout_p=dropout,
device=device)
max_seq_len = config.getint('evaluate', 'max_seq_len')
eval_examples = processor.get_examples(datasets[0], tag_column_index,
'eval')
eval_features = convert_examples_to_features(
eval_examples, labels_list, max_seq_len, model.encode_word,
squeeze=squeeze)
eval_data = create_dataset(eval_features)
time_start = time.time()
f1, report = evaluate_model(model, eval_data, labels_list, 16, device)
time_end = time.time()
print(f' f1: {f1}')
print(f' report {report}')
print(f'time {time_end - time_start}')
if __name__ == "__main__":
main()
...@@ -100,7 +100,7 @@ class PolDeepNer2: ...@@ -100,7 +100,7 @@ class PolDeepNer2:
"""A message of shame -- documentation must be completed.""" """A message of shame -- documentation must be completed."""
def __init__(self, model_path: str, def __init__(self, model_path: str,
pretrained_path: str = None, pretrained_path: str,
device="cpu", device="cpu",
squeeze=False, squeeze=False,
max_seq_length=256, max_seq_length=256,
......
...@@ -171,7 +171,7 @@ class NerProcessor: ...@@ -171,7 +171,7 @@ class NerProcessor:
""" """
label_set = set([]) label_set = set([])
for path in paths: for path in paths:
examples = self._create_examples(self._read_file(path), "data") examples = self._create_examples(self._read_iob(path, tag_column_index), "data")
label_set.update(NerProcessor._get_labels(examples)) label_set.update(NerProcessor._get_labels(examples))
return sorted(list(label_set)) return sorted(list(label_set))
...@@ -208,6 +208,34 @@ class NerProcessor: ...@@ -208,6 +208,34 @@ class NerProcessor:
data.append((sentence, label)) data.append((sentence, label))
return data return data
def _read_iob(self, filename, column_index):
data = []
sentence = []
label = []
with open(filename, encoding='utf-8') as f:
for i, line in enumerate(f, 1):
line = line.strip('\n')
# check if begining of the file or empty line
if line.startswith('-DOCSTART') or len(line) == 0:
if len(sentence) > 0:
data.append((sentence, label))
sentence = []
label = []
continue
splits = line.split()
assert len(splits) >= 2, "error on line {}. Found {} splits".format(
i, len(splits))
word, tag = splits[0], splits[column_index]
sentence.append(word)
label.append(tag)
if len(sentence) > 0:
data.append((sentence, label))
return data
def _create_examples(self, lines, set_type): def _create_examples(self, lines, set_type):
"""A message of shame -- documentation must be completed. """A message of shame -- documentation must be completed.
......
...@@ -96,8 +96,77 @@ def add_xlmr_args(parser): ...@@ -96,8 +96,77 @@ def add_xlmr_args(parser):
def evaluate_model(model, eval_dataset, label_list, batch_size, device, def evaluate_model(model, eval_dataset, label_list, batch_size, device,
model_name='Roberta'): model_name='Roberta'):
"""Evaluates an NER model on the eval_dataset provided. """
Evaluates an NER model on the eval_dataset provided.
Returns:
F1_score: Macro-average f1_score on the evaluation dataset.
Report: detailed classification report
"""
# Run prediction for full data
eval_sampler = SequentialSampler(eval_dataset)
eval_dataloader = DataLoader(
eval_dataset, sampler=eval_sampler, batch_size=batch_size)
model.eval() # turn of dropout
y_true = []
y_pred = []
label_map = {i: label for i, label in enumerate(label_list, 1)}
#print(f'label_list: {label_list}')
#print(f'label_map: {label_map}')
for input_ids, label_ids, l_mask, valid_ids in eval_dataloader:
# print(f'eval_sampler {eval_sampler}')
# print(f'eval_dataset {eval_dataset}')
# print(f'input_ids {input_ids}')
# print(f'label_ids {label_ids}')
input_ids = input_ids.to(device)
label_ids = label_ids.to(device)
valid_ids = valid_ids.to(device)
#print(f'valid_ids {label_ids}')
l_mask = l_mask.to(device)
with torch.no_grad():
if model_name == 'Roberta':
logits = model(input_ids, labels=None, labels_mask=None,
valid_mask=valid_ids)
else:
logits = model(input_ids, return_dict=True).logits
#print(f'logits1: {logits}')
logits = torch.argmax(logits, dim=2)
#print(f'logits11: {logits}')
logits = logits.detach().cpu().numpy()
label_ids = label_ids.cpu().numpy()
for i, cur_label in enumerate(label_ids):
#print(f'i {i}, cur_label:{cur_label}')
temp_1 = []
temp_2 = []
for j, m in enumerate(cur_label):
if valid_ids[i][j]: #and logits[i][j]: # if it's a valid label
temp_1.append(label_map[m])
if logits[i][j]:
index = label_map[logits[i][j]] # for debug
temp_2.append(index)
#assert len(temp_1) == len(temp_2)
y_true.append(temp_1)
y_pred.append(temp_2)
report = classification_report(y_true, y_pred, digits=4)
f1 = f1_score(y_true, y_pred, average='Macro')
return f1, report
def predict_model(model, eval_dataset, label_list, batch_size, device,
report=True):
"""
Evaluates an NER model on the eval_dataset provided.
Returns: Returns:
F1_score: Macro-average f1_score on the evaluation dataset. F1_score: Macro-average f1_score on the evaluation dataset.
Report: detailed classification report Report: detailed classification report
......
"""Script for tagging raw data."""
import configparser
from poldeepner2.models import PolDeepNer2
def main():
# config serializuje razem z modelem
# json
config_file = "config.cfg"
config = configparser.ConfigParser()
config.read(config_file)
model = config['model']['path']
pretrained_model = config['model']['pretrained_path']
ner = PolDeepNer2.load(model=model, pretrained_path=pretrained_model)
data_path = config['data']['pred_path']
with open(data_path) as f:
data = f.readlines()
if not config.getboolean('predict', 'save_to_file'):
for sentence in data:
if sentence != '\n':
print(sentence)
text_prediction = ner.process_text(sentence)
for pred in text_prediction:
print(f'{pred.text}, {pred.label}')
else:
with open(config['predict']['path_to_save'], 'w+') as f_res:
for sentence in data:
if sentence != '\n':
text_prediction = ner.process_text(sentence)
for pred in text_prediction:
f_res.write(f'{pred.text}, {pred.label}\n')
else:
f_res.write('\n')
if __name__ == "__main__":
main()
[model]
path =
cpu_or_gpu = cpu
gpu_num = 0
[predict]
data_path
save_to_file = yes
[train]
adam_epsilon =
data_test =
data_train =
data_tune =
device = gpu
dropout = 0.05
epoch_save_model = 5
eval_batch_size = 16
fp16 = false
fp16_opt_level =
freeze_model =
gradient_accumulation_steps =
hidden_size = 32
learning_rate = 0.001
max_grad_norm =
max_seq_length = 32
num_train_epochs = 100
output_dir =
pretrained_path =
seed = 42
squeeze =
train_batch_size = 16
training_mix = 0.5
transfer =
warmup_proportion =
weight_decay = 0.1
"""Script for evaluating models on a pre-defined set of data."""
import configparser
from sklearn.metrics import accuracy_score
from poldeepner2.poldeepner2.models import PolDeepNer2
from poldeepner2.poldeepner2.utils.data_utils import NerProcessor
def main():
# config serializuje razem z modelem
# json
config_file = "config.cfg"
config = configparser.ConfigParser()
config.read(config_file)
model = config['model']['path']
ner = PolDeepNer2.load(model=model)
data_path = config['data']['path']
processor = NerProcessor()
#Prediction
data = processor.get_examples(data_path)
prediction_labels = []
for sentence in data:
print(sentence)
prediction = ner.process_text(sentence)
print(prediction)
#predicted label
predict_label = prediction[2][2]
prediction_labels.append(predict_label)
#Comparing
true_labels = processor.get_labels(data_path)
eval_res = accuracy_score(true_labels, prediction_labels)
print(eval_res)
if __name__ == "__main__":
try:
main()
except ValueError as er:
print("[ERROR] %s" % er)
"""Script to teach new models compatible with the library."""
import configparser
import logging
import os
import random
import sys
import time
from pathlib import Path
import numpy as np
import torch
from pytorch_transformers import AdamW, WarmupLinearSchedule
from torch.utils.data import DataLoader, RandomSampler
from tqdm import tqdm
from poldeepner2.utils.data_utils import NerProcessor
from poldeepner2.utils.data_utils import create_dataset, \
convert_examples_to_features, save_params
from poldeepner2.utils.train_utils import evaluate_model
def main():
config_file = "config.cfg"
config = configparser.ConfigParser()
config.read(config_file)
# HYPERPARAMETERS
adam_epsilon = config.getfloat('train', 'adam_epsilon')
data_test = config['train']['data_test']
data_train = config['train']['data_train']
data_tune = config['train']['data_tune']
device = config['train']['device']
dropout = config.getfloat('train', 'dropout')
epoch_save_model = config.getboolean('train', 'epoch_save_model')
eval_batch_size = config.getint('train', 'eval_batch_size')
fp16 = config.getboolean('train', 'fp16')
fp16_opt_level = config['train']['fp16_opt_level']
freeze_model = config.getboolean('train', 'freeze_model')
gradient_accumulation_steps = \
config.getint('train', 'gradient_accumulation_steps')
hidden_size = config.getint('train', 'hidden_size')
learning_rate = config.getfloat('train', 'learning_rate')
max_grad_norm = config.getfloat('train', 'max_grad_norm')
max_seq_length = config.getint('train', 'max_seq_length')
num_train_epochs = config.getint('train', 'num_train_epochs')
output_dir = config['train']['output_dir']
pretrained_path = config['train']['pretrained_path']
seed = config.getint('train', 'seed')
squeeze = config.getboolean('train', 'squeeze')
train_batch_size = config.getint('train', 'train_batch_size')
training_mix = config.getboolean('train', 'training_mix')
use_transfer = 'transfer' in config['train'] and \
config['train']['transfer'] != 'None'
if use_transfer:
transfer = config['train']['transfer']
else:
transfer = None
warmup_proportion = config.getfloat('train', 'warmup_proportion')
weight_decay = config.getfloat('train', 'weight_decay')
# if wandb:
# import wandb
# wandb.init(project=wandb, config=config)
if os.path.exists(output_dir) and os.listdir(output_dir):
raise ValueError(
"Output directory (%s) already exists and is not empty."
% output_dir)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
logging.basicConfig(
format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO,
filename=Path(output_dir) / "log.txt")
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
logger = logging.getLogger(__name__)
for item in sorted(config.items()):
logger.info(item)
if gradient_accumulation_steps < 1:
raise ValueError(
"Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
% gradient_accumulation_steps)
train_batch_size = train_batch_size // gradient_accumulation_steps
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
# Determine set of labels
processor = NerProcessor()
datasets = [data_train]
if data_tune:
datasets.append(data_tune)
if data_test:
datasets.append(data_test)
label_list = \
processor.get_labels(datasets, config.getint('data',
'tag_column_index'))
logger.info(f"Labels: {label_list}")
num_labels = len(label_list) + 1 # add one for IGNORE label
logger.info(f"Number of labels: {num_labels}")
# Load training data
logger.info("Loading training data...")
t0 = time.time()
train_examples = \
processor.get_examples(data_train,
config.getint('data',
'tag_column_index'),
"train")
logger.info(f"Training data was loaded in {time.time() - t0} second(s)")
# preparing model configs
hidden_size = 1024 if 'large' in pretrained_path \
else (768 if 'base' in pretrained_path else hidden_size)
device = device
logger.info("Loading pretrained model...")
t0 = time.time()
if pretrained_path.startswith("hf:"):
from poldeepner2.model.hf_for_token_calssification import \
HfModelForTokenClassification
pretrained_dir = pretrained_path.split(':')[1]
model = HfModelForTokenClassification(
pretrained_path=pretrained_dir, n_labels=num_labels,
hidden_size=hidden_size, dropout_p=dropout,
device=device)
elif pretrained_path.startswith("mt5:"):
from poldeepner2.model.mt5_for_token_calssification import \
Mt5ModelForTokenClassification
variant = pretrained_path.split(':')[1]
model = Mt5ModelForTokenClassification(
variant=variant, n_labels=num_labels, hidden_size=hidden_size,
dropout_p=dropout, device=device)
else:
from poldeepner2.model.xlmr_for_token_classification \
import XLMRForTokenClassification
pretrained_dir = pretrained_path
if ":" in pretrained_dir:
pretrained_dir = pretrained_dir.split(':')[1]
if not os.path.exists(pretrained_dir):
raise ValueError(
"RoBERTa language model not found on path '%s'"
% pretrained_dir)
model = XLMRForTokenClassification(
pretrained_path=pretrained_dir, n_labels=num_labels,
hidden_size=hidden_size, dropout_p=dropout,
device=device)
logger.info(f"Pretrained model was loaded in {time.time() - t0} second(s)")
if use_transfer:
if device == "cpu":
state_dict = torch.load(
open(os.path.join(transfer, 'model.pt'), 'rb'),
map_location='cpu')
else:
state_dict = torch.load(
open(os.path.join(transfer, 'model.pt'), 'rb'))
model.load_state_dict(state_dict)
model.to(device)
# if wandb:
# wandb.watch(model)
train_features = convert_examples_to_features(
train_examples, label_list, max_seq_length, model.encode_word,
squeeze)
if training_mix:
train_features.extend(convert_examples_to_features(
train_examples, label_list, max_seq_length, model.encode_word,
not squeeze))
num_train_optimization_steps = int(
len(train_features) / train_batch_size / gradient_accumulation_steps) \
* num_train_epochs
no_decay = ['bias', 'final_layer_norm.weight']
params = list(model.named_parameters())
optimizer_grouped_parameters = [
{'params': [p for n, p in params if not any(
nd in n for nd in no_decay)], 'weight_decay': weight_decay},
{'params': [p for n, p in params if any(
nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
warmup_steps = int(warmup_proportion * num_train_optimization_steps)
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate,
eps=adam_epsilon)
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps,
t_total=num_train_optimization_steps)
# freeze model if necessary
if freeze_model:
logger.info("Freezing XLM-R model...")
for n, p in model.named_parameters():
if 'xlmr' in n and p.requires_grad:
logging.info("Parameter %s - freezed" % n)
p.requires_grad = False
else:
logging.info("Parameter %s - unchanged" % n)
if fp16:
try:
from apex import amp
except ImportError:
raise ImportError(
"Please install apex from https://www.github.com/nvidia/apex "
"to use fp16 training.")
model, optimizer = amp.initialize(
model, optimizer, opt_level=fp16_opt_level)
# Train the model
logger.info("***** Running training *****")
logger.info(" Num examples = %d", len(train_examples))
logger.info(" Batch size = %d", train_batch_size)
logger.info(" Num steps = %d", num_train_optimization_steps)
train_data = create_dataset(train_features)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler,
batch_size=train_batch_size)
# getting validation samples
best_val_f1 = 0.0
if data_tune:
val_examples = \
processor.get_examples(data_tune,
config.getint('data', 'tag_column_index'),
"tune")
val_features = convert_examples_to_features(
val_examples, label_list, max_seq_length, model.encode_word,
squeeze)
val_data = create_dataset(val_features)
if data_test:
eval_examples = \
processor.get_examples(data_test,
config.getint('data', 'tag_column_index'),
"test")
eval_features = convert_examples_to_features(
eval_examples, label_list, max_seq_length, model.encode_word,
squeeze)
eval_data = create_dataset(eval_features)
for epoch_no in range(1, num_train_epochs + 1):
epoch_stats = {"epoch": epoch_no}
logger.info("Epoch %d" % epoch_no)
tr_loss = 0
nb_tr_examples, nb_tr_steps = 0, 0
model.train()
steps = len(train_dataloader)
time_start = time.time()
# ToDo: add parameter for this feature
# for g in optimizer.param_groups:
# g['lr'] = learning_rate - (learning_rate/100 * epoch_no)
# epoch_stats['lr'] = learning_rate - (learning_rate/100 * epoch_no)
for step, batch in tqdm(enumerate(train_dataloader), total=steps):
batch = tuple(t.to(device) for t in batch)
input_ids, label_ids, l_mask, valid_ids, = batch
loss = model(input_ids, label_ids, l_mask, valid_ids)
if gradient_accumulation_steps > 1:
loss = loss / gradient_accumulation_steps
if fp16:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
max_grad_norm)
else:
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(),
max_grad_norm)
tr_loss += loss.item()
nb_tr_examples += input_ids.size(0)
nb_tr_steps += 1
epoch_stats["loss"] = loss
epoch_stats["learning_rate"] = scheduler.get_last_lr()[0]
if (step + 1) % gradient_accumulation_steps == 0:
optimizer.step()
scheduler.step()
model.zero_grad()
epoch_stats["step"] = step
# if wandb:
# wandb.log(epoch_stats)
# if wandb:
# epoch_stats["epoch_training_time"] = time.time() - time_start
if data_tune:
logger.info("\nTesting on validation set...")
time_start = time.time()
f1, report = evaluate_model(model, val_data, label_list,
eval_batch_size, device)
time_end = time.time()
epoch_stats["validation_F1"] = f1
epoch_stats["epoch_validation_time"] = time_end - time_start
if f1 > best_val_f1:
best_val_f1 = f1
logger.info(
"\nFound better f1=%.4f on validation set. Saving model\n"
% f1)
logger.info("%s\n" % report)
torch.save(model.state_dict(),
open(os.path.join(output_dir, 'model.pt'),
'wb'))
save_params(output_dir, dropout, num_labels,
label_list)
if data_test:
logger.info("\nTesting on test set...")
time_start = time.time()
print(f'len label_list: {len(label_list)}')
print(f'label_list: {label_list}')
f1_score, report = evaluate_model(model, eval_data, label_list,
eval_batch_size, device)
time_end = time.time()
epoch_stats["test_F1"] = f1_score
epoch_stats["epoch_testing_time"] = time_end - time_start
logger.info("%s\n" % report)
if epoch_save_model:
epoch_output_dir = os.path.join(output_dir, "e%03d" % epoch_no)
os.makedirs(epoch_output_dir)
torch.save(model.state_dict(),
open(os.path.join(epoch_output_dir, 'model.pt'), 'wb'))
save_params(epoch_output_dir, dropout, num_labels, label_list)
# if wandb:
# wandb.log(epoch_stats)
model.to(device)
if data_test:
eval_data = create_dataset(eval_features)
f1_score, report = evaluate_model(model, eval_data, label_list,
eval_batch_size, device)
logger.info("\n%s", report)
output_eval_file = os.path.join(output_dir, "test_results.txt")
with open(output_eval_file, "w") as writer:
logger.info("***** Writing results to file *****")
writer.write(report)
logger.info("Done.")
if __name__ == "__main__":
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment