Commit 9be427d2 authored by Łukasz Kopociński's avatar Łukasz Kopociński

Cleanup model package

parent 54f1183a
......@@ -2,7 +2,7 @@ tracking_uri: 'http://10.17.50.132:8080'
experiment_name: 'LREC_3.0'
dataset:
dir: './data/vectors'
dir: './semrel/data/data/vectors'
keys: 'elmo.rel.keys'
net_params:
......@@ -13,3 +13,4 @@ learn_params:
runs: 'default'
batch_size: 100
epochs: 50
learning_rate: 0.001
......@@ -9,12 +9,12 @@ import torch.nn as nn
from torch.optim import Adagrad, Optimizer
from torch.utils.data import DataLoader
from semrel.model.runs import RUNS
from semrel.model.scripts.utils import get_loaders
from semrel.model.scripts.utils import Metrics
from semrel.model import runs
from semrel.model.scripts import RelNet
from semrel.model.scripts.utils import parse_config, get_device, \
is_better_loss, ignored
from semrel.model.scripts import RelNet
from semrel.model.scripts.utils.data_loader import get_loaders
from semrel.model.scripts.utils.metrics import Metrics
@click.command()
......@@ -26,21 +26,21 @@ def main(config):
config = parse_config(Path(config))
runs_name = config['learn_params']['runs']
runs = RUNS[runs_name]
train_runs = runs.RUNS[runs_name]
model_name = f'{runs_name}.pt'
mlflow.set_tracking_uri(config['tracking_uri'])
mlflow.set_experiment(config['experiment_name'])
for index, params in runs.items():
for index, params in train_runs.items():
with ignored(Exception):
with mlflow.start_run():
print(f'\nRUN: {index} WITH: {params}')
in_domain = params.get('in_domain')
out_domain = params.get('out_domain')
lexical_split = params.get('lexical_split', False)
methods = params.get('methods', [])
in_domain = params.get(runs.IN_DOMAIN_KEY)
out_domain = params.get(runs.OUT_DOMAIN_KEY)
lexical_split = params.get(runs.LEXICAL_SPLIT_KEY, False)
methods = params.get(runs.METHODS_KEY, [])
mlflow.set_tags({
'in_domain': in_domain,
......@@ -49,13 +49,6 @@ def main(config):
'methods': ', '.join(methods),
})
mlflow.log_params({
'in_domain': in_domain,
'out_domain': out_domain,
'lexical_split': lexical_split,
'methods': ', '.join(methods),
})
loaders = get_loaders(
data_dir=config['dataset']['dir'],
keys_file=config['dataset']['keys'],
......@@ -69,19 +62,23 @@ def main(config):
out_domain=out_domain
)
train_loader, valid_loader, test_loader, vector_size = loaders
network = RelNet(in_dim=vector_size, **config['net_params'])
network = RelNet(
in_dim=loaders.vector_size,
**config['net_params']
)
network = network.to(device)
optimizer = Adagrad(network.parameters(), lr=0.001)
optimizer = Adagrad(
network.parameters(),
lr=config['net_params']['learning_rate']
)
loss_func = nn.CrossEntropyLoss()
# Log learning params
mlflow.log_params({
'train size': len(train_loader.sampler),
'valid size': len(valid_loader.sampler),
'test size': len(test_loader.sampler),
'vector size': vector_size,
'train size': len(loaders.train.sampler),
'valid size': len(loaders.valid.sampler),
'test size': len(loaders.test.sampler),
'vector size': loaders.vector_size,
'optimizer': optimizer.__class__.__name__,
'loss function': loss_func.__class__.__name__,
**config['learn_params'],
......@@ -96,13 +93,13 @@ def main(config):
# Train
train_metrics = train(
network, optimizer, train_loader, loss_func, device
network, optimizer, loaders.train, loss_func, device
)
log_metrics(train_metrics, 'train', epoch)
# Validate
valid_metrics, _ = evaluate(
network, valid_loader, loss_func, device
network, loaders.valid, loss_func, device
)
log_metrics(valid_metrics, 'valid', epoch)
......@@ -113,17 +110,16 @@ def main(config):
mlflow.log_artifact(f'./{model_name}')
# Test
test_network = RelNet(in_dim=vector_size,
**config['net_params'])
test_metrics, test_ner_metrics = test(
test_network, model_name, test_loader, loss_func, device
test_network = RelNet(
in_dim=loaders.vector_size,
**config['net_params']
)
test_metrics = test(
test_network, model_name, loaders.test, loss_func, device
)
print(f'\n\nTest: {test_metrics}')
# print(f'\n\nTest ner: {test_ner_metrics}')
log_metrics(test_metrics, 'test')
# log_metrics(test_ner_metrics, 'test_ner')
def log_metrics(metrics, prefix: str, step: int = 0):
......@@ -143,12 +139,17 @@ def log_metrics(metrics, prefix: str, step: int = 0):
}, step=step)
def train(network: RelNet, optimizer: Optimizer, batches: DataLoader,
loss_function, device: torch.device):
def train(
network: RelNet,
optimizer: Optimizer,
batches: DataLoader,
loss_function,
device: torch.device
) -> Metrics:
metrics = Metrics()
network.train()
for data, labels, _, _ in batches:
for data, labels in batches:
optimizer.zero_grad()
data = data.to(device)
......@@ -158,27 +159,36 @@ def train(network: RelNet, optimizer: Optimizer, batches: DataLoader,
try:
loss = loss_function(output, target)
except IndexError:
print("Output: ", output)
print("Output: ", target)
print(
f'\nOutput: {output}'
f'\nTarget: {target}'
)
continue
loss.backward()
optimizer.step()
metrics.update(output.cpu(), target.cpu(), loss.item(), len(batches))
metrics.update(
predicted=output.cpu(),
targets=target.cpu(),
loss=loss.item(),
batches=len(batches)
)
return metrics
def evaluate(network: RelNet, batches: DataLoader, loss_function,
device: torch.device) -> Metrics:
def evaluate(
network: RelNet,
batches: DataLoader,
loss_function,
device: torch.device
) -> Metrics:
metrics = Metrics()
# ner_metrics = NerMetrics()
ner_metrics = None
network.eval()
with torch.no_grad():
for data, labels, ner_from, ner_to in batches:
for data, labels in batches:
data = data.to(device)
target = labels.to(device)
......@@ -186,19 +196,29 @@ def evaluate(network: RelNet, batches: DataLoader, loss_function,
try:
loss = loss_function(output, target)
except IndexError:
print("Output: ", output)
print("Output: ", target)
print(
f'\nOutput: {output}'
f'\nTarget: {target}'
)
continue
metrics.update(output.cpu(), target.cpu(), loss.item(),
len(batches))
# ner_metrics.append(output.cpu(), target.cpu(), ner_from, ner_to)
metrics.update(
predicted=output.cpu(),
targets=target.cpu(),
loss=loss.item(),
batches=len(batches)
)
return metrics, ner_metrics
return metrics
def test(network: RelNet, model_path: str, batches: DataLoader, loss_function,
device: torch.device) -> Metrics:
def test(
network: RelNet,
model_path: str,
batches: DataLoader,
loss_function,
device: torch.device
) -> Metrics:
network.load(model_path)
network.to(device)
return evaluate(network, batches, loss_function, device)
......
import random
from collections import defaultdict
from pathlib import Path
from typing import List, Dict, Set, Tuple
from typing import List, Dict, Set, Tuple, NamedTuple
import torch
from torch.utils import data
from semrel.data.scripts import constant
CHANNELS = ('BRAND_NAME', 'PRODUCT_NAME')
class BrandProductDataset(data.Dataset):
label2digit = {
'no_relation': 0,
'in_relation': 1,
constant.NO_RELATION_LABEL: 0,
constant.IN_RELATION_LABEL: 1,
}
def __init__(self, keys_file: str, vectors_files: List[str]):
......@@ -20,11 +22,13 @@ class BrandProductDataset(data.Dataset):
self.vectors = [torch.load(file) for file in vectors_files]
self.vectors = torch.cat(self.vectors, dim=1)
@staticmethod
def _load_keys(path: Path) -> Dict[int, str]:
@classmethod
def _load_keys(cls, path: Path) -> Dict[int, str]:
with path.open('r', encoding='utf-8') as file:
return {index: tuple(line.strip().split('\t'))
for index, line in enumerate(file)}
return {
index: tuple(line.strip().split('\t'))
for index, line in enumerate(file)
}
@property
def vector_size(self) -> int:
......@@ -36,12 +40,10 @@ class BrandProductDataset(data.Dataset):
def __getitem__(self, index: int):
key = self.keys[index]
label = key[0]
is_named_entity_f = key[11]
is_named_entity_t = key[12]
x = self.vectors[index]
y = self.label2digit[label]
return x, y, is_named_entity_f, is_named_entity_t
return x, y
class DatasetGenerator:
......@@ -51,10 +53,14 @@ class DatasetGenerator:
random.seed(random_seed)
def _filter_indices_by_channels(self, indices: Set[int], channels) -> Set:
return {index
for index in indices
if (self.dataset_keys[index][4] in channels or
self.dataset_keys[index][8] in channels)}
return {
index
for index in indices
if (
self.dataset_keys[index][4] in channels
or self.dataset_keys[index][9] in channels
)
}
def _split(self, indices) -> Tuple[List, List, List]:
random.shuffle(indices)
......@@ -79,10 +85,16 @@ class DatasetGenerator:
return self._split(indices)
# ok, lets try to balance the data (positives vs negatives)
# 2 cases to cover: i) B-N, P-N, and ii) N-N
positives = {index for index in indices if
self.dataset_keys[index][0] == 'in_relation'}
negatives = {index for index in indices if
self.dataset_keys[index][0] == 'no_relation'}
positives = {
index
for index in indices
if self.dataset_keys[index][0] == constant.IN_RELATION_LABEL
}
negatives = {
index
for index in indices
if self.dataset_keys[index][0] == constant.NO_RELATION_LABEL
}
# take the negatives connected with Bs or Ps
negatives_bps = self._filter_indices_by_channels(negatives, CHANNELS)
......@@ -113,17 +125,19 @@ class DatasetGenerator:
brands_indices = defaultdict(list)
for index in sorted(positives | negatives):
brand = None
if self.dataset_keys[index][4] == 'BRAND_NAME':
if self.dataset_keys[index][4] == constant.BRAND_NAME_KEY:
brand = self.dataset_keys[index][6]
elif self.dataset_keys[index][8] == 'BRAND_NAME':
brand = self.dataset_keys[index][10]
elif self.dataset_keys[index][9] == constant.BRAND_NAME_KEY:
brand = self.dataset_keys[index][11]
else:
nns_and_nps_indices.append(index)
if brand:
brands_indices[brand].append(index)
n_brand_indices = sum(
len(indices) for _, indices in brands_indices.items())
len(indices)
for _, indices in brands_indices.items()
)
# split equally starting from the least frequent brands
counter = 0
......@@ -142,14 +156,16 @@ class DatasetGenerator:
# use held_out indices of type N-N and N-P and split them
# to make our data sets more like 3:1:1
train_indices, valid_indices, test_indices = self._split(
nns_and_nps_indices)
splits = self._split(nns_and_nps_indices)
train_indices, valid_indices, test_indices = splits
train.extend(train_indices)
valid.extend(valid_indices)
test.extend(test_indices)
return train, valid, test
def generate_datasets(
def generate(
self,
balanced: bool,
lexical_split: bool,
......@@ -157,8 +173,11 @@ class DatasetGenerator:
out_domain: str = None
) -> Tuple[List, List, List]:
if in_domain:
indices = [index for index, descriptor in self.dataset_keys.items()
if descriptor[1] == in_domain]
indices = [
index
for index, descriptor in self.dataset_keys.items()
if descriptor[1] == in_domain
]
elif out_domain:
raise NotImplementedError(
f'Out domain dataset split not implemented.')
......@@ -186,26 +205,34 @@ class BaseSampler(data.Sampler):
return len(self.indices)
def get_loaders(data_dir: str,
keys_file: str,
vectors_files: List[str],
batch_size: int,
balanced: bool = False,
lexical_split: bool = False,
in_domain: str = None,
out_domain: str = None,
random_seed: int = 42,
num_workers: int = 0,
pin_memory: bool = False):
class Loaders(NamedTuple):
train: data.DataLoader
valid: data.DataLoader
test: data.DataLoader
vector_size: int
def get_loaders(
data_dir: str,
keys_file: str,
vectors_files: List[str],
batch_size: int,
balanced: bool = False,
lexical_split: bool = False,
in_domain: str = None,
out_domain: str = None,
random_seed: int = 42,
num_workers: int = 0,
pin_memory: bool = False
) -> [data.DataLoader, data.DataLoader, data.DataLoader, Dict]:
dataset = BrandProductDataset(
keys_file=f'{data_dir}/{keys_file}',
vectors_files=[f'{data_dir}/{file}' for file in vectors_files],
)
ds_generator = DatasetGenerator(dataset.keys, random_seed)
train_indices, valid_indices, test_indices = ds_generator.generate_datasets(
balanced, lexical_split, in_domain
)
dataset_generator = DatasetGenerator(dataset.keys, random_seed)
indices = dataset_generator.generate(balanced, lexical_split, in_domain)
train_indices, valid_indices, test_indices = indices
train_loader = data.DataLoader(
dataset=dataset,
......@@ -230,4 +257,9 @@ def get_loaders(data_dir: str,
)
return train_loader, valid_loader, test_loader, dataset.vector_size
return Loaders(
train=train_loader,
valid=valid_loader,
test=test_loader,
vector_size=dataset.vector_size
)
......@@ -2,8 +2,11 @@ from typing import List
import numpy as np
import torch
from sklearn.metrics import accuracy_score, precision_score, \
recall_score, f1_score
from sklearn.metrics import \
accuracy_score, \
precision_score, \
recall_score, \
f1_score
class Metrics:
......@@ -15,7 +18,13 @@ class Metrics:
self._predicted = np.array([])
self._targets = np.array([])
def update(self, predicted, targets, loss, batches: int):
def update(
self,
predicted: torch.Tensor,
targets: torch.Tensor,
loss,
batches: int
) -> None:
_, predicted = torch.max(predicted, dim=1)
predicted = predicted.data.numpy()
targets = targets.data.numpy()
......@@ -52,54 +61,3 @@ class Metrics:
f'\n\tPrecision: {self.precision}' \
f'\n\tRecall: {self.recall}' \
f'\n\tFscore: {self.fscore}'
class NerMetrics:
def __init__(self):
self._predicted = []
self._targets = []
self._ner_from = []
self._ner_to = []
self._ner_predicted = []
def append(self, predicted, targets, ner_from, ner_to):
_, predicted = torch.max(predicted, dim=1)
predicted = predicted.data.numpy()
targets = targets.data.numpy()
self._predicted = np.append(self._predicted, predicted)
self._targets = np.append(self._targets, targets)
self._ner_from.extend(ner_from)
self._ner_to.extend(ner_to)
self.predict_ner(predicted, ner_from, ner_to)
def predict_ner(self, predicted, ner_from, ner_to):
for ner_from, ner_to, prediction in zip(ner_from, ner_to, predicted):
both_ner = eval(ner_from) and eval(ner_to)
if both_ner:
self._ner_predicted.append(prediction)
else:
self._ner_predicted.append(0)
@property
def accuracy(self) -> float:
return accuracy_score(self._targets, self._ner_predicted)
@property
def precision(self) -> List[float]:
return precision_score(self._targets, self._ner_predicted, average=None)
@property
def recall(self) -> List[float]:
return recall_score(self._targets, self._ner_predicted, average=None)
@property
def fscore(self) -> List[float]:
return f1_score(self._targets, self._ner_predicted, average=None)
def __str__(self):
return f'\tAccuracy: {self.accuracy}' \
f'\n\tPrecision: {self.precision}' \
f'\n\tRecall: {self.recall}' \
f'\n\tFscore: {self.fscore}'
......@@ -9,11 +9,11 @@ import yaml
logger = logging.getLogger(__name__)
def is_better_fscore(fscore: List[int], best_fscore: List[int]):
def is_better_fscore(fscore: List[int], best_fscore: List[int]) -> bool:
return fscore[0] > best_fscore[0] and fscore[1] > best_fscore[1]
def is_better_loss(loss: float, best_loss: float):
def is_better_loss(loss: float, best_loss: float) -> bool:
return loss < best_loss if best_loss else True
......@@ -26,9 +26,7 @@ def parse_config(path: Path):
def get_device() -> torch.device:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f'Runing on: {device}.')
return device
return torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
@contextmanager
......
......@@ -20,7 +20,7 @@ def get_indices(
) -> Tuple[List, List, List, Dict[int, str]]:
keys = BrandProductDataset._load_keys(keys_file)
ds_generator = DatasetGenerator(keys, random_seed)
train, valid, test = ds_generator.generate_datasets(
train, valid, test = ds_generator.generate(
balanced, lexical_split, in_domain
)
return train, valid, test, keys
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment