Skip to content
Snippets Groups Projects
Commit c7327132 authored by Maja Jablonska's avatar Maja Jablonska
Browse files

Add CLI parameters

parent 47655bce
No related branches found
No related tags found
1 merge request!46Merge COMBO 3.0 into master
...@@ -23,6 +23,7 @@ from combo.nn import utils ...@@ -23,6 +23,7 @@ from combo.nn import utils
from combo.nn.utils import get_text_field_mask from combo.nn.utils import get_text_field_mask
from combo.predictors import Predictor from combo.predictors import Predictor
from combo.utils import metrics from combo.utils import metrics
from utils import ConfigurationError
@Registry.register("semantic_multitask") @Registry.register("semantic_multitask")
...@@ -165,7 +166,10 @@ class ComboModel(Model, FromParameters): ...@@ -165,7 +166,10 @@ class ComboModel(Model, FromParameters):
if self.morphological_feat: if self.morphological_feat:
mapped_gold_labels = [] mapped_gold_labels = []
for _, cat_indices in self.morphological_feat.slices.items(): for _, cat_indices in self.morphological_feat.slices.items():
try:
mapped_gold_labels.append(feats[:, :, cat_indices].argmax(dim=-1)) mapped_gold_labels.append(feats[:, :, cat_indices].argmax(dim=-1))
except TypeError:
raise ConfigurationError('Feats is None - if no feats are provided, the morphological_feat property should be set to None.')
feats = torch.stack(mapped_gold_labels, dim=-1) feats = torch.stack(mapped_gold_labels, dim=-1)
...@@ -184,11 +188,11 @@ class ComboModel(Model, FromParameters): ...@@ -184,11 +188,11 @@ class ComboModel(Model, FromParameters):
relations_loss, head_loss = parser_output["loss"] relations_loss, head_loss = parser_output["loss"]
enhanced_relations_loss, enhanced_head_loss = enhanced_parser_output["loss"] enhanced_relations_loss, enhanced_head_loss = enhanced_parser_output["loss"]
losses = { losses = {
"upostag_loss": upos_output["loss"], "upostag_loss": upos_output.get("loss"),
"xpostag_loss": xpos_output["loss"], "xpostag_loss": xpos_output.get("loss"),
"semrel_loss": semrel_output["loss"], "semrel_loss": semrel_output.get("loss"),
"feats_loss": morpho_output["loss"], "feats_loss": morpho_output.get("loss"),
"lemma_loss": lemma_output["loss"], "lemma_loss": lemma_output.get("loss"),
"head_loss": head_loss, "head_loss": head_loss,
"deprel_loss": relations_loss, "deprel_loss": relations_loss,
"enhanced_head_loss": enhanced_head_loss, "enhanced_head_loss": enhanced_head_loss,
......
from .from_parameters import FromParameters, resolve from .from_parameters import FromParameters, override_parameters, resolve
from .registry import Registry from .registry import Registry
...@@ -19,7 +19,6 @@ def get_matching_arguments(args: Dict[str, Any], func: Callable) -> Dict[str, An ...@@ -19,7 +19,6 @@ def get_matching_arguments(args: Dict[str, Any], func: Callable) -> Dict[str, An
def _resolve(values: typing.Union[Dict[str, Any], str], pass_down_parameters: Dict[str, Any] = None) -> Any: def _resolve(values: typing.Union[Dict[str, Any], str], pass_down_parameters: Dict[str, Any] = None) -> Any:
if isinstance(values, Params): if isinstance(values, Params):
values = Params.as_dict() values = Params.as_dict()
...@@ -166,3 +165,59 @@ def resolve(parameters: Dict[str, Any], pass_down_parameters: Dict[str, Any] = N ...@@ -166,3 +165,59 @@ def resolve(parameters: Dict[str, Any], pass_down_parameters: Dict[str, Any] = N
pass_down_parameters = pass_down_parameters or {} pass_down_parameters = pass_down_parameters or {}
clz, clz_init = Registry.resolve(parameters['type']) clz, clz_init = Registry.resolve(parameters['type'])
return clz.from_parameters(parameters['parameters'], clz_init, pass_down_parameters) return clz.from_parameters(parameters['parameters'], clz_init, pass_down_parameters)
def flatten_dictionary(d, parent_key='', sep='/'):
"""
Flatten a nested dictionary.
Parameters:
d (dict): The input dictionary.
parent_key (str): The parent key to use for recursion (default is an empty string).
sep (str): The separator to use when concatenating keys (default is '_').
Returns:
dict: A flattened dictionary.
"""
items = []
for k, v in d.items():
new_key = f"{parent_key}{sep}{k}" if parent_key else k
if isinstance(v, dict):
items.extend(flatten_dictionary(v, new_key, sep=sep).items())
else:
items.append((new_key, v))
return dict(items)
def unflatten_dictionary(flat_dict, sep='/'):
"""
Unflatten a flattened dictionary.
Parameters:
flat_dict (dict): The flattened dictionary.
sep (str): The separator used in the flattened keys (default is '_').
Returns:
dict: The unflattened dictionary.
"""
unflattened_dict = {}
for key, value in flat_dict.items():
keys = key.split(sep)
current_level = unflattened_dict
for k in keys[:-1]:
current_level = current_level.setdefault(k, {})
current_level[keys[-1]] = value
return unflattened_dict
def override_parameters(parameters: Dict[str, Any], override_values: Dict[str, Any]) -> Dict[str, Any]:
overriden_parameters = flatten_dictionary(parameters)
override_values = flatten_dictionary(override_values)
for ko, vo in override_values.items():
if ko in overriden_parameters:
overriden_parameters[ko] = vo
return unflatten_dictionary(overriden_parameters)
...@@ -18,20 +18,28 @@ from combo.default_model import default_ud_dataset_reader, default_data_loader ...@@ -18,20 +18,28 @@ from combo.default_model import default_ud_dataset_reader, default_data_loader
from combo.modules.archival import load_archive, archive from combo.modules.archival import load_archive, archive
from combo.predict import COMBO from combo.predict import COMBO
from combo.data import api from combo.data import api
from combo.data import DatasetReader from config import override_parameters
from utils import ConfigurationError
logging.setLoggerClass(ComboLogger) logging.setLoggerClass(ComboLogger)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_FEATURES = ["token", "char", "upostag", "xpostag", "lemma", "feats"] _FEATURES = ["token", "char", "upostag", "xpostag", "lemma", "feats"]
_TARGETS = ["deprel", "feats", "head", "lemma", "upostag", "xpostag", "semrel", "sent", "deps"] _TARGETS = ["deprel", "feats", "head", "lemma", "upostag", "xpostag", "semrel", "sent", "deps"]
def handle_error(error: Exception):
msg = getattr(error, 'message', str(error))
logger.error(msg)
print(f'Error: {msg}')
FLAGS = flags.FLAGS FLAGS = flags.FLAGS
flags.DEFINE_enum(name="mode", default=None, enum_values=["train", "predict"], flags.DEFINE_enum(name="mode", default=None, enum_values=["train", "predict"],
help="Specify COMBO mode: train or predict") help="Specify COMBO mode: train or predict")
# Common flags # Common flags
flags.DEFINE_integer(name="cuda_device", default=-1, flags.DEFINE_integer(name="n_cuda_devices", default=-1,
help="Cuda device idx (default -1 cpu)") help="Number of devices to train on (default -1 auto mode - train on as many as possible)")
flags.DEFINE_string(name="output_file", default="output.log", flags.DEFINE_string(name="output_file", default="output.log",
help="Predictions result file.") help="Predictions result file.")
...@@ -42,8 +50,8 @@ flags.DEFINE_string(name="validation_data_path", default="", help="Validation da ...@@ -42,8 +50,8 @@ flags.DEFINE_string(name="validation_data_path", default="", help="Validation da
flags.DEFINE_alias(name="validation_data", original_name="validation_data_path") flags.DEFINE_alias(name="validation_data", original_name="validation_data_path")
flags.DEFINE_string(name="pretrained_tokens", default="", flags.DEFINE_string(name="pretrained_tokens", default="",
help="Pretrained tokens embeddings path") help="Pretrained tokens embeddings path")
flags.DEFINE_integer(name="embedding_dim", default=300, flags.DEFINE_integer(name="lemmatizer_embedding_dim", default=300,
help="Embeddings dim") help="Lemmatizer embeddings dim")
flags.DEFINE_integer(name="num_epochs", default=400, flags.DEFINE_integer(name="num_epochs", default=400,
help="Epochs num") help="Epochs num")
flags.DEFINE_integer(name="word_batch_size", default=2500, flags.DEFINE_integer(name="word_batch_size", default=2500,
...@@ -72,10 +80,8 @@ flags.DEFINE_string(name="finetuning_validation_data_path", default="", ...@@ -72,10 +80,8 @@ flags.DEFINE_string(name="finetuning_validation_data_path", default="",
flags.DEFINE_string(name="test_data_path", default=None, flags.DEFINE_string(name="test_data_path", default=None,
help="Test path file.") help="Test path file.")
flags.DEFINE_alias(name="test_data", original_name="test_data_path") flags.DEFINE_alias(name="test_data", original_name="test_data_path")
# Experimental
flags.DEFINE_boolean(name="use_pure_config", default=False, flags.DEFINE_boolean(name="use_pure_config", default=False,
help="Ignore ext flags (experimental).") help="Ignore ext flags.")
# Prediction flags # Prediction flags
flags.DEFINE_string(name="model_path", default=None, flags.DEFINE_string(name="model_path", default=None,
...@@ -99,37 +105,58 @@ def run(_): ...@@ -99,37 +105,58 @@ def run(_):
if not FLAGS.finetuning: if not FLAGS.finetuning:
prefix = 'Training' prefix = 'Training'
logger.info('Setting up the model for training', prefix=prefix) logger.info('Setting up the model for training', prefix=prefix)
try:
checks.file_exists(FLAGS.config_path) checks.file_exists(FLAGS.config_path)
except ConfigurationError as e:
handle_error(e)
return
logger.info(f'Reading parameters from configuration path {FLAGS.config_path}', prefix=prefix) logger.info(f'Reading parameters from configuration path {FLAGS.config_path}', prefix=prefix)
with open(FLAGS.config_path, 'r') as f: with open(FLAGS.config_path, 'r') as f:
params = json.load(f) params = json.load(f)
params = {**params, **_get_ext_vars()} params = override_parameters(params, _get_ext_vars(True))
if 'feats' not in FLAGS.features:
del params['model']['parameters']['morphological_feat']
serialization_dir = tempfile.mkdtemp(prefix='combo', dir=FLAGS.serialization_dir) serialization_dir = tempfile.mkdtemp(prefix='combo', dir=FLAGS.serialization_dir)
params['vocabulary']['parameters']['directory'] = os.path.join('/'.join(FLAGS.config_path.split('/')[:-1]), params['vocabulary']['parameters']['directory'] = os.path.join('/'.join(FLAGS.config_path.split('/')[:-1]),
params['vocabulary']['parameters']['directory']) params['vocabulary']['parameters'][
'directory'])
try: try:
vocabulary = resolve(params['vocabulary']) vocabulary = resolve(params['vocabulary'])
except KeyError: except Exception as e:
logger.error('No vocabulary in config.json!') handle_error(e)
return return
try:
model = resolve(override_parameters(params['model'], _get_ext_vars(False)),
pass_down_parameters={'vocabulary': vocabulary})
except Exception as e:
handle_error(e)
return
model = resolve(params['model'], pass_down_parameters={'vocabulary': vocabulary})
dataset_reader = None dataset_reader = None
if 'data_loader' in params: if 'data_loader' in params:
logger.info(f'Resolving the training data loader from parameters', prefix=prefix) logger.info(f'Resolving the training data loader from parameters', prefix=prefix)
try:
train_data_loader = resolve(params['data_loader']) train_data_loader = resolve(params['data_loader'])
except Exception as e:
handle_error(e)
return
else: else:
checks.file_exists(FLAGS.training_data_path) checks.file_exists(FLAGS.training_data_path)
logger.info(f'Using a default UD data loader with training data path {FLAGS.training_data_path}', logger.info(f'Using a default UD data loader with training data path {FLAGS.training_data_path}',
prefix=prefix) prefix=prefix)
try:
train_data_loader = default_data_loader(default_ud_dataset_reader(), train_data_loader = default_data_loader(default_ud_dataset_reader(),
FLAGS.training_data_path) FLAGS.training_data_path)
except Exception as e:
handle_error(e)
return
logger.info('Indexing training data loader') logger.info('Indexing training data loader')
train_data_loader.index_with(model.vocab) train_data_loader.index_with(model.vocab)
...@@ -180,10 +207,18 @@ def run(_): ...@@ -180,10 +207,18 @@ def run(_):
nlp = TrainableCombo(model, torch.optim.Adam, nlp = TrainableCombo(model, torch.optim.Adam,
optimizer_kwargs={'betas': [0.9, 0.9], 'lr': 0.002}, optimizer_kwargs={'betas': [0.9, 0.9], 'lr': 0.002},
validation_metrics=['EM']) validation_metrics=['EM'])
n_cuda_devices = "auto" if FLAGS.n_cuda_devices == -1 else FLAGS.n_cuda_devices
trainer = pl.Trainer(max_epochs=FLAGS.num_epochs, trainer = pl.Trainer(max_epochs=FLAGS.num_epochs,
default_root_dir=serialization_dir, default_root_dir=serialization_dir,
gradient_clip_val=5) gradient_clip_val=5,
devices=n_cuda_devices)
try:
trainer.fit(model=nlp, train_dataloaders=train_data_loader, val_dataloaders=validation_data_loader) trainer.fit(model=nlp, train_dataloaders=train_data_loader, val_dataloaders=validation_data_loader)
except Exception as e:
handle_error(e)
return
logger.info(f'Archiving the model in {serialization_dir}', prefix=prefix) logger.info(f'Archiving the model in {serialization_dir}', prefix=prefix)
archive(model, serialization_dir, train_data_loader, validation_data_loader, dataset_reader) archive(model, serialization_dir, train_data_loader, validation_data_loader, dataset_reader)
...@@ -192,7 +227,8 @@ def run(_): ...@@ -192,7 +227,8 @@ def run(_):
if FLAGS.test_data_path and FLAGS.output_file: if FLAGS.test_data_path and FLAGS.output_file:
checks.file_exists(FLAGS.test_data_path) checks.file_exists(FLAGS.test_data_path)
if not dataset_reader: if not dataset_reader:
logger.info("No dataset reader in the configuration or archive file - using a default UD dataset reader", logger.info(
"No dataset reader in the configuration or archive file - using a default UD dataset reader",
prefix=prefix) prefix=prefix)
dataset_reader = default_ud_dataset_reader() dataset_reader = default_ud_dataset_reader()
logger.info("Predicting test examples", prefix=prefix) logger.info("Predicting test examples", prefix=prefix)
...@@ -242,23 +278,86 @@ def run(_): ...@@ -242,23 +278,86 @@ def run(_):
def _get_ext_vars(finetuning: bool = False) -> Dict: def _get_ext_vars(finetuning: bool = False) -> Dict:
if FLAGS.use_pure_config: if FLAGS.use_pure_config:
return {} return {}
return {
"training_data_path": ( to_override = {
",".join(FLAGS.training_data_path if not finetuning else FLAGS.finetuning_training_data_path)), "model": {
"validation_data_path": ( "parameters": {
",".join(FLAGS.validation_data_path if not finetuning else FLAGS.finetuning_validation_data_path)), "lemmatizer": {
"parameters": {
"embedding_dim": FLAGS.lemmatizer_embedding_dim
}
},
"text_field_embedder": {
"parameters": {
"token_embedders": {
"parameters": {
"token": {
"parameters": {
"model_name": FLAGS.pretrained_transformer_name
}
}
}
}
}
},
"serialization_dir": FLAGS.serialization_dir
}
},
"data_loader": {
"data_path": (",".join(FLAGS.training_data_path if not finetuning else FLAGS.finetuning_training_data_path)),
"parameters": {
"reader": {
"parameters": {
"features": FLAGS.features,
"targets": FLAGS.targets,
"token_indexers": {
"token": {
"parameters": {
"model_name": FLAGS.pretrained_transformer_name
}
}
}
}
}
}
},
"validation_data_loader": {
"data_path": (",".join(FLAGS.validation_data_path if not finetuning else FLAGS.finetuning_validation_data_path)),
"parameters": {
"reader": {
"parameters": {
"features": FLAGS.features,
"targets": FLAGS.targets,
"token_indexers": {
"token": {
"parameters": {
"model_name": FLAGS.pretrained_transformer_name
}
}
}
}
}
}
},
"dataset_reader": {
"parameters": {
"features": FLAGS.features,
"targets": FLAGS.targets,
"token_indexers": {
"token": {
"parameters": {
"model_name": FLAGS.pretrained_transformer_name
}
}
}
}
},
"pretrained_tokens": FLAGS.pretrained_tokens, "pretrained_tokens": FLAGS.pretrained_tokens,
"pretrained_transformer_name": FLAGS.pretrained_transformer_name,
"features": " ".join(FLAGS.features),
"targets": " ".join(FLAGS.targets),
"type": "finetuning" if finetuning else "default",
"embedding_dim": int(FLAGS.embedding_dim),
"cuda_device": int(FLAGS.cuda_device),
"num_epochs": int(FLAGS.num_epochs),
"word_batch_size": int(FLAGS.word_batch_size), "word_batch_size": int(FLAGS.word_batch_size),
"use_tensorboard": int(FLAGS.tensorboard),
} }
return to_override
def main(): def main():
"""Parse flags.""" """Parse flags."""
......
%% Cell type:code id:b28c7d8bacb08d02 tags: %% Cell type:code id:b28c7d8bacb08d02 tags:
``` python ``` python
# The path where the training and validation datasets are stored # The path where the training and validation datasets are stored
TRAINING_DATA_PATH: str = '/Users/majajablonska/Documents/PDB/PDBUD_train.conllu' TRAINING_DATA_PATH: str = '/Users/majajablonska/Documents/PDBUD/train.conllu'
VALIDATION_DATA_PATH: str = '/Users/majajablonska/Documents/PDB/PDBUD_val.conllu' VALIDATION_DATA_PATH: str = '/Users/majajablonska/Documents/PDBUD/val.conllu'
# The path where the model can be saved to # The path where the model can be saved to
SERIALIZATION_DIR: str = "/Users/majajablonska/Documents/Workspace/combotest" SERIALIZATION_DIR: str = "/Users/majajablonska/Documents/Workspace/combotest"
``` ```
%% Cell type:code id:initial_id tags: %% Cell type:code id:initial_id tags:
``` python ``` python
from combo.predict import COMBO from combo.predict import COMBO
from combo.combo_model import ComboModel from combo.combo_model import ComboModel
from combo.data.vocabulary import Vocabulary from combo.data.vocabulary import Vocabulary
from combo.models.encoder import ComboEncoder, ComboStackedBidirectionalLSTM from combo.models.encoder import ComboEncoder, ComboStackedBidirectionalLSTM
from combo.modules.text_field_embedders import BasicTextFieldEmbedder from combo.modules.text_field_embedders import BasicTextFieldEmbedder
from combo.nn.base import Linear from combo.nn.base import Linear
from combo.modules.token_embedders import CharacterBasedWordEmbedder, TransformersWordEmbedder from combo.modules.token_embedders import CharacterBasedWordEmbedder, TransformersWordEmbedder
from combo.modules import FeedForwardPredictor from combo.modules import FeedForwardPredictor
from combo.nn.activations import ReLUActivation, TanhActivation, LinearActivation from combo.nn.activations import ReLUActivation, TanhActivation, LinearActivation
from combo.models.dilated_cnn import DilatedCnnEncoder from combo.models.dilated_cnn import DilatedCnnEncoder
from combo.data.tokenizers import LamboTokenizer, CharacterTokenizer from combo.data.tokenizers import LamboTokenizer, CharacterTokenizer
from combo.data.token_indexers import PretrainedTransformerIndexer, TokenConstPaddingCharactersIndexer, TokenFeatsIndexer, SingleIdTokenIndexer, PretrainedTransformerFixedMismatchedIndexer from combo.data.token_indexers import PretrainedTransformerIndexer, TokenConstPaddingCharactersIndexer, TokenFeatsIndexer, SingleIdTokenIndexer, PretrainedTransformerFixedMismatchedIndexer
from combo.data.dataset_readers import UniversalDependenciesDatasetReader from combo.data.dataset_readers import UniversalDependenciesDatasetReader
import torch import torch
from combo.data.dataset_loaders import SimpleDataLoader from combo.data.dataset_loaders import SimpleDataLoader
from combo.modules.parser import DependencyRelationModel, HeadPredictionModel from combo.modules.parser import DependencyRelationModel, HeadPredictionModel
from combo.modules.lemma import LemmatizerModel from combo.modules.lemma import LemmatizerModel
from combo.modules.morpho import MorphologicalFeatures from combo.modules.morpho import MorphologicalFeatures
from combo.nn.regularizers.regularizers import L2Regularizer from combo.nn.regularizers.regularizers import L2Regularizer
import pytorch_lightning as pl import pytorch_lightning as pl
from combo.training.trainable_combo import TrainableCombo from combo.training.trainable_combo import TrainableCombo
from itertools import chain from itertools import chain
``` ```
%% Cell type:code id:d74957f422f0b05b tags: %% Cell type:code id:d74957f422f0b05b tags:
``` python ``` python
def default_const_character_indexer(namespace = None): def default_const_character_indexer(namespace = None):
if namespace: if namespace:
return TokenConstPaddingCharactersIndexer( return TokenConstPaddingCharactersIndexer(
tokenizer=CharacterTokenizer(end_tokens=["__END__"], tokenizer=CharacterTokenizer(end_tokens=["__END__"],
start_tokens=["__START__"]), start_tokens=["__START__"]),
min_padding_length=32, min_padding_length=32,
namespace=namespace namespace=namespace
) )
else: else:
return TokenConstPaddingCharactersIndexer( return TokenConstPaddingCharactersIndexer(
tokenizer=CharacterTokenizer(end_tokens=["__END__"], tokenizer=CharacterTokenizer(end_tokens=["__END__"],
start_tokens=["__START__"]), start_tokens=["__START__"]),
min_padding_length=32 min_padding_length=32
) )
dataset_reader = UniversalDependenciesDatasetReader( dataset_reader = UniversalDependenciesDatasetReader(
features=["token", "char"], features=["token", "char"],
lemma_indexers={ lemma_indexers={
"char": default_const_character_indexer("lemma_characters") "char": default_const_character_indexer("lemma_characters")
}, },
targets=["deprel", "head", "upostag", "lemma", "feats", "xpostag"], targets=["deprel", "head", "upostag", "lemma", "feats", "xpostag"],
token_indexers={ token_indexers={
"char": default_const_character_indexer(), "char": default_const_character_indexer(),
"feats": TokenFeatsIndexer(), "feats": TokenFeatsIndexer(),
"lemma": default_const_character_indexer(), "lemma": default_const_character_indexer(),
"token": PretrainedTransformerFixedMismatchedIndexer("bert-base-cased"), "token": PretrainedTransformerFixedMismatchedIndexer("bert-base-cased"),
"upostag": SingleIdTokenIndexer( "upostag": SingleIdTokenIndexer(
feature_name="pos_", feature_name="pos_",
namespace="upostag" namespace="upostag"
), ),
"xpostag": SingleIdTokenIndexer( "xpostag": SingleIdTokenIndexer(
feature_name="tag_", feature_name="tag_",
namespace="xpostag" namespace="xpostag"
) )
}, },
use_sem=False use_sem=False
) )
data_loader = SimpleDataLoader.from_dataset_reader(dataset_reader, data_loader = SimpleDataLoader.from_dataset_reader(dataset_reader,
data_path=TRAINING_DATA_PATH, data_path=TRAINING_DATA_PATH,
batch_size=16, batch_size=16,
batches_per_epoch=4, batches_per_epoch=4,
shuffle=True) shuffle=True)
val_data_loader = SimpleDataLoader.from_dataset_reader(dataset_reader, val_data_loader = SimpleDataLoader.from_dataset_reader(dataset_reader,
data_path=VALIDATION_DATA_PATH, data_path=VALIDATION_DATA_PATH,
batch_size=16, batch_size=16,
batches_per_epoch=4, batches_per_epoch=4,
shuffle=True) shuffle=True)
vocabulary = Vocabulary.from_instances_extended( vocabulary = Vocabulary.from_instances_extended(
chain(data_loader.iter_instances(), val_data_loader.iter_instances()), chain(data_loader.iter_instances(), val_data_loader.iter_instances()),
non_padded_namespaces=['head_labels'], non_padded_namespaces=['head_labels'],
only_include_pretrained_words=False, only_include_pretrained_words=False,
oov_token='_', oov_token='_',
padding_token='__PAD__' padding_token='__PAD__'
) )
``` ```
%% Output %% Output
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either: To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible - Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
%% Cell type:code id:fa724d362fd6bd23 tags: %% Cell type:code id:fa724d362fd6bd23 tags:
``` python ``` python
seq_encoder = ComboEncoder(layer_dropout_probability=0.33, seq_encoder = ComboEncoder(layer_dropout_probability=0.33,
stacked_bilstm=ComboStackedBidirectionalLSTM( stacked_bilstm=ComboStackedBidirectionalLSTM(
hidden_size=512, hidden_size=512,
input_size=164, input_size=164,
layer_dropout_probability=0.33, layer_dropout_probability=0.33,
num_layers=2, num_layers=2,
recurrent_dropout_probability=0.33 recurrent_dropout_probability=0.33
)) ))
``` ```
%% Cell type:code id:f8a10f9892005fca tags: %% Cell type:code id:f8a10f9892005fca tags:
``` python ``` python
char_words_embedder = CharacterBasedWordEmbedder( char_words_embedder = CharacterBasedWordEmbedder(
dilated_cnn_encoder = DilatedCnnEncoder( dilated_cnn_encoder = DilatedCnnEncoder(
input_dim=64, input_dim=64,
kernel_size=[3, 3, 3], kernel_size=[3, 3, 3],
padding=[1, 2, 4], padding=[1, 2, 4],
stride=[1, 1, 1], stride=[1, 1, 1],
filters=[512, 256, 64], filters=[512, 256, 64],
dilation=[1, 2, 4], dilation=[1, 2, 4],
activations=[ReLUActivation(), ReLUActivation(), LinearActivation()] activations=[ReLUActivation(), ReLUActivation(), LinearActivation()]
), ),
embedding_dim=64, embedding_dim=64,
vocabulary=vocabulary vocabulary=vocabulary
) )
tokenizer = LamboTokenizer() tokenizer = LamboTokenizer()
indexer = PretrainedTransformerIndexer('bert-base-cased') indexer = PretrainedTransformerIndexer('bert-base-cased')
data_loader.iter_instances() data_loader.iter_instances()
``` ```
%% Output %% Output
Using model LAMBO-UD_English-EWT Using model LAMBO-UD_English-EWT
<generator object SimpleDataLoader.iter_instances at 0x7fb2d3cdfc80> <generator object SimpleDataLoader.iter_instances at 0x7fb512dc4f20>
%% Cell type:code id:14413692656b68ac tags: %% Cell type:code id:14413692656b68ac tags:
``` python ``` python
vocabulary.save_to_files('/Users/majajablonska/PycharmProjects/combo-lightning/tests/fixtures/train_vocabulary') vocabulary.save_to_files('/Users/majajablonska/PycharmProjects/combo-lightning/tests/fixtures/train_vocabulary')
``` ```
%% Output %% Output
Directory /Users/majajablonska/PycharmProjects/combo-lightning/tests/fixtures/train_vocabulary is not empty Directory /Users/majajablonska/PycharmProjects/combo-lightning/tests/fixtures/train_vocabulary is not empty
%% Cell type:code id:437d12054baaffa1 tags: %% Cell type:code id:437d12054baaffa1 tags:
``` python ``` python
from nn import RegularizerApplicator from nn import RegularizerApplicator
model = ComboModel( model = ComboModel(
vocabulary=vocabulary, vocabulary=vocabulary,
dependency_relation=DependencyRelationModel( dependency_relation=DependencyRelationModel(
vocabulary=vocabulary, vocabulary=vocabulary,
dependency_projection_layer=Linear( dependency_projection_layer=Linear(
activation=TanhActivation(), activation=TanhActivation(),
dropout_rate=0.25, dropout_rate=0.25,
in_features=1024, in_features=1024,
out_features=128 out_features=128
), ),
head_predictor=HeadPredictionModel( head_predictor=HeadPredictionModel(
cycle_loss_n=0, cycle_loss_n=0,
dependency_projection_layer=Linear( dependency_projection_layer=Linear(
activation=TanhActivation(), activation=TanhActivation(),
in_features=1024, in_features=1024,
out_features=512 out_features=512
), ),
head_projection_layer=Linear( head_projection_layer=Linear(
activation=TanhActivation(), activation=TanhActivation(),
in_features=1024, in_features=1024,
out_features=512 out_features=512
) )
), ),
head_projection_layer=Linear( head_projection_layer=Linear(
activation=TanhActivation(), activation=TanhActivation(),
dropout_rate=0.25, dropout_rate=0.25,
in_features=1024, in_features=1024,
out_features=128 out_features=128
), ),
vocab_namespace="deprel_labels" vocab_namespace="deprel_labels"
), ),
lemmatizer=LemmatizerModel( lemmatizer=LemmatizerModel(
vocabulary=vocabulary, vocabulary=vocabulary,
activations=[ReLUActivation(), ReLUActivation(), ReLUActivation(), LinearActivation()], activations=[ReLUActivation(), ReLUActivation(), ReLUActivation(), LinearActivation()],
char_vocab_namespace="token_characters", char_vocab_namespace="token_characters",
dilation=[1, 2, 4, 1], dilation=[1, 2, 4, 1],
embedding_dim=256, embedding_dim=256,
filters=[256, 256, 256], filters=[256, 256, 256],
input_projection_layer=Linear( input_projection_layer=Linear(
activation=TanhActivation(), activation=TanhActivation(),
dropout_rate=0.25, dropout_rate=0.25,
in_features=1024, in_features=1024,
out_features=32 out_features=32
), ),
kernel_size=[3, 3, 3, 1], kernel_size=[3, 3, 3, 1],
lemma_vocab_namespace="lemma_characters", lemma_vocab_namespace="lemma_characters",
padding=[1, 2, 4, 0], padding=[1, 2, 4, 0],
stride=[1, 1, 1, 1] stride=[1, 1, 1, 1]
), ),
loss_weights={ loss_weights={
"deprel": 0.8, "deprel": 0.8,
"feats": 0.2, "feats": 0.2,
"head": 0.2, "head": 0.2,
"lemma": 0.05, "lemma": 0.05,
"semrel": 0.05, "semrel": 0.05,
"upostag": 0.05, "upostag": 0.05,
"xpostag": 0.05 "xpostag": 0.05
}, },
morphological_feat=MorphologicalFeatures( morphological_feat=MorphologicalFeatures(
vocabulary=vocabulary, vocabulary=vocabulary,
activations=[TanhActivation(), LinearActivation()], activations=[TanhActivation(), LinearActivation()],
dropout=[0.25, 0.], dropout=[0.25, 0.],
hidden_dims=[128], hidden_dims=[128],
input_dim=1024, input_dim=1024,
num_layers=2, num_layers=2,
vocab_namespace="feats_labels" vocab_namespace="feats_labels"
), ),
regularizer=RegularizerApplicator([ regularizer=RegularizerApplicator([
(".*conv1d.*", L2Regularizer(1e-6)), (".*conv1d.*", L2Regularizer(1e-6)),
(".*forward.*", L2Regularizer(1e-6)), (".*forward.*", L2Regularizer(1e-6)),
(".*backward.*", L2Regularizer(1e-6)), (".*backward.*", L2Regularizer(1e-6)),
(".*char_embed.*", L2Regularizer(1e-5)) (".*char_embed.*", L2Regularizer(1e-5))
]), ]),
seq_encoder=ComboEncoder( seq_encoder=ComboEncoder(
layer_dropout_probability=0.33, layer_dropout_probability=0.33,
stacked_bilstm=ComboStackedBidirectionalLSTM( stacked_bilstm=ComboStackedBidirectionalLSTM(
hidden_size=512, hidden_size=512,
input_size=164, input_size=164,
layer_dropout_probability=0.33, layer_dropout_probability=0.33,
num_layers=2, num_layers=2,
recurrent_dropout_probability=0.33 recurrent_dropout_probability=0.33
) )
), ),
text_field_embedder=BasicTextFieldEmbedder( text_field_embedder=BasicTextFieldEmbedder(
token_embedders={ token_embedders={
"char": CharacterBasedWordEmbedder( "char": CharacterBasedWordEmbedder(
vocabulary=vocabulary, vocabulary=vocabulary,
dilated_cnn_encoder=DilatedCnnEncoder( dilated_cnn_encoder=DilatedCnnEncoder(
activations=[ReLUActivation(), ReLUActivation(), LinearActivation()], activations=[ReLUActivation(), ReLUActivation(), LinearActivation()],
dilation=[1, 2, 4], dilation=[1, 2, 4],
filters=[512, 256, 64], filters=[512, 256, 64],
input_dim=64, input_dim=64,
kernel_size=[3, 3, 3], kernel_size=[3, 3, 3],
padding=[1, 2, 4], padding=[1, 2, 4],
stride=[1, 1, 1], stride=[1, 1, 1],
), ),
embedding_dim=64 embedding_dim=64
), ),
"token": TransformersWordEmbedder("allegro/herbert-base-cased", projection_dim=100) "token": TransformersWordEmbedder("allegro/herbert-base-cased", projection_dim=100)
} }
), ),
upos_tagger=FeedForwardPredictor.from_vocab( upos_tagger=FeedForwardPredictor.from_vocab(
vocabulary=vocabulary, vocabulary=vocabulary,
activations=[TanhActivation(), LinearActivation()], activations=[TanhActivation(), LinearActivation()],
dropout=[0.25, 0.], dropout=[0.25, 0.],
hidden_dims=[64], hidden_dims=[64],
input_dim=1024, input_dim=1024,
num_layers=2, num_layers=2,
vocab_namespace="upostag_labels" vocab_namespace="upostag_labels"
), ),
xpos_tagger=FeedForwardPredictor.from_vocab( xpos_tagger=FeedForwardPredictor.from_vocab(
vocabulary=vocabulary, vocabulary=vocabulary,
activations=[TanhActivation(), LinearActivation()], activations=[TanhActivation(), LinearActivation()],
dropout=[0.25, 0.], dropout=[0.25, 0.],
hidden_dims=[64], hidden_dims=[64],
input_dim=1024, input_dim=1024,
num_layers=2, num_layers=2,
vocab_namespace="xpostag_labels" vocab_namespace="xpostag_labels"
), ),
serialization_dir=SERIALIZATION_DIR serialization_dir=SERIALIZATION_DIR
) )
``` ```
%% Output %% Output
Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.sso.sso_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight'] Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.sso.sso_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.sso.sso_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). - This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
%% Cell type:code id:e131e0ec75dc6927 tags: %% Cell type:code id:e131e0ec75dc6927 tags:
``` python ``` python
data_loader.index_with(vocabulary) data_loader.index_with(vocabulary)
a = 0 a = 0
for i in data_loader: for i in data_loader:
break break
``` ```
%% Cell type:code id:195c71fcf8170ff tags: %% Cell type:code id:195c71fcf8170ff tags:
``` python ``` python
val_data_loader.index_with(vocabulary) val_data_loader.index_with(vocabulary)
``` ```
%% Cell type:code id:cefc5173154d1605 tags: %% Cell type:code id:cefc5173154d1605 tags:
``` python ``` python
nlp = TrainableCombo(model, torch.optim.Adam, nlp = TrainableCombo(model, torch.optim.Adam,
optimizer_kwargs={'betas': [0.9, 0.9], 'lr': 0.002}, optimizer_kwargs={'betas': [0.9, 0.9], 'lr': 0.002},
validation_metrics=['EM']) validation_metrics=['EM'])
trainer = pl.Trainer(max_epochs=1, trainer = pl.Trainer(max_epochs=1,
default_root_dir=SERIALIZATION_DIR, default_root_dir=SERIALIZATION_DIR,
gradient_clip_val=5) gradient_clip_val=5)
``` ```
%% Output %% Output
GPU available: False, used: False GPU available: False, used: False
TPU available: False, using: 0 TPU cores TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs HPU available: False, using: 0 HPUs
/Users/majajablonska/miniconda/envs/combo/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:67: UserWarning: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default /Users/majajablonska/miniconda/envs/combo/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:67: UserWarning: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
warning_cache.warn( warning_cache.warn(
%% Cell type:code id:e5af131bae4b1a33 tags: %% Cell type:code id:e5af131bae4b1a33 tags:
``` python ``` python
trainer.fit(model=nlp, train_dataloaders=data_loader, val_dataloaders=val_data_loader) trainer.fit(model=nlp, train_dataloaders=data_loader, val_dataloaders=val_data_loader)
``` ```
%% Output %% Output
| Name | Type | Params | Name | Type | Params
------------------------------------- -------------------------------------
0 | model | ComboModel | 136 M 0 | model | ComboModel | 136 M
------------------------------------- -------------------------------------
12.1 M Trainable params 12.1 M Trainable params
124 M Non-trainable params 124 M Non-trainable params
136 M Total params 136 M Total params
546.115 Total estimated model params size (MB) 546.106 Total estimated model params size (MB)
/Users/majajablonska/miniconda/envs/combo/lib/python3.9/site-packages/pytorch_lightning/utilities/data.py:76: UserWarning: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 16. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`. /Users/majajablonska/miniconda/envs/combo/lib/python3.9/site-packages/pytorch_lightning/utilities/data.py:76: UserWarning: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 16. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.
warning_cache.warn( warning_cache.warn(
/Users/majajablonska/miniconda/envs/combo/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py:280: PossibleUserWarning: The number of training batches (4) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch. /Users/majajablonska/miniconda/envs/combo/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py:280: PossibleUserWarning: The number of training batches (4) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.
rank_zero_warn( rank_zero_warn(
`Trainer.fit` stopped: `max_epochs=1` reached. `Trainer.fit` stopped: `max_epochs=1` reached.
%% Cell type:code id:3e23413c86063183 tags: %% Cell type:code id:3e23413c86063183 tags:
``` python ``` python
predictor = COMBO(model, dataset_reader) predictor = COMBO(model, dataset_reader)
``` ```
%% Cell type:code id:d555d7f0223a624b tags: %% Cell type:code id:d555d7f0223a624b tags:
``` python ``` python
a = predictor("Cześć, jestem psem.") a = predictor("Cześć, jestem psem.")
``` ```
%% Cell type:code id:a68cd3861e1ceb67 tags: %% Cell type:code id:a68cd3861e1ceb67 tags:
``` python ``` python
print("{:15} {:15} {:10} {:10} {:10}".format('TOKEN', 'LEMMA', 'UPOS', 'HEAD', 'DEPREL')) print("{:15} {:15} {:10} {:10} {:10}".format('TOKEN', 'LEMMA', 'UPOS', 'HEAD', 'DEPREL'))
for token in a.tokens: for token in a.tokens:
print("{:15} {:15} {:10} {:10} {:10}".format(token.text, token.lemma, token.upostag, token.head, token.deprel)) print("{:15} {:15} {:10} {:10} {:10}".format(token.text, token.lemma, token.upostag, token.head, token.deprel))
``` ```
%% Output %% Output
TOKEN LEMMA UPOS HEAD DEPREL TOKEN LEMMA UPOS HEAD DEPREL
Cześć, ????? NOUN 0 root Cześć, ?????? NOUN 0 root
jestem ????? NOUN 1 punct jestem ?????a NOUN 1 punct
psem. ????? NOUN 1 punct psem. ????? NOUN 1 punct
%% Cell type:code id:d0f43f4493218b5 tags: %% Cell type:code id:d0f43f4493218b5 tags:
``` python ``` python
from modules.archival import archive from modules.archival import archive
``` ```
%% Cell type:code id:ec92aa5bb5bb3605 tags: %% Cell type:code id:ec92aa5bb5bb3605 tags:
``` python ``` python
archive(model, '/Users/majajablonska/Documents/combo', data_loader, val_data_loader, dataset_reader) archive(model, '/Users/majajablonska/Documents/combo', data_loader, val_data_loader, dataset_reader)
``` ```
%% Output %% Output
'/Users/majajablonska/Documents/combo' '/Users/majajablonska/Documents/combo'
%% Cell type:code id:5ad8a827586f65e3 tags: %% Cell type:code id:5ad8a827586f65e3 tags:
``` python ``` python
``` ```
......
# Training
Basic command:
```bash
combo --mode train \
--training_data_path your_training_path \
--validation_data_path your_validation_path
```
Options:
```bash
combo --helpfull
```
## Examples
For clarity, the training and validation data paths are omitted.
Train on multiple accelerators (default: train on all available ones)
```bash
combo --mode train
--n_cuda_devices 8
```
...@@ -2,6 +2,7 @@ import unittest ...@@ -2,6 +2,7 @@ import unittest
import os import os
from combo.config import Registry from combo.config import Registry
from combo.config.from_parameters import override_parameters
from combo.data import WhitespaceTokenizer, UniversalDependenciesDatasetReader, Vocabulary from combo.data import WhitespaceTokenizer, UniversalDependenciesDatasetReader, Vocabulary
from combo.data.token_indexers.token_characters_indexer import TokenCharactersIndexer from combo.data.token_indexers.token_characters_indexer import TokenCharactersIndexer
...@@ -79,3 +80,61 @@ class ConfigurationTest(unittest.TestCase): ...@@ -79,3 +80,61 @@ class ConfigurationTest(unittest.TestCase):
self.assertEqual(type(reconstructed_vocab), Vocabulary) self.assertEqual(type(reconstructed_vocab), Vocabulary)
self.assertEqual(reconstructed_vocab.constructed_from, 'from_files') self.assertEqual(reconstructed_vocab.constructed_from, 'from_files')
self.assertSetEqual(reconstructed_vocab.get_namespaces(), {'animals'}) self.assertSetEqual(reconstructed_vocab.get_namespaces(), {'animals'})
def test_override_parameters(self):
parameters = {
'type': 'base_vocabulary',
'parameters': {
'counter': {'counter': {'test': 0}},
'max_vocab_size': 10
}
}
to_override = {'parameters': {'max_vocab_size': 15}}
self.assertDictEqual({
'type': 'base_vocabulary',
'parameters': {
'counter': {'counter': {'test': 0}},
'max_vocab_size': 15
}
}, override_parameters(parameters, to_override))
def test_override_nested_parameters(self):
parameters = {
'type': 'base_vocabulary',
'parameters': {
'counter': {'counter': {'test': 0}, 'another_property': 0},
'another_counter': {'counter': {'test': 0}, 'another_property': 0}
}
}
to_override = {'parameters': {'another_counter': {'counter': {'test': 1}}}}
self.assertDictEqual({
'type': 'base_vocabulary',
'parameters': {
'counter': {'counter': {'test': 0}, 'another_property': 0},
'another_counter': {'counter': {'test': 1}, 'another_property': 0}
}
}, override_parameters(parameters, to_override))
def test_override_parameters_no_change(self):
parameters = {
'type': 'base_vocabulary',
'parameters': {
'counter': {'counter': {'test': 0}},
'max_vocab_size': 10
}
}
to_override = {}
self.assertDictEqual({
'type': 'base_vocabulary',
'parameters': {
'counter': {'counter': {'test': 0}},
'max_vocab_size': 10
}
}, override_parameters(parameters, to_override))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment