Skip to content
Snippets Groups Projects
Commit bf863906 authored by Maja Jablonska's avatar Maja Jablonska
Browse files

Remove unnecessary dilated_cnn copy

parent e1bfa0c7
Branches
Tags
1 merge request!46Merge COMBO 3.0 into master
......@@ -12,7 +12,7 @@ from combo.data.tokenizers import CharacterTokenizer
from combo.data.vocabulary import Vocabulary
from combo.combo_model import ComboModel
from combo.models.encoder import ComboEncoder, ComboStackedBidirectionalLSTM
from combo.modules.dilated_cnn import DilatedCnnEncoder
from combo.models.dilated_cnn import DilatedCnnEncoder
from combo.modules.lemma import LemmatizerModel
from combo.modules.morpho import MorphologicalFeatures
from combo.modules.parser import DependencyRelationModel, HeadPredictionModel
......
......@@ -18,6 +18,7 @@ from combo.default_model import default_ud_dataset_reader, default_data_loader
from combo.modules.archival import load_archive, archive
from combo.predict import COMBO
from combo.data import api
from combo.data import DatasetReader
logging.setLoggerClass(ComboLogger)
logger = logging.getLogger(__name__)
......@@ -93,14 +94,6 @@ flags.DEFINE_enum(name="predictor_name", default="combo-lambo",
enum_values=["combo", "combo-spacy", "combo-lambo"],
help="Use predictor with whitespace, spacy or lambo (recommended) tokenizer.")
def get_predictor() -> COMBO:
checks.file_exists(FLAGS.model_path)
arch = load_archive(FLAGS.model_path)
dataset_reader = default_ud_dataset_reader()
return COMBO(arch.model, dataset_reader)
def run(_):
if FLAGS.mode == 'train':
if not FLAGS.finetuning:
......@@ -211,13 +204,39 @@ def run(_):
keep_semrel=dataset_reader.use_sem).serialize())
elif FLAGS.mode == 'predict':
predictor = get_predictor()
sentence = input("Sentence:")
prediction = predictor(sentence)
print("{:15} {:15} {:10} {:10} {:10}".format('TOKEN', 'LEMMA', 'UPOS', 'HEAD', 'DEPREL'))
for token in prediction.tokens:
print("{:15} {:15} {:10} {:10} {:10}".format(token.text, token.lemma, token.upostag, token.head,
token.deprel))
prefix = 'Predicting'
logger.info('Loading the model', prefix=prefix)
model, _, _, _, dataset_reader = load_archive(FLAGS.model_path)
if not dataset_reader:
logger.info("No dataset reader in the configuration or archive file - using a default UD dataset reader",
prefix=prefix)
dataset_reader = default_ud_dataset_reader()
predictor = COMBO(model, dataset_reader)
if FLAGS.input_file == '-':
print("Interactive mode.")
sentence = input("Sentence: ")
prediction = predictor(sentence)
print("{:15} {:15} {:10} {:10} {:10}".format('TOKEN', 'LEMMA', 'UPOS', 'HEAD', 'DEPREL'))
for token in prediction.tokens:
print("{:15} {:15} {:10} {:10} {:10}".format(token.text, token.lemma, token.upostag, token.head,
token.deprel))
elif FLAGS.output_file:
checks.file_exists(FLAGS.input_file)
logger.info("Predicting examples from file", prefix=prefix)
test_trees = dataset_reader.read(FLAGS.input_file)
predictor = COMBO(model, dataset_reader)
with open(FLAGS.output_file, "w") as file:
for tree in tqdm(test_trees):
file.writelines(api.sentence2conllu(predictor.predict_instance(tree),
keep_semrel=dataset_reader.use_sem).serialize())
else:
msg = 'No output file for input file {input_file} specified.'.format(input_file=FLAGS.input_file)
logger.info(msg, prefix=prefix)
print(msg)
def _get_ext_vars(finetuning: bool = False) -> Dict:
......
"""
Adapted from COMBO 1.0
Author: Mateusz Klimaszewski
"""
from typing import List
import torch
from combo.config import FromParameters, Registry
from combo.config.from_parameters import register_arguments
from combo.nn.activations import Activation
@Registry.register('dilated_cnn')
class DilatedCnnEncoder(torch.nn.Module, FromParameters):
@register_arguments
def __init__(self,
input_dim: int,
filters: List[int],
kernel_size: List[int],
stride: List[int],
padding: List[int],
dilation: List[int],
activations: List[Activation]):
super().__init__()
conv1d_layers = []
input_dims = [input_dim] + filters[:-1]
output_dims = filters
for idx in range(len(activations)):
conv1d_layers.append(torch.nn.Conv1d(
in_channels=input_dims[idx],
out_channels=output_dims[idx],
kernel_size=(kernel_size[idx],),
stride=(stride[idx],),
padding=padding[idx],
dilation=(dilation[idx],)))
self.conv1d_layers = torch.nn.ModuleList(conv1d_layers)
self.activations = activations
assert len(self.activations) == len(self.conv1d_layers)
def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
for layer, activation in zip(self.conv1d_layers, self.activations):
x = activation(layer(x))
return x
......@@ -7,7 +7,7 @@ from overrides import overrides
from combo import data
from combo.config import Registry
from combo.config.from_parameters import register_arguments
from combo.modules import dilated_cnn
from combo.models import dilated_cnn
from combo.nn import base
from combo.nn.activations import Activation
from combo.nn.utils import masked_cross_entropy
......
......@@ -8,7 +8,7 @@ from overrides import overrides
from combo.config import Registry
from combo.config.from_parameters import register_arguments
from combo.data import Vocabulary
from combo.modules.dilated_cnn import DilatedCnnEncoder
from combo.models.dilated_cnn import DilatedCnnEncoder
from combo.modules.token_embedders import TokenEmbedder
from typing import Optional
......
%% Cell type:code id:b28c7d8bacb08d02 tags:
``` python
# The path where the training and validation datasets are stored
TRAINING_DATA_PATH: str = '/Users/majajablonska/Documents/PDB/PDBUD_train.conllu'
VALIDATION_DATA_PATH: str = '/Users/majajablonska/Documents/PDB/PDBUD_val.conllu'
# The path where the model can be saved to
SERIALIZATION_DIR: str = "/Users/majajablonska/Documents/Workspace/combotest"
```
%% Cell type:code id:initial_id tags:
``` python
from combo.predict import COMBO
from combo.combo_model import ComboModel
from combo.data.vocabulary import Vocabulary
from combo.models.encoder import ComboEncoder, ComboStackedBidirectionalLSTM
from combo.modules.text_field_embedders import BasicTextFieldEmbedder
from combo.nn.base import Linear
from combo.modules.token_embedders import CharacterBasedWordEmbedder, TransformersWordEmbedder
from combo.modules import FeedForwardPredictor
from combo.nn.activations import ReLUActivation, TanhActivation, LinearActivation
from combo.modules.dilated_cnn import DilatedCnnEncoder
from combo.models.dilated_cnn import DilatedCnnEncoder
from combo.data.tokenizers import LamboTokenizer, CharacterTokenizer
from combo.data.token_indexers import PretrainedTransformerIndexer, TokenConstPaddingCharactersIndexer, TokenFeatsIndexer, SingleIdTokenIndexer, PretrainedTransformerFixedMismatchedIndexer
from combo.data.dataset_readers import UniversalDependenciesDatasetReader
import torch
from combo.data.dataset_loaders import SimpleDataLoader
from combo.modules.parser import DependencyRelationModel, HeadPredictionModel
from combo.modules.lemma import LemmatizerModel
from combo.modules.morpho import MorphologicalFeatures
from combo.nn.regularizers.regularizers import L2Regularizer
import pytorch_lightning as pl
from combo.training.trainable_combo import TrainableCombo
from itertools import chain
```
%% Cell type:code id:d74957f422f0b05b tags:
``` python
def default_const_character_indexer(namespace = None):
if namespace:
return TokenConstPaddingCharactersIndexer(
tokenizer=CharacterTokenizer(end_tokens=["__END__"],
start_tokens=["__START__"]),
min_padding_length=32,
namespace=namespace
)
else:
return TokenConstPaddingCharactersIndexer(
tokenizer=CharacterTokenizer(end_tokens=["__END__"],
start_tokens=["__START__"]),
min_padding_length=32
)
dataset_reader = UniversalDependenciesDatasetReader(
features=["token", "char"],
lemma_indexers={
"char": default_const_character_indexer("lemma_characters")
},
targets=["deprel", "head", "upostag", "lemma", "feats", "xpostag"],
token_indexers={
"char": default_const_character_indexer(),
"feats": TokenFeatsIndexer(),
"lemma": default_const_character_indexer(),
"token": PretrainedTransformerFixedMismatchedIndexer("bert-base-cased"),
"upostag": SingleIdTokenIndexer(
feature_name="pos_",
namespace="upostag"
),
"xpostag": SingleIdTokenIndexer(
feature_name="tag_",
namespace="xpostag"
)
},
use_sem=False
)
data_loader = SimpleDataLoader.from_dataset_reader(dataset_reader,
data_path=TRAINING_DATA_PATH,
batch_size=16,
batches_per_epoch=4,
shuffle=True)
val_data_loader = SimpleDataLoader.from_dataset_reader(dataset_reader,
data_path=VALIDATION_DATA_PATH,
batch_size=16,
batches_per_epoch=4,
shuffle=True)
vocabulary = Vocabulary.from_instances_extended(
chain(data_loader.iter_instances(), val_data_loader.iter_instances()),
non_padded_namespaces=['head_labels'],
only_include_pretrained_words=False,
oov_token='_',
padding_token='__PAD__'
)
```
%% Output
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
%% Cell type:code id:fa724d362fd6bd23 tags:
``` python
seq_encoder = ComboEncoder(layer_dropout_probability=0.33,
stacked_bilstm=ComboStackedBidirectionalLSTM(
hidden_size=512,
input_size=164,
layer_dropout_probability=0.33,
num_layers=2,
recurrent_dropout_probability=0.33
))
```
%% Cell type:code id:f8a10f9892005fca tags:
``` python
char_words_embedder = CharacterBasedWordEmbedder(
dilated_cnn_encoder = DilatedCnnEncoder(
input_dim=64,
kernel_size=[3, 3, 3],
padding=[1, 2, 4],
stride=[1, 1, 1],
filters=[512, 256, 64],
dilation=[1, 2, 4],
activations=[ReLUActivation(), ReLUActivation(), LinearActivation()]
),
embedding_dim=64,
vocabulary=vocabulary
)
tokenizer = LamboTokenizer()
indexer = PretrainedTransformerIndexer('bert-base-cased')
data_loader.iter_instances()
```
%% Output
Using model LAMBO-UD_English-EWT
<generator object SimpleDataLoader.iter_instances at 0x7faf9b7f6820>
<generator object SimpleDataLoader.iter_instances at 0x7fdd1e0a0c80>
%% Cell type:code id:14413692656b68ac tags:
``` python
vocabulary.save_to_files('/Users/majajablonska/PycharmProjects/combo-lightning/tests/fixtures/train_vocabulary')
```
%% Output
Directory /Users/majajablonska/PycharmProjects/combo-lightning/tests/fixtures/train_vocabulary is not empty
%% Cell type:code id:437d12054baaffa1 tags:
``` python
from nn import RegularizerApplicator
model = ComboModel(
vocabulary=vocabulary,
dependency_relation=DependencyRelationModel(
vocabulary=vocabulary,
dependency_projection_layer=Linear(
activation=TanhActivation(),
dropout_rate=0.25,
in_features=1024,
out_features=128
),
head_predictor=HeadPredictionModel(
cycle_loss_n=0,
dependency_projection_layer=Linear(
activation=TanhActivation(),
in_features=1024,
out_features=512
),
head_projection_layer=Linear(
activation=TanhActivation(),
in_features=1024,
out_features=512
)
),
head_projection_layer=Linear(
activation=TanhActivation(),
dropout_rate=0.25,
in_features=1024,
out_features=128
),
vocab_namespace="deprel_labels"
),
lemmatizer=LemmatizerModel(
vocabulary=vocabulary,
activations=[ReLUActivation(), ReLUActivation(), ReLUActivation(), LinearActivation()],
char_vocab_namespace="token_characters",
dilation=[1, 2, 4, 1],
embedding_dim=256,
filters=[256, 256, 256],
input_projection_layer=Linear(
activation=TanhActivation(),
dropout_rate=0.25,
in_features=1024,
out_features=32
),
kernel_size=[3, 3, 3, 1],
lemma_vocab_namespace="lemma_characters",
padding=[1, 2, 4, 0],
stride=[1, 1, 1, 1]
),
loss_weights={
"deprel": 0.8,
"feats": 0.2,
"head": 0.2,
"lemma": 0.05,
"semrel": 0.05,
"upostag": 0.05,
"xpostag": 0.05
},
morphological_feat=MorphologicalFeatures(
vocabulary=vocabulary,
activations=[TanhActivation(), LinearActivation()],
dropout=[0.25, 0.],
hidden_dims=[128],
input_dim=1024,
num_layers=2,
vocab_namespace="feats_labels"
),
regularizer=RegularizerApplicator([
(".*conv1d.*", L2Regularizer(1e-6)),
(".*forward.*", L2Regularizer(1e-6)),
(".*backward.*", L2Regularizer(1e-6)),
(".*char_embed.*", L2Regularizer(1e-5))
]),
seq_encoder=ComboEncoder(
layer_dropout_probability=0.33,
stacked_bilstm=ComboStackedBidirectionalLSTM(
hidden_size=512,
input_size=164,
layer_dropout_probability=0.33,
num_layers=2,
recurrent_dropout_probability=0.33
)
),
text_field_embedder=BasicTextFieldEmbedder(
token_embedders={
"char": CharacterBasedWordEmbedder(
vocabulary=vocabulary,
dilated_cnn_encoder=DilatedCnnEncoder(
activations=[ReLUActivation(), ReLUActivation(), LinearActivation()],
dilation=[1, 2, 4],
filters=[512, 256, 64],
input_dim=64,
kernel_size=[3, 3, 3],
padding=[1, 2, 4],
stride=[1, 1, 1],
),
embedding_dim=64
),
"token": TransformersWordEmbedder("allegro/herbert-base-cased", projection_dim=100)
}
),
upos_tagger=FeedForwardPredictor.from_vocab(
vocabulary=vocabulary,
activations=[TanhActivation(), LinearActivation()],
dropout=[0.25, 0.],
hidden_dims=[64],
input_dim=1024,
num_layers=2,
vocab_namespace="upostag_labels"
),
xpos_tagger=FeedForwardPredictor.from_vocab(
vocabulary=vocabulary,
activations=[TanhActivation(), LinearActivation()],
dropout=[0.25, 0.],
hidden_dims=[64],
input_dim=1024,
num_layers=2,
vocab_namespace="xpostag_labels"
),
serialization_dir=SERIALIZATION_DIR
)
```
%% Output
Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.sso.sso_relationship.bias', 'cls.predictions.decoder.weight', 'cls.sso.sso_relationship.weight']
Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.sso.sso_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.sso.sso_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
%% Cell type:code id:e131e0ec75dc6927 tags:
``` python
data_loader.index_with(vocabulary)
a = 0
for i in data_loader:
break
```
%% Cell type:code id:195c71fcf8170ff tags:
``` python
val_data_loader.index_with(vocabulary)
```
%% Cell type:code id:cefc5173154d1605 tags:
``` python
nlp = TrainableCombo(model, torch.optim.Adam,
optimizer_kwargs={'betas': [0.9, 0.9], 'lr': 0.002},
validation_metrics=['EM'])
trainer = pl.Trainer(max_epochs=1,
default_root_dir=SERIALIZATION_DIR,
gradient_clip_val=5)
```
%% Output
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/majajablonska/miniconda/envs/combo/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:67: UserWarning: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
warning_cache.warn(
%% Cell type:code id:e5af131bae4b1a33 tags:
``` python
trainer.fit(model=nlp, train_dataloaders=data_loader, val_dataloaders=val_data_loader)
```
%% Output
| Name | Type | Params
-------------------------------------
0 | model | ComboModel | 136 M
-------------------------------------
12.1 M Trainable params
124 M Non-trainable params
136 M Total params
546.115 Total estimated model params size (MB)
/Users/majajablonska/miniconda/envs/combo/lib/python3.9/site-packages/pytorch_lightning/utilities/data.py:76: UserWarning: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 16. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.
warning_cache.warn(
/Users/majajablonska/miniconda/envs/combo/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py:280: PossibleUserWarning: The number of training batches (4) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.
rank_zero_warn(
`Trainer.fit` stopped: `max_epochs=1` reached.
%% Cell type:code id:3e23413c86063183 tags:
``` python
predictor = COMBO(model, dataset_reader)
```
%% Cell type:code id:d555d7f0223a624b tags:
``` python
a = predictor("Cześć, jestem psem.")
```
%% Cell type:code id:a68cd3861e1ceb67 tags:
``` python
print("{:15} {:15} {:10} {:10} {:10}".format('TOKEN', 'LEMMA', 'UPOS', 'HEAD', 'DEPREL'))
for token in a.tokens:
print("{:15} {:15} {:10} {:10} {:10}".format(token.text, token.lemma, token.upostag, token.head, token.deprel))
```
%% Output
TOKEN LEMMA UPOS HEAD DEPREL
Cześć, ????? NOUN 2 punct
jestem ????? NOUN 0 root
psem. ???? NOUN 2 punct
Cześć, ?????? NOUN 0 root
jestem ?????? NOUN 1 punct
psem. ????? NOUN 1 punct
%% Cell type:code id:d0f43f4493218b5 tags:
``` python
from modules.archival import archive
```
%% Cell type:code id:ec92aa5bb5bb3605 tags:
``` python
archive(model, '/Users/majajablonska/Documents/combo', data_loader, val_data_loader, dataset_reader)
```
%% Output
'/Users/majajablonska/Documents/combo'
%% Cell type:code id:5ad8a827586f65e3 tags:
``` python
```
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment