diff --git a/combo/commands/__init__.py b/combo/commands/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..83ff4f4b48489406a73f3a902f0ab28eb62572b1 100644 --- a/combo/commands/__init__.py +++ b/combo/commands/__init__.py @@ -0,0 +1 @@ +from .train import FinetuningTrainModel \ No newline at end of file diff --git a/combo/commands/train.py b/combo/commands/train.py new file mode 100644 index 0000000000000000000000000000000000000000..f4bb80729a4875e48574ac94d43b36660559fe39 --- /dev/null +++ b/combo/commands/train.py @@ -0,0 +1,10 @@ +from pytorch_lightning import Trainer + + +class FinetuningTrainModel(Trainer): + """ + Class made only for finetuning, + the only difference is saving vocab from concatenated + (archive and current) datasets + """ + pass \ No newline at end of file diff --git a/combo/data/dataset.py b/combo/data/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..3b16c149b7abd9e2aba6ac50a4b0b71740d4f5d7 --- /dev/null +++ b/combo/data/dataset.py @@ -0,0 +1,273 @@ +import copy +import logging +import pathlib +from dataclasses import dataclass +from typing import List, Any, Dict, Iterable, Optional, Tuple + +import conllu +import torch +from overrides import overrides + +from combo import data +from combo.data import Vocabulary, fields, Instance, Token, TokenizerToken +from combo.data.dataset_readers.dataset_reader import DatasetReader +from combo.data.fields import Field +from combo.data.fields.adjacency_field import AdjacencyField +from combo.data.fields.metadata_field import MetadataField +from combo.data.fields.sequence_label_field import SequenceLabelField +from combo.data.fields.text_field import TextField +from combo.data.token_indexers import TokenIndexer +from combo.models import parser +from combo.utils import checks, pad_sequence_to_length + +logger = logging.getLogger(__name__) + + +@dataclass(init=False, repr=False) +class _Token(TokenizerToken): + __slots__ = TokenizerToken.__slots__ + ['feats_'] + + feats_: Optional[str] + + def __init__(self, text: str = None, idx: int = None, idx_end: int = None, lemma_: str = None, pos_: str = None, + tag_: str = None, dep_: str = None, ent_type_: str = None, text_id: int = None, type_id: int = None, + feats_: str = None) -> None: + super().__init__(text, idx, idx_end, lemma_, pos_, tag_, dep_, ent_type_, text_id, type_id) + self.feats_ = feats_ + + +class UniversalDependenciesDatasetReader(DatasetReader): + def __init__( + self, + token_indexers: Dict[str, TokenIndexer] = None, + lemma_indexers: Dict[str, TokenIndexer] = None, + features: List[str] = None, + targets: List[str] = None, + use_sem: bool = False, + **kwargs, + ) -> None: + super().__init__(**kwargs) + if features is None: + features = ["token", "char"] + if targets is None: + targets = ["head", "deprel", "upostag", "xpostag", "lemma", "feats"] + + if "token" not in features and "char" not in features: + raise checks.ConfigurationError("There must be at least one ('char' or 'token') text-based feature!") + + if "deps" in targets and not ("head" in targets and "deprel" in targets): + raise checks.ConfigurationError("Add 'head' and 'deprel' to targets when using 'deps'!") + + intersection = set(features).intersection(set(targets)) + if len(intersection) != 0: + raise checks.ConfigurationError( + "Features and targets cannot share elements! " + "Remove {} from either features or targets.".format(intersection) + ) + self.use_sem = use_sem + + # *.conllu readers configuration + fields = list(parser.DEFAULT_FIELDS) + fields[1] = "token" # use 'token' instead of 'form' + field_parsers = parser.DEFAULT_FIELD_PARSERS + # Do not make it nullable + field_parsers.pop("xpostag", None) + # Ignore parsing misc + field_parsers.pop("misc", None) + if self.use_sem: + fields = list(fields) + fields.append("semrel") + field_parsers["semrel"] = lambda line, i: line[i] + self.field_parsers = field_parsers + self.fields = tuple(fields) + + self._token_indexers = token_indexers + self._lemma_indexers = lemma_indexers + self._targets = targets + self._features = features + self.generate_labels = True + # Filter out not required token indexers to avoid + # Mismatched token keys ConfigurationError + for indexer_name in list(self._token_indexers.keys()): + if indexer_name not in self._features: + del self._token_indexers[indexer_name] + + @overrides + def _read(self, file_path: str) -> Iterable[Instance]: + file_path = [file_path] if len(file_path.split(",")) == 0 else file_path.split(",") + + for conllu_file in file_path: + file = pathlib.Path(conllu_file) + assert conllu_file and file.exists(), f"File with path '{conllu_file}' does not exists!" + with file.open("r", encoding="utf-8") as f: + for annotation in conllu.parse_incr(f, fields=self.fields, field_parsers=self.field_parsers): + yield self.text_to_instance(annotation) + + # why is there an error? TypeError: UniversalDependenciesDatasetReader.text_to_instance: `inputs` must be present + #@overrides + def text_to_instance(self, tree: conllu.TokenList) -> Instance: + fields_: Dict[str, Field] = {} + tree_tokens = [t for t in tree if isinstance(t["id"], int)] + tokens = [_Token(t["token"], + pos_=t.get("upostag"), + tag_=t.get("xpostag"), + lemma_=t.get("lemma"), + feats_=t.get("feats")) + for t in tree_tokens] + + # features + text_field = TextField(tokens, self._token_indexers) + fields_["sentence"] = text_field + + # targets + if self.generate_labels: + for target_name in self._targets: + if target_name != "sent": + target_values = [t[target_name] for t in tree_tokens] + if target_name == "lemma": + target_values = [TokenizerToken(v) for v in target_values] + fields_[target_name] = TextField(target_values, self._lemma_indexers) + elif target_name == "feats": + target_values = self._feat_values(tree_tokens) + fields_[target_name] = fields.SequenceMultiLabelField(target_values, + self._feats_indexer, + self._feats_as_tensor_wrapper, + text_field, + label_namespace="feats_labels") + elif target_name == "head": + target_values = [0 if v == "_" else int(v) for v in target_values] + fields_[target_name] = SequenceLabelField(target_values, text_field, + label_namespace=target_name + "_labels") + elif target_name == "deps": + # Graphs require adding ROOT (AdjacencyField uses sequence length from TextField). + text_field_deps = TextField([_Token("ROOT")] + copy.deepcopy(tokens), self._token_indexers) + enhanced_heads: List[Tuple[int, int]] = [] + enhanced_deprels: List[str] = [] + for idx, t in enumerate(tree_tokens): + t_deps = t["deps"] + if t_deps and t_deps != "_": + for rel, head in t_deps: + # EmoryNLP skips the first edge, if there are two edges between the same + # nodes. Thanks to that one is in a tree and another in a graph. + # This snippet follows that approach. + if enhanced_heads and enhanced_heads[-1] == (idx, head): + enhanced_heads.pop() + enhanced_deprels.pop() + enhanced_heads.append((idx, head)) + enhanced_deprels.append(rel) + fields_["enhanced_heads"] = AdjacencyField( + indices=enhanced_heads, + sequence_field=text_field_deps, + label_namespace="enhanced_heads_labels", + padding_value=0, + ) + fields_["enhanced_deprels"] = AdjacencyField( + indices=enhanced_heads, + sequence_field=text_field_deps, + labels=enhanced_deprels, + # Label namespace matches regular tree parsing. + label_namespace="enhanced_deprel_labels", + padding_value=0, + ) + else: + fields_[target_name] = SequenceLabelField(target_values, text_field, + label_namespace=target_name + "_labels") + + # Restore feats fields to string representation + # parser.serialize_field doesn't handle key without value + for token in tree.tokens: + if "feats" in token: + feats = token["feats"] + if feats: + feats_values = [] + for k, v in feats.items(): + feats_values.append('='.join((k, v)) if v else k) + field = "|".join(feats_values) + else: + field = "_" + token["feats"] = field + + # metadata + fields_["metadata"] = MetadataField({"input": tree, + "field_names": self.fields, + "tokens": tokens}) + + return Instance(fields_) + + @staticmethod + def _feat_values(tree: List[Dict[str, Any]]): + features = [] + for token in tree: + token_features = [] + if token["feats"] is not None: + for feat, value in token["feats"].items(): + if feat in ["_", "__ROOT__"]: + pass + else: + # Handle case where feature is binary (doesn't have associated value) + if value: + token_features.append(feat + "=" + value) + else: + token_features.append(feat) + features.append(token_features) + return features + + @staticmethod + def _feats_as_tensor_wrapper(field: fields.SequenceMultiLabelField): + def as_tensor(padding_lengths): + desired_num_tokens = padding_lengths["num_tokens"] + assert len(field._indexed_multi_labels) > 0 + classes_count = len(field._indexed_multi_labels[0]) + default_value = [0.0] * classes_count + padded_tags = pad_sequence_to_length(field._indexed_multi_labels, desired_num_tokens, + lambda: default_value) + tensor = torch.tensor(padded_tags, dtype=torch.long) + return tensor + + return as_tensor + + @staticmethod + def _feats_indexer(vocab: Vocabulary): + label_namespace = "feats_labels" + vocab_size = vocab.get_vocab_size(label_namespace) + slices = get_slices_if_not_provided(vocab) + + def _m_from_n_ones_encoding(multi_label: List[str], sentence_length: int) -> List[int]: + one_hot_encoding = [0] * vocab_size + for cat, cat_indices in slices.items(): + if cat not in ["__PAD__", "_"]: + label_from_cat = [label for label in multi_label if cat == label.split("=")[0]] + if label_from_cat: + label_from_cat = label_from_cat[0] + index = vocab.get_token_index(label_from_cat, label_namespace) + else: + # Get Cat=None index + index = vocab.get_token_index(cat + "=None", label_namespace) + one_hot_encoding[index] = 1 + return one_hot_encoding + + return _m_from_n_ones_encoding + + +def get_slices_if_not_provided(vocab: data.Vocabulary): + if hasattr(vocab, "slices"): + return vocab.slices + + if "feats_labels" in vocab.get_namespaces(): + idx2token = vocab.get_index_to_token_vocabulary("feats_labels") + for _, v in dict(idx2token).items(): + if v not in ["_", "__PAD__"]: + empty_value = v.split("=")[0] + "=None" + vocab.add_token_to_namespace(empty_value, "feats_labels") + + slices = {} + for idx, name in vocab.get_index_to_token_vocabulary("feats_labels").items(): + # There are 2 types features: with (Case=Acc) or without assigment (None). + # Here we group their indices by name (before assigment sign). + name = name.split("=")[0] + if name in slices: + slices[name].append(idx) + else: + slices[name] = [idx] + vocab.slices = slices + return vocab.slices diff --git a/combo/models/base.py b/combo/models/base.py new file mode 100644 index 0000000000000000000000000000000000000000..45eae041affc66f1bd34e3224b893e22285d0028 --- /dev/null +++ b/combo/models/base.py @@ -0,0 +1,274 @@ +from typing import Dict, Optional, List, Union, Tuple + +import torch +import torch.nn as nn +from overrides import overrides + +from combo.models.combo_nn import Activation +import combo.utils.checks as checks +from combo.data.vocabulary import Vocabulary +from combo.models.utils import masked_cross_entropy +from combo.predictors.predictor import Predictor + + +class Linear(nn.Linear): + def __init__(self, + in_features: int, + out_features: int, + activation: Optional[Activation] = None, + dropout_rate: Optional[float] = 0.0): + super().__init__(in_features, out_features) + self.activation = activation if activation else self.identity + self.dropout = nn.Dropout(p=dropout_rate) if dropout_rate else self.identity + + def forward(self, x: torch.FloatTensor) -> torch.FloatTensor: + x = super().forward(x) + x = self.activation(x) + return self.dropout(x) + + def get_output_dim(self) -> int: + return self.out_features + + @staticmethod + def identity(x): + return x + + +class FeedForward(torch.nn.Module): + """ + Modified copy of allennlp.modules.feedforward.FeedForward + + This `Module` is a feed-forward neural network, just a sequence of `Linear` layers with + activation functions in between. + + # Parameters + + input_dim : `int`, required + The dimensionality of the input. We assume the input has shape `(batch_size, input_dim)`. + num_layers : `int`, required + The number of `Linear` layers to apply to the input. + hidden_dims : `Union[int, List[int]]`, required + The output dimension of each of the `Linear` layers. If this is a single `int`, we use + it for all `Linear` layers. If it is a `List[int]`, `len(hidden_dims)` must be + `num_layers`. + activations : `Union[Activation, List[Activation]]`, required + The activation function to use after each `Linear` layer. If this is a single function, + we use it after all `Linear` layers. If it is a `List[Activation]`, + `len(activations)` must be `num_layers`. Activation must have torch.nn.Module type. + dropout : `Union[float, List[float]]`, optional (default = `0.0`) + If given, we will apply this amount of dropout after each layer. Semantics of `float` + versus `List[float]` is the same as with other parameters. + + # Examples + + ```python + FeedForward(124, 2, [64, 32], torch.nn.ReLU(), 0.2) + #> FeedForward( + #> (_activations): ModuleList( + #> (0): ReLU() + #> (1): ReLU() + #> ) + #> (_linear_layers): ModuleList( + #> (0): Linear(in_features=124, out_features=64, bias=True) + #> (1): Linear(in_features=64, out_features=32, bias=True) + #> ) + #> (_dropout): ModuleList( + #> (0): Dropout(p=0.2, inplace=False) + #> (1): Dropout(p=0.2, inplace=False) + #> ) + #> ) + ``` + """ + + def __init__( + self, + input_dim: int, + num_layers: int, + hidden_dims: Union[int, List[int]], + activations: Union[Activation, List[Activation]], + dropout: Union[float, List[float]] = 0.0, + ) -> None: + + super().__init__() + if not isinstance(hidden_dims, list): + hidden_dims = [hidden_dims] * num_layers # type: ignore + if not isinstance(activations, list): + activations = [activations] * num_layers # type: ignore + if not isinstance(dropout, list): + dropout = [dropout] * num_layers # type: ignore + if len(hidden_dims) != num_layers: + raise checks.ConfigurationError( + "len(hidden_dims) (%d) != num_layers (%d)" % (len(hidden_dims), num_layers) + ) + if len(activations) != num_layers: + raise checks.ConfigurationError( + "len(activations) (%d) != num_layers (%d)" % (len(activations), num_layers) + ) + if len(dropout) != num_layers: + raise checks.ConfigurationError( + "len(dropout) (%d) != num_layers (%d)" % (len(dropout), num_layers) + ) + self._activations = torch.nn.ModuleList(activations) + input_dims = [input_dim] + hidden_dims[:-1] + linear_layers = [] + for layer_input_dim, layer_output_dim in zip(input_dims, hidden_dims): + linear_layers.append(torch.nn.Linear(layer_input_dim, layer_output_dim)) + self._linear_layers = torch.nn.ModuleList(linear_layers) + dropout_layers = [torch.nn.Dropout(p=value) for value in dropout] + self._dropout = torch.nn.ModuleList(dropout_layers) + self._output_dim = hidden_dims[-1] + self.input_dim = input_dim + + def get_output_dim(self): + return self._output_dim + + def get_input_dim(self): + return self.input_dim + + def forward(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, List[torch.Tensor]]: + + output = inputs + feature_maps = [] + for layer, activation, dropout in zip( + self._linear_layers, self._activations, self._dropout + ): + feature_maps.append(output) + output = dropout(activation(layer(output))) + return output, feature_maps + + +class FeedForwardPredictor(Predictor): + """Feedforward predictor. Should be used on top of Seq2Seq encoder.""" + + def __init__(self, feedforward_network: "FeedForward"): + super().__init__() + self.feedforward_network = feedforward_network + + def forward(self, + x: Union[torch.Tensor, List[torch.Tensor]], + mask: Optional[torch.BoolTensor] = None, + labels: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None, + sample_weights: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None) -> Dict[str, torch.Tensor]: + if mask is None: + mask = x.new_ones(x.size()[:-1]) + + x, feature_maps = self.feedforward_network(x) + output = { + "prediction": x.argmax(-1), + "probability": x, + "embedding": feature_maps[-1], + } + + if labels is not None: + if sample_weights is None: + sample_weights = labels.new_ones([mask.size(0)]) + output["loss"] = self._loss(x, labels, mask, sample_weights) + + return output + + def _loss(self, + pred: torch.Tensor, + true: torch.Tensor, + mask: torch.BoolTensor, + sample_weights: torch.Tensor) -> torch.Tensor: + BATCH_SIZE, _, CLASSES = pred.size() + valid_positions = mask.sum() + pred = pred.reshape(-1, CLASSES) + true = true.reshape(-1) + mask = mask.reshape(-1) + loss = masked_cross_entropy(pred, true, mask) + loss = loss.reshape(BATCH_SIZE, -1) * sample_weights.unsqueeze(-1) + return loss.sum() / valid_positions + + @classmethod + def from_vocab(cls, + vocab: Vocabulary, + vocab_namespace: str, + input_dim: int, + num_layers: int, + hidden_dims: List[int], + activations: Union[Activation, List[Activation]], + dropout: Union[float, List[float]] = 0.0, + ): + if len(hidden_dims) + 1 != num_layers: + raise checks.ConfigurationError( + f"len(hidden_dims) ({len(hidden_dims):d}) + 1 != num_layers ({num_layers:d})" + ) + + assert vocab_namespace in vocab.get_namespaces(), \ + f"There is not {vocab_namespace} in created vocabs, check if this field has any values to predict!" + hidden_dims = hidden_dims + [vocab.get_vocab_size(vocab_namespace)] + + return cls(FeedForward( + input_dim=input_dim, + num_layers=num_layers, + hidden_dims=hidden_dims, + activations=activations, + dropout=dropout)) + + +""" +Adapted from AllenNLP +""" + + +class TimeDistributed(torch.nn.Module): + """ + Given an input shaped like `(batch_size, time_steps, [rest])` and a `Module` that takes + inputs like `(batch_size, [rest])`, `TimeDistributed` reshapes the input to be + `(batch_size * time_steps, [rest])`, applies the contained `Module`, then reshapes it back. + + Note that while the above gives shapes with `batch_size` first, this `Module` also works if + `batch_size` is second - we always just combine the first two dimensions, then split them. + + It also reshapes keyword arguments unless they are not tensors or their name is specified in + the optional `pass_through` iterable. + """ + + def __init__(self, module): + super().__init__() + self._module = module + + @overrides + def forward(self, *inputs, pass_through: List[str] = None, **kwargs): + + pass_through = pass_through or [] + + reshaped_inputs = [self._reshape_tensor(input_tensor) for input_tensor in inputs] + + # Need some input to then get the batch_size and time_steps. + some_input = None + if inputs: + some_input = inputs[-1] + + reshaped_kwargs = {} + for key, value in kwargs.items(): + if isinstance(value, torch.Tensor) and key not in pass_through: + if some_input is None: + some_input = value + + value = self._reshape_tensor(value) + + reshaped_kwargs[key] = value + + reshaped_outputs = self._module(*reshaped_inputs, **reshaped_kwargs) + + if some_input is None: + raise RuntimeError("No input tensor to time-distribute") + + # Now get the output back into the right shape. + # (batch_size, time_steps, **output_size) + new_size = some_input.size()[:2] + reshaped_outputs.size()[1:] + outputs = reshaped_outputs.contiguous().view(new_size) + + return outputs + + @staticmethod + def _reshape_tensor(input_tensor): + input_size = input_tensor.size() + if len(input_size) <= 2: + raise RuntimeError(f"No dimension to distribute: {input_size}") + # Squash batch_size and time_steps into a single axis; result has shape + # (batch_size * time_steps, **input_size). + squashed_shape = [-1] + list(input_size[2:]) + return input_tensor.contiguous().view(*squashed_shape) diff --git a/combo/models/combo_nn.py b/combo/models/combo_nn.py new file mode 100644 index 0000000000000000000000000000000000000000..822c1cd665e7aba7b8b94ce05ead907412ced553 --- /dev/null +++ b/combo/models/combo_nn.py @@ -0,0 +1,14 @@ +import torch +import torch.nn as nn +from overrides import overrides + + +class Activation(nn.Module): + def forward(self, x: torch.Tensor) -> torch.Tensor: + raise NotImplementedError + + +class LinearActivation(Activation): + @overrides + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x diff --git a/combo/models/dilated_cnn.py b/combo/models/dilated_cnn.py new file mode 100644 index 0000000000000000000000000000000000000000..79ca6d9a952e1da104150b896696eed75aec4901 --- /dev/null +++ b/combo/models/dilated_cnn.py @@ -0,0 +1,43 @@ +""" +Adapted from COMBO 1.0 +Author: Mateusz Klimaszewski +""" + +from typing import List + +import torch +import torch.nn as nn + +from combo.models.combo_nn import Activation + + +class DilatedCnnEncoder(nn.Module): + + def __init__(self, + input_dim: int, + filters: List[int], + kernel_size: List[int], + stride: List[int], + padding: List[int], + dilation: List[int], + activations: List[Activation]): + super().__init__() + conv1d_layers = [] + input_dims = [input_dim] + filters[:-1] + output_dims = filters + for idx in range(len(activations)): + conv1d_layers.append(nn.Conv1d( + in_channels=input_dims[idx], + out_channels=output_dims[idx], + kernel_size=(kernel_size[idx],), + stride=(stride[idx],), + padding=padding[idx], + dilation=(dilation[idx],))) + self.conv1d_layers = nn.ModuleList(conv1d_layers) + self.activations = activations + assert len(self.activations) == len(self.conv1d_layers) + + def forward(self, x: torch.FloatTensor) -> torch.FloatTensor: + for layer, activation in zip(self.conv1d_layers, self.activations): + x = activation(layer(x)) + return x diff --git a/combo/models/embeddings.py b/combo/models/embeddings.py new file mode 100644 index 0000000000000000000000000000000000000000..35b732ae5c4497893a28e67ecff36cbb383b3d79 --- /dev/null +++ b/combo/models/embeddings.py @@ -0,0 +1,221 @@ +from typing import Optional, List + +import torch +from overrides import overrides +from torch import nn +from torchtext.vocab import Vectors, GloVe, FastText, CharNGram + +from combo.data import Vocabulary +from combo.models.base import TimeDistributed +from combo.models.dilated_cnn import DilatedCnnEncoder +from combo.models.utils import tiny_value_of_dtype +from combo.utils import ConfigurationError + + +class TokenEmbedder(nn.Module): + def __init__(self): + super(TokenEmbedder, self).__init__() + + @property + def output_dim(self) -> int: + raise NotImplementedError() + + def forward(self, + x: torch.Tensor, + char_mask: Optional[torch.BoolTensor] = None) -> torch.Tensor: + raise NotImplementedError() + + +class _TorchEmbedder(TokenEmbedder): + def __init__(self, + num_embeddings: int, + embedding_dim: int, + padding_idx: Optional[int] = None, + max_norm: Optional[float] = None, + norm_type: float = 2., + scale_grad_by_freq: bool = False, + sparse: bool = False, + vocab_namespace: str = "tokens", + vocab: Vocabulary = None, + weight: Optional[torch.Tensor] = None, + trainable: bool = True, + projection_dim: Optional[int] = None): + super(_TorchEmbedder, self).__init__() + self._embedding_dim = embedding_dim + self._embedding = nn.Embedding(num_embeddings=num_embeddings, + embedding_dim=embedding_dim, + padding_idx=padding_idx, + max_norm=max_norm, + norm_type=norm_type, + scale_grad_by_freq=scale_grad_by_freq, + sparse=sparse) + self.__vocab_namespace = vocab_namespace + self.__vocab = vocab + + if weight is not None: + if weight.shape() != (num_embeddings, embedding_dim): + raise ConfigurationError( + "Weight matrix must be of shape (num_embeddings, embedding_dim)." + + f"Got: ({weight.shape()})" + ) + + self.__weight = torch.nn.Parameter(weight, requires_grad=trainable) + else: + self.__weight = torch.nn.Parameter(torch.FloatTensor(num_embeddings, embedding_dim), + requires_grad=trainable) + torch.nn.init.xavier_uniform_(self.__weight) + + if padding_idx is not None: + self.__weight.data[padding_idx].fill_(0) + + if projection_dim: + self._projection = torch.nn.Linear(embedding_dim, projection_dim) + self._output_dim = projection_dim + else: + self._projection = None + self._output_dim = embedding_dim + + @overrides + def output_dim(self) -> int: + return self._output_dim + + @overrides + def forward(self, + x: torch.Tensor, + char_mask: Optional[torch.BoolTensor] = None) -> torch.Tensor: + embedded = self._embedding(x) + if self._projection: + projection = self._projection + for p in range(embedded.dim()-2): + projection = TimeDistributed(p) + embedded = projection(embedded) + return embedded + + +class _TorchtextVectorsEmbedder(TokenEmbedder): + """ + Torchtext Vectors object wrapper + """ + + def __init__(self, + torchtext_embedder: Vectors, + lower_case_backup: bool = False): + """ + :param torchtext_embedder: Torchtext Vectors object + :param lower_case_backup: whether to look up the token in the + lower case. Default: False. + """ + super(_TorchtextVectorsEmbedder, self).__init__() + self.__torchtext_embedder = torchtext_embedder + self.__lower_case_backup = lower_case_backup + + @overrides + def output_dim(self) -> int: + return len(self.__torchtext_embedder) + + @overrides + def forward(self, + x: torch.Tensor, + char_mask: Optional[torch.BoolTensor] = None) -> torch.Tensor: + return self.__torchtext_embedder.get_vecs_by_tokens(x, self.__lower_case_backup) + + +class GloVe42BEmbedder(_TorchtextVectorsEmbedder): + def __init__(self, dim: int = 300): + super(GloVe42BEmbedder, self).__init__(GloVe("42B", dim)) + + +class GloVe840BEmbedder(_TorchtextVectorsEmbedder): + def __init__(self, dim: int = 300): + super(GloVe840BEmbedder, self).__init__(GloVe("840B", dim)) + + +class GloVeTwitter27BEmbedder(_TorchtextVectorsEmbedder): + def __init__(self, dim: int = 300): + super(GloVeTwitter27BEmbedder, self).__init__(GloVe("twitter.27B", dim)) + + +class GloVe6BEmbedder(_TorchtextVectorsEmbedder): + def __init__(self, dim: int = 300): + super(GloVe6BEmbedder, self).__init__(GloVe("6B", dim)) + + +class FastTextEmbedder(_TorchtextVectorsEmbedder): + def __init__(self, language: str = "en"): + super(FastTextEmbedder, self).__init__(FastText(language)) + + +class CharNGramEmbedder(_TorchtextVectorsEmbedder): + def __init__(self): + super(CharNGramEmbedder, self).__init__(CharNGram()) + + +class CharacterBasedWordEmbedder(TokenEmbedder): + def __init__(self, + num_embeddings: int, + embedding_dim: int, + dilated_cnn_encoder: DilatedCnnEncoder): + super(CharacterBasedWordEmbedder, self).__init__() + self.__embedding_dim = embedding_dim + self.__dilated_cnn_encoder = dilated_cnn_encoder + self.char_embed = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim) + + @overrides + def output_dim(self) -> int: + return self.__embedding_dim + + @overrides + def forward(self, + x: torch.Tensor, + char_mask: Optional[torch.BoolTensor] = None) -> torch.Tensor: + if char_mask is None: + char_mask = x.new_ones(x.size()) + + x = self.char_embed(x) + x = x * char_mask.unsqueeze(-1).float() + x = self.__dilated_cnn_encoder(x.transpose(2, 3)) + return torch.max(x, dim=-1)[0] + + +class PretrainedTransformerMismatchedEmbedder(TokenEmbedder): + pass + + +class TransformersWordEmbedder(PretrainedTransformerMismatchedEmbedder): + pass + + +class FeatsTokenEmbedder(_TorchEmbedder): + def __init__(self, + num_embeddings: int, + embedding_dim: int, + padding_idx: Optional[int] = None, + max_norm: Optional[float] = None, + norm_type: float = 2., + scale_grad_by_freq: bool = False, + sparse: bool = False, + vocab_namespace: str = "feats", + vocab: Vocabulary = None, + weight: Optional[torch.Tensor] = None, + trainable: bool = True): + super(FeatsTokenEmbedder, self).__init__(num_embeddings, + embedding_dim, + padding_idx, + max_norm, + norm_type, + scale_grad_by_freq, + sparse, + vocab_namespace, + vocab, + weight, + trainable) + + @overrides + def forward(self, + x: torch.Tensor, + char_mask: Optional[torch.BoolTensor] = None) -> torch.Tensor: + mask = x.gt(0) + x = super().forward(x) + return x.sum(dim=-2)/( + (mask.sum(dim=-1)+tiny_value_of_dtype(torch.float)).unsqueeze(dim=-1) + ) diff --git a/combo/models/graph_parser.py b/combo/models/graph_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..6dffef52a943e4671aea7ac911b5c25981c3051f --- /dev/null +++ b/combo/models/graph_parser.py @@ -0,0 +1,190 @@ +""" +Adapted from COMBO. +Author: Mateusz Klimaszewski +""" + +from typing import List, Optional, Union, Tuple, Dict + +from combo import data +from combo.models import base +from combo.models.base import Predictor + +import torch +import torch.nn.functional as F + + +class GraphHeadPredictionModel(Predictor): + """Head prediction model.""" + + def __init__(self, + head_projection_layer: base.Linear, + dependency_projection_layer: base.Linear, + cycle_loss_n: int = 0, + graph_weighting: float = 0.2): + super().__init__() + self.head_projection_layer = head_projection_layer + self.dependency_projection_layer = dependency_projection_layer + self.cycle_loss_n = cycle_loss_n + self.graph_weighting = graph_weighting + + def forward(self, + x: Union[torch.Tensor, List[torch.Tensor]], + labels: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None, + mask: Optional[torch.BoolTensor] = None, + sample_weights: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None) -> Dict[str, torch.Tensor]: + if mask is None: + mask = x.new_ones(x.size()[-1]) + heads_labels = None + if labels is not None and labels[0] is not None: + heads_labels = labels + + head_arc_emb = self.head_projection_layer(x) + dep_arc_emb = self.dependency_projection_layer(x) + x = dep_arc_emb.bmm(head_arc_emb.transpose(2, 1)) + pred = x.sigmoid() > 0.5 + + output = { + "prediction": pred, + "probability": x + } + + if heads_labels is not None: + if sample_weights is None: + sample_weights = heads_labels.new_ones([mask.size(0)]) + output["loss"], output["cycle_loss"] = self._loss(x, heads_labels, mask, sample_weights) + + return output + + def _cycle_loss(self, pred: torch.Tensor): + BATCH_SIZE, _, _ = pred.size() + loss = pred.new_zeros(BATCH_SIZE) + # Index from 1: as using non __ROOT__ tokens + pred = pred.softmax(-1)[:, 1:, 1:] + x = pred + for i in range(self.cycle_loss_n): + loss += self._batch_trace(x) + + # Don't multiple on last iteration + if i < self.cycle_loss_n - 1: + x = x.bmm(pred) + + return loss + + @staticmethod + def _batch_trace(x: torch.Tensor) -> torch.Tensor: + assert len(x.size()) == 3 + BATCH_SIZE, N, M = x.size() + assert N == M + identity = x.new_tensor(torch.eye(N)) + identity = identity.reshape((1, N, N)) + batch_identity = identity.repeat(BATCH_SIZE, 1, 1) + return (x * batch_identity).sum((-1, -2)) + + def _loss(self, pred: torch.Tensor, labels: torch.Tensor, mask: torch.BoolTensor, + sample_weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + BATCH_SIZE, N, M = pred.size() + assert N == M + SENTENCE_LENGTH = N + + valid_positions = mask.sum() + + result = [] + true = labels + # Ignore first pred dimension as it is ROOT token prediction + for i in range(SENTENCE_LENGTH - 1): + pred_i = pred[:, i + 1, 1:].reshape(-1) + true_i = true[:, i + 1, 1:].reshape(-1) + mask_i = mask[:, i] + bce_loss = F.binary_cross_entropy_with_logits(pred_i, true_i, reduction="none").mean(-1) * mask_i + result.append(bce_loss) + cycle_loss = self._cycle_loss(pred) + loss = torch.stack(result).transpose(1, 0) * sample_weights.unsqueeze(-1) + return loss.sum() / valid_positions + cycle_loss.mean(), cycle_loss.mean() + + +class GraphDependencyRelationModel(Predictor): + """Dependency relation parsing model.""" + + def __init__(self, + head_predictor: GraphHeadPredictionModel, + head_projection_layer: base.Linear, + dependency_projection_layer: base.Linear, + relation_prediction_layer: base.Linear): + super().__init__() + self.head_predictor = head_predictor + self.head_projection_layer = head_projection_layer + self.dependency_projection_layer = dependency_projection_layer + self.relation_prediction_layer = relation_prediction_layer + + def forward(self, + x: Union[torch.Tensor, List[torch.Tensor]], + mask: Optional[torch.BoolTensor] = None, + labels: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None, + sample_weights: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None) -> Dict[str, torch.Tensor]: + relations_labels, head_labels, enhanced_heads_labels, enhanced_deprels_labels = None, None, None, None + if labels is not None and labels[0] is not None: + relations_labels, head_labels, enhanced_heads_labels = labels + + head_output = self.head_predictor(x, enhanced_heads_labels, mask, sample_weights) + head_pred = head_output["probability"] + BATCH_SIZE, LENGTH, _ = head_pred.size() + + head_rel_emb = self.head_projection_layer(x) + + dep_rel_emb = self.dependency_projection_layer(x) + + # All possible edges combinations for each batch + # Repeat interleave to have [emb1, emb1 ... (length times) ... emb1, emb2 ... ] + head_rel_pred = head_rel_emb.repeat_interleave(LENGTH, -2) + # Regular repeat to have all combinations [deprel1, deprel2, ... deprelL, deprel1 ...] + dep_rel_pred = dep_rel_emb.repeat(1, LENGTH, 1) + + # All possible edges combinations for each batch + dep_rel_pred = torch.cat((head_rel_pred, dep_rel_pred), dim=-1) + + relation_prediction = self.relation_prediction_layer(dep_rel_pred).reshape(BATCH_SIZE, LENGTH, LENGTH, -1) + output = head_output + + output["prediction"] = (relation_prediction.argmax(-1), head_output["prediction"]) + output["rel_probability"] = relation_prediction + + if labels is not None and labels[0] is not None: + if sample_weights is None: + sample_weights = labels.new_ones([mask.size(0)]) + loss = self._loss(relation_prediction, relations_labels, enhanced_heads_labels, mask, sample_weights) + output["loss"] = (loss, head_output["loss"]) + + return output + + @staticmethod + def _loss(pred: torch.Tensor, + true: torch.Tensor, + heads_true: torch.Tensor, + mask: torch.BoolTensor, + sample_weights: torch.Tensor) -> torch.Tensor: + correct_heads_mask = heads_true.long() == 1 + true = true[correct_heads_mask] + pred = pred[correct_heads_mask] + loss = F.cross_entropy(pred, true.long()) + return loss.sum() / pred.size(0) + + @classmethod + def from_vocab(cls, + vocab: data.Vocabulary, + vocab_namespace: str, + head_predictor: GraphHeadPredictionModel, + head_projection_layer: base.Linear, + dependency_projection_layer: base.Linear + ): + """Creates parser combining model configuration and vocabulary data.""" + assert vocab_namespace in vocab.get_namespaces() + relation_prediction_layer = base.Linear( + in_features=head_projection_layer.get_output_dim() + dependency_projection_layer.get_output_dim(), + out_features=vocab.get_vocab_size(vocab_namespace) + ) + return cls( + head_predictor=head_predictor, + head_projection_layer=head_projection_layer, + dependency_projection_layer=dependency_projection_layer, + relation_prediction_layer=relation_prediction_layer + ) diff --git a/combo/models/lemma.py b/combo/models/lemma.py new file mode 100644 index 0000000000000000000000000000000000000000..d724a1ecb9c22610fc6ac56493929178d7a6cd5a --- /dev/null +++ b/combo/models/lemma.py @@ -0,0 +1,107 @@ +from typing import Optional, Dict, List, Union + +import torch +import torch.nn as nn + +from combo import data +from combo.models import dilated_cnn, base, utils +from combo.models.base import Predictor, TimeDistributed +from combo.models.combo_nn import Activation +from combo.utils import ConfigurationError + + +class LemmatizerModel(Predictor): + """Lemmatizer model.""" + + def __init__(self, + num_embeddings: int, + embedding_dim: int, + dilated_cnn_encoder: dilated_cnn.DilatedCnnEncoder, + input_projection_layer: base.Linear): + super().__init__() + self.char_embed = nn.Embedding( + num_embeddings=num_embeddings, + embedding_dim=embedding_dim, + ) + self.dilated_cnn_encoder = TimeDistributed(dilated_cnn_encoder) + self.input_projection_layer = input_projection_layer + + def forward(self, + x: Union[torch.Tensor, List[torch.Tensor]], + mask: Optional[torch.BoolTensor] = None, + labels: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None, + sample_weights: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None) -> Dict[str, torch.Tensor]: + encoder_emb, chars = x + + encoder_emb = self.input_projection_layer(encoder_emb) + char_embeddings = self.char_embed(chars) + + BATCH_SIZE, _, MAX_WORD_LENGTH, CHAR_EMB = char_embeddings.size() + encoder_emb = encoder_emb.unsqueeze(2).repeat(1, 1, MAX_WORD_LENGTH, 1) + + x = torch.cat((char_embeddings, encoder_emb), dim=-1).transpose(2, 3) + x = self.dilated_cnn_encoder(x).transpose(2, 3) + output = { + "prediction": x.argmax(-1), + "probability": x + } + + if labels is not None: + if mask is None: + mask = encoder_emb.new_ones(encoder_emb.size()[:-2]) + if sample_weights is None: + sample_weights = labels.new_ones(BATCH_SIZE) + mask = mask.unsqueeze(2).repeat(1, 1, MAX_WORD_LENGTH).bool() + output["loss"] = self._loss(x, labels, mask, sample_weights) + + return output + + @staticmethod + def _loss(pred: torch.Tensor, true: torch.Tensor, mask: torch.BoolTensor, + sample_weights: torch.Tensor) -> torch.Tensor: + BATCH_SIZE, SENTENCE_LENGTH, MAX_WORD_LENGTH, CHAR_CLASSES = pred.size() + pred = pred.reshape(-1, CHAR_CLASSES) + + true = true.reshape(-1) + mask = true.gt(0) + loss = utils.masked_cross_entropy(pred, true, mask) + loss = loss.reshape(BATCH_SIZE, -1) * sample_weights.unsqueeze(-1) + valid_positions = mask.sum() + return loss.sum() / valid_positions + + @classmethod + def from_vocab(cls, + vocab: data.Vocabulary, + char_vocab_namespace: str, + lemma_vocab_namespace: str, + embedding_dim: int, + input_projection_layer: base.Linear, + filters: List[int], + kernel_size: List[int], + stride: List[int], + padding: List[int], + dilation: List[int], + activations: List[Activation], + ): + assert char_vocab_namespace in vocab.get_namespaces() + assert lemma_vocab_namespace in vocab.get_namespaces() + + if len(filters) + 1 != len(kernel_size): + raise ConfigurationError( + f"len(filters) ({len(filters):d}) + 1 != kernel_size ({len(kernel_size):d})" + ) + filters = filters + [vocab.get_vocab_size(lemma_vocab_namespace)] + + dilated_cnn_encoder = dilated_cnn.DilatedCnnEncoder( + input_dim=embedding_dim + input_projection_layer.get_output_dim(), + filters=filters, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + activations=activations, + ) + return cls(num_embeddings=vocab.get_vocab_size(char_vocab_namespace), + embedding_dim=embedding_dim, + dilated_cnn_encoder=dilated_cnn_encoder, + input_projection_layer=input_projection_layer) diff --git a/combo/models/model.py b/combo/models/model.py new file mode 100644 index 0000000000000000000000000000000000000000..9482ea546e69803d1c95815a955e259523bfd80f --- /dev/null +++ b/combo/models/model.py @@ -0,0 +1,464 @@ +""" +Adapted from AllenNLP +https://github.com/allenai/allennlp/blob/main/allennlp/models/model.py +""" + +import logging +import os +from os import PathLike +import re +from typing import Dict, List, Set, Type, Optional, Union + +import numpy +import torch + +from combo.common.params import remove_keys_from_params, Params +from combo.data import Vocabulary, Instance +from combo.data.batch import Batch +from combo.nn import util, RegularizerApplicator +from combo.utils import ConfigurationError + +logger = logging.getLogger(__name__) + +# When training a model, many sets of weights are saved. By default we want to +# save/load this set of weights. +_DEFAULT_WEIGHTS = "best.th" + + +class Model(torch.nn.Module): + """ + This abstract class represents a model to be trained. Rather than relying completely + on the Pytorch Module, we modify the output spec of `forward` to be a dictionary. + + Models built using this API are still compatible with other pytorch models and can + be used naturally as modules within other models - outputs are dictionaries, which + can be unpacked and passed into other layers. One caveat to this is that if you + wish to use an AllenNLP model inside a Container (such as nn.Sequential), you must + interleave the models with a wrapper module which unpacks the dictionary into + a list of tensors. + + In order for your model to be trained using the [`Trainer`](../training/trainer.md) + api, the output dictionary of your Model must include a "loss" key, which will be + optimised during the training process. + + Finally, you can optionally implement :func:`Model.get_metrics` in order to make use + of early stopping and best-model serialization based on a validation metric in + `Trainer`. Metrics that begin with "_" will not be logged + to the progress bar by `Trainer`. + + The `from_archive` method on this class is registered as a `Model` with name "from_archive". + So, if you are using a configuration file, you can specify a model as `{"type": "from_archive", + "archive_file": "/path/to/archive.tar.gz"}`, which will pull out the model from the given + location and return it. + + # Parameters + + vocab: `Vocabulary` + There are two typical use-cases for the `Vocabulary` in a `Model`: getting vocabulary sizes + when constructing embedding matrices or output classifiers (as the vocabulary holds the + number of classes in your output, also), and translating model output into human-readable + form. + + In a typical AllenNLP configuration file, this parameter does not get an entry under the + "model", it gets specified as a top-level parameter, then is passed in to the model + separately. + regularizer: `RegularizerApplicator`, optional + If given, the `Trainer` will use this to regularize model parameters. + serialization_dir: `str`, optional + The directory in which the training output is saved to, or the directory the model is loaded from. + """ + + _warn_for_unseparable_batches: Set[str] = set() + default_predictor: Optional[str] = None + + def __init__( + self, + vocab: Vocabulary, + regularizer: RegularizerApplicator = None, + serialization_dir: Optional[str] = None, + ) -> None: + super().__init__() + self.vocab = vocab + self._regularizer = regularizer + self.serialization_dir = serialization_dir + + def get_regularization_penalty(self) -> Optional[torch.Tensor]: + """ + Computes the regularization penalty for the model. + Returns None if the model was not configured to use regularization. + """ + if self._regularizer is None: + regularization_penalty = None + else: + try: + regularization_penalty = self._regularizer(self) + if isinstance(regularization_penalty, float): + assert regularization_penalty == 0.0 + regularization_penalty = torch.tensor(regularization_penalty) + except AssertionError: + raise RuntimeError("The regularizer cannot be a non-zero float.") + return regularization_penalty + + def get_parameters_for_histogram_tensorboard_logging(self) -> List[str]: + """ + Returns the name of model parameters used for logging histograms to tensorboard. + """ + return [name for name, _ in self.named_parameters()] + + def forward(self, *inputs) -> Dict[str, torch.Tensor]: + """ + Defines the forward pass of the model. In addition, to facilitate easy training, + this method is designed to compute a loss function defined by a user. + + The input is comprised of everything required to perform a + training update, `including` labels - you define the signature here! + It is down to the user to ensure that inference can be performed + without the presence of these labels. Hence, any inputs not available at + inference time should only be used inside a conditional block. + + The intended sketch of this method is as follows:: + + def forward(self, input1, input2, targets=None): + .... + .... + output1 = self.layer1(input1) + output2 = self.layer2(input2) + output_dict = {"output1": output1, "output2": output2} + if targets is not None: + # Function returning a scalar torch.Tensor, defined by the user. + loss = self._compute_loss(output1, output2, targets) + output_dict["loss"] = loss + return output_dict + + # Parameters + + *inputs : `Any` + Tensors comprising everything needed to perform a training update, `including` labels, + which should be optional (i.e have a default value of `None`). At inference time, + simply pass the relevant inputs, not including the labels. + + # Returns + + output_dict : `Dict[str, torch.Tensor]` + The outputs from the model. In order to train a model using the + `Trainer` api, you must provide a "loss" key pointing to a + scalar `torch.Tensor` representing the loss to be optimized. + """ + raise NotImplementedError + + def forward_on_instance(self, instance: Instance) -> Dict[str, numpy.ndarray]: + """ + Takes an [`Instance`](../data/instance.md), which typically has raw text in it, converts + that text into arrays using this model's [`Vocabulary`](../data/vocabulary.md), passes those + arrays through `self.forward()` and `self.make_output_human_readable()` (which by default + does nothing) and returns the result. Before returning the result, we convert any + `torch.Tensors` into numpy arrays and remove the batch dimension. + """ + return self.forward_on_instances([instance])[0] + + def forward_on_instances(self, instances: List[Instance]) -> List[Dict[str, numpy.ndarray]]: + """ + Takes a list of `Instances`, converts that text into arrays using this model's `Vocabulary`, + passes those arrays through `self.forward()` and `self.make_output_human_readable()` (which + by default does nothing) and returns the result. Before returning the result, we convert + any `torch.Tensors` into numpy arrays and separate the batched output into a list of + individual dicts per instance. Note that typically this will be faster on a GPU (and + conditionally, on a CPU) than repeated calls to `forward_on_instance`. + + # Parameters + + instances : `List[Instance]`, required + The instances to run the model on. + + # Returns + + A list of the models output for each instance. + """ + batch_size = len(instances) + with torch.no_grad(): + cuda_device = self._get_prediction_device() + dataset = Batch(instances) + dataset.index_instances(self.vocab) + model_input = util.move_to_device(dataset.as_tensor_dict(), cuda_device) + outputs = self.make_output_human_readable(self(**model_input)) + + instance_separated_output: List[Dict[str, numpy.ndarray]] = [ + {} for _ in dataset.instances + ] + for name, output in list(outputs.items()): + if isinstance(output, torch.Tensor): + # NOTE(markn): This is a hack because 0-dim pytorch tensors are not iterable. + # This occurs with batch size 1, because we still want to include the loss in that case. + if output.dim() == 0: + output = output.unsqueeze(0) + + if output.size(0) != batch_size: + self._maybe_warn_for_unseparable_batches(name) + continue + output = output.detach().cpu().numpy() + elif len(output) != batch_size: + self._maybe_warn_for_unseparable_batches(name) + continue + for instance_output, batch_element in zip(instance_separated_output, output): + instance_output[name] = batch_element + return instance_separated_output + + def make_output_human_readable( + self, output_dict: Dict[str, torch.Tensor] + ) -> Dict[str, torch.Tensor]: + """ + Takes the result of `forward` and makes it human readable. Most of the time, the only thing + this method does is convert tokens / predicted labels from tensors to strings that humans + might actually understand. Somtimes you'll also do an argmax or something in here, too, but + that most often happens in `Model.forward`, before you compute your metrics. + + This method `modifies` the input dictionary, and also `returns` the same dictionary. + + By default in the base class we do nothing. + """ + + return output_dict + + def get_metrics(self, reset: bool = False) -> Dict[str, float]: + """ + Returns a dictionary of metrics. This method will be called by + `allennlp.training.Trainer` in order to compute and use model metrics for early + stopping and model serialization. We return an empty dictionary here rather than raising + as it is not required to implement metrics for a new model. A boolean `reset` parameter is + passed, as frequently a metric accumulator will have some state which should be reset + between epochs. This is also compatible with [`Metric`s](../training/metrics/metric.md). Metrics + should be populated during the call to `forward`, with the `Metric` handling the accumulation of + the metric until this method is called. + """ + + return {} + + def _get_prediction_device(self) -> int: + """ + This method checks the device of the model parameters to determine the cuda_device + this model should be run on for predictions. If there are no parameters, it returns -1. + + # Returns + + The cuda device this model should run on for predictions. + """ + devices = {util.get_device_of(param) for param in self.parameters()} + + if len(devices) > 1: + devices_string = ", ".join(str(x) for x in devices) + raise ConfigurationError(f"Parameters have mismatching cuda_devices: {devices_string}") + elif len(devices) == 1: + return devices.pop() + else: + return -1 + + def _maybe_warn_for_unseparable_batches(self, output_key: str): + """ + This method warns once if a user implements a model which returns a dictionary with + values which we are unable to split back up into elements of the batch. This is controlled + by a class attribute `_warn_for_unseperable_batches` because it would be extremely verbose + otherwise. + """ + if output_key not in self._warn_for_unseparable_batches: + logger.warning( + f"Encountered the {output_key} key in the model's return dictionary which " + "couldn't be split by the batch size. Key will be ignored." + ) + # We only want to warn once for this key, + # so we set this to false so we don't warn again. + self._warn_for_unseparable_batches.add(output_key) + + @classmethod + def _load( + cls, + config: Params, + serialization_dir: Union[str, PathLike], + weights_file: Optional[Union[str, PathLike]] = None, + cuda_device: int = -1, + ) -> "Model": + """ + Instantiates an already-trained model, based on the experiment + configuration and some optional overrides. + """ + weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS) + + # Load vocabulary from file + vocab_dir = os.path.join(serialization_dir, "vocabulary") + # If the config specifies a vocabulary subclass, we need to use it. + vocab_params = config.get("vocabulary", Params({})) + vocab_choice = vocab_params.pop_choice("type", Vocabulary.list_available(), True) + vocab_class, _ = Vocabulary.resolve_class_name(vocab_choice) + vocab = vocab_class.from_files( + vocab_dir, vocab_params.get("padding_token"), vocab_params.get("oov_token") + ) + + model_params = config.get("model") + + # The experiment config tells us how to _train_ a model, including where to get pre-trained + # embeddings/weights from. We're now _loading_ the model, so those weights will already be + # stored in our model. We don't need any pretrained weight file or initializers anymore, + # and we don't want the code to look for it, so we remove it from the parameters here. + remove_keys_from_params(model_params) + model = Model.from_params( + vocab=vocab, params=model_params, serialization_dir=serialization_dir + ) + + # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are + # in sync with the weights + if cuda_device >= 0: + model.cuda(cuda_device) + else: + model.cpu() + + # If vocab+embedding extension was done, the model initialized from from_params + # and one defined by state dict in weights_file might not have same embedding shapes. + # Eg. when model embedder module was transferred along with vocab extension, the + # initialized embedding weight shape would be smaller than one in the state_dict. + # So calling model embedding extension is required before load_state_dict. + # If vocab and model embeddings are in sync, following would be just a no-op. + model.extend_embedder_vocab() + + # Load state dict. We pass `strict=False` so PyTorch doesn't raise a RuntimeError + # if the state dict is missing keys because we handle this case below. + model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device)) + missing_keys, unexpected_keys = model.load_state_dict(model_state, strict=False) + + # Modules might define a class variable called `authorized_missing_keys`, + # a list of regex patterns, that tells us to ignore missing keys that match + # any of the patterns. + # We sometimes need this in order to load older models with newer versions of AllenNLP. + + def filter_out_authorized_missing_keys(module, prefix=""): + nonlocal missing_keys + for pat in getattr(module.__class__, "authorized_missing_keys", None) or []: + missing_keys = [ + k + for k in missing_keys + if k.startswith(prefix) and re.search(pat[len(prefix) :], k) is None + ] + for name, child in module._modules.items(): + if child is not None: + filter_out_authorized_missing_keys(child, prefix + name + ".") + + filter_out_authorized_missing_keys(model) + + if unexpected_keys or missing_keys: + raise RuntimeError( + f"Error loading state dict for {model.__class__.__name__}\n\t" + f"Missing keys: {missing_keys}\n\t" + f"Unexpected keys: {unexpected_keys}" + ) + + return model + + @classmethod + def load( + cls, + config: Params, + serialization_dir: Union[str, PathLike], + weights_file: Optional[Union[str, PathLike]] = None, + cuda_device: int = -1, + ) -> "Model": + """ + Instantiates an already-trained model, based on the experiment + configuration and some optional overrides. + + # Parameters + + config : `Params` + The configuration that was used to train the model. It should definitely + have a `model` section, and should probably have a `trainer` section + as well. + serialization_dir: `str = None` + The directory containing the serialized weights, parameters, and vocabulary + of the model. + weights_file: `str = None` + By default we load the weights from `best.th` in the serialization + directory, but you can override that value here. + cuda_device: `int = -1` + By default we load the model on the CPU, but if you want to load it + for GPU usage you can specify the id of your GPU here + + # Returns + + model : `Model` + The model specified in the configuration, loaded with the serialized + vocabulary and the trained weights. + """ + + # Peak at the class of the model. + model_type = ( + config["model"] if isinstance(config["model"], str) else config["model"]["type"] + ) + + # Load using an overridable _load method. + # This allows subclasses of Model to override _load. + + model_class: Type[Model] = cls.by_name(model_type) # type: ignore + if not isinstance(model_class, type): + # If you're using from_archive to specify your model (e.g., for fine tuning), then you + # can't currently override the behavior of _load; we just use the default Model._load. + # If we really need to change this, we would need to implement a recursive + # get_model_class method, that recurses whenever it finds a from_archive model type. + model_class = Model + return model_class._load(config, serialization_dir, weights_file, cuda_device) + + def extend_embedder_vocab(self, embedding_sources_mapping: Dict[str, str] = None) -> None: + """ + Iterates through all embedding modules in the model and assures it can embed + with the extended vocab. This is required in fine-tuning or transfer learning + scenarios where model was trained with original vocabulary but during + fine-tuning/transfer-learning, it will have it work with extended vocabulary + (original + new-data vocabulary). + + # Parameters + + embedding_sources_mapping : `Dict[str, str]`, optional (default = `None`) + Mapping from model_path to pretrained-file path of the embedding + modules. If pretrained-file used at time of embedding initialization + isn't available now, user should pass this mapping. Model path is + path traversing the model attributes upto this embedding module. + Eg. "_text_field_embedder.token_embedder_tokens". + """ + # self.named_modules() gives all sub-modules (including nested children) + # The path nesting is already separated by ".": eg. parent_module_name.child_module_name + embedding_sources_mapping = embedding_sources_mapping or {} + for model_path, module in self.named_modules(): + if hasattr(module, "extend_vocab"): + pretrained_file = embedding_sources_mapping.get(model_path) + module.extend_vocab( + self.vocab, + extension_pretrained_file=pretrained_file, + model_path=model_path, + ) + + @classmethod + def from_archive(cls, archive_file: str, vocab: Vocabulary = None) -> "Model": + """ + Loads a model from an archive file. This basically just calls + `return archival.load_archive(archive_file).model`. It exists as a method here for + convenience, and so that we can register it for easy use for fine tuning an existing model + from a config file. + + If `vocab` is given, we will extend the loaded model's vocabulary using the passed vocab + object (including calling `extend_embedder_vocab`, which extends embedding layers). + """ + from combo.models.archival import load_archive # here to avoid circular imports + + model = load_archive(archive_file).model + if vocab: + model.vocab.extend_from_vocab(vocab) + model.extend_embedder_vocab() + return model + + +def remove_weights_related_keys_from_params( + params: Params, keys: List[str] = ["pretrained_file", "initializer"] +): + remove_keys_from_params(params, keys) + + +def remove_pretrained_embedding_params(params: Params): + """This function only exists for backwards compatibility. + Please use `remove_weights_related_keys_from_params()` instead.""" + remove_keys_from_params(params, ["pretrained_file"]) diff --git a/combo/models/morpho.py b/combo/models/morpho.py new file mode 100644 index 0000000000000000000000000000000000000000..5fb9545eeec5bc0049a51abb69ba479817410484 --- /dev/null +++ b/combo/models/morpho.py @@ -0,0 +1,103 @@ +""" +Adapted from COMBO +Author: Mateusz Klimaszewski +""" +from typing import Dict, List, Optional, Union +import torch + +from combo import data +from combo.data import dataset +from combo.models import base, utils +from combo.models.combo_nn import Activation +from combo.utils import ConfigurationError + + +class MorphologicalFeatures(base.Predictor): + """Morphological features predicting model.""" + + def __init__(self, feedforward_network: base.FeedForward, slices: Dict[str, List[int]]): + super().__init__() + self.feedforward_network = feedforward_network + self.slices = slices + + def forward(self, + x: Union[torch.Tensor, List[torch.Tensor]], + mask: Optional[torch.BoolTensor] = None, + labels: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None, + sample_weights: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None) -> Dict[str, torch.Tensor]: + if mask is None: + mask = x.new_ones(x.size()[:-1]) + + x, feature_maps = self.feedforward_network(x) + + prediction = [] + for _, cat_indices in self.slices.items(): + prediction.append(x[:, :, cat_indices].argmax(dim=-1)) + + output = { + "prediction": torch.stack(prediction, dim=-1), + "probability": x, + "embedding": feature_maps[-1], + } + + if labels is not None: + if sample_weights is None: + sample_weights = labels.new_ones([mask.size(0)]) + output["loss"] = self._loss(x, labels, mask, sample_weights) + + return output + + def _loss(self, pred: torch.Tensor, true: torch.Tensor, mask: torch.BoolTensor, + sample_weights: torch.Tensor) -> torch.Tensor: + assert pred.size() == true.size() + BATCH_SIZE, _, MORPHOLOGICAL_FEATURES = pred.size() + + valid_positions = mask.sum() + + pred = pred.reshape(-1, MORPHOLOGICAL_FEATURES) + true = true.reshape(-1, MORPHOLOGICAL_FEATURES) + mask = mask.reshape(-1) + loss = None + loss_func = utils.masked_cross_entropy + for cat, cat_indices in self.slices.items(): + if cat not in ["__PAD__", "_"]: + if loss is None: + loss = loss_func(pred[:, cat_indices], + true[:, cat_indices].argmax(dim=1), + mask) + else: + loss += loss_func(pred[:, cat_indices], + true[:, cat_indices].argmax(dim=1), + mask) + loss = loss.reshape(BATCH_SIZE, -1) * sample_weights.unsqueeze(-1) + return loss.sum() / valid_positions + + @classmethod + def from_vocab(cls, + vocab: data.Vocabulary, + vocab_namespace: str, + input_dim: int, + num_layers: int, + hidden_dims: List[int], + activations: Union[Activation, List[Activation]], + dropout: Union[float, List[float]] = 0.0, + ): + if len(hidden_dims) + 1 != num_layers: + raise ConfigurationError( + f"len(hidden_dims) ({len(hidden_dims):d}) + 1 != num_layers ({num_layers:d})" + ) + + assert vocab_namespace in vocab.get_namespaces() + hidden_dims = hidden_dims + [vocab.get_vocab_size(vocab_namespace)] + + slices = dataset.get_slices_if_not_provided(vocab) + + return cls( + feedforward_network=base.FeedForward( + input_dim=input_dim, + num_layers=num_layers, + hidden_dims=hidden_dims, + activations=activations, + dropout=dropout), + slices=slices + ) diff --git a/combo/models/parser.py b/combo/models/parser.py new file mode 100644 index 0000000000000000000000000000000000000000..42d2efb3944a3e81188d6cf8f673dcb2cfb75002 --- /dev/null +++ b/combo/models/parser.py @@ -0,0 +1,223 @@ +""" +Adapted from COMBO +Author: Mateusz Klimaszewski +""" +from typing import Tuple, Dict, Optional, Union, List + +import numpy as np +import torch +import torch.nn.functional as F + +from combo import data +from combo.models import base, utils +from combo.nn import chu_liu_edmonds + + +class HeadPredictionModel(base.Predictor): + """Head prediction model.""" + + def __init__(self, + head_projection_layer: base.Linear, + dependency_projection_layer: base.Linear, + cycle_loss_n: int = 0): + super().__init__() + self.head_projection_layer = head_projection_layer + self.dependency_projection_layer = dependency_projection_layer + self.cycle_loss_n = cycle_loss_n + + def forward(self, + x: Union[torch.Tensor, List[torch.Tensor]], + mask: Optional[torch.BoolTensor] = None, + labels: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None, + sample_weights: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None) -> Dict[str, torch.Tensor]: + if mask is None: + mask = x.new_ones(x.size()[-1]) + + head_arc_emb = self.head_projection_layer(x) + dep_arc_emb = self.dependency_projection_layer(x) + x = dep_arc_emb.bmm(head_arc_emb.transpose(2, 1)) + + if self.training: + pred = x.argmax(-1) + else: + pred = [] + # Adding non existing in mask ROOT to lengths + lengths = mask.data.sum(dim=1).long().cpu().numpy() + 1 + for idx, length in enumerate(lengths): + probs = x[idx, :].softmax(dim=-1).cpu().numpy() + + # We do not want any word to be parent of the root node (ROOT, 0). + # Also setting it to -1 instead of 0 fixes edge case where softmax made all + # but ROOT prediction to EXACTLY 0.0 and it might cause in many ROOT -> word edges) + probs[:, 0] = -1 + heads, _ = chu_liu_edmonds.decode_mst(probs.T, length=length, has_labels=False) + heads[0] = 0 + pred.append(heads) + pred = torch.from_numpy(np.stack(pred)).to(x.device) + + output = { + "prediction": pred[:, 1:], + "probability": x + } + + if labels is not None: + if sample_weights is None: + sample_weights = labels.new_ones([mask.size(0)]) + output["loss"], output["cycle_loss"] = self._loss(x, labels, mask, sample_weights) + + return output + + def _cycle_loss(self, pred: torch.Tensor): + BATCH_SIZE, _, _ = pred.size() + loss = pred.new_zeros(BATCH_SIZE) + # Index from 1: as using non __ROOT__ tokens + pred = pred.softmax(-1)[:, 1:, 1:] + x = pred + for i in range(self.cycle_loss_n): + loss += self._batch_trace(x) + + # Don't multiple on last iteration + if i < self.cycle_loss_n - 1: + x = x.bmm(pred) + + return loss + + @staticmethod + def _batch_trace(x: torch.Tensor) -> torch.Tensor: + assert len(x.size()) == 3 + BATCH_SIZE, N, M = x.size() + assert N == M + identity = x.new_tensor(torch.eye(N)) + identity = identity.reshape((1, N, N)) + batch_identity = identity.repeat(BATCH_SIZE, 1, 1) + return (x * batch_identity).sum((-1, -2)) + + def _loss(self, pred: torch.Tensor, true: torch.Tensor, mask: torch.BoolTensor, + sample_weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + BATCH_SIZE, N, M = pred.size() + assert N == M + SENTENCE_LENGTH = N + + valid_positions = mask.sum() + + result = [] + # Ignore first pred dimension as it is ROOT token prediction + for i in range(SENTENCE_LENGTH - 1): + pred_i = pred[:, i + 1, :].reshape(BATCH_SIZE, SENTENCE_LENGTH) + true_i = true[:, i].reshape(-1) + mask_i = mask[:, i] + cross_entropy_loss = utils.masked_cross_entropy(pred_i, true_i, mask_i) + result.append(cross_entropy_loss) + cycle_loss = self._cycle_loss(pred) + loss = torch.stack(result).transpose(1, 0) * sample_weights.unsqueeze(-1) + return loss.sum() / valid_positions + cycle_loss.mean(), cycle_loss.mean() + + + +class DependencyRelationModel(base.Predictor): + """Dependency relation parsing model.""" + + def __init__(self, + root_idx: int, + head_predictor: HeadPredictionModel, + head_projection_layer: base.Linear, + dependency_projection_layer: base.Linear, + relation_prediction_layer: base.Linear): + super().__init__() + self.root_idx = root_idx + self.head_predictor = head_predictor + self.head_projection_layer = head_projection_layer + self.dependency_projection_layer = dependency_projection_layer + self.relation_prediction_layer = relation_prediction_layer + + def forward(self, + x: Union[torch.Tensor, List[torch.Tensor]], + mask: Optional[torch.BoolTensor] = None, + labels: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None, + sample_weights: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None) -> Dict[str, torch.Tensor]: + device = x.device + if mask is not None: + mask = mask[:, 1:] + relations_labels, head_labels = None, None + if labels is not None and labels[0] is not None: + relations_labels, head_labels = labels + if mask is None: + mask = head_labels.new_ones(head_labels.size()) + + head_output = self.head_predictor(x, mask, head_labels, sample_weights) + head_pred = head_output["probability"] + head_pred_soft = F.softmax(head_pred, dim=-1) + + head_rel_emb = self.head_projection_layer(x) + + dep_rel_emb = self.dependency_projection_layer(x) + + dep_rel_pred = head_pred_soft.bmm(head_rel_emb) + dep_rel_pred = torch.cat((dep_rel_pred, dep_rel_emb), dim=-1) + relation_prediction = self.relation_prediction_layer(dep_rel_pred) + output = head_output + output["embedding"] = dep_rel_pred + + if self.training: + output["prediction"] = (relation_prediction.argmax(-1)[:, 1:], head_output["prediction"]) + else: + # Mask root label whenever head is not 0. + relation_prediction_output = relation_prediction[:, 1:].clone() + mask = (head_output["prediction"] == 0) + vocab_size = relation_prediction_output.size(-1) + root_idx = torch.tensor([self.root_idx], device=device) + relation_prediction_output[mask] = (relation_prediction_output + .masked_select(mask.unsqueeze(-1)) + .reshape(-1, vocab_size) + .index_fill(-1, root_idx, 10e10)) + relation_prediction_output[~mask] = (relation_prediction_output + .masked_select(~(mask.unsqueeze(-1))) + .reshape(-1, vocab_size) + .index_fill(-1, root_idx, -10e10)) + output["prediction"] = (relation_prediction_output.argmax(-1), head_output["prediction"]) + + if labels is not None and labels[0] is not None: + if sample_weights is None: + sample_weights = labels.new_ones([mask.size(0)]) + loss = self._loss(relation_prediction[:, 1:], relations_labels, mask, sample_weights) + output["loss"] = (loss, head_output["loss"]) + + return output + + @staticmethod + def _loss(pred: torch.Tensor, + true: torch.Tensor, + mask: torch.BoolTensor, + sample_weights: torch.Tensor) -> torch.Tensor: + + valid_positions = mask.sum() + + BATCH_SIZE, _, DEPENDENCY_RELATIONS = pred.size() + pred = pred.reshape(-1, DEPENDENCY_RELATIONS) + true = true.reshape(-1) + mask = mask.reshape(-1) + loss = utils.masked_cross_entropy(pred, true, mask) + loss = loss.reshape(BATCH_SIZE, -1) * sample_weights.unsqueeze(-1) + return loss.sum() / valid_positions + + @classmethod + def from_vocab(cls, + vocab: data.Vocabulary, + vocab_namespace: str, + head_predictor: HeadPredictionModel, + head_projection_layer: base.Linear, + dependency_projection_layer: base.Linear + ): + """Creates parser combining model configuration and vocabulary data.""" + assert vocab_namespace in vocab.get_namespaces() + relation_prediction_layer = base.Linear( + in_features=head_projection_layer.get_output_dim() + dependency_projection_layer.get_output_dim(), + out_features=vocab.get_vocab_size(vocab_namespace) + ) + return cls( + head_predictor=head_predictor, + head_projection_layer=head_projection_layer, + dependency_projection_layer=dependency_projection_layer, + relation_prediction_layer=relation_prediction_layer, + root_idx=vocab.get_token_index("root", vocab_namespace) + ) diff --git a/combo/models/utils.py b/combo/models/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..8366a7b264064fd51473643eed05cb852285891a --- /dev/null +++ b/combo/models/utils.py @@ -0,0 +1,27 @@ +import torch +import torch.nn.functional as F + + +def masked_cross_entropy(pred: torch.Tensor, true: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor: + pred = pred + (mask.float().unsqueeze(-1) + 1e-45).log() + return F.cross_entropy(pred, true, reduction="none") * mask + + +""" +Adapted from AllenNLP +""" +def tiny_value_of_dtype(dtype: torch.dtype): + """ + Returns a moderately tiny value for a given PyTorch data type that is used to avoid numerical + issues such as division by zero. + This is different from `info_value_of_dtype(dtype).tiny` because it causes some NaN bugs. + Only supports floating point dtypes. + """ + if not dtype.is_floating_point: + raise TypeError("Only supports floating point dtypes.") + if dtype == torch.float or dtype == torch.double: + return 1e-13 + elif dtype == torch.half: + return 1e-4 + else: + raise TypeError("Does not support dtype " + str(dtype)) \ No newline at end of file diff --git a/combo/modules/seq2seq_encoder.py b/combo/modules/seq2seq_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..71413f3c5f0539caf337e10e7c05c09e50dd5c19 --- /dev/null +++ b/combo/modules/seq2seq_encoder.py @@ -0,0 +1,33 @@ +class Seq2SeqEncoder: + """ + A `Seq2SeqEncoder` is a `Module` that takes as input a sequence of vectors and returns a + modified sequence of vectors. Input shape : `(batch_size, sequence_length, input_dim)`; output + shape : `(batch_size, sequence_length, output_dim)`. + + We add two methods to the basic `Module` API: `get_input_dim()` and `get_output_dim()`. + You might need this if you want to construct a `Linear` layer using the output of this encoder, + or to raise sensible errors for mis-matching input dimensions. + """ + + def get_input_dim(self) -> int: + """ + Returns the dimension of the vector input for each element in the sequence input + to a `Seq2SeqEncoder`. This is `not` the shape of the input tensor, but the + last element of that shape. + """ + raise NotImplementedError + + def get_output_dim(self) -> int: + """ + Returns the dimension of each vector in the sequence output by this `Seq2SeqEncoder`. + This is `not` the shape of the returned tensor, but the last element of that shape. + """ + raise NotImplementedError + + def is_bidirectional(self) -> bool: + """ + Returns `True` if this encoder is bidirectional. If so, we assume the forward direction + of the encoder is the first half of the final dimension, and the backward direction is the + second half. + """ + raise NotImplementedError diff --git a/combo/nn/regularizers/regularizer_applicator.py b/combo/nn/regularizers/regularizer_applicator.py new file mode 100644 index 0000000000000000000000000000000000000000..ae8d52d61a3615a6f45dab8c77166f7d4741b740 --- /dev/null +++ b/combo/nn/regularizers/regularizer_applicator.py @@ -0,0 +1,40 @@ +import re +from typing import List, Tuple + +import torch + +from combo.nn.regularizers import Regularizer + + +class RegularizerApplicator: + """ + Applies regularizers to the parameters of a Module based on regex matches. + """ + + def __init__(self, regexes: List[Tuple[str, Regularizer]] = None) -> None: + """ + # Parameters + regexes : `List[Tuple[str, Regularizer]]`, optional (default = `None`) + A sequence of pairs (regex, Regularizer), where each Regularizer + applies to the parameters its regex matches (and that haven't previously + been matched). + """ + self._regularizers = regexes or [] + + def __call__(self, module: torch.nn.Module) -> torch.Tensor: + """ + # Parameters + module : `torch.nn.Module`, required + The module to regularize. + """ + accumulator = 0.0 + for name, parameter in module.named_parameters(): + # We first check if the parameter needs gradient updates or not + if parameter.requires_grad: + # For each parameter find the first matching regex. + for regex, regularizer in self._regularizers: + if re.search(regex, name): + penalty = regularizer(parameter) + accumulator = accumulator + penalty + break + return accumulator diff --git a/combo/nn/util.py b/combo/nn/util.py new file mode 100644 index 0000000000000000000000000000000000000000..69c8d017760606cfdbf4b09999d813e1932b71f7 --- /dev/null +++ b/combo/nn/util.py @@ -0,0 +1,259 @@ +""" +Adapted from AllenNLP +https://github.com/allenai/allennlp/blob/80fb6061e568cb9d6ab5d45b661e86eb61b92c82/allennlp/nn/util.py +""" +from typing import Union, Dict, Optional, List, Any + +import torch + +from combo.common.util import int_to_device +from combo.utils import ConfigurationError + + +def move_to_device(obj, device: Union[torch.device, int]): + """ + Given a structure (possibly) containing Tensors, + move all the Tensors to the specified device (or do nothing, if they are already on + the target device). + """ + device = int_to_device(device) + + if isinstance(obj, torch.Tensor): + # You may be wondering why we don't just always call `obj.to(device)` since that would + # be a no-op anyway if `obj` is already on `device`. Well that works fine except + # when PyTorch is not compiled with CUDA support, in which case even calling + # `obj.to(torch.device("cpu"))` would result in an error. + return obj if obj.device == device else obj.to(device=device) + elif isinstance(obj, dict): + for key, value in obj.items(): + obj[key] = move_to_device(value, device) + return obj + elif isinstance(obj, list): + for i, item in enumerate(obj): + obj[i] = move_to_device(item, device) + return obj + elif isinstance(obj, tuple) and hasattr(obj, "_fields"): + # This is the best way to detect a NamedTuple, it turns out. + return obj.__class__(*(move_to_device(item, device) for item in obj)) + elif isinstance(obj, tuple): + return tuple(move_to_device(item, device) for item in obj) + else: + return obj + + +def device_mapping(cuda_device: int): + """ + In order to `torch.load()` a GPU-trained model onto a CPU (or specific GPU), + you have to supply a `map_location` function. Call this with + the desired `cuda_device` to get the function that `torch.load()` needs. + """ + + def inner_device_mapping(storage: torch.Storage, location) -> torch.Storage: + if cuda_device >= 0: + return storage.cuda(cuda_device) + else: + return storage + + return inner_device_mapping + + +def get_lengths_from_binary_sequence_mask(mask: torch.BoolTensor) -> torch.LongTensor: + """ + Compute sequence lengths for each batch element in a tensor using a + binary mask. + # Parameters + mask : `torch.BoolTensor`, required. + A 2D binary mask of shape (batch_size, sequence_length) to + calculate the per-batch sequence lengths from. + # Returns + `torch.LongTensor` + A torch.LongTensor of shape (batch_size,) representing the lengths + of the sequences in the batch. + """ + return mask.sum(-1) + + +def sort_batch_by_length(tensor: torch.Tensor, sequence_lengths: torch.Tensor): + """ + Sort a batch first tensor by some specified lengths. + # Parameters + tensor : `torch.FloatTensor`, required. + A batch first Pytorch tensor. + sequence_lengths : `torch.LongTensor`, required. + A tensor representing the lengths of some dimension of the tensor which + we want to sort by. + # Returns + sorted_tensor : `torch.FloatTensor` + The original tensor sorted along the batch dimension with respect to sequence_lengths. + sorted_sequence_lengths : `torch.LongTensor` + The original sequence_lengths sorted by decreasing size. + restoration_indices : `torch.LongTensor` + Indices into the sorted_tensor such that + `sorted_tensor.index_select(0, restoration_indices) == original_tensor` + permutation_index : `torch.LongTensor` + The indices used to sort the tensor. This is useful if you want to sort many + tensors using the same ordering. + """ + + if not isinstance(tensor, torch.Tensor) or not isinstance(sequence_lengths, torch.Tensor): + raise ConfigurationError("Both the tensor and sequence lengths must be torch.Tensors.") + + sorted_sequence_lengths, permutation_index = sequence_lengths.sort(0, descending=True) + sorted_tensor = tensor.index_select(0, permutation_index) + + index_range = torch.arange(0, len(sequence_lengths), device=sequence_lengths.device) + # This is the equivalent of zipping with index, sorting by the original + # sequence lengths and returning the now sorted indices. + _, reverse_mapping = permutation_index.sort(0, descending=False) + restoration_indices = index_range.index_select(0, reverse_mapping) + return sorted_tensor, sorted_sequence_lengths, restoration_indices, permutation_index + +def get_text_field_mask( + text_field_tensors: Dict[str, Dict[str, torch.Tensor]], + num_wrapping_dims: int = 0, + padding_id: int = 0, +) -> torch.BoolTensor: + """ + Takes the dictionary of tensors produced by a `TextField` and returns a mask + with 0 where the tokens are padding, and 1 otherwise. `padding_id` specifies the id of padding tokens. + We also handle `TextFields` wrapped by an arbitrary number of `ListFields`, where the number of wrapping + `ListFields` is given by `num_wrapping_dims`. + If `num_wrapping_dims == 0`, the returned mask has shape `(batch_size, num_tokens)`. + If `num_wrapping_dims > 0` then the returned mask has `num_wrapping_dims` extra + dimensions, so the shape will be `(batch_size, ..., num_tokens)`. + There could be several entries in the tensor dictionary with different shapes (e.g., one for + word ids, one for character ids). In order to get a token mask, we use the tensor in + the dictionary with the lowest number of dimensions. After subtracting `num_wrapping_dims`, + if this tensor has two dimensions we assume it has shape `(batch_size, ..., num_tokens)`, + and use it for the mask. If instead it has three dimensions, we assume it has shape + `(batch_size, ..., num_tokens, num_features)`, and sum over the last dimension to produce + the mask. Most frequently this will be a character id tensor, but it could also be a + featurized representation of each token, etc. + If the input `text_field_tensors` contains the "mask" key, this is returned instead of inferring the mask. + """ + masks = [] + for indexer_name, indexer_tensors in text_field_tensors.items(): + if "mask" in indexer_tensors: + masks.append(indexer_tensors["mask"].bool()) + if len(masks) == 1: + return masks[0] + elif len(masks) > 1: + # TODO(mattg): My guess is this will basically never happen, so I'm not writing logic to + # handle it. Should be straightforward to handle, though. If you see this error in + # practice, open an issue on github. + raise ValueError("found two mask outputs; not sure which to use!") + + tensor_dims = [ + (tensor.dim(), tensor) + for indexer_output in text_field_tensors.values() + for tensor in indexer_output.values() + ] + tensor_dims.sort(key=lambda x: x[0]) + + smallest_dim = tensor_dims[0][0] - num_wrapping_dims + if smallest_dim == 2: + token_tensor = tensor_dims[0][1] + return token_tensor != padding_id + elif smallest_dim == 3: + character_tensor = tensor_dims[0][1] + return (character_tensor != padding_id).any(dim=-1) + else: + raise ValueError("Expected a tensor with dimension 2 or 3, found {}".format(smallest_dim)) + + +def get_dropout_mask(dropout_probability: float, tensor_for_masking: torch.Tensor): + """ + Computes and returns an element-wise dropout mask for a given tensor, where + each element in the mask is dropped out with probability dropout_probability. + Note that the mask is NOT applied to the tensor - the tensor is passed to retain + the correct CUDA tensor type for the mask. + # Parameters + dropout_probability : `float`, required. + Probability of dropping a dimension of the input. + tensor_for_masking : `torch.Tensor`, required. + # Returns + `torch.FloatTensor` + A torch.FloatTensor consisting of the binary mask scaled by 1/ (1 - dropout_probability). + This scaling ensures expected values and variances of the output of applying this mask + and the original tensor are the same. + """ + binary_mask = (torch.rand(tensor_for_masking.size()) > dropout_probability).to( + tensor_for_masking.device + ) + # Scale mask by 1/keep_prob to preserve output statistics. + dropout_mask = binary_mask.float().div(1.0 - dropout_probability) + return dropout_mask + + +def find_embedding_layer(model: torch.nn.Module) -> torch.nn.Module: + """ + Takes a model (typically an AllenNLP `Model`, but this works for any `torch.nn.Module`) and + makes a best guess about which module is the embedding layer. For typical AllenNLP models, + this often is the `TextFieldEmbedder`, but if you're using a pre-trained contextualizer, we + really want layer 0 of that contextualizer, not the output. So there are a bunch of hacks in + here for specific pre-trained contextualizers. + """ + # We'll look for a few special cases in a first pass, then fall back to just finding a + # TextFieldEmbedder in a second pass if we didn't find a special case. + from transformers.models.gpt2.modeling_gpt2 import GPT2Model + from transformers.models.bert.modeling_bert import BertEmbeddings + from transformers.models.albert.modeling_albert import AlbertEmbeddings + from transformers.models.roberta.modeling_roberta import RobertaEmbeddings + + for module in model.modules(): + if isinstance(module, BertEmbeddings): + return module.word_embeddings + if isinstance(module, RobertaEmbeddings): + return module.word_embeddings + if isinstance(module, AlbertEmbeddings): + return module.word_embeddings + if isinstance(module, GPT2Model): + return module.wte + + return None + + # for module in model.modules(): + # if isinstance(module, TextFieldEmbedder): + # + # if isinstance(module, BasicTextFieldEmbedder): + # # We'll have a check for single Embedding cases, because we can be more efficient + # # in cases like this. If this check fails, then for something like hotflip we need + # # to actually run the text field embedder and construct a vector for each token. + # if len(module._token_embedders) == 1: + # embedder = list(module._token_embedders.values())[0] + # if isinstance(embedder, Embedding): + # if embedder._projection is None: + # # If there's a projection inside the Embedding, then we need to return + # # the whole TextFieldEmbedder, because there's more computation that + # # needs to be done than just multiply by an embedding matrix. + # return embedder + # return module + raise RuntimeError("No embedding module found!") + + + +def get_token_offsets_from_text_field_inputs( + text_field_inputs: List[Any], +) -> Optional[torch.Tensor]: + """ + Given a list of inputs to a TextFieldEmbedder, tries to find token offsets from those inputs, if + there are any. You will have token offsets if you are using a mismatched token embedder; if + you're not, the return value from this function should be None. This function is intended to be + called from a `forward_hook` attached to a `TextFieldEmbedder`, so the inputs are formatted just + as a list. + It's possible in theory that you could have multiple offsets as inputs to a single call to a + `TextFieldEmbedder`, but that's an extremely rare use case (I can't really imagine anyone + wanting to do that). In that case, we'll only return the first one. If you need different + behavior for your model, open an issue on github describing what you're doing. + """ + for input_index, text_field_input in enumerate(text_field_inputs): + if not isinstance(text_field_input, dict): + continue + for input_value in text_field_input.values(): + if not isinstance(input_value, dict): + continue + for embedder_arg_name, embedder_arg_value in input_value.items(): + if embedder_arg_name == "offsets": + return embedder_arg_value + return None + diff --git a/combo/training/trainer.py b/combo/training/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..ebb8ff0c3d33f593a8cf80cdd7720d41a8189cc1 --- /dev/null +++ b/combo/training/trainer.py @@ -0,0 +1,13 @@ +from pytorch_lightning import Trainer + + +class Callback: + pass + + +class TransferPatienceEpochCallback: + pass + + +class GradientDescentTrainer(Trainer): + pass diff --git a/main.py b/main.py new file mode 100644 index 0000000000000000000000000000000000000000..5596b44786f04e4810aefe9f8d712f08ed310f71 --- /dev/null +++ b/main.py @@ -0,0 +1,16 @@ +# This is a sample Python script. + +# Press Shift+F10 to execute it or replace it with your code. +# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings. + + +def print_hi(name): + # Use a breakpoint in the code line below to debug your script. + print(f'Hi, {name}') # Press Ctrl+F8 to toggle the breakpoint. + + +# Press the green button in the gutter to run the script. +if __name__ == '__main__': + print_hi('PyCharm') + +# See PyCharm help at https://www.jetbrains.com/help/pycharm/