Minor fixes

74ff65e0 · Maja Jablonska · c24d252e · c24d252e · c24d252e · 74ff65e0
Commit 74ff65e0 authored 1 year ago by Maja Jablonska
--- a/combo/models/base.py
+++ b/combo/models/base.py
-from typing import Dict, Optional, List, Union, Tuple
-import torch
-import torch.nn as nn
-from overrides import overrides
-from combo.nn import Activation
-import combo.utils.checks as checks
-from combo.data.vocabulary import Vocabulary
-from combo.models.utils import masked_cross_entropy
-from combo.predictors.predictor import Predictor
-class Linear(nn.Linear):
-    def __init__(self,
-                 in_features: int,
-                 out_features: int,
-                 activation: Optional[Activation] = None,
-                 dropout_rate: Optional[float] = 0.0):
-        super().__init__(in_features, out_features)
-        self.activation = activation if activation else self.identity
-        self.dropout = nn.Dropout(p=dropout_rate) if dropout_rate else self.identity
-    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
-        x = super().forward(x)
-        x = self.activation(x)
-        return self.dropout(x)
-    def get_output_dim(self) -> int:
-        return self.out_features
-    @staticmethod
-    def identity(x):
-        return x
-class FeedForward(torch.nn.Module):
-    """
-    Modified copy of allennlp.modules.feedforward.FeedForward
-    This `Module` is a feed-forward neural network, just a sequence of `Linear` layers with
-    activation functions in between.
-    # Parameters
-    input_dim : `int`, required
-        The dimensionality of the input.  We assume the input has shape `(batch_size, input_dim)`.
-    num_layers : `int`, required
-        The number of `Linear` layers to apply to the input.
-    hidden_dims : `Union[int, List[int]]`, required
-        The output dimension of each of the `Linear` layers.  If this is a single `int`, we use
-        it for all `Linear` layers.  If it is a `List[int]`, `len(hidden_dims)` must be
-        `num_layers`.
-    activations : `Union[Activation, List[Activation]]`, required
-        The activation function to use after each `Linear` layer.  If this is a single function,
-        we use it after all `Linear` layers.  If it is a `List[Activation]`,
-        `len(activations)` must be `num_layers`. Activation must have torch.nn.Module type.
-    dropout : `Union[float, List[float]]`, optional (default = `0.0`)
-        If given, we will apply this amount of dropout after each layer.  Semantics of `float`
-        versus `List[float]` is the same as with other parameters.
-    # Examples
-    ```python
-    FeedForward(124, 2, [64, 32], torch.nn.ReLU(), 0.2)
-    #> FeedForward(
-    #>   (_activations): ModuleList(
-    #>     (0): ReLU()
-    #>     (1): ReLU()
-    #>   )
-    #>   (_linear_layers): ModuleList(
-    #>     (0): Linear(in_features=124, out_features=64, bias=True)
-    #>     (1): Linear(in_features=64, out_features=32, bias=True)
-    #>   )
-    #>   (_dropout): ModuleList(
-    #>     (0): Dropout(p=0.2, inplace=False)
-    #>     (1): Dropout(p=0.2, inplace=False)
-    #>   )
-    #> )
-    ```
-    """
-    def __init__(
-            self,
-            input_dim: int,
-            num_layers: int,
-            hidden_dims: Union[int, List[int]],
-            activations: Union[Activation, List[Activation]],
-            dropout: Union[float, List[float]] = 0.0,
-    ) -> None:
-        super().__init__()
-        if not isinstance(hidden_dims, list):
-            hidden_dims = [hidden_dims] * num_layers  # type: ignore
-        if not isinstance(activations, list):
-            activations = [activations] * num_layers  # type: ignore
-        if not isinstance(dropout, list):
-            dropout = [dropout] * num_layers  # type: ignore
-        if len(hidden_dims) != num_layers:
-            raise checks.ConfigurationError(
-                "len(hidden_dims) (%d) != num_layers (%d)" % (len(hidden_dims), num_layers)
-            )
-        if len(activations) != num_layers:
-            raise checks.ConfigurationError(
-                "len(activations) (%d) != num_layers (%d)" % (len(activations), num_layers)
-            )
-        if len(dropout) != num_layers:
-            raise checks.ConfigurationError(
-                "len(dropout) (%d) != num_layers (%d)" % (len(dropout), num_layers)
-            )
-        self._activations = torch.nn.ModuleList(activations)
-        input_dims = [input_dim] + hidden_dims[:-1]
-        linear_layers = []
-        for layer_input_dim, layer_output_dim in zip(input_dims, hidden_dims):
-            linear_layers.append(torch.nn.Linear(layer_input_dim, layer_output_dim))
-        self._linear_layers = torch.nn.ModuleList(linear_layers)
-        dropout_layers = [torch.nn.Dropout(p=value) for value in dropout]
-        self._dropout = torch.nn.ModuleList(dropout_layers)
-        self._output_dim = hidden_dims[-1]
-        self.input_dim = input_dim
-    def get_output_dim(self):
-        return self._output_dim
-    def get_input_dim(self):
-        return self.input_dim
-    def forward(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, List[torch.Tensor]]:
-        output = inputs
-        feature_maps = []
-        for layer, activation, dropout in zip(
-                self._linear_layers, self._activations, self._dropout
-        ):
-            feature_maps.append(output)
-            output = dropout(activation(layer(output)))
-        return output, feature_maps
-class FeedForwardPredictor(Predictor):
-    """Feedforward predictor. Should be used on top of Seq2Seq encoder."""
-    def __init__(self, feedforward_network: "FeedForward"):
-        super().__init__()
-        self.feedforward_network = feedforward_network
-    def forward(self,
-                x: Union[torch.Tensor, List[torch.Tensor]],
-                mask: Optional[torch.BoolTensor] = None,
-                labels: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
-                sample_weights: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None) -> Dict[str, torch.Tensor]:
-        if mask is None:
-            mask = x.new_ones(x.size()[:-1])
-        x, feature_maps = self.feedforward_network(x)
-        output = {
-            "prediction": x.argmax(-1),
-            "probability": x,
-            "embedding": feature_maps[-1],
-        }
-        if labels is not None:
-            if sample_weights is None:
-                sample_weights = labels.new_ones([mask.size(0)])
-            output["loss"] = self._loss(x, labels, mask, sample_weights)
-        return output
-    def _loss(self,
-              pred: torch.Tensor,
-              true: torch.Tensor,
-              mask: torch.BoolTensor,
-              sample_weights: torch.Tensor) -> torch.Tensor:
-        BATCH_SIZE, _, CLASSES = pred.size()
-        valid_positions = mask.sum()
-        pred = pred.reshape(-1, CLASSES)
-        true = true.reshape(-1)
-        mask = mask.reshape(-1)
-        loss = masked_cross_entropy(pred, true, mask)
-        loss = loss.reshape(BATCH_SIZE, -1) * sample_weights.unsqueeze(-1)
-        return loss.sum() / valid_positions
-    @classmethod
-    def from_vocab(cls,
-                   vocab: Vocabulary,
-                   vocab_namespace: str,
-                   input_dim: int,
-                   num_layers: int,
-                   hidden_dims: List[int],
-                   activations: Union[Activation, List[Activation]],
-                   dropout: Union[float, List[float]] = 0.0,
-                   ):
-        if len(hidden_dims) + 1 != num_layers:
-            raise checks.ConfigurationError(
-                f"len(hidden_dims) ({len(hidden_dims):d}) + 1 != num_layers ({num_layers:d})"
-            )
-        assert vocab_namespace in vocab.get_namespaces(), \
-            f"There is not {vocab_namespace} in created vocabs, check if this field has any values to predict!"
-        hidden_dims = hidden_dims + [vocab.get_vocab_size(vocab_namespace)]
-        return cls(FeedForward(
-            input_dim=input_dim,
-            num_layers=num_layers,
-            hidden_dims=hidden_dims,
-            activations=activations,
-            dropout=dropout))
-"""
-Adapted from AllenNLP
-"""
-class TimeDistributed(torch.nn.Module):
-    """
-    Given an input shaped like `(batch_size, time_steps, [rest])` and a `Module` that takes
-    inputs like `(batch_size, [rest])`, `TimeDistributed` reshapes the input to be
-    `(batch_size * time_steps, [rest])`, applies the contained `Module`, then reshapes it back.
-    Note that while the above gives shapes with `batch_size` first, this `Module` also works if
-    `batch_size` is second - we always just combine the first two dimensions, then split them.
-    It also reshapes keyword arguments unless they are not tensors or their name is specified in
-    the optional `pass_through` iterable.
-    """
-    def __init__(self, module):
-        super().__init__()
-        self._module = module
-    @overrides
-    def forward(self, *inputs, pass_through: List[str] = None, **kwargs):
-        pass_through = pass_through or []
-        reshaped_inputs = [self._reshape_tensor(input_tensor) for input_tensor in inputs]
-        # Need some input to then get the batch_size and time_steps.
-        some_input = None
-        if inputs:
-            some_input = inputs[-1]
-        reshaped_kwargs = {}
-        for key, value in kwargs.items():
-            if isinstance(value, torch.Tensor) and key not in pass_through:
-                if some_input is None:
-                    some_input = value
-                value = self._reshape_tensor(value)
-            reshaped_kwargs[key] = value
-        reshaped_outputs = self._module(*reshaped_inputs, **reshaped_kwargs)
-        if some_input is None:
-            raise RuntimeError("No input tensor to time-distribute")
-        # Now get the output back into the right shape.
-        # (batch_size, time_steps, **output_size)
-        new_size = some_input.size()[:2] + reshaped_outputs.size()[1:]
-        outputs = reshaped_outputs.contiguous().view(new_size)
-        return outputs
-    @staticmethod
-    def _reshape_tensor(input_tensor):
-        input_size = input_tensor.size()
-        if len(input_size) <= 2:
-            raise RuntimeError(f"No dimension to distribute: {input_size}")
-        # Squash batch_size and time_steps into a single axis; result has shape
-        # (batch_size * time_steps, **input_size).
-        squashed_shape = [-1] + list(input_size[2:])
-        return input_tensor.contiguous().view(*squashed_shape)
--- a/combo/models/embeddings.py
+++ b/combo/models/embeddings.py
-from typing import Optional
-import torch
-from overrides import overrides
-from torch import nn
-from torchtext.vocab import Vectors, GloVe, FastText, CharNGram
-from combo.data import Vocabulary
-from combo.models.base import TimeDistributed
-from combo.models.dilated_cnn import DilatedCnnEncoder
-from combo.models.utils import tiny_value_of_dtype
-from combo.utils import ConfigurationError
-class TokenEmbedder(nn.Module):
-    def __init__(self):
-        super(TokenEmbedder, self).__init__()
-    @property
-    def output_dim(self) -> int:
-        raise NotImplementedError()
-    def forward(self,
-                x: torch.Tensor,
-                char_mask: Optional[torch.BoolTensor] = None) -> torch.Tensor:
-        raise NotImplementedError()
-class _TorchEmbedder(TokenEmbedder):
-    def __init__(self,
-                 num_embeddings: int,
-                 embedding_dim: int,
-                 padding_idx: Optional[int] = None,
-                 max_norm: Optional[float] = None,
-                 norm_type: float = 2.,
-                 scale_grad_by_freq: bool = False,
-                 sparse: bool = False,
-                 vocab_namespace: str = "tokens",
-                 vocab: Vocabulary = None,
-                 weight: Optional[torch.Tensor] = None,
-                 trainable: bool = True,
-                 projection_dim: Optional[int] = None):
-        super(_TorchEmbedder, self).__init__()
-        self._embedding_dim = embedding_dim
-        self._embedding = nn.Embedding(num_embeddings=num_embeddings,
-                                       embedding_dim=embedding_dim,
-                                       padding_idx=padding_idx,
-                                       max_norm=max_norm,
-                                       norm_type=norm_type,
-                                       scale_grad_by_freq=scale_grad_by_freq,
-                                       sparse=sparse)
-        self.__vocab_namespace = vocab_namespace
-        self.__vocab = vocab
-        if weight is not None:
-            if weight.shape() != (num_embeddings, embedding_dim):
-                raise ConfigurationError(
-                    "Weight matrix must be of shape (num_embeddings, embedding_dim)." +
-                    f"Got: ({weight.shape()})"
-                )
-            self.__weight = torch.nn.Parameter(weight, requires_grad=trainable)
-        else:
-            self.__weight = torch.nn.Parameter(torch.FloatTensor(num_embeddings, embedding_dim),
-                                               requires_grad=trainable)
-            torch.nn.init.xavier_uniform_(self.__weight)
-        if padding_idx is not None:
-            self.__weight.data[padding_idx].fill_(0)
-        if projection_dim:
-            self._projection = torch.nn.Linear(embedding_dim, projection_dim)
-            self._output_dim = projection_dim
-        else:
-            self._projection = None
-            self._output_dim = embedding_dim
-    @overrides
-    def output_dim(self) -> int:
-        return self._output_dim
-    @overrides
-    def forward(self,
-                x: torch.Tensor,
-                char_mask: Optional[torch.BoolTensor] = None) -> torch.Tensor:
-        embedded = self._embedding(x)
-        if self._projection:
-            projection = self._projection
-            for p in range(embedded.dim()-2):
-                projection = TimeDistributed(p)
-            embedded = projection(embedded)
-        return embedded
-class _TorchtextVectorsEmbedder(TokenEmbedder):
-    """
-    Torchtext Vectors object wrapper
-    """
-    def __init__(self,
-                 torchtext_embedder: Vectors,
-                 lower_case_backup: bool = False):
-        """
-        :param torchtext_embedder: Torchtext Vectors object
-        :param lower_case_backup: whether to look up the token in the
-        lower case. Default: False.
-        """
-        super(_TorchtextVectorsEmbedder, self).__init__()
-        self.__torchtext_embedder = torchtext_embedder
-        self.__lower_case_backup = lower_case_backup
-    @overrides
-    def output_dim(self) -> int:
-        return len(self.__torchtext_embedder)
-    @overrides
-    def forward(self,
-                x: torch.Tensor,
-                char_mask: Optional[torch.BoolTensor] = None) -> torch.Tensor:
-        return self.__torchtext_embedder.get_vecs_by_tokens(x, self.__lower_case_backup)
-class GloVe42BEmbedder(_TorchtextVectorsEmbedder):
-    def __init__(self, dim: int = 300):
-        super(GloVe42BEmbedder, self).__init__(GloVe("42B", dim))
-class GloVe840BEmbedder(_TorchtextVectorsEmbedder):
-    def __init__(self, dim: int = 300):
-        super(GloVe840BEmbedder, self).__init__(GloVe("840B", dim))
-class GloVeTwitter27BEmbedder(_TorchtextVectorsEmbedder):
-    def __init__(self, dim: int = 300):
-        super(GloVeTwitter27BEmbedder, self).__init__(GloVe("twitter.27B", dim))
-class GloVe6BEmbedder(_TorchtextVectorsEmbedder):
-    def __init__(self, dim: int = 300):
-        super(GloVe6BEmbedder, self).__init__(GloVe("6B", dim))
-class FastTextEmbedder(_TorchtextVectorsEmbedder):
-    def __init__(self, language: str = "en"):
-        super(FastTextEmbedder, self).__init__(FastText(language))
-class CharNGramEmbedder(_TorchtextVectorsEmbedder):
-    def __init__(self):
-        super(CharNGramEmbedder, self).__init__(CharNGram())
-class CharacterBasedWordEmbedder(TokenEmbedder):
-    def __init__(self,
-                 num_embeddings: int,
-                 embedding_dim: int,
-                 dilated_cnn_encoder: DilatedCnnEncoder):
-        super(CharacterBasedWordEmbedder, self).__init__()
-        self.__embedding_dim = embedding_dim
-        self.__dilated_cnn_encoder = dilated_cnn_encoder
-        self.char_embed = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim)
-    @overrides
-    def output_dim(self) -> int:
-        return self.__embedding_dim
-    @overrides
-    def forward(self,
-                x: torch.Tensor,
-                char_mask: Optional[torch.BoolTensor] = None) -> torch.Tensor:
-        if char_mask is None:
-            char_mask = x.new_ones(x.size())
-        x = self.char_embed(x)
-        x = x * char_mask.unsqueeze(-1).float()
-        x = self.__dilated_cnn_encoder(x.transpose(2, 3))
-        return torch.max(x, dim=-1)[0]
-class PretrainedTransformerMismatchedEmbedder(TokenEmbedder):
-    pass
-class TransformersWordEmbedder(PretrainedTransformerMismatchedEmbedder):
-    pass
-class FeatsTokenEmbedder(_TorchEmbedder):
-    def __init__(self,
-                 num_embeddings: int,
-                 embedding_dim: int,
-                 padding_idx: Optional[int] = None,
-                 max_norm: Optional[float] = None,
-                 norm_type: float = 2.,
-                 scale_grad_by_freq: bool = False,
-                 sparse: bool = False,
-                 vocab_namespace: str = "feats",
-                 vocab: Vocabulary = None,
-                 weight: Optional[torch.Tensor] = None,
-                 trainable: bool = True):
-        super(FeatsTokenEmbedder, self).__init__(num_embeddings,
-                                                 embedding_dim,
-                                                 padding_idx,
-                                                 max_norm,
-                                                 norm_type,
-                                                 scale_grad_by_freq,
-                                                 sparse,
-                                                 vocab_namespace,
-                                                 vocab,
-                                                 weight,
-                                                 trainable)
-    @overrides
-    def forward(self,
-                x: torch.Tensor,
-                char_mask: Optional[torch.BoolTensor] = None) -> torch.Tensor:
-        mask = x.gt(0)
-        x = super().forward(x)
-        return x.sum(dim=-2)/(
-            (mask.sum(dim=-1)+tiny_value_of_dtype(torch.float)).unsqueeze(dim=-1)
-        )
--- a/combo/models/graph_parser.py
+++ b/combo/models/graph_parser.py
@@ -6,19 +6,20 @@ Author: Mateusz Klimaszewski
 from typing import List, Optional, Union, Tuple, Dict
 from combo import data
-from combo.models import base
+from combo.predictors import Predictor
-from combo.models.base import Predictor
 import torch
 import torch.nn.functional as F
+from combo.nn.base import Linear
 class GraphHeadPredictionModel(Predictor):
    """Head prediction model."""
    def __init__(self,
-                 head_projection_layer: base.Linear,
+                 head_projection_layer: Linear,
-                 dependency_projection_layer: base.Linear,
+                 dependency_projection_layer: Linear,
                 cycle_loss_n: int = 0,
                 graph_weighting: float = 0.2):
        super().__init__()
@@ -107,9 +108,9 @@ class GraphDependencyRelationModel(Predictor):
    def __init__(self,
                 head_predictor: GraphHeadPredictionModel,
-                 head_projection_layer: base.Linear,
+                 head_projection_layer: Linear,
-                 dependency_projection_layer: base.Linear,
+                 dependency_projection_layer: Linear,
-                 relation_prediction_layer: base.Linear):
+                 relation_prediction_layer: Linear):
        super().__init__()
        self.head_predictor = head_predictor
        self.head_projection_layer = head_projection_layer
@@ -173,12 +174,12 @@ class GraphDependencyRelationModel(Predictor):
                   vocab: data.Vocabulary,
                   vocab_namespace: str,
                   head_predictor: GraphHeadPredictionModel,
-                   head_projection_layer: base.Linear,
+                   head_projection_layer: Linear,
-                   dependency_projection_layer: base.Linear
+                   dependency_projection_layer: Linear
                   ):
        """Creates parser combining model configuration and vocabulary data."""
        assert vocab_namespace in vocab.get_namespaces()
-        relation_prediction_layer = base.Linear(
+        relation_prediction_layer = Linear(
            in_features=head_projection_layer.get_output_dim() + dependency_projection_layer.get_output_dim(),
            out_features=vocab.get_vocab_size(vocab_namespace)
        )

--- a/combo/models/lemma.py
+++ b/combo/models/lemma.py
-from typing import Optional, Dict, List, Union
-import torch
-import torch.nn as nn
-from combo import data
-from combo.models import dilated_cnn, base, utils
-from combo.models.base import Predictor, TimeDistributed
-from combo.nn import Activation
-from combo.utils import ConfigurationError
-class LemmatizerModel(Predictor):
-    """Lemmatizer model."""
-    def __init__(self,
-                 num_embeddings: int,
-                 embedding_dim: int,
-                 dilated_cnn_encoder: dilated_cnn.DilatedCnnEncoder,
-                 input_projection_layer: base.Linear):
-        super().__init__()
-        self.char_embed = nn.Embedding(
-            num_embeddings=num_embeddings,
-            embedding_dim=embedding_dim,
-        )
-        self.dilated_cnn_encoder = TimeDistributed(dilated_cnn_encoder)
-        self.input_projection_layer = input_projection_layer
-    def forward(self,
-                x: Union[torch.Tensor, List[torch.Tensor]],
-                mask: Optional[torch.BoolTensor] = None,
-                labels: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
-                sample_weights: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None) -> Dict[str, torch.Tensor]:
-        encoder_emb, chars = x
-        encoder_emb = self.input_projection_layer(encoder_emb)
-        char_embeddings = self.char_embed(chars)
-        BATCH_SIZE, _, MAX_WORD_LENGTH, CHAR_EMB = char_embeddings.size()
-        encoder_emb = encoder_emb.unsqueeze(2).repeat(1, 1, MAX_WORD_LENGTH, 1)
-        x = torch.cat((char_embeddings, encoder_emb), dim=-1).transpose(2, 3)
-        x = self.dilated_cnn_encoder(x).transpose(2, 3)
-        output = {
-            "prediction": x.argmax(-1),
-            "probability": x
-        }
-        if labels is not None:
-            if mask is None:
-                mask = encoder_emb.new_ones(encoder_emb.size()[:-2])
-            if sample_weights is None:
-                sample_weights = labels.new_ones(BATCH_SIZE)
-            mask = mask.unsqueeze(2).repeat(1, 1, MAX_WORD_LENGTH).bool()
-            output["loss"] = self._loss(x, labels, mask, sample_weights)
-        return output
-    @staticmethod
-    def _loss(pred: torch.Tensor, true: torch.Tensor, mask: torch.BoolTensor,
-              sample_weights: torch.Tensor) -> torch.Tensor:
-        BATCH_SIZE, SENTENCE_LENGTH, MAX_WORD_LENGTH, CHAR_CLASSES = pred.size()
-        pred = pred.reshape(-1, CHAR_CLASSES)
-        true = true.reshape(-1)
-        mask = true.gt(0)
-        loss = utils.masked_cross_entropy(pred, true, mask)
-        loss = loss.reshape(BATCH_SIZE, -1) * sample_weights.unsqueeze(-1)
-        valid_positions = mask.sum()
-        return loss.sum() / valid_positions
-    @classmethod
-    def from_vocab(cls,
-                   vocab: data.Vocabulary,
-                   char_vocab_namespace: str,
-                   lemma_vocab_namespace: str,
-                   embedding_dim: int,
-                   input_projection_layer: base.Linear,
-                   filters: List[int],
-                   kernel_size: List[int],
-                   stride: List[int],
-                   padding: List[int],
-                   dilation: List[int],
-                   activations: List[Activation],
-                   ):
-        assert char_vocab_namespace in vocab.get_namespaces()
-        assert lemma_vocab_namespace in vocab.get_namespaces()
-        if len(filters) + 1 != len(kernel_size):
-            raise ConfigurationError(
-                f"len(filters) ({len(filters):d}) + 1 != kernel_size ({len(kernel_size):d})"
-            )
-        filters = filters + [vocab.get_vocab_size(lemma_vocab_namespace)]
-        dilated_cnn_encoder = dilated_cnn.DilatedCnnEncoder(
-            input_dim=embedding_dim + input_projection_layer.get_output_dim(),
-            filters=filters,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            activations=activations,
-        )
-        return cls(num_embeddings=vocab.get_vocab_size(char_vocab_namespace),
-                   embedding_dim=embedding_dim,
-                   dilated_cnn_encoder=dilated_cnn_encoder,
-                   input_projection_layer=input_projection_layer)
--- a/combo/models/morpho.py
+++ b/combo/models/morpho.py
-"""
-Adapted from COMBO
-Author: Mateusz Klimaszewski
-"""
-from typing import Dict, List, Optional, Union
-import torch
-from combo import data
-from combo.data import dataset
-from combo.models import base, utils
-from combo.nn import Activation
-from combo.utils import ConfigurationError
-class MorphologicalFeatures(base.Predictor):
-    """Morphological features predicting model."""
-    def __init__(self, feedforward_network: base.FeedForward, slices: Dict[str, List[int]]):
-        super().__init__()
-        self.feedforward_network = feedforward_network
-        self.slices = slices
-    def forward(self,
-                x: Union[torch.Tensor, List[torch.Tensor]],
-                mask: Optional[torch.BoolTensor] = None,
-                labels: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
-                sample_weights: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None) -> Dict[str, torch.Tensor]:
-        if mask is None:
-            mask = x.new_ones(x.size()[:-1])
-        x, feature_maps = self.feedforward_network(x)
-        prediction = []
-        for _, cat_indices in self.slices.items():
-            prediction.append(x[:, :, cat_indices].argmax(dim=-1))
-        output = {
-            "prediction": torch.stack(prediction, dim=-1),
-            "probability": x,
-            "embedding": feature_maps[-1],
-        }
-        if labels is not None:
-            if sample_weights is None:
-                sample_weights = labels.new_ones([mask.size(0)])
-            output["loss"] = self._loss(x, labels, mask, sample_weights)
-        return output
-    def _loss(self, pred: torch.Tensor, true: torch.Tensor, mask: torch.BoolTensor,
-              sample_weights: torch.Tensor) -> torch.Tensor:
-        assert pred.size() == true.size()
-        BATCH_SIZE, _, MORPHOLOGICAL_FEATURES = pred.size()
-        valid_positions = mask.sum()
-        pred = pred.reshape(-1, MORPHOLOGICAL_FEATURES)
-        true = true.reshape(-1, MORPHOLOGICAL_FEATURES)
-        mask = mask.reshape(-1)
-        loss = None
-        loss_func = utils.masked_cross_entropy
-        for cat, cat_indices in self.slices.items():
-            if cat not in ["__PAD__", "_"]:
-                if loss is None:
-                    loss = loss_func(pred[:, cat_indices],
-                                     true[:, cat_indices].argmax(dim=1),
-                                     mask)
-                else:
-                    loss += loss_func(pred[:, cat_indices],
-                                      true[:, cat_indices].argmax(dim=1),
-                                      mask)
-        loss = loss.reshape(BATCH_SIZE, -1) * sample_weights.unsqueeze(-1)
-        return loss.sum() / valid_positions
-    @classmethod
-    def from_vocab(cls,
-                   vocab: data.Vocabulary,
-                   vocab_namespace: str,
-                   input_dim: int,
-                   num_layers: int,
-                   hidden_dims: List[int],
-                   activations: Union[Activation, List[Activation]],
-                   dropout: Union[float, List[float]] = 0.0,
-                   ):
-        if len(hidden_dims) + 1 != num_layers:
-            raise ConfigurationError(
-                f"len(hidden_dims) ({len(hidden_dims):d}) + 1 != num_layers ({num_layers:d})"
-            )
-        assert vocab_namespace in vocab.get_namespaces()
-        hidden_dims = hidden_dims + [vocab.get_vocab_size(vocab_namespace)]
-        slices = dataset.get_slices_if_not_provided(vocab)
-        return cls(
-            feedforward_network=base.FeedForward(
-                input_dim=input_dim,
-                num_layers=num_layers,
-                hidden_dims=hidden_dims,
-                activations=activations,
-                dropout=dropout),
-            slices=slices
-        )
--- a/combo/models/time_distributed.py
+++ b/combo/models/time_distributed.py
+"""
+Adapted from AllenNLP
+"""
+from typing import List
+import torch
+from overrides import overrides
+from combo.config.registry import Registry
+from combo.config.from_parameters import FromParameters, register_arguments
+@Registry.register('time_distributed')
+class TimeDistributed(torch.nn.Module, FromParameters):
+    """
+    Given an input shaped like `(batch_size, time_steps, [rest])` and a `Module` that takes
+    inputs like `(batch_size, [rest])`, `TimeDistributed` reshapes the input to be
+    `(batch_size * time_steps, [rest])`, applies the contained `Module`, then reshapes it back.
+    Note that while the above gives shapes with `batch_size` first, this `Module` also works if
+    `batch_size` is second - we always just combine the first two dimensions, then split them.
+    It also reshapes keyword arguments unless they are not tensors or their name is specified in
+    the optional `pass_through` iterable.
+    """
+    @register_arguments
+    def __init__(self, module):
+        super().__init__()
+        self._module = module
+    @overrides
+    def forward(self, *inputs, pass_through: List[str] = None, **kwargs):
+        pass_through = pass_through or []
+        reshaped_inputs = [self._reshape_tensor(input_tensor) for input_tensor in inputs]
+        # Need some input to then get the batch_size and time_steps.
+        some_input = None
+        if inputs:
+            some_input = inputs[-1]
+        reshaped_kwargs = {}
+        for key, value in kwargs.items():
+            if isinstance(value, torch.Tensor) and key not in pass_through:
+                if some_input is None:
+                    some_input = value
+                value = self._reshape_tensor(value)
+            reshaped_kwargs[key] = value
+        reshaped_outputs = self._module(*reshaped_inputs, **reshaped_kwargs)
+        if some_input is None:
+            raise RuntimeError("No input tensor to time-distribute")
+        # Now get the output back into the right shape.
+        # (batch_size, time_steps, **output_size)
+        new_size = some_input.size()[:2] + reshaped_outputs.size()[1:]
+        outputs = reshaped_outputs.contiguous().view(new_size)
+        return outputs
+    @staticmethod
+    def _reshape_tensor(input_tensor):
+        input_size = input_tensor.size()
+        if len(input_size) <= 2:
+            raise RuntimeError(f"No dimension to distribute: {input_size}")
+        # Squash batch_size and time_steps into a single axis; result has shape
+        # (batch_size * time_steps, **input_size).
+        squashed_shape = [-1] + list(input_size[2:])
+        return input_tensor.contiguous().view(*squashed_shape)
--- a/combo/modules/lemma.py
+++ b/combo/modules/lemma.py
@@ -12,7 +12,7 @@ from combo.nn import base
 from combo.nn.activations import Activation
 from combo.nn.utils import masked_cross_entropy
 from combo.utils import ConfigurationError
-from combo.models.base import TimeDistributed
+from combo.models.time_distributed import TimeDistributed
 from combo.predictors import Predictor

--- a/combo/modules/text_field_embedders/basic_text_field_embedder.py
+++ b/combo/modules/text_field_embedders/basic_text_field_embedder.py
@@ -15,7 +15,7 @@ from combo.modules.text_field_embedders.text_field_embedder import TextFieldEmbe
 from combo.modules.token_embedders import EmptyEmbedder
 from combo.modules.token_embedders.token_embedder import TokenEmbedder
 from combo.utils import ConfigurationError
-from combo.models.base import TimeDistributed
+from combo.models.time_distributed import TimeDistributed
 @Registry.register("base_text_field_embedder")

--- a/combo/modules/token_embedders/token_embedder.py
+++ b/combo/modules/token_embedders/token_embedder.py
@@ -12,7 +12,7 @@ from combo.data import Vocabulary
 from combo.nn.utils import tiny_value_of_dtype, uncombine_initial_dims, combine_initial_dims
 from combo.modules.module import Module
 from combo.utils import ConfigurationError
-from combo.models.base import TimeDistributed
+from models.time_distributed import TimeDistributed
 class TokenEmbedder(Module, FromParameters):

--- a/combo/polish_model_training.ipynb
+++ b/combo/polish_model_training.ipynb
--- a/tests/data/tokenizers/test_spacy_tokenizer.py
+++ b/tests/data/tokenizers/test_spacy_tokenizer.py
@@ -17,9 +17,3 @@ class SpacyTokenizerTest(unittest.TestCase):
        tokens = self.spacy_tokenizer.tokenize('')
        self.assertEqual(len(tokens), 0)
-    # def test_batch_tokenize_sentence(self):
-    #     tokens = self.spacy_tokenizer.batch_tokenize(['First sentence!', 'This is the second sentence.'])
-    #     self.assertListEqual([t.text for t in tokens[0]],
-    #                          ['First', 'sentence', '!'])
-    #     self.assertListEqual([t.text for t in tokens[1]],
-    #                          ['This', 'is', 'the', 'second', 'sentence', '.'])