diff --git a/combo/commands/__init__.py b/combo/commands/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..83ff4f4b48489406a73f3a902f0ab28eb62572b1 100644
--- a/combo/commands/__init__.py
+++ b/combo/commands/__init__.py
@@ -0,0 +1 @@
+from .train import FinetuningTrainModel
\ No newline at end of file
diff --git a/combo/commands/train.py b/combo/commands/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4bb80729a4875e48574ac94d43b36660559fe39
--- /dev/null
+++ b/combo/commands/train.py
@@ -0,0 +1,10 @@
+from pytorch_lightning import Trainer
+
+
+class FinetuningTrainModel(Trainer):
+    """
+    Class made only for finetuning,
+    the only difference is saving vocab from concatenated
+    (archive and current) datasets
+    """
+    pass
\ No newline at end of file
diff --git a/combo/data/dataset.py b/combo/data/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b16c149b7abd9e2aba6ac50a4b0b71740d4f5d7
--- /dev/null
+++ b/combo/data/dataset.py
@@ -0,0 +1,273 @@
+import copy
+import logging
+import pathlib
+from dataclasses import dataclass
+from typing import List, Any, Dict, Iterable, Optional, Tuple
+
+import conllu
+import torch
+from overrides import overrides
+
+from combo import data
+from combo.data import Vocabulary, fields, Instance, Token, TokenizerToken
+from combo.data.dataset_readers.dataset_reader import DatasetReader
+from combo.data.fields import Field
+from combo.data.fields.adjacency_field import AdjacencyField
+from combo.data.fields.metadata_field import MetadataField
+from combo.data.fields.sequence_label_field import SequenceLabelField
+from combo.data.fields.text_field import TextField
+from combo.data.token_indexers import TokenIndexer
+from combo.models import parser
+from combo.utils import checks, pad_sequence_to_length
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass(init=False, repr=False)
+class _Token(TokenizerToken):
+    __slots__ = TokenizerToken.__slots__ + ['feats_']
+
+    feats_: Optional[str]
+
+    def __init__(self, text: str = None, idx: int = None, idx_end: int = None, lemma_: str = None, pos_: str = None,
+                 tag_: str = None, dep_: str = None, ent_type_: str = None, text_id: int = None, type_id: int = None,
+                 feats_: str = None) -> None:
+        super().__init__(text, idx, idx_end, lemma_, pos_, tag_, dep_, ent_type_, text_id, type_id)
+        self.feats_ = feats_
+
+
+class UniversalDependenciesDatasetReader(DatasetReader):
+    def __init__(
+            self,
+            token_indexers: Dict[str, TokenIndexer] = None,
+            lemma_indexers: Dict[str, TokenIndexer] = None,
+            features: List[str] = None,
+            targets: List[str] = None,
+            use_sem: bool = False,
+            **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        if features is None:
+            features = ["token", "char"]
+        if targets is None:
+            targets = ["head", "deprel", "upostag", "xpostag", "lemma", "feats"]
+
+        if "token" not in features and "char" not in features:
+            raise checks.ConfigurationError("There must be at least one ('char' or 'token') text-based feature!")
+
+        if "deps" in targets and not ("head" in targets and "deprel" in targets):
+            raise checks.ConfigurationError("Add 'head' and 'deprel' to targets when using 'deps'!")
+
+        intersection = set(features).intersection(set(targets))
+        if len(intersection) != 0:
+            raise checks.ConfigurationError(
+                "Features and targets cannot share elements! "
+                "Remove {} from either features or targets.".format(intersection)
+            )
+        self.use_sem = use_sem
+
+        # *.conllu readers configuration
+        fields = list(parser.DEFAULT_FIELDS)
+        fields[1] = "token"  # use 'token' instead of 'form'
+        field_parsers = parser.DEFAULT_FIELD_PARSERS
+        # Do not make it nullable
+        field_parsers.pop("xpostag", None)
+        # Ignore parsing misc
+        field_parsers.pop("misc", None)
+        if self.use_sem:
+            fields = list(fields)
+            fields.append("semrel")
+            field_parsers["semrel"] = lambda line, i: line[i]
+        self.field_parsers = field_parsers
+        self.fields = tuple(fields)
+
+        self._token_indexers = token_indexers
+        self._lemma_indexers = lemma_indexers
+        self._targets = targets
+        self._features = features
+        self.generate_labels = True
+        # Filter out not required token indexers to avoid
+        # Mismatched token keys ConfigurationError
+        for indexer_name in list(self._token_indexers.keys()):
+            if indexer_name not in self._features:
+                del self._token_indexers[indexer_name]
+
+    @overrides
+    def _read(self, file_path: str) -> Iterable[Instance]:
+        file_path = [file_path] if len(file_path.split(",")) == 0 else file_path.split(",")
+
+        for conllu_file in file_path:
+            file = pathlib.Path(conllu_file)
+            assert conllu_file and file.exists(), f"File with path '{conllu_file}' does not exists!"
+            with file.open("r", encoding="utf-8") as f:
+                for annotation in conllu.parse_incr(f, fields=self.fields, field_parsers=self.field_parsers):
+                    yield self.text_to_instance(annotation)
+
+    # why is there an error? TypeError: UniversalDependenciesDatasetReader.text_to_instance: `inputs` must be present
+    #@overrides
+    def text_to_instance(self, tree: conllu.TokenList) -> Instance:
+        fields_: Dict[str, Field] = {}
+        tree_tokens = [t for t in tree if isinstance(t["id"], int)]
+        tokens = [_Token(t["token"],
+                         pos_=t.get("upostag"),
+                         tag_=t.get("xpostag"),
+                         lemma_=t.get("lemma"),
+                         feats_=t.get("feats"))
+                  for t in tree_tokens]
+
+        # features
+        text_field = TextField(tokens, self._token_indexers)
+        fields_["sentence"] = text_field
+
+        # targets
+        if self.generate_labels:
+            for target_name in self._targets:
+                if target_name != "sent":
+                    target_values = [t[target_name] for t in tree_tokens]
+                    if target_name == "lemma":
+                        target_values = [TokenizerToken(v) for v in target_values]
+                        fields_[target_name] = TextField(target_values, self._lemma_indexers)
+                    elif target_name == "feats":
+                        target_values = self._feat_values(tree_tokens)
+                        fields_[target_name] = fields.SequenceMultiLabelField(target_values,
+                                                                              self._feats_indexer,
+                                                                              self._feats_as_tensor_wrapper,
+                                                                              text_field,
+                                                                              label_namespace="feats_labels")
+                    elif target_name == "head":
+                        target_values = [0 if v == "_" else int(v) for v in target_values]
+                        fields_[target_name] = SequenceLabelField(target_values, text_field,
+                                                                  label_namespace=target_name + "_labels")
+                    elif target_name == "deps":
+                        # Graphs require adding ROOT (AdjacencyField uses sequence length from TextField).
+                        text_field_deps = TextField([_Token("ROOT")] + copy.deepcopy(tokens), self._token_indexers)
+                        enhanced_heads: List[Tuple[int, int]] = []
+                        enhanced_deprels: List[str] = []
+                        for idx, t in enumerate(tree_tokens):
+                            t_deps = t["deps"]
+                            if t_deps and t_deps != "_":
+                                for rel, head in t_deps:
+                                    # EmoryNLP skips the first edge, if there are two edges between the same
+                                    # nodes. Thanks to that one is in a tree and another in a graph.
+                                    # This snippet follows that approach.
+                                    if enhanced_heads and enhanced_heads[-1] == (idx, head):
+                                        enhanced_heads.pop()
+                                        enhanced_deprels.pop()
+                                    enhanced_heads.append((idx, head))
+                                    enhanced_deprels.append(rel)
+                        fields_["enhanced_heads"] = AdjacencyField(
+                            indices=enhanced_heads,
+                            sequence_field=text_field_deps,
+                            label_namespace="enhanced_heads_labels",
+                            padding_value=0,
+                        )
+                        fields_["enhanced_deprels"] = AdjacencyField(
+                            indices=enhanced_heads,
+                            sequence_field=text_field_deps,
+                            labels=enhanced_deprels,
+                            # Label namespace matches regular tree parsing.
+                            label_namespace="enhanced_deprel_labels",
+                            padding_value=0,
+                        )
+                    else:
+                        fields_[target_name] = SequenceLabelField(target_values, text_field,
+                                                                  label_namespace=target_name + "_labels")
+
+        # Restore feats fields to string representation
+        # parser.serialize_field doesn't handle key without value
+        for token in tree.tokens:
+            if "feats" in token:
+                feats = token["feats"]
+                if feats:
+                    feats_values = []
+                    for k, v in feats.items():
+                        feats_values.append('='.join((k, v)) if v else k)
+                    field = "|".join(feats_values)
+                else:
+                    field = "_"
+                token["feats"] = field
+
+        # metadata
+        fields_["metadata"] = MetadataField({"input": tree,
+                                             "field_names": self.fields,
+                                             "tokens": tokens})
+
+        return Instance(fields_)
+
+    @staticmethod
+    def _feat_values(tree: List[Dict[str, Any]]):
+        features = []
+        for token in tree:
+            token_features = []
+            if token["feats"] is not None:
+                for feat, value in token["feats"].items():
+                    if feat in ["_", "__ROOT__"]:
+                        pass
+                    else:
+                        # Handle case where feature is binary (doesn't have associated value)
+                        if value:
+                            token_features.append(feat + "=" + value)
+                        else:
+                            token_features.append(feat)
+            features.append(token_features)
+        return features
+
+    @staticmethod
+    def _feats_as_tensor_wrapper(field: fields.SequenceMultiLabelField):
+        def as_tensor(padding_lengths):
+            desired_num_tokens = padding_lengths["num_tokens"]
+            assert len(field._indexed_multi_labels) > 0
+            classes_count = len(field._indexed_multi_labels[0])
+            default_value = [0.0] * classes_count
+            padded_tags = pad_sequence_to_length(field._indexed_multi_labels, desired_num_tokens,
+                                                 lambda: default_value)
+            tensor = torch.tensor(padded_tags, dtype=torch.long)
+            return tensor
+
+        return as_tensor
+
+    @staticmethod
+    def _feats_indexer(vocab: Vocabulary):
+        label_namespace = "feats_labels"
+        vocab_size = vocab.get_vocab_size(label_namespace)
+        slices = get_slices_if_not_provided(vocab)
+
+        def _m_from_n_ones_encoding(multi_label: List[str], sentence_length: int) -> List[int]:
+            one_hot_encoding = [0] * vocab_size
+            for cat, cat_indices in slices.items():
+                if cat not in ["__PAD__", "_"]:
+                    label_from_cat = [label for label in multi_label if cat == label.split("=")[0]]
+                    if label_from_cat:
+                        label_from_cat = label_from_cat[0]
+                        index = vocab.get_token_index(label_from_cat, label_namespace)
+                    else:
+                        # Get Cat=None index
+                        index = vocab.get_token_index(cat + "=None", label_namespace)
+                    one_hot_encoding[index] = 1
+            return one_hot_encoding
+
+        return _m_from_n_ones_encoding
+
+
+def get_slices_if_not_provided(vocab: data.Vocabulary):
+    if hasattr(vocab, "slices"):
+        return vocab.slices
+
+    if "feats_labels" in vocab.get_namespaces():
+        idx2token = vocab.get_index_to_token_vocabulary("feats_labels")
+        for _, v in dict(idx2token).items():
+            if v not in ["_", "__PAD__"]:
+                empty_value = v.split("=")[0] + "=None"
+                vocab.add_token_to_namespace(empty_value, "feats_labels")
+
+        slices = {}
+        for idx, name in vocab.get_index_to_token_vocabulary("feats_labels").items():
+            # There are 2 types features: with (Case=Acc) or without assigment (None).
+            # Here we group their indices by name (before assigment sign).
+            name = name.split("=")[0]
+            if name in slices:
+                slices[name].append(idx)
+            else:
+                slices[name] = [idx]
+        vocab.slices = slices
+        return vocab.slices
diff --git a/combo/models/base.py b/combo/models/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..45eae041affc66f1bd34e3224b893e22285d0028
--- /dev/null
+++ b/combo/models/base.py
@@ -0,0 +1,274 @@
+from typing import Dict, Optional, List, Union, Tuple
+
+import torch
+import torch.nn as nn
+from overrides import overrides
+
+from combo.models.combo_nn import Activation
+import combo.utils.checks as checks
+from combo.data.vocabulary import Vocabulary
+from combo.models.utils import masked_cross_entropy
+from combo.predictors.predictor import Predictor
+
+
+class Linear(nn.Linear):
+    def __init__(self,
+                 in_features: int,
+                 out_features: int,
+                 activation: Optional[Activation] = None,
+                 dropout_rate: Optional[float] = 0.0):
+        super().__init__(in_features, out_features)
+        self.activation = activation if activation else self.identity
+        self.dropout = nn.Dropout(p=dropout_rate) if dropout_rate else self.identity
+
+    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
+        x = super().forward(x)
+        x = self.activation(x)
+        return self.dropout(x)
+
+    def get_output_dim(self) -> int:
+        return self.out_features
+
+    @staticmethod
+    def identity(x):
+        return x
+
+
+class FeedForward(torch.nn.Module):
+    """
+    Modified copy of allennlp.modules.feedforward.FeedForward
+
+    This `Module` is a feed-forward neural network, just a sequence of `Linear` layers with
+    activation functions in between.
+
+    # Parameters
+
+    input_dim : `int`, required
+        The dimensionality of the input.  We assume the input has shape `(batch_size, input_dim)`.
+    num_layers : `int`, required
+        The number of `Linear` layers to apply to the input.
+    hidden_dims : `Union[int, List[int]]`, required
+        The output dimension of each of the `Linear` layers.  If this is a single `int`, we use
+        it for all `Linear` layers.  If it is a `List[int]`, `len(hidden_dims)` must be
+        `num_layers`.
+    activations : `Union[Activation, List[Activation]]`, required
+        The activation function to use after each `Linear` layer.  If this is a single function,
+        we use it after all `Linear` layers.  If it is a `List[Activation]`,
+        `len(activations)` must be `num_layers`. Activation must have torch.nn.Module type.
+    dropout : `Union[float, List[float]]`, optional (default = `0.0`)
+        If given, we will apply this amount of dropout after each layer.  Semantics of `float`
+        versus `List[float]` is the same as with other parameters.
+
+    # Examples
+
+    ```python
+    FeedForward(124, 2, [64, 32], torch.nn.ReLU(), 0.2)
+    #> FeedForward(
+    #>   (_activations): ModuleList(
+    #>     (0): ReLU()
+    #>     (1): ReLU()
+    #>   )
+    #>   (_linear_layers): ModuleList(
+    #>     (0): Linear(in_features=124, out_features=64, bias=True)
+    #>     (1): Linear(in_features=64, out_features=32, bias=True)
+    #>   )
+    #>   (_dropout): ModuleList(
+    #>     (0): Dropout(p=0.2, inplace=False)
+    #>     (1): Dropout(p=0.2, inplace=False)
+    #>   )
+    #> )
+    ```
+    """
+
+    def __init__(
+            self,
+            input_dim: int,
+            num_layers: int,
+            hidden_dims: Union[int, List[int]],
+            activations: Union[Activation, List[Activation]],
+            dropout: Union[float, List[float]] = 0.0,
+    ) -> None:
+
+        super().__init__()
+        if not isinstance(hidden_dims, list):
+            hidden_dims = [hidden_dims] * num_layers  # type: ignore
+        if not isinstance(activations, list):
+            activations = [activations] * num_layers  # type: ignore
+        if not isinstance(dropout, list):
+            dropout = [dropout] * num_layers  # type: ignore
+        if len(hidden_dims) != num_layers:
+            raise checks.ConfigurationError(
+                "len(hidden_dims) (%d) != num_layers (%d)" % (len(hidden_dims), num_layers)
+            )
+        if len(activations) != num_layers:
+            raise checks.ConfigurationError(
+                "len(activations) (%d) != num_layers (%d)" % (len(activations), num_layers)
+            )
+        if len(dropout) != num_layers:
+            raise checks.ConfigurationError(
+                "len(dropout) (%d) != num_layers (%d)" % (len(dropout), num_layers)
+            )
+        self._activations = torch.nn.ModuleList(activations)
+        input_dims = [input_dim] + hidden_dims[:-1]
+        linear_layers = []
+        for layer_input_dim, layer_output_dim in zip(input_dims, hidden_dims):
+            linear_layers.append(torch.nn.Linear(layer_input_dim, layer_output_dim))
+        self._linear_layers = torch.nn.ModuleList(linear_layers)
+        dropout_layers = [torch.nn.Dropout(p=value) for value in dropout]
+        self._dropout = torch.nn.ModuleList(dropout_layers)
+        self._output_dim = hidden_dims[-1]
+        self.input_dim = input_dim
+
+    def get_output_dim(self):
+        return self._output_dim
+
+    def get_input_dim(self):
+        return self.input_dim
+
+    def forward(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+
+        output = inputs
+        feature_maps = []
+        for layer, activation, dropout in zip(
+                self._linear_layers, self._activations, self._dropout
+        ):
+            feature_maps.append(output)
+            output = dropout(activation(layer(output)))
+        return output, feature_maps
+
+
+class FeedForwardPredictor(Predictor):
+    """Feedforward predictor. Should be used on top of Seq2Seq encoder."""
+
+    def __init__(self, feedforward_network: "FeedForward"):
+        super().__init__()
+        self.feedforward_network = feedforward_network
+
+    def forward(self,
+                x: Union[torch.Tensor, List[torch.Tensor]],
+                mask: Optional[torch.BoolTensor] = None,
+                labels: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
+                sample_weights: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None) -> Dict[str, torch.Tensor]:
+        if mask is None:
+            mask = x.new_ones(x.size()[:-1])
+
+        x, feature_maps = self.feedforward_network(x)
+        output = {
+            "prediction": x.argmax(-1),
+            "probability": x,
+            "embedding": feature_maps[-1],
+        }
+
+        if labels is not None:
+            if sample_weights is None:
+                sample_weights = labels.new_ones([mask.size(0)])
+            output["loss"] = self._loss(x, labels, mask, sample_weights)
+
+        return output
+
+    def _loss(self,
+              pred: torch.Tensor,
+              true: torch.Tensor,
+              mask: torch.BoolTensor,
+              sample_weights: torch.Tensor) -> torch.Tensor:
+        BATCH_SIZE, _, CLASSES = pred.size()
+        valid_positions = mask.sum()
+        pred = pred.reshape(-1, CLASSES)
+        true = true.reshape(-1)
+        mask = mask.reshape(-1)
+        loss = masked_cross_entropy(pred, true, mask)
+        loss = loss.reshape(BATCH_SIZE, -1) * sample_weights.unsqueeze(-1)
+        return loss.sum() / valid_positions
+
+    @classmethod
+    def from_vocab(cls,
+                   vocab: Vocabulary,
+                   vocab_namespace: str,
+                   input_dim: int,
+                   num_layers: int,
+                   hidden_dims: List[int],
+                   activations: Union[Activation, List[Activation]],
+                   dropout: Union[float, List[float]] = 0.0,
+                   ):
+        if len(hidden_dims) + 1 != num_layers:
+            raise checks.ConfigurationError(
+                f"len(hidden_dims) ({len(hidden_dims):d}) + 1 != num_layers ({num_layers:d})"
+            )
+
+        assert vocab_namespace in vocab.get_namespaces(), \
+            f"There is not {vocab_namespace} in created vocabs, check if this field has any values to predict!"
+        hidden_dims = hidden_dims + [vocab.get_vocab_size(vocab_namespace)]
+
+        return cls(FeedForward(
+            input_dim=input_dim,
+            num_layers=num_layers,
+            hidden_dims=hidden_dims,
+            activations=activations,
+            dropout=dropout))
+
+
+"""
+Adapted from AllenNLP
+"""
+
+
+class TimeDistributed(torch.nn.Module):
+    """
+    Given an input shaped like `(batch_size, time_steps, [rest])` and a `Module` that takes
+    inputs like `(batch_size, [rest])`, `TimeDistributed` reshapes the input to be
+    `(batch_size * time_steps, [rest])`, applies the contained `Module`, then reshapes it back.
+
+    Note that while the above gives shapes with `batch_size` first, this `Module` also works if
+    `batch_size` is second - we always just combine the first two dimensions, then split them.
+
+    It also reshapes keyword arguments unless they are not tensors or their name is specified in
+    the optional `pass_through` iterable.
+    """
+
+    def __init__(self, module):
+        super().__init__()
+        self._module = module
+
+    @overrides
+    def forward(self, *inputs, pass_through: List[str] = None, **kwargs):
+
+        pass_through = pass_through or []
+
+        reshaped_inputs = [self._reshape_tensor(input_tensor) for input_tensor in inputs]
+
+        # Need some input to then get the batch_size and time_steps.
+        some_input = None
+        if inputs:
+            some_input = inputs[-1]
+
+        reshaped_kwargs = {}
+        for key, value in kwargs.items():
+            if isinstance(value, torch.Tensor) and key not in pass_through:
+                if some_input is None:
+                    some_input = value
+
+                value = self._reshape_tensor(value)
+
+            reshaped_kwargs[key] = value
+
+        reshaped_outputs = self._module(*reshaped_inputs, **reshaped_kwargs)
+
+        if some_input is None:
+            raise RuntimeError("No input tensor to time-distribute")
+
+        # Now get the output back into the right shape.
+        # (batch_size, time_steps, **output_size)
+        new_size = some_input.size()[:2] + reshaped_outputs.size()[1:]
+        outputs = reshaped_outputs.contiguous().view(new_size)
+
+        return outputs
+
+    @staticmethod
+    def _reshape_tensor(input_tensor):
+        input_size = input_tensor.size()
+        if len(input_size) <= 2:
+            raise RuntimeError(f"No dimension to distribute: {input_size}")
+        # Squash batch_size and time_steps into a single axis; result has shape
+        # (batch_size * time_steps, **input_size).
+        squashed_shape = [-1] + list(input_size[2:])
+        return input_tensor.contiguous().view(*squashed_shape)
diff --git a/combo/models/combo_nn.py b/combo/models/combo_nn.py
new file mode 100644
index 0000000000000000000000000000000000000000..822c1cd665e7aba7b8b94ce05ead907412ced553
--- /dev/null
+++ b/combo/models/combo_nn.py
@@ -0,0 +1,14 @@
+import torch
+import torch.nn as nn
+from overrides import overrides
+
+
+class Activation(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+
+
+class LinearActivation(Activation):
+    @overrides
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x
diff --git a/combo/models/dilated_cnn.py b/combo/models/dilated_cnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..79ca6d9a952e1da104150b896696eed75aec4901
--- /dev/null
+++ b/combo/models/dilated_cnn.py
@@ -0,0 +1,43 @@
+"""
+Adapted from COMBO 1.0
+Author: Mateusz Klimaszewski
+"""
+
+from typing import List
+
+import torch
+import torch.nn as nn
+
+from combo.models.combo_nn import Activation
+
+
+class DilatedCnnEncoder(nn.Module):
+
+    def __init__(self,
+                 input_dim: int,
+                 filters: List[int],
+                 kernel_size: List[int],
+                 stride: List[int],
+                 padding: List[int],
+                 dilation: List[int],
+                 activations: List[Activation]):
+        super().__init__()
+        conv1d_layers = []
+        input_dims = [input_dim] + filters[:-1]
+        output_dims = filters
+        for idx in range(len(activations)):
+            conv1d_layers.append(nn.Conv1d(
+                in_channels=input_dims[idx],
+                out_channels=output_dims[idx],
+                kernel_size=(kernel_size[idx],),
+                stride=(stride[idx],),
+                padding=padding[idx],
+                dilation=(dilation[idx],)))
+        self.conv1d_layers = nn.ModuleList(conv1d_layers)
+        self.activations = activations
+        assert len(self.activations) == len(self.conv1d_layers)
+
+    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
+        for layer, activation in zip(self.conv1d_layers, self.activations):
+            x = activation(layer(x))
+        return x
diff --git a/combo/models/embeddings.py b/combo/models/embeddings.py
new file mode 100644
index 0000000000000000000000000000000000000000..35b732ae5c4497893a28e67ecff36cbb383b3d79
--- /dev/null
+++ b/combo/models/embeddings.py
@@ -0,0 +1,221 @@
+from typing import Optional, List
+
+import torch
+from overrides import overrides
+from torch import nn
+from torchtext.vocab import Vectors, GloVe, FastText, CharNGram
+
+from combo.data import Vocabulary
+from combo.models.base import TimeDistributed
+from combo.models.dilated_cnn import DilatedCnnEncoder
+from combo.models.utils import tiny_value_of_dtype
+from combo.utils import ConfigurationError
+
+
+class TokenEmbedder(nn.Module):
+    def __init__(self):
+        super(TokenEmbedder, self).__init__()
+
+    @property
+    def output_dim(self) -> int:
+        raise NotImplementedError()
+
+    def forward(self,
+                x: torch.Tensor,
+                char_mask: Optional[torch.BoolTensor] = None) -> torch.Tensor:
+        raise NotImplementedError()
+
+
+class _TorchEmbedder(TokenEmbedder):
+    def __init__(self,
+                 num_embeddings: int,
+                 embedding_dim: int,
+                 padding_idx: Optional[int] = None,
+                 max_norm: Optional[float] = None,
+                 norm_type: float = 2.,
+                 scale_grad_by_freq: bool = False,
+                 sparse: bool = False,
+                 vocab_namespace: str = "tokens",
+                 vocab: Vocabulary = None,
+                 weight: Optional[torch.Tensor] = None,
+                 trainable: bool = True,
+                 projection_dim: Optional[int] = None):
+        super(_TorchEmbedder, self).__init__()
+        self._embedding_dim = embedding_dim
+        self._embedding = nn.Embedding(num_embeddings=num_embeddings,
+                                       embedding_dim=embedding_dim,
+                                       padding_idx=padding_idx,
+                                       max_norm=max_norm,
+                                       norm_type=norm_type,
+                                       scale_grad_by_freq=scale_grad_by_freq,
+                                       sparse=sparse)
+        self.__vocab_namespace = vocab_namespace
+        self.__vocab = vocab
+
+        if weight is not None:
+            if weight.shape() != (num_embeddings, embedding_dim):
+                raise ConfigurationError(
+                    "Weight matrix must be of shape (num_embeddings, embedding_dim)." +
+                    f"Got: ({weight.shape()})"
+                )
+
+            self.__weight = torch.nn.Parameter(weight, requires_grad=trainable)
+        else:
+            self.__weight = torch.nn.Parameter(torch.FloatTensor(num_embeddings, embedding_dim),
+                                               requires_grad=trainable)
+            torch.nn.init.xavier_uniform_(self.__weight)
+
+        if padding_idx is not None:
+            self.__weight.data[padding_idx].fill_(0)
+
+        if projection_dim:
+            self._projection = torch.nn.Linear(embedding_dim, projection_dim)
+            self._output_dim = projection_dim
+        else:
+            self._projection = None
+            self._output_dim = embedding_dim
+
+    @overrides
+    def output_dim(self) -> int:
+        return self._output_dim
+
+    @overrides
+    def forward(self,
+                x: torch.Tensor,
+                char_mask: Optional[torch.BoolTensor] = None) -> torch.Tensor:
+        embedded = self._embedding(x)
+        if self._projection:
+            projection = self._projection
+            for p in range(embedded.dim()-2):
+                projection = TimeDistributed(p)
+            embedded = projection(embedded)
+        return embedded
+
+
+class _TorchtextVectorsEmbedder(TokenEmbedder):
+    """
+    Torchtext Vectors object wrapper
+    """
+
+    def __init__(self,
+                 torchtext_embedder: Vectors,
+                 lower_case_backup: bool = False):
+        """
+        :param torchtext_embedder: Torchtext Vectors object
+        :param lower_case_backup: whether to look up the token in the
+        lower case. Default: False.
+        """
+        super(_TorchtextVectorsEmbedder, self).__init__()
+        self.__torchtext_embedder = torchtext_embedder
+        self.__lower_case_backup = lower_case_backup
+
+    @overrides
+    def output_dim(self) -> int:
+        return len(self.__torchtext_embedder)
+
+    @overrides
+    def forward(self,
+                x: torch.Tensor,
+                char_mask: Optional[torch.BoolTensor] = None) -> torch.Tensor:
+        return self.__torchtext_embedder.get_vecs_by_tokens(x, self.__lower_case_backup)
+
+
+class GloVe42BEmbedder(_TorchtextVectorsEmbedder):
+    def __init__(self, dim: int = 300):
+        super(GloVe42BEmbedder, self).__init__(GloVe("42B", dim))
+
+
+class GloVe840BEmbedder(_TorchtextVectorsEmbedder):
+    def __init__(self, dim: int = 300):
+        super(GloVe840BEmbedder, self).__init__(GloVe("840B", dim))
+
+
+class GloVeTwitter27BEmbedder(_TorchtextVectorsEmbedder):
+    def __init__(self, dim: int = 300):
+        super(GloVeTwitter27BEmbedder, self).__init__(GloVe("twitter.27B", dim))
+
+
+class GloVe6BEmbedder(_TorchtextVectorsEmbedder):
+    def __init__(self, dim: int = 300):
+        super(GloVe6BEmbedder, self).__init__(GloVe("6B", dim))
+
+
+class FastTextEmbedder(_TorchtextVectorsEmbedder):
+    def __init__(self, language: str = "en"):
+        super(FastTextEmbedder, self).__init__(FastText(language))
+
+
+class CharNGramEmbedder(_TorchtextVectorsEmbedder):
+    def __init__(self):
+        super(CharNGramEmbedder, self).__init__(CharNGram())
+
+
+class CharacterBasedWordEmbedder(TokenEmbedder):
+    def __init__(self,
+                 num_embeddings: int,
+                 embedding_dim: int,
+                 dilated_cnn_encoder: DilatedCnnEncoder):
+        super(CharacterBasedWordEmbedder, self).__init__()
+        self.__embedding_dim = embedding_dim
+        self.__dilated_cnn_encoder = dilated_cnn_encoder
+        self.char_embed = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim)
+
+    @overrides
+    def output_dim(self) -> int:
+        return self.__embedding_dim
+
+    @overrides
+    def forward(self,
+                x: torch.Tensor,
+                char_mask: Optional[torch.BoolTensor] = None) -> torch.Tensor:
+        if char_mask is None:
+            char_mask = x.new_ones(x.size())
+
+        x = self.char_embed(x)
+        x = x * char_mask.unsqueeze(-1).float()
+        x = self.__dilated_cnn_encoder(x.transpose(2, 3))
+        return torch.max(x, dim=-1)[0]
+
+
+class PretrainedTransformerMismatchedEmbedder(TokenEmbedder):
+    pass
+
+
+class TransformersWordEmbedder(PretrainedTransformerMismatchedEmbedder):
+    pass
+
+
+class FeatsTokenEmbedder(_TorchEmbedder):
+    def __init__(self,
+                 num_embeddings: int,
+                 embedding_dim: int,
+                 padding_idx: Optional[int] = None,
+                 max_norm: Optional[float] = None,
+                 norm_type: float = 2.,
+                 scale_grad_by_freq: bool = False,
+                 sparse: bool = False,
+                 vocab_namespace: str = "feats",
+                 vocab: Vocabulary = None,
+                 weight: Optional[torch.Tensor] = None,
+                 trainable: bool = True):
+        super(FeatsTokenEmbedder, self).__init__(num_embeddings,
+                                                 embedding_dim,
+                                                 padding_idx,
+                                                 max_norm,
+                                                 norm_type,
+                                                 scale_grad_by_freq,
+                                                 sparse,
+                                                 vocab_namespace,
+                                                 vocab,
+                                                 weight,
+                                                 trainable)
+
+    @overrides
+    def forward(self,
+                x: torch.Tensor,
+                char_mask: Optional[torch.BoolTensor] = None) -> torch.Tensor:
+        mask = x.gt(0)
+        x = super().forward(x)
+        return x.sum(dim=-2)/(
+            (mask.sum(dim=-1)+tiny_value_of_dtype(torch.float)).unsqueeze(dim=-1)
+        )
diff --git a/combo/models/graph_parser.py b/combo/models/graph_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..6dffef52a943e4671aea7ac911b5c25981c3051f
--- /dev/null
+++ b/combo/models/graph_parser.py
@@ -0,0 +1,190 @@
+"""
+Adapted from COMBO.
+Author: Mateusz Klimaszewski
+"""
+
+from typing import List, Optional, Union, Tuple, Dict
+
+from combo import data
+from combo.models import base
+from combo.models.base import Predictor
+
+import torch
+import torch.nn.functional as F
+
+
+class GraphHeadPredictionModel(Predictor):
+    """Head prediction model."""
+
+    def __init__(self,
+                 head_projection_layer: base.Linear,
+                 dependency_projection_layer: base.Linear,
+                 cycle_loss_n: int = 0,
+                 graph_weighting: float = 0.2):
+        super().__init__()
+        self.head_projection_layer = head_projection_layer
+        self.dependency_projection_layer = dependency_projection_layer
+        self.cycle_loss_n = cycle_loss_n
+        self.graph_weighting = graph_weighting
+
+    def forward(self,
+                x: Union[torch.Tensor, List[torch.Tensor]],
+                labels: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
+                mask: Optional[torch.BoolTensor] = None,
+                sample_weights: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None) -> Dict[str, torch.Tensor]:
+        if mask is None:
+            mask = x.new_ones(x.size()[-1])
+        heads_labels = None
+        if labels is not None and labels[0] is not None:
+            heads_labels = labels
+
+        head_arc_emb = self.head_projection_layer(x)
+        dep_arc_emb = self.dependency_projection_layer(x)
+        x = dep_arc_emb.bmm(head_arc_emb.transpose(2, 1))
+        pred = x.sigmoid() > 0.5
+
+        output = {
+            "prediction": pred,
+            "probability": x
+        }
+
+        if heads_labels is not None:
+            if sample_weights is None:
+                sample_weights = heads_labels.new_ones([mask.size(0)])
+            output["loss"], output["cycle_loss"] = self._loss(x, heads_labels, mask, sample_weights)
+
+        return output
+
+    def _cycle_loss(self, pred: torch.Tensor):
+        BATCH_SIZE, _, _ = pred.size()
+        loss = pred.new_zeros(BATCH_SIZE)
+        # Index from 1: as using non __ROOT__ tokens
+        pred = pred.softmax(-1)[:, 1:, 1:]
+        x = pred
+        for i in range(self.cycle_loss_n):
+            loss += self._batch_trace(x)
+
+            # Don't multiple on last iteration
+            if i < self.cycle_loss_n - 1:
+                x = x.bmm(pred)
+
+        return loss
+
+    @staticmethod
+    def _batch_trace(x: torch.Tensor) -> torch.Tensor:
+        assert len(x.size()) == 3
+        BATCH_SIZE, N, M = x.size()
+        assert N == M
+        identity = x.new_tensor(torch.eye(N))
+        identity = identity.reshape((1, N, N))
+        batch_identity = identity.repeat(BATCH_SIZE, 1, 1)
+        return (x * batch_identity).sum((-1, -2))
+
+    def _loss(self, pred: torch.Tensor, labels: torch.Tensor, mask: torch.BoolTensor,
+              sample_weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        BATCH_SIZE, N, M = pred.size()
+        assert N == M
+        SENTENCE_LENGTH = N
+
+        valid_positions = mask.sum()
+
+        result = []
+        true = labels
+        # Ignore first pred dimension as it is ROOT token prediction
+        for i in range(SENTENCE_LENGTH - 1):
+            pred_i = pred[:, i + 1, 1:].reshape(-1)
+            true_i = true[:, i + 1, 1:].reshape(-1)
+            mask_i = mask[:, i]
+            bce_loss = F.binary_cross_entropy_with_logits(pred_i, true_i, reduction="none").mean(-1) * mask_i
+            result.append(bce_loss)
+        cycle_loss = self._cycle_loss(pred)
+        loss = torch.stack(result).transpose(1, 0) * sample_weights.unsqueeze(-1)
+        return loss.sum() / valid_positions + cycle_loss.mean(), cycle_loss.mean()
+
+
+class GraphDependencyRelationModel(Predictor):
+    """Dependency relation parsing model."""
+
+    def __init__(self,
+                 head_predictor: GraphHeadPredictionModel,
+                 head_projection_layer: base.Linear,
+                 dependency_projection_layer: base.Linear,
+                 relation_prediction_layer: base.Linear):
+        super().__init__()
+        self.head_predictor = head_predictor
+        self.head_projection_layer = head_projection_layer
+        self.dependency_projection_layer = dependency_projection_layer
+        self.relation_prediction_layer = relation_prediction_layer
+
+    def forward(self,
+                x: Union[torch.Tensor, List[torch.Tensor]],
+                mask: Optional[torch.BoolTensor] = None,
+                labels: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
+                sample_weights: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None) -> Dict[str, torch.Tensor]:
+        relations_labels, head_labels, enhanced_heads_labels, enhanced_deprels_labels = None, None, None, None
+        if labels is not None and labels[0] is not None:
+            relations_labels, head_labels, enhanced_heads_labels = labels
+
+        head_output = self.head_predictor(x, enhanced_heads_labels, mask, sample_weights)
+        head_pred = head_output["probability"]
+        BATCH_SIZE, LENGTH, _ = head_pred.size()
+
+        head_rel_emb = self.head_projection_layer(x)
+
+        dep_rel_emb = self.dependency_projection_layer(x)
+
+        # All possible edges combinations for each batch
+        # Repeat interleave to have [emb1, emb1 ... (length times) ... emb1, emb2 ... ]
+        head_rel_pred = head_rel_emb.repeat_interleave(LENGTH, -2)
+        # Regular repeat to have all combinations [deprel1, deprel2, ... deprelL, deprel1 ...]
+        dep_rel_pred = dep_rel_emb.repeat(1, LENGTH, 1)
+
+        # All possible edges combinations for each batch
+        dep_rel_pred = torch.cat((head_rel_pred, dep_rel_pred), dim=-1)
+
+        relation_prediction = self.relation_prediction_layer(dep_rel_pred).reshape(BATCH_SIZE, LENGTH, LENGTH, -1)
+        output = head_output
+
+        output["prediction"] = (relation_prediction.argmax(-1), head_output["prediction"])
+        output["rel_probability"] = relation_prediction
+
+        if labels is not None and labels[0] is not None:
+            if sample_weights is None:
+                sample_weights = labels.new_ones([mask.size(0)])
+            loss = self._loss(relation_prediction, relations_labels, enhanced_heads_labels, mask, sample_weights)
+            output["loss"] = (loss, head_output["loss"])
+
+        return output
+
+    @staticmethod
+    def _loss(pred: torch.Tensor,
+              true: torch.Tensor,
+              heads_true: torch.Tensor,
+              mask: torch.BoolTensor,
+              sample_weights: torch.Tensor) -> torch.Tensor:
+        correct_heads_mask = heads_true.long() == 1
+        true = true[correct_heads_mask]
+        pred = pred[correct_heads_mask]
+        loss = F.cross_entropy(pred, true.long())
+        return loss.sum() / pred.size(0)
+
+    @classmethod
+    def from_vocab(cls,
+                   vocab: data.Vocabulary,
+                   vocab_namespace: str,
+                   head_predictor: GraphHeadPredictionModel,
+                   head_projection_layer: base.Linear,
+                   dependency_projection_layer: base.Linear
+                   ):
+        """Creates parser combining model configuration and vocabulary data."""
+        assert vocab_namespace in vocab.get_namespaces()
+        relation_prediction_layer = base.Linear(
+            in_features=head_projection_layer.get_output_dim() + dependency_projection_layer.get_output_dim(),
+            out_features=vocab.get_vocab_size(vocab_namespace)
+        )
+        return cls(
+            head_predictor=head_predictor,
+            head_projection_layer=head_projection_layer,
+            dependency_projection_layer=dependency_projection_layer,
+            relation_prediction_layer=relation_prediction_layer
+        )
diff --git a/combo/models/lemma.py b/combo/models/lemma.py
new file mode 100644
index 0000000000000000000000000000000000000000..d724a1ecb9c22610fc6ac56493929178d7a6cd5a
--- /dev/null
+++ b/combo/models/lemma.py
@@ -0,0 +1,107 @@
+from typing import Optional, Dict, List, Union
+
+import torch
+import torch.nn as nn
+
+from combo import data
+from combo.models import dilated_cnn, base, utils
+from combo.models.base import Predictor, TimeDistributed
+from combo.models.combo_nn import Activation
+from combo.utils import ConfigurationError
+
+
+class LemmatizerModel(Predictor):
+    """Lemmatizer model."""
+
+    def __init__(self,
+                 num_embeddings: int,
+                 embedding_dim: int,
+                 dilated_cnn_encoder: dilated_cnn.DilatedCnnEncoder,
+                 input_projection_layer: base.Linear):
+        super().__init__()
+        self.char_embed = nn.Embedding(
+            num_embeddings=num_embeddings,
+            embedding_dim=embedding_dim,
+        )
+        self.dilated_cnn_encoder = TimeDistributed(dilated_cnn_encoder)
+        self.input_projection_layer = input_projection_layer
+
+    def forward(self,
+                x: Union[torch.Tensor, List[torch.Tensor]],
+                mask: Optional[torch.BoolTensor] = None,
+                labels: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
+                sample_weights: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None) -> Dict[str, torch.Tensor]:
+        encoder_emb, chars = x
+
+        encoder_emb = self.input_projection_layer(encoder_emb)
+        char_embeddings = self.char_embed(chars)
+
+        BATCH_SIZE, _, MAX_WORD_LENGTH, CHAR_EMB = char_embeddings.size()
+        encoder_emb = encoder_emb.unsqueeze(2).repeat(1, 1, MAX_WORD_LENGTH, 1)
+
+        x = torch.cat((char_embeddings, encoder_emb), dim=-1).transpose(2, 3)
+        x = self.dilated_cnn_encoder(x).transpose(2, 3)
+        output = {
+            "prediction": x.argmax(-1),
+            "probability": x
+        }
+
+        if labels is not None:
+            if mask is None:
+                mask = encoder_emb.new_ones(encoder_emb.size()[:-2])
+            if sample_weights is None:
+                sample_weights = labels.new_ones(BATCH_SIZE)
+            mask = mask.unsqueeze(2).repeat(1, 1, MAX_WORD_LENGTH).bool()
+            output["loss"] = self._loss(x, labels, mask, sample_weights)
+
+        return output
+
+    @staticmethod
+    def _loss(pred: torch.Tensor, true: torch.Tensor, mask: torch.BoolTensor,
+              sample_weights: torch.Tensor) -> torch.Tensor:
+        BATCH_SIZE, SENTENCE_LENGTH, MAX_WORD_LENGTH, CHAR_CLASSES = pred.size()
+        pred = pred.reshape(-1, CHAR_CLASSES)
+
+        true = true.reshape(-1)
+        mask = true.gt(0)
+        loss = utils.masked_cross_entropy(pred, true, mask)
+        loss = loss.reshape(BATCH_SIZE, -1) * sample_weights.unsqueeze(-1)
+        valid_positions = mask.sum()
+        return loss.sum() / valid_positions
+
+    @classmethod
+    def from_vocab(cls,
+                   vocab: data.Vocabulary,
+                   char_vocab_namespace: str,
+                   lemma_vocab_namespace: str,
+                   embedding_dim: int,
+                   input_projection_layer: base.Linear,
+                   filters: List[int],
+                   kernel_size: List[int],
+                   stride: List[int],
+                   padding: List[int],
+                   dilation: List[int],
+                   activations: List[Activation],
+                   ):
+        assert char_vocab_namespace in vocab.get_namespaces()
+        assert lemma_vocab_namespace in vocab.get_namespaces()
+
+        if len(filters) + 1 != len(kernel_size):
+            raise ConfigurationError(
+                f"len(filters) ({len(filters):d}) + 1 != kernel_size ({len(kernel_size):d})"
+            )
+        filters = filters + [vocab.get_vocab_size(lemma_vocab_namespace)]
+
+        dilated_cnn_encoder = dilated_cnn.DilatedCnnEncoder(
+            input_dim=embedding_dim + input_projection_layer.get_output_dim(),
+            filters=filters,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            activations=activations,
+        )
+        return cls(num_embeddings=vocab.get_vocab_size(char_vocab_namespace),
+                   embedding_dim=embedding_dim,
+                   dilated_cnn_encoder=dilated_cnn_encoder,
+                   input_projection_layer=input_projection_layer)
diff --git a/combo/models/model.py b/combo/models/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..9482ea546e69803d1c95815a955e259523bfd80f
--- /dev/null
+++ b/combo/models/model.py
@@ -0,0 +1,464 @@
+"""
+Adapted from AllenNLP
+https://github.com/allenai/allennlp/blob/main/allennlp/models/model.py
+"""
+
+import logging
+import os
+from os import PathLike
+import re
+from typing import Dict, List, Set, Type, Optional, Union
+
+import numpy
+import torch
+
+from combo.common.params import remove_keys_from_params, Params
+from combo.data import Vocabulary, Instance
+from combo.data.batch import Batch
+from combo.nn import util, RegularizerApplicator
+from combo.utils import ConfigurationError
+
+logger = logging.getLogger(__name__)
+
+# When training a model, many sets of weights are saved. By default we want to
+# save/load this set of weights.
+_DEFAULT_WEIGHTS = "best.th"
+
+
+class Model(torch.nn.Module):
+    """
+    This abstract class represents a model to be trained. Rather than relying completely
+    on the Pytorch Module, we modify the output spec of `forward` to be a dictionary.
+
+    Models built using this API are still compatible with other pytorch models and can
+    be used naturally as modules within other models - outputs are dictionaries, which
+    can be unpacked and passed into other layers. One caveat to this is that if you
+    wish to use an AllenNLP model inside a Container (such as nn.Sequential), you must
+    interleave the models with a wrapper module which unpacks the dictionary into
+    a list of tensors.
+
+    In order for your model to be trained using the [`Trainer`](../training/trainer.md)
+    api, the output dictionary of your Model must include a "loss" key, which will be
+    optimised during the training process.
+
+    Finally, you can optionally implement :func:`Model.get_metrics` in order to make use
+    of early stopping and best-model serialization based on a validation metric in
+    `Trainer`. Metrics that begin with "_" will not be logged
+    to the progress bar by `Trainer`.
+
+    The `from_archive` method on this class is registered as a `Model` with name "from_archive".
+    So, if you are using a configuration file, you can specify a model as `{"type": "from_archive",
+    "archive_file": "/path/to/archive.tar.gz"}`, which will pull out the model from the given
+    location and return it.
+
+    # Parameters
+
+    vocab: `Vocabulary`
+        There are two typical use-cases for the `Vocabulary` in a `Model`: getting vocabulary sizes
+        when constructing embedding matrices or output classifiers (as the vocabulary holds the
+        number of classes in your output, also), and translating model output into human-readable
+        form.
+
+        In a typical AllenNLP configuration file, this parameter does not get an entry under the
+        "model", it gets specified as a top-level parameter, then is passed in to the model
+        separately.
+    regularizer: `RegularizerApplicator`, optional
+        If given, the `Trainer` will use this to regularize model parameters.
+    serialization_dir: `str`, optional
+        The directory in which the training output is saved to, or the directory the model is loaded from.
+    """
+
+    _warn_for_unseparable_batches: Set[str] = set()
+    default_predictor: Optional[str] = None
+
+    def __init__(
+        self,
+        vocab: Vocabulary,
+        regularizer: RegularizerApplicator = None,
+        serialization_dir: Optional[str] = None,
+    ) -> None:
+        super().__init__()
+        self.vocab = vocab
+        self._regularizer = regularizer
+        self.serialization_dir = serialization_dir
+
+    def get_regularization_penalty(self) -> Optional[torch.Tensor]:
+        """
+        Computes the regularization penalty for the model.
+        Returns None if the model was not configured to use regularization.
+        """
+        if self._regularizer is None:
+            regularization_penalty = None
+        else:
+            try:
+                regularization_penalty = self._regularizer(self)
+                if isinstance(regularization_penalty, float):
+                    assert regularization_penalty == 0.0
+                    regularization_penalty = torch.tensor(regularization_penalty)
+            except AssertionError:
+                raise RuntimeError("The regularizer cannot be a non-zero float.")
+        return regularization_penalty
+
+    def get_parameters_for_histogram_tensorboard_logging(self) -> List[str]:
+        """
+        Returns the name of model parameters used for logging histograms to tensorboard.
+        """
+        return [name for name, _ in self.named_parameters()]
+
+    def forward(self, *inputs) -> Dict[str, torch.Tensor]:
+        """
+        Defines the forward pass of the model. In addition, to facilitate easy training,
+        this method is designed to compute a loss function defined by a user.
+
+        The input is comprised of everything required to perform a
+        training update, `including` labels - you define the signature here!
+        It is down to the user to ensure that inference can be performed
+        without the presence of these labels. Hence, any inputs not available at
+        inference time should only be used inside a conditional block.
+
+        The intended sketch of this method is as follows::
+
+            def forward(self, input1, input2, targets=None):
+                ....
+                ....
+                output1 = self.layer1(input1)
+                output2 = self.layer2(input2)
+                output_dict = {"output1": output1, "output2": output2}
+                if targets is not None:
+                    # Function returning a scalar torch.Tensor, defined by the user.
+                    loss = self._compute_loss(output1, output2, targets)
+                    output_dict["loss"] = loss
+                return output_dict
+
+        # Parameters
+
+        *inputs : `Any`
+            Tensors comprising everything needed to perform a training update, `including` labels,
+            which should be optional (i.e have a default value of `None`).  At inference time,
+            simply pass the relevant inputs, not including the labels.
+
+        # Returns
+
+        output_dict : `Dict[str, torch.Tensor]`
+            The outputs from the model. In order to train a model using the
+            `Trainer` api, you must provide a "loss" key pointing to a
+            scalar `torch.Tensor` representing the loss to be optimized.
+        """
+        raise NotImplementedError
+
+    def forward_on_instance(self, instance: Instance) -> Dict[str, numpy.ndarray]:
+        """
+        Takes an [`Instance`](../data/instance.md), which typically has raw text in it, converts
+        that text into arrays using this model's [`Vocabulary`](../data/vocabulary.md), passes those
+        arrays through `self.forward()` and `self.make_output_human_readable()` (which by default
+        does nothing) and returns the result.  Before returning the result, we convert any
+        `torch.Tensors` into numpy arrays and remove the batch dimension.
+        """
+        return self.forward_on_instances([instance])[0]
+
+    def forward_on_instances(self, instances: List[Instance]) -> List[Dict[str, numpy.ndarray]]:
+        """
+        Takes a list of `Instances`, converts that text into arrays using this model's `Vocabulary`,
+        passes those arrays through `self.forward()` and `self.make_output_human_readable()` (which
+        by default does nothing) and returns the result.  Before returning the result, we convert
+        any `torch.Tensors` into numpy arrays and separate the batched output into a list of
+        individual dicts per instance. Note that typically this will be faster on a GPU (and
+        conditionally, on a CPU) than repeated calls to `forward_on_instance`.
+
+        # Parameters
+
+        instances : `List[Instance]`, required
+            The instances to run the model on.
+
+        # Returns
+
+        A list of the models output for each instance.
+        """
+        batch_size = len(instances)
+        with torch.no_grad():
+            cuda_device = self._get_prediction_device()
+            dataset = Batch(instances)
+            dataset.index_instances(self.vocab)
+            model_input = util.move_to_device(dataset.as_tensor_dict(), cuda_device)
+            outputs = self.make_output_human_readable(self(**model_input))
+
+            instance_separated_output: List[Dict[str, numpy.ndarray]] = [
+                {} for _ in dataset.instances
+            ]
+            for name, output in list(outputs.items()):
+                if isinstance(output, torch.Tensor):
+                    # NOTE(markn): This is a hack because 0-dim pytorch tensors are not iterable.
+                    # This occurs with batch size 1, because we still want to include the loss in that case.
+                    if output.dim() == 0:
+                        output = output.unsqueeze(0)
+
+                    if output.size(0) != batch_size:
+                        self._maybe_warn_for_unseparable_batches(name)
+                        continue
+                    output = output.detach().cpu().numpy()
+                elif len(output) != batch_size:
+                    self._maybe_warn_for_unseparable_batches(name)
+                    continue
+                for instance_output, batch_element in zip(instance_separated_output, output):
+                    instance_output[name] = batch_element
+            return instance_separated_output
+
+    def make_output_human_readable(
+        self, output_dict: Dict[str, torch.Tensor]
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Takes the result of `forward` and makes it human readable.  Most of the time, the only thing
+        this method does is convert tokens / predicted labels from tensors to strings that humans
+        might actually understand.  Somtimes you'll also do an argmax or something in here, too, but
+        that most often happens in `Model.forward`, before you compute your metrics.
+
+        This method `modifies` the input dictionary, and also `returns` the same dictionary.
+
+        By default in the base class we do nothing.
+        """
+
+        return output_dict
+
+    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
+        """
+        Returns a dictionary of metrics. This method will be called by
+        `allennlp.training.Trainer` in order to compute and use model metrics for early
+        stopping and model serialization.  We return an empty dictionary here rather than raising
+        as it is not required to implement metrics for a new model.  A boolean `reset` parameter is
+        passed, as frequently a metric accumulator will have some state which should be reset
+        between epochs. This is also compatible with [`Metric`s](../training/metrics/metric.md). Metrics
+        should be populated during the call to `forward`, with the `Metric` handling the accumulation of
+        the metric until this method is called.
+        """
+
+        return {}
+
+    def _get_prediction_device(self) -> int:
+        """
+        This method checks the device of the model parameters to determine the cuda_device
+        this model should be run on for predictions.  If there are no parameters, it returns -1.
+
+        # Returns
+
+        The cuda device this model should run on for predictions.
+        """
+        devices = {util.get_device_of(param) for param in self.parameters()}
+
+        if len(devices) > 1:
+            devices_string = ", ".join(str(x) for x in devices)
+            raise ConfigurationError(f"Parameters have mismatching cuda_devices: {devices_string}")
+        elif len(devices) == 1:
+            return devices.pop()
+        else:
+            return -1
+
+    def _maybe_warn_for_unseparable_batches(self, output_key: str):
+        """
+        This method warns once if a user implements a model which returns a dictionary with
+        values which we are unable to split back up into elements of the batch. This is controlled
+        by a class attribute `_warn_for_unseperable_batches` because it would be extremely verbose
+        otherwise.
+        """
+        if output_key not in self._warn_for_unseparable_batches:
+            logger.warning(
+                f"Encountered the {output_key} key in the model's return dictionary which "
+                "couldn't be split by the batch size. Key will be ignored."
+            )
+            # We only want to warn once for this key,
+            # so we set this to false so we don't warn again.
+            self._warn_for_unseparable_batches.add(output_key)
+
+    @classmethod
+    def _load(
+        cls,
+        config: Params,
+        serialization_dir: Union[str, PathLike],
+        weights_file: Optional[Union[str, PathLike]] = None,
+        cuda_device: int = -1,
+    ) -> "Model":
+        """
+        Instantiates an already-trained model, based on the experiment
+        configuration and some optional overrides.
+        """
+        weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS)
+
+        # Load vocabulary from file
+        vocab_dir = os.path.join(serialization_dir, "vocabulary")
+        # If the config specifies a vocabulary subclass, we need to use it.
+        vocab_params = config.get("vocabulary", Params({}))
+        vocab_choice = vocab_params.pop_choice("type", Vocabulary.list_available(), True)
+        vocab_class, _ = Vocabulary.resolve_class_name(vocab_choice)
+        vocab = vocab_class.from_files(
+            vocab_dir, vocab_params.get("padding_token"), vocab_params.get("oov_token")
+        )
+
+        model_params = config.get("model")
+
+        # The experiment config tells us how to _train_ a model, including where to get pre-trained
+        # embeddings/weights from. We're now _loading_ the model, so those weights will already be
+        # stored in our model. We don't need any pretrained weight file or initializers anymore,
+        # and we don't want the code to look for it, so we remove it from the parameters here.
+        remove_keys_from_params(model_params)
+        model = Model.from_params(
+            vocab=vocab, params=model_params, serialization_dir=serialization_dir
+        )
+
+        # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are
+        # in sync with the weights
+        if cuda_device >= 0:
+            model.cuda(cuda_device)
+        else:
+            model.cpu()
+
+        # If vocab+embedding extension was done, the model initialized from from_params
+        # and one defined by state dict in weights_file might not have same embedding shapes.
+        # Eg. when model embedder module was transferred along with vocab extension, the
+        # initialized embedding weight shape would be smaller than one in the state_dict.
+        # So calling model embedding extension is required before load_state_dict.
+        # If vocab and model embeddings are in sync, following would be just a no-op.
+        model.extend_embedder_vocab()
+
+        # Load state dict. We pass `strict=False` so PyTorch doesn't raise a RuntimeError
+        # if the state dict is missing keys because we handle this case below.
+        model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device))
+        missing_keys, unexpected_keys = model.load_state_dict(model_state, strict=False)
+
+        # Modules might define a class variable called `authorized_missing_keys`,
+        # a list of regex patterns, that tells us to ignore missing keys that match
+        # any of the patterns.
+        # We sometimes need this in order to load older models with newer versions of AllenNLP.
+
+        def filter_out_authorized_missing_keys(module, prefix=""):
+            nonlocal missing_keys
+            for pat in getattr(module.__class__, "authorized_missing_keys", None) or []:
+                missing_keys = [
+                    k
+                    for k in missing_keys
+                    if k.startswith(prefix) and re.search(pat[len(prefix) :], k) is None
+                ]
+            for name, child in module._modules.items():
+                if child is not None:
+                    filter_out_authorized_missing_keys(child, prefix + name + ".")
+
+        filter_out_authorized_missing_keys(model)
+
+        if unexpected_keys or missing_keys:
+            raise RuntimeError(
+                f"Error loading state dict for {model.__class__.__name__}\n\t"
+                f"Missing keys: {missing_keys}\n\t"
+                f"Unexpected keys: {unexpected_keys}"
+            )
+
+        return model
+
+    @classmethod
+    def load(
+        cls,
+        config: Params,
+        serialization_dir: Union[str, PathLike],
+        weights_file: Optional[Union[str, PathLike]] = None,
+        cuda_device: int = -1,
+    ) -> "Model":
+        """
+        Instantiates an already-trained model, based on the experiment
+        configuration and some optional overrides.
+
+        # Parameters
+
+        config : `Params`
+            The configuration that was used to train the model. It should definitely
+            have a `model` section, and should probably have a `trainer` section
+            as well.
+        serialization_dir: `str = None`
+            The directory containing the serialized weights, parameters, and vocabulary
+            of the model.
+        weights_file: `str = None`
+            By default we load the weights from `best.th` in the serialization
+            directory, but you can override that value here.
+        cuda_device: `int = -1`
+            By default we load the model on the CPU, but if you want to load it
+            for GPU usage you can specify the id of your GPU here
+
+        # Returns
+
+        model : `Model`
+            The model specified in the configuration, loaded with the serialized
+            vocabulary and the trained weights.
+        """
+
+        # Peak at the class of the model.
+        model_type = (
+            config["model"] if isinstance(config["model"], str) else config["model"]["type"]
+        )
+
+        # Load using an overridable _load method.
+        # This allows subclasses of Model to override _load.
+
+        model_class: Type[Model] = cls.by_name(model_type)  # type: ignore
+        if not isinstance(model_class, type):
+            # If you're using from_archive to specify your model (e.g., for fine tuning), then you
+            # can't currently override the behavior of _load; we just use the default Model._load.
+            # If we really need to change this, we would need to implement a recursive
+            # get_model_class method, that recurses whenever it finds a from_archive model type.
+            model_class = Model
+        return model_class._load(config, serialization_dir, weights_file, cuda_device)
+
+    def extend_embedder_vocab(self, embedding_sources_mapping: Dict[str, str] = None) -> None:
+        """
+        Iterates through all embedding modules in the model and assures it can embed
+        with the extended vocab. This is required in fine-tuning or transfer learning
+        scenarios where model was trained with original vocabulary but during
+        fine-tuning/transfer-learning, it will have it work with extended vocabulary
+        (original + new-data vocabulary).
+
+        # Parameters
+
+        embedding_sources_mapping : `Dict[str, str]`, optional (default = `None`)
+            Mapping from model_path to pretrained-file path of the embedding
+            modules. If pretrained-file used at time of embedding initialization
+            isn't available now, user should pass this mapping. Model path is
+            path traversing the model attributes upto this embedding module.
+            Eg. "_text_field_embedder.token_embedder_tokens".
+        """
+        # self.named_modules() gives all sub-modules (including nested children)
+        # The path nesting is already separated by ".": eg. parent_module_name.child_module_name
+        embedding_sources_mapping = embedding_sources_mapping or {}
+        for model_path, module in self.named_modules():
+            if hasattr(module, "extend_vocab"):
+                pretrained_file = embedding_sources_mapping.get(model_path)
+                module.extend_vocab(
+                    self.vocab,
+                    extension_pretrained_file=pretrained_file,
+                    model_path=model_path,
+                )
+
+    @classmethod
+    def from_archive(cls, archive_file: str, vocab: Vocabulary = None) -> "Model":
+        """
+        Loads a model from an archive file.  This basically just calls
+        `return archival.load_archive(archive_file).model`.  It exists as a method here for
+        convenience, and so that we can register it for easy use for fine tuning an existing model
+        from a config file.
+
+        If `vocab` is given, we will extend the loaded model's vocabulary using the passed vocab
+        object (including calling `extend_embedder_vocab`, which extends embedding layers).
+        """
+        from combo.models.archival import load_archive # here to avoid circular imports
+
+        model = load_archive(archive_file).model
+        if vocab:
+            model.vocab.extend_from_vocab(vocab)
+            model.extend_embedder_vocab()
+        return model
+
+
+def remove_weights_related_keys_from_params(
+    params: Params, keys: List[str] = ["pretrained_file", "initializer"]
+):
+    remove_keys_from_params(params, keys)
+
+
+def remove_pretrained_embedding_params(params: Params):
+    """This function only exists for backwards compatibility.
+    Please use `remove_weights_related_keys_from_params()` instead."""
+    remove_keys_from_params(params, ["pretrained_file"])
diff --git a/combo/models/morpho.py b/combo/models/morpho.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fb9545eeec5bc0049a51abb69ba479817410484
--- /dev/null
+++ b/combo/models/morpho.py
@@ -0,0 +1,103 @@
+"""
+Adapted from COMBO
+Author: Mateusz Klimaszewski
+"""
+from typing import Dict, List, Optional, Union
+import torch
+
+from combo import data
+from combo.data import dataset
+from combo.models import base, utils
+from combo.models.combo_nn import Activation
+from combo.utils import ConfigurationError
+
+
+class MorphologicalFeatures(base.Predictor):
+    """Morphological features predicting model."""
+
+    def __init__(self, feedforward_network: base.FeedForward, slices: Dict[str, List[int]]):
+        super().__init__()
+        self.feedforward_network = feedforward_network
+        self.slices = slices
+
+    def forward(self,
+                x: Union[torch.Tensor, List[torch.Tensor]],
+                mask: Optional[torch.BoolTensor] = None,
+                labels: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
+                sample_weights: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None) -> Dict[str, torch.Tensor]:
+        if mask is None:
+            mask = x.new_ones(x.size()[:-1])
+
+        x, feature_maps = self.feedforward_network(x)
+
+        prediction = []
+        for _, cat_indices in self.slices.items():
+            prediction.append(x[:, :, cat_indices].argmax(dim=-1))
+
+        output = {
+            "prediction": torch.stack(prediction, dim=-1),
+            "probability": x,
+            "embedding": feature_maps[-1],
+        }
+
+        if labels is not None:
+            if sample_weights is None:
+                sample_weights = labels.new_ones([mask.size(0)])
+            output["loss"] = self._loss(x, labels, mask, sample_weights)
+
+        return output
+
+    def _loss(self, pred: torch.Tensor, true: torch.Tensor, mask: torch.BoolTensor,
+              sample_weights: torch.Tensor) -> torch.Tensor:
+        assert pred.size() == true.size()
+        BATCH_SIZE, _, MORPHOLOGICAL_FEATURES = pred.size()
+
+        valid_positions = mask.sum()
+
+        pred = pred.reshape(-1, MORPHOLOGICAL_FEATURES)
+        true = true.reshape(-1, MORPHOLOGICAL_FEATURES)
+        mask = mask.reshape(-1)
+        loss = None
+        loss_func = utils.masked_cross_entropy
+        for cat, cat_indices in self.slices.items():
+            if cat not in ["__PAD__", "_"]:
+                if loss is None:
+                    loss = loss_func(pred[:, cat_indices],
+                                     true[:, cat_indices].argmax(dim=1),
+                                     mask)
+                else:
+                    loss += loss_func(pred[:, cat_indices],
+                                      true[:, cat_indices].argmax(dim=1),
+                                      mask)
+        loss = loss.reshape(BATCH_SIZE, -1) * sample_weights.unsqueeze(-1)
+        return loss.sum() / valid_positions
+
+    @classmethod
+    def from_vocab(cls,
+                   vocab: data.Vocabulary,
+                   vocab_namespace: str,
+                   input_dim: int,
+                   num_layers: int,
+                   hidden_dims: List[int],
+                   activations: Union[Activation, List[Activation]],
+                   dropout: Union[float, List[float]] = 0.0,
+                   ):
+        if len(hidden_dims) + 1 != num_layers:
+            raise ConfigurationError(
+                f"len(hidden_dims) ({len(hidden_dims):d}) + 1 != num_layers ({num_layers:d})"
+            )
+
+        assert vocab_namespace in vocab.get_namespaces()
+        hidden_dims = hidden_dims + [vocab.get_vocab_size(vocab_namespace)]
+
+        slices = dataset.get_slices_if_not_provided(vocab)
+
+        return cls(
+            feedforward_network=base.FeedForward(
+                input_dim=input_dim,
+                num_layers=num_layers,
+                hidden_dims=hidden_dims,
+                activations=activations,
+                dropout=dropout),
+            slices=slices
+        )
diff --git a/combo/models/parser.py b/combo/models/parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..42d2efb3944a3e81188d6cf8f673dcb2cfb75002
--- /dev/null
+++ b/combo/models/parser.py
@@ -0,0 +1,223 @@
+"""
+Adapted from COMBO
+Author: Mateusz Klimaszewski
+"""
+from typing import Tuple, Dict, Optional, Union, List
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from combo import data
+from combo.models import base, utils
+from combo.nn import chu_liu_edmonds
+
+
+class HeadPredictionModel(base.Predictor):
+    """Head prediction model."""
+
+    def __init__(self,
+                 head_projection_layer: base.Linear,
+                 dependency_projection_layer: base.Linear,
+                 cycle_loss_n: int = 0):
+        super().__init__()
+        self.head_projection_layer = head_projection_layer
+        self.dependency_projection_layer = dependency_projection_layer
+        self.cycle_loss_n = cycle_loss_n
+
+    def forward(self,
+                x: Union[torch.Tensor, List[torch.Tensor]],
+                mask: Optional[torch.BoolTensor] = None,
+                labels: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
+                sample_weights: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None) -> Dict[str, torch.Tensor]:
+        if mask is None:
+            mask = x.new_ones(x.size()[-1])
+
+        head_arc_emb = self.head_projection_layer(x)
+        dep_arc_emb = self.dependency_projection_layer(x)
+        x = dep_arc_emb.bmm(head_arc_emb.transpose(2, 1))
+
+        if self.training:
+            pred = x.argmax(-1)
+        else:
+            pred = []
+            # Adding non existing in mask ROOT to lengths
+            lengths = mask.data.sum(dim=1).long().cpu().numpy() + 1
+            for idx, length in enumerate(lengths):
+                probs = x[idx, :].softmax(dim=-1).cpu().numpy()
+
+                # We do not want any word to be parent of the root node (ROOT, 0).
+                # Also setting it to -1 instead of 0 fixes edge case where softmax made all
+                # but ROOT prediction to EXACTLY 0.0 and it might cause in many ROOT -> word edges)
+                probs[:, 0] = -1
+                heads, _ = chu_liu_edmonds.decode_mst(probs.T, length=length, has_labels=False)
+                heads[0] = 0
+                pred.append(heads)
+            pred = torch.from_numpy(np.stack(pred)).to(x.device)
+
+        output = {
+            "prediction": pred[:, 1:],
+            "probability": x
+        }
+
+        if labels is not None:
+            if sample_weights is None:
+                sample_weights = labels.new_ones([mask.size(0)])
+            output["loss"], output["cycle_loss"] = self._loss(x, labels, mask, sample_weights)
+
+        return output
+
+    def _cycle_loss(self, pred: torch.Tensor):
+        BATCH_SIZE, _, _ = pred.size()
+        loss = pred.new_zeros(BATCH_SIZE)
+        # Index from 1: as using non __ROOT__ tokens
+        pred = pred.softmax(-1)[:, 1:, 1:]
+        x = pred
+        for i in range(self.cycle_loss_n):
+            loss += self._batch_trace(x)
+
+            # Don't multiple on last iteration
+            if i < self.cycle_loss_n - 1:
+                x = x.bmm(pred)
+
+        return loss
+
+    @staticmethod
+    def _batch_trace(x: torch.Tensor) -> torch.Tensor:
+        assert len(x.size()) == 3
+        BATCH_SIZE, N, M = x.size()
+        assert N == M
+        identity = x.new_tensor(torch.eye(N))
+        identity = identity.reshape((1, N, N))
+        batch_identity = identity.repeat(BATCH_SIZE, 1, 1)
+        return (x * batch_identity).sum((-1, -2))
+
+    def _loss(self, pred: torch.Tensor, true: torch.Tensor, mask: torch.BoolTensor,
+              sample_weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        BATCH_SIZE, N, M = pred.size()
+        assert N == M
+        SENTENCE_LENGTH = N
+
+        valid_positions = mask.sum()
+
+        result = []
+        # Ignore first pred dimension as it is ROOT token prediction
+        for i in range(SENTENCE_LENGTH - 1):
+            pred_i = pred[:, i + 1, :].reshape(BATCH_SIZE, SENTENCE_LENGTH)
+            true_i = true[:, i].reshape(-1)
+            mask_i = mask[:, i]
+            cross_entropy_loss = utils.masked_cross_entropy(pred_i, true_i, mask_i)
+            result.append(cross_entropy_loss)
+        cycle_loss = self._cycle_loss(pred)
+        loss = torch.stack(result).transpose(1, 0) * sample_weights.unsqueeze(-1)
+        return loss.sum() / valid_positions + cycle_loss.mean(), cycle_loss.mean()
+
+
+
+class DependencyRelationModel(base.Predictor):
+    """Dependency relation parsing model."""
+
+    def __init__(self,
+                 root_idx: int,
+                 head_predictor: HeadPredictionModel,
+                 head_projection_layer: base.Linear,
+                 dependency_projection_layer: base.Linear,
+                 relation_prediction_layer: base.Linear):
+        super().__init__()
+        self.root_idx = root_idx
+        self.head_predictor = head_predictor
+        self.head_projection_layer = head_projection_layer
+        self.dependency_projection_layer = dependency_projection_layer
+        self.relation_prediction_layer = relation_prediction_layer
+
+    def forward(self,
+                x: Union[torch.Tensor, List[torch.Tensor]],
+                mask: Optional[torch.BoolTensor] = None,
+                labels: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
+                sample_weights: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None) -> Dict[str, torch.Tensor]:
+        device = x.device
+        if mask is not None:
+            mask = mask[:, 1:]
+        relations_labels, head_labels = None, None
+        if labels is not None and labels[0] is not None:
+            relations_labels, head_labels = labels
+            if mask is None:
+                mask = head_labels.new_ones(head_labels.size())
+
+        head_output = self.head_predictor(x, mask, head_labels, sample_weights)
+        head_pred = head_output["probability"]
+        head_pred_soft = F.softmax(head_pred, dim=-1)
+
+        head_rel_emb = self.head_projection_layer(x)
+
+        dep_rel_emb = self.dependency_projection_layer(x)
+
+        dep_rel_pred = head_pred_soft.bmm(head_rel_emb)
+        dep_rel_pred = torch.cat((dep_rel_pred, dep_rel_emb), dim=-1)
+        relation_prediction = self.relation_prediction_layer(dep_rel_pred)
+        output = head_output
+        output["embedding"] = dep_rel_pred
+
+        if self.training:
+            output["prediction"] = (relation_prediction.argmax(-1)[:, 1:], head_output["prediction"])
+        else:
+            # Mask root label whenever head is not 0.
+            relation_prediction_output = relation_prediction[:, 1:].clone()
+            mask = (head_output["prediction"] == 0)
+            vocab_size = relation_prediction_output.size(-1)
+            root_idx = torch.tensor([self.root_idx], device=device)
+            relation_prediction_output[mask] = (relation_prediction_output
+                                                .masked_select(mask.unsqueeze(-1))
+                                                .reshape(-1, vocab_size)
+                                                .index_fill(-1, root_idx, 10e10))
+            relation_prediction_output[~mask] = (relation_prediction_output
+                                                 .masked_select(~(mask.unsqueeze(-1)))
+                                                 .reshape(-1, vocab_size)
+                                                 .index_fill(-1, root_idx, -10e10))
+            output["prediction"] = (relation_prediction_output.argmax(-1), head_output["prediction"])
+
+        if labels is not None and labels[0] is not None:
+            if sample_weights is None:
+                sample_weights = labels.new_ones([mask.size(0)])
+            loss = self._loss(relation_prediction[:, 1:], relations_labels, mask, sample_weights)
+            output["loss"] = (loss, head_output["loss"])
+
+        return output
+
+    @staticmethod
+    def _loss(pred: torch.Tensor,
+              true: torch.Tensor,
+              mask: torch.BoolTensor,
+              sample_weights: torch.Tensor) -> torch.Tensor:
+
+        valid_positions = mask.sum()
+
+        BATCH_SIZE, _, DEPENDENCY_RELATIONS = pred.size()
+        pred = pred.reshape(-1, DEPENDENCY_RELATIONS)
+        true = true.reshape(-1)
+        mask = mask.reshape(-1)
+        loss = utils.masked_cross_entropy(pred, true, mask)
+        loss = loss.reshape(BATCH_SIZE, -1) * sample_weights.unsqueeze(-1)
+        return loss.sum() / valid_positions
+
+    @classmethod
+    def from_vocab(cls,
+                   vocab: data.Vocabulary,
+                   vocab_namespace: str,
+                   head_predictor: HeadPredictionModel,
+                   head_projection_layer: base.Linear,
+                   dependency_projection_layer: base.Linear
+                   ):
+        """Creates parser combining model configuration and vocabulary data."""
+        assert vocab_namespace in vocab.get_namespaces()
+        relation_prediction_layer = base.Linear(
+            in_features=head_projection_layer.get_output_dim() + dependency_projection_layer.get_output_dim(),
+            out_features=vocab.get_vocab_size(vocab_namespace)
+        )
+        return cls(
+            head_predictor=head_predictor,
+            head_projection_layer=head_projection_layer,
+            dependency_projection_layer=dependency_projection_layer,
+            relation_prediction_layer=relation_prediction_layer,
+            root_idx=vocab.get_token_index("root", vocab_namespace)
+        )
diff --git a/combo/models/utils.py b/combo/models/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8366a7b264064fd51473643eed05cb852285891a
--- /dev/null
+++ b/combo/models/utils.py
@@ -0,0 +1,27 @@
+import torch
+import torch.nn.functional as F
+
+
+def masked_cross_entropy(pred: torch.Tensor, true: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor:
+    pred = pred + (mask.float().unsqueeze(-1) + 1e-45).log()
+    return F.cross_entropy(pred, true, reduction="none") * mask
+
+
+"""
+Adapted from AllenNLP
+"""
+def tiny_value_of_dtype(dtype: torch.dtype):
+    """
+    Returns a moderately tiny value for a given PyTorch data type that is used to avoid numerical
+    issues such as division by zero.
+    This is different from `info_value_of_dtype(dtype).tiny` because it causes some NaN bugs.
+    Only supports floating point dtypes.
+    """
+    if not dtype.is_floating_point:
+        raise TypeError("Only supports floating point dtypes.")
+    if dtype == torch.float or dtype == torch.double:
+        return 1e-13
+    elif dtype == torch.half:
+        return 1e-4
+    else:
+        raise TypeError("Does not support dtype " + str(dtype))
\ No newline at end of file
diff --git a/combo/modules/seq2seq_encoder.py b/combo/modules/seq2seq_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..71413f3c5f0539caf337e10e7c05c09e50dd5c19
--- /dev/null
+++ b/combo/modules/seq2seq_encoder.py
@@ -0,0 +1,33 @@
+class Seq2SeqEncoder:
+    """
+    A `Seq2SeqEncoder` is a `Module` that takes as input a sequence of vectors and returns a
+    modified sequence of vectors.  Input shape : `(batch_size, sequence_length, input_dim)`; output
+    shape : `(batch_size, sequence_length, output_dim)`.
+
+    We add two methods to the basic `Module` API: `get_input_dim()` and `get_output_dim()`.
+    You might need this if you want to construct a `Linear` layer using the output of this encoder,
+    or to raise sensible errors for mis-matching input dimensions.
+    """
+
+    def get_input_dim(self) -> int:
+        """
+        Returns the dimension of the vector input for each element in the sequence input
+        to a `Seq2SeqEncoder`. This is `not` the shape of the input tensor, but the
+        last element of that shape.
+        """
+        raise NotImplementedError
+
+    def get_output_dim(self) -> int:
+        """
+        Returns the dimension of each vector in the sequence output by this `Seq2SeqEncoder`.
+        This is `not` the shape of the returned tensor, but the last element of that shape.
+        """
+        raise NotImplementedError
+
+    def is_bidirectional(self) -> bool:
+        """
+        Returns `True` if this encoder is bidirectional.  If so, we assume the forward direction
+        of the encoder is the first half of the final dimension, and the backward direction is the
+        second half.
+        """
+        raise NotImplementedError
diff --git a/combo/nn/regularizers/regularizer_applicator.py b/combo/nn/regularizers/regularizer_applicator.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae8d52d61a3615a6f45dab8c77166f7d4741b740
--- /dev/null
+++ b/combo/nn/regularizers/regularizer_applicator.py
@@ -0,0 +1,40 @@
+import re
+from typing import List, Tuple
+
+import torch
+
+from combo.nn.regularizers import Regularizer
+
+
+class RegularizerApplicator:
+    """
+    Applies regularizers to the parameters of a Module based on regex matches.
+    """
+
+    def __init__(self, regexes: List[Tuple[str, Regularizer]] = None) -> None:
+        """
+        # Parameters
+        regexes : `List[Tuple[str, Regularizer]]`, optional (default = `None`)
+            A sequence of pairs (regex, Regularizer), where each Regularizer
+            applies to the parameters its regex matches (and that haven't previously
+            been matched).
+        """
+        self._regularizers = regexes or []
+
+    def __call__(self, module: torch.nn.Module) -> torch.Tensor:
+        """
+        # Parameters
+        module : `torch.nn.Module`, required
+            The module to regularize.
+        """
+        accumulator = 0.0
+        for name, parameter in module.named_parameters():
+            # We first check if the parameter needs gradient updates or not
+            if parameter.requires_grad:
+                # For each parameter find the first matching regex.
+                for regex, regularizer in self._regularizers:
+                    if re.search(regex, name):
+                        penalty = regularizer(parameter)
+                        accumulator = accumulator + penalty
+                        break
+        return accumulator
diff --git a/combo/nn/util.py b/combo/nn/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..69c8d017760606cfdbf4b09999d813e1932b71f7
--- /dev/null
+++ b/combo/nn/util.py
@@ -0,0 +1,259 @@
+"""
+Adapted from AllenNLP
+https://github.com/allenai/allennlp/blob/80fb6061e568cb9d6ab5d45b661e86eb61b92c82/allennlp/nn/util.py
+"""
+from typing import Union, Dict, Optional, List, Any
+
+import torch
+
+from combo.common.util import int_to_device
+from combo.utils import ConfigurationError
+
+
+def move_to_device(obj, device: Union[torch.device, int]):
+    """
+    Given a structure (possibly) containing Tensors,
+    move all the Tensors to the specified device (or do nothing, if they are already on
+    the target device).
+    """
+    device = int_to_device(device)
+
+    if isinstance(obj, torch.Tensor):
+        # You may be wondering why we don't just always call `obj.to(device)` since that would
+        # be a no-op anyway if `obj` is already on `device`. Well that works fine except
+        # when PyTorch is not compiled with CUDA support, in which case even calling
+        # `obj.to(torch.device("cpu"))` would result in an error.
+        return obj if obj.device == device else obj.to(device=device)
+    elif isinstance(obj, dict):
+        for key, value in obj.items():
+            obj[key] = move_to_device(value, device)
+        return obj
+    elif isinstance(obj, list):
+        for i, item in enumerate(obj):
+            obj[i] = move_to_device(item, device)
+        return obj
+    elif isinstance(obj, tuple) and hasattr(obj, "_fields"):
+        # This is the best way to detect a NamedTuple, it turns out.
+        return obj.__class__(*(move_to_device(item, device) for item in obj))
+    elif isinstance(obj, tuple):
+        return tuple(move_to_device(item, device) for item in obj)
+    else:
+        return obj
+
+
+def device_mapping(cuda_device: int):
+    """
+    In order to `torch.load()` a GPU-trained model onto a CPU (or specific GPU),
+    you have to supply a `map_location` function. Call this with
+    the desired `cuda_device` to get the function that `torch.load()` needs.
+    """
+
+    def inner_device_mapping(storage: torch.Storage, location) -> torch.Storage:
+        if cuda_device >= 0:
+            return storage.cuda(cuda_device)
+        else:
+            return storage
+
+    return inner_device_mapping
+
+
+def get_lengths_from_binary_sequence_mask(mask: torch.BoolTensor) -> torch.LongTensor:
+    """
+    Compute sequence lengths for each batch element in a tensor using a
+    binary mask.
+    # Parameters
+    mask : `torch.BoolTensor`, required.
+        A 2D binary mask of shape (batch_size, sequence_length) to
+        calculate the per-batch sequence lengths from.
+    # Returns
+    `torch.LongTensor`
+        A torch.LongTensor of shape (batch_size,) representing the lengths
+        of the sequences in the batch.
+    """
+    return mask.sum(-1)
+
+
+def sort_batch_by_length(tensor: torch.Tensor, sequence_lengths: torch.Tensor):
+    """
+    Sort a batch first tensor by some specified lengths.
+    # Parameters
+    tensor : `torch.FloatTensor`, required.
+        A batch first Pytorch tensor.
+    sequence_lengths : `torch.LongTensor`, required.
+        A tensor representing the lengths of some dimension of the tensor which
+        we want to sort by.
+    # Returns
+    sorted_tensor : `torch.FloatTensor`
+        The original tensor sorted along the batch dimension with respect to sequence_lengths.
+    sorted_sequence_lengths : `torch.LongTensor`
+        The original sequence_lengths sorted by decreasing size.
+    restoration_indices : `torch.LongTensor`
+        Indices into the sorted_tensor such that
+        `sorted_tensor.index_select(0, restoration_indices) == original_tensor`
+    permutation_index : `torch.LongTensor`
+        The indices used to sort the tensor. This is useful if you want to sort many
+        tensors using the same ordering.
+    """
+
+    if not isinstance(tensor, torch.Tensor) or not isinstance(sequence_lengths, torch.Tensor):
+        raise ConfigurationError("Both the tensor and sequence lengths must be torch.Tensors.")
+
+    sorted_sequence_lengths, permutation_index = sequence_lengths.sort(0, descending=True)
+    sorted_tensor = tensor.index_select(0, permutation_index)
+
+    index_range = torch.arange(0, len(sequence_lengths), device=sequence_lengths.device)
+    # This is the equivalent of zipping with index, sorting by the original
+    # sequence lengths and returning the now sorted indices.
+    _, reverse_mapping = permutation_index.sort(0, descending=False)
+    restoration_indices = index_range.index_select(0, reverse_mapping)
+    return sorted_tensor, sorted_sequence_lengths, restoration_indices, permutation_index
+
+def get_text_field_mask(
+    text_field_tensors: Dict[str, Dict[str, torch.Tensor]],
+    num_wrapping_dims: int = 0,
+    padding_id: int = 0,
+) -> torch.BoolTensor:
+    """
+    Takes the dictionary of tensors produced by a `TextField` and returns a mask
+    with 0 where the tokens are padding, and 1 otherwise. `padding_id` specifies the id of padding tokens.
+    We also handle `TextFields` wrapped by an arbitrary number of `ListFields`, where the number of wrapping
+    `ListFields` is given by `num_wrapping_dims`.
+    If `num_wrapping_dims == 0`, the returned mask has shape `(batch_size, num_tokens)`.
+    If `num_wrapping_dims > 0` then the returned mask has `num_wrapping_dims` extra
+    dimensions, so the shape will be `(batch_size, ..., num_tokens)`.
+    There could be several entries in the tensor dictionary with different shapes (e.g., one for
+    word ids, one for character ids).  In order to get a token mask, we use the tensor in
+    the dictionary with the lowest number of dimensions.  After subtracting `num_wrapping_dims`,
+    if this tensor has two dimensions we assume it has shape `(batch_size, ..., num_tokens)`,
+    and use it for the mask.  If instead it has three dimensions, we assume it has shape
+    `(batch_size, ..., num_tokens, num_features)`, and sum over the last dimension to produce
+    the mask.  Most frequently this will be a character id tensor, but it could also be a
+    featurized representation of each token, etc.
+    If the input `text_field_tensors` contains the "mask" key, this is returned instead of inferring the mask.
+    """
+    masks = []
+    for indexer_name, indexer_tensors in text_field_tensors.items():
+        if "mask" in indexer_tensors:
+            masks.append(indexer_tensors["mask"].bool())
+    if len(masks) == 1:
+        return masks[0]
+    elif len(masks) > 1:
+        # TODO(mattg): My guess is this will basically never happen, so I'm not writing logic to
+        # handle it.  Should be straightforward to handle, though.  If you see this error in
+        # practice, open an issue on github.
+        raise ValueError("found two mask outputs; not sure which to use!")
+
+    tensor_dims = [
+        (tensor.dim(), tensor)
+        for indexer_output in text_field_tensors.values()
+        for tensor in indexer_output.values()
+    ]
+    tensor_dims.sort(key=lambda x: x[0])
+
+    smallest_dim = tensor_dims[0][0] - num_wrapping_dims
+    if smallest_dim == 2:
+        token_tensor = tensor_dims[0][1]
+        return token_tensor != padding_id
+    elif smallest_dim == 3:
+        character_tensor = tensor_dims[0][1]
+        return (character_tensor != padding_id).any(dim=-1)
+    else:
+        raise ValueError("Expected a tensor with dimension 2 or 3, found {}".format(smallest_dim))
+
+
+def get_dropout_mask(dropout_probability: float, tensor_for_masking: torch.Tensor):
+    """
+    Computes and returns an element-wise dropout mask for a given tensor, where
+    each element in the mask is dropped out with probability dropout_probability.
+    Note that the mask is NOT applied to the tensor - the tensor is passed to retain
+    the correct CUDA tensor type for the mask.
+    # Parameters
+    dropout_probability : `float`, required.
+        Probability of dropping a dimension of the input.
+    tensor_for_masking : `torch.Tensor`, required.
+    # Returns
+    `torch.FloatTensor`
+        A torch.FloatTensor consisting of the binary mask scaled by 1/ (1 - dropout_probability).
+        This scaling ensures expected values and variances of the output of applying this mask
+        and the original tensor are the same.
+    """
+    binary_mask = (torch.rand(tensor_for_masking.size()) > dropout_probability).to(
+        tensor_for_masking.device
+    )
+    # Scale mask by 1/keep_prob to preserve output statistics.
+    dropout_mask = binary_mask.float().div(1.0 - dropout_probability)
+    return dropout_mask
+
+
+def find_embedding_layer(model: torch.nn.Module) -> torch.nn.Module:
+    """
+    Takes a model (typically an AllenNLP `Model`, but this works for any `torch.nn.Module`) and
+    makes a best guess about which module is the embedding layer.  For typical AllenNLP models,
+    this often is the `TextFieldEmbedder`, but if you're using a pre-trained contextualizer, we
+    really want layer 0 of that contextualizer, not the output.  So there are a bunch of hacks in
+    here for specific pre-trained contextualizers.
+    """
+    # We'll look for a few special cases in a first pass, then fall back to just finding a
+    # TextFieldEmbedder in a second pass if we didn't find a special case.
+    from transformers.models.gpt2.modeling_gpt2 import GPT2Model
+    from transformers.models.bert.modeling_bert import BertEmbeddings
+    from transformers.models.albert.modeling_albert import AlbertEmbeddings
+    from transformers.models.roberta.modeling_roberta import RobertaEmbeddings
+
+    for module in model.modules():
+        if isinstance(module, BertEmbeddings):
+            return module.word_embeddings
+        if isinstance(module, RobertaEmbeddings):
+            return module.word_embeddings
+        if isinstance(module, AlbertEmbeddings):
+            return module.word_embeddings
+        if isinstance(module, GPT2Model):
+            return module.wte
+
+    return None
+
+    # for module in model.modules():
+    #     if isinstance(module, TextFieldEmbedder):
+    #
+    #         if isinstance(module, BasicTextFieldEmbedder):
+    #             # We'll have a check for single Embedding cases, because we can be more efficient
+    #             # in cases like this.  If this check fails, then for something like hotflip we need
+    #             # to actually run the text field embedder and construct a vector for each token.
+    #             if len(module._token_embedders) == 1:
+    #                 embedder = list(module._token_embedders.values())[0]
+    #                 if isinstance(embedder, Embedding):
+    #                     if embedder._projection is None:
+    #                         # If there's a projection inside the Embedding, then we need to return
+    #                         # the whole TextFieldEmbedder, because there's more computation that
+    #                         # needs to be done than just multiply by an embedding matrix.
+    #                         return embedder
+    #         return module
+    raise RuntimeError("No embedding module found!")
+
+
+
+def get_token_offsets_from_text_field_inputs(
+    text_field_inputs: List[Any],
+) -> Optional[torch.Tensor]:
+    """
+    Given a list of inputs to a TextFieldEmbedder, tries to find token offsets from those inputs, if
+    there are any.  You will have token offsets if you are using a mismatched token embedder; if
+    you're not, the return value from this function should be None.  This function is intended to be
+    called from a `forward_hook` attached to a `TextFieldEmbedder`, so the inputs are formatted just
+    as a list.
+    It's possible in theory that you could have multiple offsets as inputs to a single call to a
+    `TextFieldEmbedder`, but that's an extremely rare use case (I can't really imagine anyone
+    wanting to do that).  In that case, we'll only return the first one.  If you need different
+    behavior for your model, open an issue on github describing what you're doing.
+    """
+    for input_index, text_field_input in enumerate(text_field_inputs):
+        if not isinstance(text_field_input, dict):
+            continue
+        for input_value in text_field_input.values():
+            if not isinstance(input_value, dict):
+                continue
+            for embedder_arg_name, embedder_arg_value in input_value.items():
+                if embedder_arg_name == "offsets":
+                    return embedder_arg_value
+    return None
+
diff --git a/combo/training/trainer.py b/combo/training/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebb8ff0c3d33f593a8cf80cdd7720d41a8189cc1
--- /dev/null
+++ b/combo/training/trainer.py
@@ -0,0 +1,13 @@
+from pytorch_lightning import Trainer
+
+
+class Callback:
+    pass
+
+
+class TransferPatienceEpochCallback:
+    pass
+
+
+class GradientDescentTrainer(Trainer):
+    pass
diff --git a/main.py b/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..5596b44786f04e4810aefe9f8d712f08ed310f71
--- /dev/null
+++ b/main.py
@@ -0,0 +1,16 @@
+# This is a sample Python script.
+
+# Press Shift+F10 to execute it or replace it with your code.
+# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
+
+
+def print_hi(name):
+    # Use a breakpoint in the code line below to debug your script.
+    print(f'Hi, {name}')  # Press Ctrl+F8 to toggle the breakpoint.
+
+
+# Press the green button in the gutter to run the script.
+if __name__ == '__main__':
+    print_hi('PyCharm')
+
+# See PyCharm help at https://www.jetbrains.com/help/pycharm/