diff --git a/combo/data/api.py b/combo/data/api.py
index 4ab7f1a33de77ccf4b17c284777200e39c668081..bfec5eeefac8d41f1a503eb314523fcc34b80c3a 100644
--- a/combo/data/api.py
+++ b/combo/data/api.py
@@ -21,12 +21,13 @@ class Token:
     deps: Optional[str] = None
     misc: Optional[str] = None
     semrel: Optional[str] = None
+    embeddings: Dict[str, List[float]] = field(default_factory=list, repr=False)
 
 
 @dataclass
 class Sentence:
     tokens: List[Token] = field(default_factory=list)
-    sentence_embedding: List[float] = field(default_factory=list)
+    sentence_embedding: List[float] = field(default_factory=list, repr=False)
     metadata: Dict[str, Any] = field(default_factory=collections.OrderedDict)
 
     def to_json(self):
@@ -77,14 +78,16 @@ def tokens2conllu(tokens: List[str]) -> conllu.TokenList:
 
 
 def conllu2sentence(conllu_sentence: conllu.TokenList,
-                    sentence_embedding=None) -> Sentence:
+                    sentence_embedding=None, embeddings=None) -> Sentence:
+    if embeddings is None:
+        embeddings = {}
     if sentence_embedding is None:
         sentence_embedding = []
     tokens = []
-    for token in conllu_sentence.tokens:
+    for idx, token in enumerate(conllu_sentence.tokens):
         tokens.append(
             Token(
-                **token
+                **token, embeddings=embeddings[idx]
             )
         )
     return Sentence(
diff --git a/combo/models/base.py b/combo/models/base.py
index a5cb5fe61f85a98f78d143a54695d01948aa8dda..234fbcaaf739b84fe33bb8633411a8aeb0276b5a 100644
--- a/combo/models/base.py
+++ b/combo/models/base.py
@@ -1,11 +1,10 @@
-from typing import Dict, Optional, List, Union
+from typing import Dict, Optional, List, Union, Tuple
 
 import torch
 import torch.nn as nn
 from allennlp import common, data
 from allennlp import nn as allen_nn
 from allennlp.common import checks
-from allennlp.modules import feedforward
 from allennlp.nn import Activation
 
 from combo.models import utils
@@ -51,7 +50,7 @@ class Linear(nn.Linear, common.FromParams):
 class FeedForwardPredictor(Predictor):
     """Feedforward predictor. Should be used on top of Seq2Seq encoder."""
 
-    def __init__(self, feedforward_network: feedforward.FeedForward):
+    def __init__(self, feedforward_network: "FeedForward"):
         super().__init__()
         self.feedforward_network = feedforward_network
 
@@ -63,10 +62,11 @@ class FeedForwardPredictor(Predictor):
         if mask is None:
             mask = x.new_ones(x.size()[:-1])
 
-        x = self.feedforward_network(x)
+        x, feature_maps = self.feedforward_network(x)
         output = {
             "prediction": x.argmax(-1),
-            "probability": x
+            "probability": x,
+            "embedding": feature_maps[-1],
         }
 
         if labels is not None:
@@ -109,9 +109,112 @@ class FeedForwardPredictor(Predictor):
             f"There is not {vocab_namespace} in created vocabs, check if this field has any values to predict!"
         hidden_dims = hidden_dims + [vocab.get_vocab_size(vocab_namespace)]
 
-        return cls(feedforward.FeedForward(
+        return cls(FeedForward(
             input_dim=input_dim,
             num_layers=num_layers,
             hidden_dims=hidden_dims,
             activations=activations,
             dropout=dropout))
+
+
+class FeedForward(torch.nn.Module, common.FromParams):
+    """
+    Modified copy of allennlp.modules.feedforward.FeedForward
+
+    This `Module` is a feed-forward neural network, just a sequence of `Linear` layers with
+    activation functions in between.
+
+    # Parameters
+
+    input_dim : `int`, required
+        The dimensionality of the input.  We assume the input has shape `(batch_size, input_dim)`.
+    num_layers : `int`, required
+        The number of `Linear` layers to apply to the input.
+    hidden_dims : `Union[int, List[int]]`, required
+        The output dimension of each of the `Linear` layers.  If this is a single `int`, we use
+        it for all `Linear` layers.  If it is a `List[int]`, `len(hidden_dims)` must be
+        `num_layers`.
+    activations : `Union[Activation, List[Activation]]`, required
+        The activation function to use after each `Linear` layer.  If this is a single function,
+        we use it after all `Linear` layers.  If it is a `List[Activation]`,
+        `len(activations)` must be `num_layers`. Activation must have torch.nn.Module type.
+    dropout : `Union[float, List[float]]`, optional (default = `0.0`)
+        If given, we will apply this amount of dropout after each layer.  Semantics of `float`
+        versus `List[float]` is the same as with other parameters.
+
+    # Examples
+
+    ```python
+    FeedForward(124, 2, [64, 32], torch.nn.ReLU(), 0.2)
+    #> FeedForward(
+    #>   (_activations): ModuleList(
+    #>     (0): ReLU()
+    #>     (1): ReLU()
+    #>   )
+    #>   (_linear_layers): ModuleList(
+    #>     (0): Linear(in_features=124, out_features=64, bias=True)
+    #>     (1): Linear(in_features=64, out_features=32, bias=True)
+    #>   )
+    #>   (_dropout): ModuleList(
+    #>     (0): Dropout(p=0.2, inplace=False)
+    #>     (1): Dropout(p=0.2, inplace=False)
+    #>   )
+    #> )
+    ```
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        num_layers: int,
+        hidden_dims: Union[int, List[int]],
+        activations: Union[Activation, List[Activation]],
+        dropout: Union[float, List[float]] = 0.0,
+    ) -> None:
+
+        super().__init__()
+        if not isinstance(hidden_dims, list):
+            hidden_dims = [hidden_dims] * num_layers  # type: ignore
+        if not isinstance(activations, list):
+            activations = [activations] * num_layers  # type: ignore
+        if not isinstance(dropout, list):
+            dropout = [dropout] * num_layers  # type: ignore
+        if len(hidden_dims) != num_layers:
+            raise checks.ConfigurationError(
+                "len(hidden_dims) (%d) != num_layers (%d)" % (len(hidden_dims), num_layers)
+            )
+        if len(activations) != num_layers:
+            raise checks.ConfigurationError(
+                "len(activations) (%d) != num_layers (%d)" % (len(activations), num_layers)
+            )
+        if len(dropout) != num_layers:
+            raise checks.ConfigurationError(
+                "len(dropout) (%d) != num_layers (%d)" % (len(dropout), num_layers)
+            )
+        self._activations = torch.nn.ModuleList(activations)
+        input_dims = [input_dim] + hidden_dims[:-1]
+        linear_layers = []
+        for layer_input_dim, layer_output_dim in zip(input_dims, hidden_dims):
+            linear_layers.append(torch.nn.Linear(layer_input_dim, layer_output_dim))
+        self._linear_layers = torch.nn.ModuleList(linear_layers)
+        dropout_layers = [torch.nn.Dropout(p=value) for value in dropout]
+        self._dropout = torch.nn.ModuleList(dropout_layers)
+        self._output_dim = hidden_dims[-1]
+        self.input_dim = input_dim
+
+    def get_output_dim(self):
+        return self._output_dim
+
+    def get_input_dim(self):
+        return self.input_dim
+
+    def forward(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+
+        output = inputs
+        feature_maps = []
+        for layer, activation, dropout in zip(
+            self._linear_layers, self._activations, self._dropout
+        ):
+            feature_maps.append(output)
+            output = dropout(activation(layer(output)))
+        return output, feature_maps
diff --git a/combo/models/model.py b/combo/models/model.py
index 9866bcb4fba41ed2506b2d33290e6cd0fe237d29..c648453db5ab951ed68573cf79094f1bf21286d2 100644
--- a/combo/models/model.py
+++ b/combo/models/model.py
@@ -129,6 +129,11 @@ class ComboModel(allen_models.Model):
             "enhanced_head": enhanced_head_pred,
             "enhanced_deprel": enhanced_relations_pred,
             "sentence_embedding": torch.max(encoder_emb, dim=1)[0],
+            "upostag_token_embedding": upos_output["embedding"],
+            "xpostag_token_embedding": xpos_output["embedding"],
+            "semrel_token_embedding": semrel_output["embedding"],
+            "feats_token_embedding": morpho_output["embedding"],
+            "deprel_token_embedding": parser_output["embedding"],
         }
 
         if "rel_probability" in enhanced_parser_output:
@@ -196,8 +201,8 @@ class ComboModel(allen_models.Model):
         if callable_model:
             return callable_model(*args, **kwargs)
         if returns_tuple:
-            return {"prediction": (None, None), "loss": (None, None)}
-        return {"prediction": None, "loss": None}
+            return {"prediction": (None, None), "loss": (None, None), "embedding": (None, None)}
+        return {"prediction": None, "loss": None, "embedding": None}
 
     @staticmethod
     def _clean(output):
diff --git a/combo/models/morpho.py b/combo/models/morpho.py
index ea3451dcd0a1e5656dff718e9988ef7ed4406500..b0d307932937da398e8e265b622190422b00a15a 100644
--- a/combo/models/morpho.py
+++ b/combo/models/morpho.py
@@ -4,7 +4,6 @@ from typing import Dict, List, Optional, Union
 import torch
 from allennlp import data
 from allennlp.common import checks
-from allennlp.modules import feedforward
 from allennlp.nn import Activation
 
 from combo.data import dataset
@@ -15,7 +14,7 @@ from combo.models import base, utils
 class MorphologicalFeatures(base.Predictor):
     """Morphological features predicting model."""
 
-    def __init__(self, feedforward_network: feedforward.FeedForward, slices: Dict[str, List[int]]):
+    def __init__(self, feedforward_network: base.FeedForward, slices: Dict[str, List[int]]):
         super().__init__()
         self.feedforward_network = feedforward_network
         self.slices = slices
@@ -28,7 +27,7 @@ class MorphologicalFeatures(base.Predictor):
         if mask is None:
             mask = x.new_ones(x.size()[:-1])
 
-        x = self.feedforward_network(x)
+        x, feature_maps = self.feedforward_network(x)
 
         prediction = []
         for _, cat_indices in self.slices.items():
@@ -36,7 +35,8 @@ class MorphologicalFeatures(base.Predictor):
 
         output = {
             "prediction": torch.stack(prediction, dim=-1),
-            "probability": x
+            "probability": x,
+            "embedding": feature_maps[-1],
         }
 
         if labels is not None:
@@ -92,7 +92,7 @@ class MorphologicalFeatures(base.Predictor):
         slices = dataset.get_slices_if_not_provided(vocab)
 
         return cls(
-            feedforward_network=feedforward.FeedForward(
+            feedforward_network=base.FeedForward(
                 input_dim=input_dim,
                 num_layers=num_layers,
                 hidden_dims=hidden_dims,
diff --git a/combo/models/parser.py b/combo/models/parser.py
index 511edffc2f8d17edbc3fd0702e6425a4ec645e4e..b16f0adcff066c39558cb8709122780d69ee8702 100644
--- a/combo/models/parser.py
+++ b/combo/models/parser.py
@@ -153,6 +153,7 @@ class DependencyRelationModel(base.Predictor):
         dep_rel_pred = torch.cat((dep_rel_pred, dep_rel_emb), dim=-1)
         relation_prediction = self.relation_prediction_layer(dep_rel_pred)
         output = head_output
+        output["embedding"] = dep_rel_pred
 
         if self.training:
             output["prediction"] = (relation_prediction.argmax(-1)[:, 1:], head_output["prediction"])
diff --git a/combo/predict.py b/combo/predict.py
index e528a186a287c5b9d0f971d6f091240a40231959..f580c01cbd7a98766d528238a292f4194d0fbdba 100644
--- a/combo/predict.py
+++ b/combo/predict.py
@@ -82,8 +82,8 @@ class COMBO(predictor.Predictor):
         sentences = []
         predictions = super().predict_batch_instance(instances)
         for prediction, instance in zip(predictions, instances):
-            tree, sentence_embedding = self._predictions_as_tree(prediction, instance)
-            sentence = conllu2sentence(tree, sentence_embedding)
+            tree, sentence_embedding, embeddings = self._predictions_as_tree(prediction, instance)
+            sentence = conllu2sentence(tree, sentence_embedding, embeddings)
             sentences.append(sentence)
         return sentences
 
@@ -96,8 +96,8 @@ class COMBO(predictor.Predictor):
     @overrides
     def predict_instance(self, instance: allen_data.Instance, serialize: bool = True) -> data.Sentence:
         predictions = super().predict_instance(instance)
-        tree, sentence_embedding = self._predictions_as_tree(predictions, instance)
-        return conllu2sentence(tree, sentence_embedding)
+        tree, sentence_embedding, embeddings = self._predictions_as_tree(predictions, instance, )
+        return conllu2sentence(tree, sentence_embedding, embeddings)
 
     @overrides
     def predict_json(self, inputs: common.JsonDict) -> data.Sentence:
@@ -141,6 +141,7 @@ class COMBO(predictor.Predictor):
         tree = instance.fields["metadata"]["input"]
         field_names = instance.fields["metadata"]["field_names"]
         tree_tokens = [t for t in tree if isinstance(t["id"], int)]
+        embeddings = [{} for _ in range(len(tree_tokens))]
         for field_name in field_names:
             if field_name not in predictions:
                 continue
@@ -149,6 +150,7 @@ class COMBO(predictor.Predictor):
                 if field_name in {"xpostag", "upostag", "semrel", "deprel"}:
                     value = self.vocab.get_token_from_index(field_predictions[idx], field_name + "_labels")
                     token[field_name] = value
+                    embeddings[idx][field_name] = predictions[f"{field_name}_token_embedding"][idx]
                 elif field_name == "head":
                     token[field_name] = int(field_predictions[idx])
                 elif field_name == "deps":
@@ -174,6 +176,7 @@ class COMBO(predictor.Predictor):
                         field_value = "|".join(np.array(features)[arg_indices].tolist())
 
                     token[field_name] = field_value
+                    embeddings[idx][field_name] = predictions[f"{field_name}_token_embedding"][idx]
                 elif field_name == "lemma":
                     prediction = field_predictions[idx]
                     word_chars = []
@@ -206,7 +209,7 @@ class COMBO(predictor.Predictor):
             empty_tokens = graph.restore_collapse_edges(tree_tokens)
             tree.tokens.extend(empty_tokens)
 
-        return tree, predictions["sentence_embedding"]
+        return tree, predictions["sentence_embedding"], embeddings
 
     @classmethod
     def with_spacy_tokenizer(cls, model: models.Model,