Passing training for enhanced dependency parsing.

6d373257 · Mateusz Klimaszewski · Mateusz Klimaszewski · 07ffff19 · 6d373257 · 6d373257
Commit 6d373257 authored Sep 23, 2020 by Mateusz Klimaszewski Committed by Mateusz Klimaszewski Jan 3, 2021
--- a/combo/data/dataset.py
+++ b/combo/data/dataset.py
+import copy
 import logging
 from typing import Union, List, Dict, Iterable, Optional, Any, Tuple

@@ -115,29 +116,34 @@ class UniversalDependenciesDatasetReader(allen_data.DatasetReader):
                        fields_[target_name] = allen_fields.SequenceLabelField(target_values, text_field,
                                                                               label_namespace=target_name + "_labels")
                    elif target_name == "deps":
-                        heads = [0 if t["head"] == "_" else int(t["head"]) for t in tree_tokens]
-                        deprels = [t["deprel"] for t in tree_tokens]
+                        # Graphs require adding ROOT (AdjacencyField uses sequence length from TextField).
+                        text_field_deps = copy.deepcopy(text_field)
+                        text_field_deps.tokens.insert(0, _Token("ROOT"))
                        enhanced_heads: List[Tuple[int, int]] = []
                        enhanced_deprels: List[str] = []
                        for idx, t in enumerate(tree_tokens):
-                            enhanced_heads.append((idx, heads[idx]))
-                            enhanced_deprels.append(deprels[idx])
                            t_deps = t["deps"]
                            if t_deps and t_deps != "_":
-                                t_deprels, t_heads = zip(*t_deps)
-                                enhanced_heads.extend([(idx, t) for t in t_heads])
-                                enhanced_deprels.extend(t_deprels)
+                                for rel, head in t_deps:
+                                    # EmoryNLP skips the first edge, if there are two edges between the same
+                                    # nodes. Thanks to that one is in a tree and another in a graph.
+                                    # This snippet follows that approach.
+                                    if enhanced_heads and enhanced_heads[-1] == (idx, head):
+                                        enhanced_heads.pop()
+                                        enhanced_deprels.pop()
+                                    enhanced_heads.append((idx, head))
+                                    enhanced_deprels.append(rel)
                        fields_["enhanced_heads"] = allen_fields.AdjacencyField(
                            indices=enhanced_heads,
-                            sequence_field=text_field,
+                            sequence_field=text_field_deps,
                            label_namespace="enhanced_heads_labels",
                            padding_value=0,
                        )
                        fields_["enhanced_deprels"] = allen_fields.AdjacencyField(
                            indices=enhanced_heads,
-                            sequence_field=text_field,
+                            sequence_field=text_field_deps,
                            labels=enhanced_deprels,
-                            # Label namespace should match regular tree parsing.
+                            # Label namespace matches regular tree parsing.
                            label_namespace="deprel_labels",
                            padding_value=0,
                        )
@@ -160,7 +166,9 @@ class UniversalDependenciesDatasetReader(allen_data.DatasetReader):
                token["feats"] = field

        # metadata
-        fields_["metadata"] = allen_fields.MetadataField({"input": tree, "field_names": self.fields})
+        fields_["metadata"] = allen_fields.MetadataField({"input": tree,
+                                                          "field_names": self.fields,
+                                                          "tokens": tokens})

        return allen_data.Instance(fields_)


--- a/combo/main.py
+++ b/combo/main.py
@@ -33,8 +33,10 @@ flags.DEFINE_string(name="output_file", default="output.log",
 # Training flags
 flags.DEFINE_list(name="training_data_path", default="./tests/fixtures/example.conllu",
                  help="Training data path(s)")
+flags.DEFINE_alias(name="training_data", original_name="training_data_path")
 flags.DEFINE_list(name="validation_data_path", default="",
                  help="Validation data path(s)")
+flags.DEFINE_alias(name="validation_data", original_name="validation_data_path")
 flags.DEFINE_string(name="pretrained_tokens", default="",
                    help="Pretrained tokens embeddings path")
 flags.DEFINE_integer(name="embedding_dim", default=300,

--- a/combo/models/__init__.py
+++ b/combo/models/__init__.py
 """Models module."""
 from .base import FeedForwardPredictor
+from .graph_parser import GraphDependencyRelationModel
 from .parser import DependencyRelationModel
 from .embeddings import CharacterBasedWordEmbeddings
 from .encoder import ComboEncoder

--- a/combo/models/graph_parser.py
+++ b/combo/models/graph_parser.py
+"""Enhanced dependency parsing models."""
+from typing import Tuple, Dict, Optional, Union, List
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from allennlp import data
+from allennlp.nn import chu_liu_edmonds
+
+from combo.models import base, utils
+
+
+class GraphHeadPredictionModel(base.Predictor):
+    """Head prediction model."""
+
+    def __init__(self,
+                 head_projection_layer: base.Linear,
+                 dependency_projection_layer: base.Linear,
+                 cycle_loss_n: int = 0,
+                 graph_weighting: float = 0.2):
+        super().__init__()
+        self.head_projection_layer = head_projection_layer
+        self.dependency_projection_layer = dependency_projection_layer
+        self.cycle_loss_n = cycle_loss_n
+        self.graph_weighting = graph_weighting
+
+    def forward(self,
+                x: Union[torch.Tensor, List[torch.Tensor]],
+                labels: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
+                mask: Optional[torch.BoolTensor] = None,
+                sample_weights: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None) -> Dict[str, torch.Tensor]:
+        if mask is None:
+            mask = x.new_ones(x.size()[-1])
+        heads_labels = None
+        if labels is not None and labels[0] is not None:
+            heads_labels = labels
+
+        head_arc_emb = self.head_projection_layer(x)
+        dep_arc_emb = self.dependency_projection_layer(x)
+        x = dep_arc_emb.bmm(head_arc_emb.transpose(2, 1))
+        pred = x.sigmoid() > 0.5
+
+        output = {
+            "prediction": pred,
+            "probability": x
+        }
+
+        if heads_labels is not None:
+            if sample_weights is None:
+                sample_weights = heads_labels.new_ones([mask.size(0)])
+            output["loss"], output["cycle_loss"] = self._loss(x, heads_labels, mask, sample_weights)
+
+        return output
+
+    def _cycle_loss(self, pred: torch.Tensor):
+        BATCH_SIZE, _, _ = pred.size()
+        loss = pred.new_zeros(BATCH_SIZE)
+        # Index from 1: as using non __ROOT__ tokens
+        pred = pred.softmax(-1)[:, 1:, 1:]
+        x = pred
+        for i in range(self.cycle_loss_n):
+            loss += self._batch_trace(x)
+
+            # Don't multiple on last iteration
+            if i < self.cycle_loss_n - 1:
+                x = x.bmm(pred)
+
+        return loss
+
+    @staticmethod
+    def _batch_trace(x: torch.Tensor) -> torch.Tensor:
+        assert len(x.size()) == 3
+        BATCH_SIZE, N, M = x.size()
+        assert N == M
+        identity = x.new_tensor(torch.eye(N))
+        identity = identity.reshape((1, N, N))
+        batch_identity = identity.repeat(BATCH_SIZE, 1, 1)
+        return (x * batch_identity).sum((-1, -2))
+
+    def _loss(self, pred: torch.Tensor, labels: torch.Tensor,  mask: torch.BoolTensor,
+              sample_weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        BATCH_SIZE, N, M = pred.size()
+        assert N == M
+        SENTENCE_LENGTH = N
+
+        valid_positions = mask.sum()
+
+        result = []
+        true = labels
+        # Ignore first pred dimension as it is ROOT token prediction
+        for i in range(SENTENCE_LENGTH - 1):
+            pred_i = pred[:, i + 1, 1:].reshape(-1)
+            true_i = true[:, i + 1, 1:].reshape(-1)
+            mask_i = mask[:, i]
+            bce_loss = F.binary_cross_entropy_with_logits(pred_i, true_i, reduction="none").mean(-1) * mask_i
+            result.append(bce_loss)
+        cycle_loss = self._cycle_loss(pred)
+        loss = torch.stack(result).transpose(1, 0) * sample_weights.unsqueeze(-1)
+        return loss.sum() / valid_positions + cycle_loss.mean(), cycle_loss.mean()
+
+
+@base.Predictor.register("combo_graph_dependency_parsing_from_vocab", constructor="from_vocab")
+class GraphDependencyRelationModel(base.Predictor):
+    """Dependency relation parsing model."""
+
+    def __init__(self,
+                 head_predictor: GraphHeadPredictionModel,
+                 head_projection_layer: base.Linear,
+                 dependency_projection_layer: base.Linear,
+                 relation_prediction_layer: base.Linear):
+        super().__init__()
+        self.head_predictor = head_predictor
+        self.head_projection_layer = head_projection_layer
+        self.dependency_projection_layer = dependency_projection_layer
+        self.relation_prediction_layer = relation_prediction_layer
+
+    def forward(self,
+                x: Union[torch.Tensor, List[torch.Tensor]],
+                mask: Optional[torch.BoolTensor] = None,
+                labels: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
+                sample_weights: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None) -> Dict[str, torch.Tensor]:
+        # if mask is not None:
+        #     mask = mask[:, 1:]
+        relations_labels, head_labels, enhanced_heads_labels, enhanced_deprels_labels = None, None, None, None
+        if labels is not None and labels[0] is not None:
+            relations_labels, head_labels, enhanced_heads_labels = labels
+            # if mask is None:
+            #     mask = head_labels.new_ones(head_labels.size())
+
+        head_output = self.head_predictor(x, enhanced_heads_labels, mask, sample_weights)
+        head_pred = head_output["probability"]
+        BATCH_SIZE, LENGTH, _ = head_pred.size()
+
+        head_rel_emb = self.head_projection_layer(x)
+
+        dep_rel_emb = self.dependency_projection_layer(x)
+
+        # All possible edges combinations for each batch
+        # Repeat interleave to have [emb1, emb1 ... (length times) ... emb1, emb2 ... ]
+        head_rel_pred = head_rel_emb.repeat_interleave(LENGTH, -2)
+        # Regular repeat to have all combinations [deprel1, deprel2, ... deprelL, deprel1 ...]
+        dep_rel_pred = dep_rel_emb.repeat(1, LENGTH, 1)
+
+        # All possible edges combinations for each batch
+        dep_rel_pred = torch.cat((head_rel_pred, dep_rel_pred), dim=-1)
+
+        relation_prediction = self.relation_prediction_layer(dep_rel_pred).reshape(BATCH_SIZE, LENGTH, LENGTH, -1)
+        output = head_output
+
+        output["prediction"] = (relation_prediction.argmax(-1), head_output["prediction"])
+
+        if labels is not None and labels[0] is not None:
+            if sample_weights is None:
+                sample_weights = labels.new_ones([mask.size(0)])
+            loss = self._loss(relation_prediction, relations_labels, enhanced_heads_labels, mask, sample_weights)
+            output["loss"] = (loss, head_output["loss"])
+
+        return output
+
+    @staticmethod
+    def _loss(pred: torch.Tensor,
+              true: torch.Tensor,
+              heads_true: torch.Tensor,
+              mask: torch.BoolTensor,
+              sample_weights: torch.Tensor) -> torch.Tensor:
+
+        true = true[true.long() > 0]
+        pred = pred[heads_true.long() == 1]
+        loss = F.cross_entropy(pred, true.long())
+        return loss.sum() / pred.size(0)
+
+    @classmethod
+    def from_vocab(cls,
+                   vocab: data.Vocabulary,
+                   vocab_namespace: str,
+                   head_predictor: GraphHeadPredictionModel,
+                   head_projection_layer: base.Linear,
+                   dependency_projection_layer: base.Linear
+                   ):
+        """Creates parser combining model configuration and vocabulary data."""
+        assert vocab_namespace in vocab.get_namespaces()
+        relation_prediction_layer = base.Linear(
+            in_features=head_projection_layer.get_output_dim() + dependency_projection_layer.get_output_dim(),
+            out_features=vocab.get_vocab_size(vocab_namespace)
+        )
+        return cls(
+            head_predictor=head_predictor,
+            head_projection_layer=head_projection_layer,
+            dependency_projection_layer=dependency_projection_layer,
+            relation_prediction_layer=relation_prediction_layer
+        )
--- a/combo/models/model.py
+++ b/combo/models/model.py
@@ -27,6 +27,7 @@ class SemanticMultitaskModel(allen_models.Model):
                 semantic_relation: Optional[base.Predictor] = None,
                 morphological_feat: Optional[base.Predictor] = None,
                 dependency_relation: Optional[base.Predictor] = None,
+                 enhanced_dependency_relation: Optional[base.Predictor] = None,
                 regularizer: allen_nn.RegularizerApplicator = None) -> None:
        super().__init__(vocab, regularizer)
        self.text_field_embedder = text_field_embedder
@@ -39,6 +40,7 @@ class SemanticMultitaskModel(allen_models.Model):
        self.semantic_relation = semantic_relation
        self.morphological_feat = morphological_feat
        self.dependency_relation = dependency_relation
+        self.enhanced_dependency_relation = enhanced_dependency_relation
        self._head_sentinel = torch.nn.Parameter(torch.randn([1, 1, self.seq_encoder.get_output_dim()]))
        self.scores = metrics.SemanticMetrics()
        self._partial_losses = None
@@ -106,7 +108,14 @@ class SemanticMultitaskModel(allen_models.Model):
                                       mask=word_mask,
                                       labels=(deprel, head),
                                       sample_weights=sample_weights)
+        enhanced_parser_output = self._optional(self.enhanced_dependency_relation,
+                                                encoder_emb,
+                                                returns_tuple=True,
+                                                mask=word_mask,
+                                                labels=(enhanced_deprels, head, enhanced_heads),
+                                                sample_weights=sample_weights)
        relations_pred, head_pred = parser_output["prediction"]
+        enhanced_relations_pred, enhanced_head_pred = enhanced_parser_output["prediction"]
        output = {
            "upostag": upos_output["prediction"],
            "xpostag": xpos_output["prediction"],
@@ -115,6 +124,8 @@ class SemanticMultitaskModel(allen_models.Model):
            "lemma": lemma_output["prediction"],
            "head": head_pred,
            "deprel": relations_pred,
+            "enhanced_head": enhanced_head_pred,
+            "enhanced_deprel": enhanced_relations_pred,
            "sentence_embedding": torch.max(encoder_emb[:, 1:], dim=1)[0],
        }

@@ -136,9 +147,12 @@ class SemanticMultitaskModel(allen_models.Model):
                "lemma": lemma.get("char").get("token_characters") if lemma else None,
                "head": head,
                "deprel": deprel,
+                "enhanced_head": enhanced_heads,
+                "enhanced_deprel": enhanced_deprels,
            }
            self.scores(output, labels, word_mask[:, 1:])
            relations_loss, head_loss = parser_output["loss"]
+            enhanced_relations_loss, enhanced_head_loss = enhanced_parser_output["loss"]
            losses = {
                "upostag_loss": upos_output["loss"],
                "xpostag_loss": xpos_output["loss"],
@@ -147,6 +161,8 @@ class SemanticMultitaskModel(allen_models.Model):
                "lemma_loss": lemma_output["loss"],
                "head_loss": head_loss,
                "deprel_loss": relations_loss,
+                "enhanced_head_loss": enhanced_head_loss,
+                "enhanced_deprel_loss": enhanced_relations_loss,
                # Cycle loss is only for the metrics purposes.
                "cycle_loss": parser_output.get("cycle_loss")
            }

--- a/combo/predict.py
+++ b/combo/predict.py
@@ -154,6 +154,10 @@ class SemanticMultitaskPredictor(predictor.Predictor):
                        token[field_name] = value
                    elif field_name in ["head"]:
                        token[field_name] = int(predictions[field_name][idx])
+                    elif field_name == "deps":
+                        # Handled after every other decoding
+                        continue
+
                    elif field_name in ["feats"]:
                        slices = self._model.morphological_feat.slices
                        features = []
@@ -171,8 +175,6 @@ class SemanticMultitaskPredictor(predictor.Predictor):
                            field_value = "|".join(sorted(features))

                        token[field_name] = field_value
-                    elif field_name == "head":
-                        pass
                    elif field_name == "lemma":
                        prediction = predictions[field_name][idx]
                        word_chars = []
@@ -191,6 +193,13 @@ class SemanticMultitaskPredictor(predictor.Predictor):
                    else:
                        raise NotImplementedError(f"Unknown field name {field_name}!")

+        if "enhanced_head" in predictions and predictions["enhanced_head"]:
+            import combo.utils.graph as graph
+            tree = graph.sdp_to_dag_deps(arc_scores=predictions["enhanced_head"],
+                                         rel_scores=predictions["enhanced_deprel"],
+                                         tree=tree,
+                                         root_label="ROOT")
+
        return tree, predictions["sentence_embedding"]

    @classmethod

--- a/combo/utils/graph.py
+++ b/combo/utils/graph.py
+"""Based on https://github.com/emorynlp/iwpt-shared-task-2020."""
+import numpy as np
+from conllu import TokenList
+
+
+def sdp_to_dag_deps(arc_scores, rel_scores, tree: TokenList, root_label):
+    # adding ROOT
+    tree_tokens = tree.tokens
+    tree_heads = [0] + [t["head"] for t in tree_tokens]
+    graph = adjust_root_score_then_add_secondary_arcs(arc_scores, rel_scores, tree_heads,
+                                                      root_label)
+    for i, (t, g) in enumerate(zip(tree_heads, graph)):
+        if not i:
+            continue
+        rels = [x[1] for x in g]
+        heads = [x[0] for x in g]
+        head = tree_tokens[i - 1]["head"]
+        index = heads.index(head)
+        deprel = tree_tokens[i - 1]["deprel"]
+        deprel = deprel.split('>')[-1]
+        # TODO is this necessary?
+        if len(heads) >= 2:
+            heads.pop(index)
+            rels.pop(index)
+        deps = '|'.join(f'{h}:{r}' for h, r in zip(heads, rels))
+        tree_tokens[i - 1]["deps"] = deps
+        tree_tokens[i - 1]["deprel"] = deprel
+    return tree
+
+
+def adjust_root_score_then_add_secondary_arcs(arc_scores, rel_labels, tree_heads, root_idx):
+    if len(arc_scores) != tree_heads:
+        arc_scores = arc_scores[:len(tree_heads), :len(tree_heads)]
+        rel_labels = rel_labels[:len(tree_heads), :len(tree_heads)]
+    parse_preds = arc_scores > 0
+    parse_preds[:, 0] = False  # set heads to False
+    # rel_labels[:, :, root_idx] = -float('inf')
+    return add_secondary_arcs(arc_scores, rel_labels, tree_heads, root_idx, parse_preds)
+
+
+def add_secondary_arcs(arc_scores, rel_labels, tree_heads, root_label, parse_preds):
+    if not isinstance(tree_heads, np.ndarray):
+        tree_heads = np.array(tree_heads)
+    dh = np.argwhere(parse_preds)
+    sdh = sorted([(arc_scores[x[0], x[1]], list(x)) for x in dh], reverse=True)
+    graph = [[] for _ in range(len(tree_heads))]
+    for d, h in enumerate(tree_heads):
+        if d:
+            graph[h].append(d)
+    for s, (d, h) in sdh:
+        if not d or not h or d in graph[h]:
+            continue
+        try:
+            path = next(_dfs(graph, d, h))
+        except StopIteration:
+            # no path from d to h
+            graph[h].append(d)
+    parse_graph = [[] for _ in range(len(tree_heads))]
+    num_root = 0
+    for h in range(len(tree_heads)):
+        for d in graph[h]:
+            rel = rel_labels[d, h]
+            if h == 0:
+                rel = root_label
+                assert num_root == 0
+                num_root += 1
+            parse_graph[d].append((h, rel))
+        parse_graph[d] = sorted(parse_graph[d])
+    return parse_graph
+
+
+def _dfs(graph, start, end):
+    fringe = [(start, [])]
+    while fringe:
+        state, path = fringe.pop()
+        if path and state == end:
+            yield path
+            continue
+        for next_state in graph[state]:
+            if next_state in path:
+                continue
+            fringe.append((next_state, path + [next_state]))
--- a/combo/utils/metrics.py
+++ b/combo/utils/metrics.py
@@ -117,6 +117,8 @@ class AttachmentScores(metrics.Metric):
        mask : `torch.BoolTensor`, optional (default = None).
            A tensor of the same shape as `predicted_indices`.
        """
+        if gold_labels is None or gold_indices is None:
+            return
        detached = self.detach_tensors(
            predicted_indices, predicted_labels, gold_indices, gold_labels, mask
        )
@@ -198,6 +200,7 @@ class SemanticMetrics(metrics.Metric):
        self.feats_score = SequenceBoolAccuracy(prod_last_dim=True)
        self.lemma_score = SequenceBoolAccuracy(prod_last_dim=True)
        self.attachment_scores = AttachmentScores()
+        self.enhanced_attachment_scores = AttachmentScores()
        self.em_score = 0.0

    def __call__(  # type: ignore
@@ -215,14 +218,25 @@ class SemanticMetrics(metrics.Metric):
                               gold_labels["head"],
                               gold_labels["deprel"],
                               mask)
+        self.enhanced_attachment_scores(predictions["enhanced_head"],
+                                        predictions["enhanced_deprel"],
+                                        gold_labels["enhanced_head"],
+                                        gold_labels["enhanced_deprel"],
+                                        mask=None)
+        enhanced_indices = (
+            self.enhanced_attachment_scores.correct_indices.reshape(mask.size(0), mask.size(1) + 1, -1)[:, 1:, 1:].sum(
+                -1).reshape(-1).bool()
+            if len(self.enhanced_attachment_scores.correct_indices.size()) > 0
+            else self.enhanced_attachment_scores.correct_indices
+        )
        total = mask.sum()
        correct_indices = (self.upos_score.correct_indices *
                           self.xpos_score.correct_indices *
                           self.semrel_score.correct_indices *
                           self.feats_score.correct_indices *
                           self.lemma_score.correct_indices *
-                           self.attachment_scores.correct_indices
-                           )
+                           self.attachment_scores.correct_indices *
+                           enhanced_indices)

        total, correct_indices = self.detach_tensors(total, correct_indices)
        self.em_score = (correct_indices.float().sum() / total).item()
@@ -237,6 +251,8 @@ class SemanticMetrics(metrics.Metric):
            "EM": self.em_score
        }
        metrics_dict.update(self.attachment_scores.get_metric(reset))
+        enhanced_metrics = {f"E{k}": v for k, v in self.enhanced_attachment_scores.get_metric(reset).items()}
+        metrics_dict.update(enhanced_metrics)
        return metrics_dict

    def reset(self) -> None:
@@ -246,4 +262,5 @@ class SemanticMetrics(metrics.Metric):
        self.lemma_score.reset()
        self.feats_score.reset()
        self.attachment_scores.reset()
+        self.enhanced_attachment_scores.reset()
        self.em_score = 0.0
--- a/config.graph.template.jsonnet
+++ b/config.graph.template.jsonnet
+########################################################################################
+#                                 BASIC configuration                                  #
+########################################################################################
+# Training data path, str
+# Must be in CONNLU format (or it's extended version with semantic relation field).
+# Can accepted multiple paths when concatenated with ',', "path1,path2"
+local training_data_path = std.extVar("training_data_path");
+# Validation data path, str
+# Can accepted multiple paths when concatenated with ',', "path1,path2"
+local validation_data_path = if std.length(std.extVar("validation_data_path")) > 0 then std.extVar("validation_data_path");
+# Path to pretrained tokens, str or null
+local pretrained_tokens = if std.length(std.extVar("pretrained_tokens")) > 0 then std.extVar("pretrained_tokens");
+# Name of pretrained transformer model, str or null
+local pretrained_transformer_name = if std.length(std.extVar("pretrained_transformer_name")) > 0 then std.extVar("pretrained_transformer_name");
+# Learning rate value, float
+local learning_rate = 0.002;
+# Number of epochs, int
+local num_epochs = std.parseInt(std.extVar("num_epochs"));
+# Cuda device id, -1 for cpu, int
+local cuda_device = std.parseInt(std.extVar("cuda_device"));
+# Minimum number of words in batch, int
+local word_batch_size = std.parseInt(std.extVar("word_batch_size"));
+# Features used as input, list of str
+# Choice "upostag", "xpostag", "lemma"
+# Required "token", "char"
+local features = std.split(std.extVar("features"), " ");
+# Targets of the model, list of str
+# Choice "feats", "lemma", "upostag", "xpostag", "semrel". "sent"
+# Required "deprel", "head"
+local targets = std.split(std.extVar("targets"), " ");
+# Word embedding dimension, int
+# If pretrained_tokens is not null must much provided dimensionality
+local embedding_dim = std.parseInt(std.extVar("embedding_dim"));
+# Dropout rate on predictors, float
+# All of the models on top of the encoder uses this dropout
+local predictors_dropout = 0.25;
+# Xpostag embedding dimension, int
+# (discarded if xpostag not in features)
+local xpostag_dim = 32;
+# Upostag embedding dimension, int
+# (discarded if upostag not in features)
+local upostag_dim = 32;
+# Feats embedding dimension, int
+# (discarded if feats not in featres)
+local feats_dim = 32;
+# Lemma embedding dimension, int
+# (discarded if lemma not in features)
+local lemma_char_dim = 64;
+# Character embedding dim, int
+local char_dim = 64;
+# Word embedding projection dim, int
+local projected_embedding_dim = 100;
+# Loss weights, dict[str, int]
+local loss_weights = {
+    xpostag: 0.05,
+    upostag: 0.05,
+    lemma: 0.05,
+    feats: 0.2,
+    deprel: 0.8,
+    head: 0.2,
+    semrel: 0.05,
+    enhanced_head: 0.2,
+    enhanced_deprel: 0.8,
+};
+# Encoder hidden size, int
+local hidden_size = 512;
+# Number of layers in the encoder, int
+local num_layers = 2;
+# Cycle loss iterations, int
+local cycle_loss_n = 0;
+# Maximum length of the word, int
+# Shorter words are padded, longer - truncated
+local word_length = 30;
+# Whether to use tensorboard, bool
+local use_tensorboard = if std.extVar("use_tensorboard") == "True" then true else false;
+# Path for tensorboard metrics, str
+local metrics_dir = "./runs";
+
+# Helper functions
+local in_features(name) = !(std.length(std.find(name, features)) == 0);
+local in_targets(name) = !(std.length(std.find(name, targets)) == 0);
+local use_transformer = pretrained_transformer_name != null;
+
+# Verify some configuration requirements
+assert in_features("token"): "Key 'token' must be in features!";
+assert in_features("char"): "Key 'char' must be in features!";
+
+assert in_targets("deprel"): "Key 'deprel' must be in targets!";
+assert in_targets("head"): "Key 'head' must be in targets!";
+
+assert pretrained_tokens == null || pretrained_transformer_name == null: "Can't use pretrained tokens and pretrained transformer at the same time!";
+
+########################################################################################
+#                              ADVANCED configuration                                  #
+########################################################################################
+
+# Detailed dataset, training, vocabulary and model configuration.
+{
+    # Configuration type (default or finetuning), str
+    type: std.extVar('type'),
+    # Datasets used for vocab creation, list of str
+    # Choice "train", "valid"
+    datasets_for_vocab_creation: ['train'],
+    # Path to training data, str
+    train_data_path: training_data_path,
+    # Path to validation data, str
+    validation_data_path: validation_data_path,
+    # Dataset reader configuration (conllu format)
+    dataset_reader: {
+        type: "conllu",
+        features: features,
+        targets: targets,
+        # Whether data contains semantic relation field, bool
+        use_sem: if in_targets("semrel") then true else false,
+        token_indexers: {
+            token: if use_transformer then {
+                type: "pretrained_transformer_mismatched",
+                model_name: pretrained_transformer_name,
+            } else {
+                # SingleIdTokenIndexer, token as single int
+                type: "single_id",
+            },
+            upostag: {
+                type: "single_id",
+                namespace: "upostag",
+                feature_name: "pos_",
+            },
+            xpostag: {
+                type: "single_id",
+                namespace: "xpostag",
+                feature_name: "tag_",
+            },
+            lemma: {
+                type: "characters_const_padding",
+                character_tokenizer: {
+                    start_tokens: ["__START__"],
+                    end_tokens: ["__END__"],
+                },
+                # +2 for start and end token
+                min_padding_length: word_length + 2,
+            },
+            char: {
+                type: "characters_const_padding",
+                character_tokenizer: {
+                    start_tokens: ["__START__"],
+                    end_tokens: ["__END__"],
+                },
+                # +2 for start and end token
+                min_padding_length: word_length + 2,
+            },
+            feats: {
+                type: "feats_indexer",
+            },
+        },
+        lemma_indexers: {
+            char: {
+                type: "characters_const_padding",
+                namespace: "lemma_characters",
+                character_tokenizer: {
+                    start_tokens: ["__START__"],
+                    end_tokens: ["__END__"],
+                },
+                # +2 for start and end token
+                min_padding_length: word_length + 2,
+            },
+        },
+    },
+    # Data loader configuration
+    data_loader: {
+        batch_sampler: {
+            type: "token_count",
+            word_batch_size: word_batch_size,
+        },
+    },
+    # Vocabulary configuration
+    vocabulary: std.prune({
+        type: "from_instances_extended",
+        only_include_pretrained_words: true,
+        pretrained_files: {
+            tokens: pretrained_tokens,
+        },
+        oov_token: "_",
+        padding_token: "__PAD__",
+        non_padded_namespaces: ["head_labels"],
+    }),
+    model: std.prune({
+        type: "semantic_multitask",
+        text_field_embedder: {
+            type: "basic",
+            token_embedders: {
+                xpostag: if in_features("xpostag") then {
+                    type: "embedding",
+                    padding_index: 0,
+                    embedding_dim: xpostag_dim,
+                    vocab_namespace: "xpostag",
+                },
+                upostag: if in_features("upostag") then {
+                    type: "embedding",
+                    padding_index: 0,
+                    embedding_dim: upostag_dim,
+                    vocab_namespace: "upostag",
+                },
+                token: if use_transformer then {
+                    type: "transformers_word_embeddings",
+                    model_name: pretrained_transformer_name,
+                    projection_dim: projected_embedding_dim,
+                } else {
+                    type: "embeddings_projected",
+                    embedding_dim: embedding_dim,
+                    projection_layer: {
+                        in_features: embedding_dim,
+                        out_features: projected_embedding_dim,
+                        dropout_rate: 0.25,
+                        activation: "tanh"
+                    },
+                    vocab_namespace: "tokens",
+                    pretrained_file: pretrained_tokens,
+                    trainable: if pretrained_tokens == null then true else false,
+                },
+                char: {
+                    type: "char_embeddings_from_config",
+                    embedding_dim: char_dim,
+                    dilated_cnn_encoder: {
+                        input_dim: char_dim,
+                        filters: [512, 256, char_dim],
+                        kernel_size: [3, 3, 3],
+                        stride: [1, 1, 1],
+                        padding: [1, 2, 4],
+                        dilation: [1, 2, 4],
+                        activations: ["relu", "relu", "linear"],
+                    },
+                },
+                lemma: if in_features("lemma") then {
+                    type: "char_embeddings_from_config",
+                    embedding_dim: lemma_char_dim,
+                    dilated_cnn_encoder: {
+                        input_dim: lemma_char_dim,
+                        filters: [512, 256, lemma_char_dim],
+                        kernel_size: [3, 3, 3],
+                        stride: [1, 1, 1],
+                        padding: [1, 2, 4],
+                        dilation: [1, 2, 4],
+                        activations: ["relu", "relu", "linear"],
+                    },
+                },
+                feats: if in_features("feats") then {
+                    type: "feats_embedding",
+                    padding_index: 0,
+                    embedding_dim: feats_dim,
+                    vocab_namespace: "feats",
+                },
+            },
+        },
+        loss_weights: loss_weights,
+        seq_encoder: {
+            type: "combo_encoder",
+            layer_dropout_probability: 0.33,
+            stacked_bilstm: {
+                input_size:
+                (char_dim + projected_embedding_dim +
+                (if in_features('xpostag') then xpostag_dim else 0) +
+                (if in_features('lemma') then lemma_char_dim else 0) +
+                (if in_features('upostag') then upostag_dim else 0) +
+                (if in_features('feats') then feats_dim else 0)),
+                hidden_size: hidden_size,
+                num_layers: num_layers,
+                recurrent_dropout_probability: 0.33,
+                layer_dropout_probability: 0.33
+            },
+        },
+        dependency_relation: {
+            type: "combo_dependency_parsing_from_vocab",
+            vocab_namespace: 'deprel_labels',
+            head_predictor: {
+                local projection_dim = 512,
+                cycle_loss_n: cycle_loss_n,
+                head_projection_layer: {
+                    in_features: hidden_size * 2,
+                    out_features: projection_dim,
+                    activation: "tanh",
+                },
+                dependency_projection_layer: {
+                    in_features: hidden_size * 2,
+                    out_features: projection_dim,
+                    activation: "tanh",
+                },
+            },
+            local projection_dim = 128,
+            head_projection_layer: {
+                in_features: hidden_size * 2,
+                out_features: projection_dim,
+                dropout_rate: predictors_dropout,
+                activation: "tanh"
+            },
+            dependency_projection_layer: {
+                in_features: hidden_size * 2,
+                out_features: projection_dim,
+                dropout_rate: predictors_dropout,
+                activation: "tanh"
+            },
+        },
+        enhanced_dependency_relation: if in_targets("deps") then {
+            type: "combo_graph_dependency_parsing_from_vocab",
+            vocab_namespace: 'deprel_labels',
+            head_predictor: {
+                local projection_dim = 512,
+                cycle_loss_n: cycle_loss_n,
+                head_projection_layer: {
+                    in_features: hidden_size * 2,
+                    out_features: projection_dim,
+                    activation: "tanh",
+                },
+                dependency_projection_layer: {
+                    in_features: hidden_size * 2,
+                    out_features: projection_dim,
+                    activation: "tanh",
+                },
+            },
+            local projection_dim = 128,
+            head_projection_layer: {
+                in_features: hidden_size * 2,
+                out_features: projection_dim,
+                dropout_rate: predictors_dropout,
+                activation: "tanh"
+            },
+            dependency_projection_layer: {
+                in_features: hidden_size * 2,
+                out_features: projection_dim,
+                dropout_rate: predictors_dropout,
+                activation: "tanh"
+            },
+        },
+        morphological_feat: if in_targets("feats") then {
+            type: "combo_morpho_from_vocab",
+            vocab_namespace: "feats_labels",
+            input_dim: hidden_size * 2,
+            hidden_dims: [128],
+            activations: ["tanh", "linear"],
+            dropout: [predictors_dropout, 0.0],
+            num_layers: 2,
+        },
+        lemmatizer: if in_targets("lemma") then {
+            type: "combo_lemma_predictor_from_vocab",
+            char_vocab_namespace: "token_characters",
+            lemma_vocab_namespace: "lemma_characters",
+            embedding_dim: 256,
+            input_projection_layer: {
+                in_features: hidden_size * 2,
+                out_features: 32,
+                dropout_rate: predictors_dropout,
+                activation: "tanh"
+            },
+            filters: [256, 256, 256],
+            kernel_size: [3, 3, 3, 1],
+            stride: [1, 1, 1, 1],
+            padding: [1, 2, 4, 0],
+            dilation: [1, 2, 4, 1],
+            activations: ["relu", "relu", "relu", "linear"],
+        },
+        upos_tagger: if in_targets("upostag") then {
+            input_dim: hidden_size * 2,
+            hidden_dims: [64],
+            activations: ["tanh", "linear"],
+            dropout: [predictors_dropout, 0.0],
+            num_layers: 2,
+            vocab_namespace: "upostag_labels"
+        },
+        xpos_tagger: if in_targets("xpostag") then {
+            input_dim: hidden_size * 2,
+            hidden_dims: [128],
+            activations: ["tanh", "linear"],
+            dropout: [predictors_dropout, 0.0],
+            num_layers: 2,
+            vocab_namespace: "xpostag_labels"
+        },
+        semantic_relation: if in_targets("semrel") then {
+            input_dim: hidden_size * 2,
+            hidden_dims: [64],
+            activations: ["tanh", "linear"],
+            dropout: [predictors_dropout, 0.0],
+            num_layers: 2,
+            vocab_namespace: "semrel_labels"
+        },
+        regularizer: {
+            regexes: [
+                [".*conv1d.*", {type: "l2", alpha: 1e-6}],
+                [".*forward.*", {type: "l2", alpha: 1e-6}],
+                [".*backward.*", {type: "l2", alpha: 1e-6}],
+                [".*char_embed.*", {type: "l2", alpha: 1e-5}],
+            ],
+        },
+    }),
+    trainer: std.prune({
+        checkpointer: {
+            type: "finishing_only_checkpointer",
+        },
+        type: "gradient_descent_validate_n",
+        cuda_device: cuda_device,
+        grad_clipping: 5.0,
+        num_epochs: num_epochs,
+        optimizer: {
+            type: "adam",
+            lr: learning_rate,
+            betas: [0.9, 0.9],
+        },
+        patience: 1, # it will  be overwriten by callback
+        epoch_callbacks: [
+            { type: "transfer_patience" },
+        ],
+        learning_rate_scheduler: {
+            type: "combo_scheduler",
+        },
+        tensorboard_writer: if use_tensorboard then {
+            serialization_dir: metrics_dir,
+            should_log_learning_rate: false,
+            should_log_parameter_statistics: false,
+            summary_interval: 100,
+        },
+        validation_metric: "+EM",
+    }),
+}
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@ from setuptools import find_packages, setup

 REQUIREMENTS = [
    'absl-py==0.9.0',
-    'allennlp==1.2.0',
+    'allennlp==1.2.1',
    'conllu==2.3.2',
    'dataclasses;python_version<"3.7"',
    'dataclasses-json==0.5.2',

--- a/tests/fixtures/example.conllu
+++ b/tests/fixtures/example.conllu
@@ -4,3 +4,10 @@
 2	Sentence	verylonglemmawhichmustbetruncatedbythesystemto30	NOUN	nom	Number=Sing	0	root	_	_
 3	.	.	PUNCT	.	_	1	punct	_	_

+# sent_id = test-s1
+# text = Easy sentence.
+1	Verylongwordwhichmustbetruncatedbythesystemto30	easy	ADJ	adj	AdpType=Prep|Adp	2	amod	_	_
+2	Sentence	verylonglemmawhichmustbetruncatedbythesystemto30	NOUN	nom	Number=Sing	0	root	_	_
+3	.	.	PUNCT	.	_	1	punct	2:mod	_
+4	.	.	PUNCT	.	_	1	punct	2:xmod	_
+
--- a/tests/utils/test_graph.py
+++ b/tests/utils/test_graph.py
+import unittest
+import combo.utils.graph as graph
+
+import conllu
+import numpy as np
+
+
+class GraphTest(unittest.TestCase):
+
+    def test_adding_empty_graph_with_the_same_labels(self):
+        tree = conllu.TokenList(
+            tokens=[
+                {"head": 2, "deprel": "ROOT", "form": "word1"},
+                {"head": 3, "deprel": "yes", "form": "word2"},
+                {"head": 1, "deprel": "yes", "form": "word3"},
+            ]
+        )
+        empty_graph = np.zeros((4, 4))
+        graph_labels = np.array([
+            ["no", "no", "no", "no"],
+            ["no", "no", "ROOT", "no"],
+            ["no", "no", "no", "yes"],
+            ["no", "yes", "no", "no"],
+        ])
+        root_label = "ROOT"
+        expected_deps = ["2:ROOT", "3:yes", "1:yes"]
+
+        # when
+        tree = graph.sdp_to_dag_deps(empty_graph, graph_labels, tree, root_label)
+        actual_deps = [t["deps"] for t in tree.tokens]
+
+        # then
+        self.assertEqual(actual_deps, expected_deps)
+
+    def test_adding_empty_graph_with_different_labels(self):
+        tree = conllu.TokenList(
+            tokens=[
+                {"head": 2, "deprel": "ROOT", "form": "word1"},
+                {"head": 3, "deprel": "tree_label", "form": "word2"},
+                {"head": 1, "deprel": "tree_label", "form": "word3"},
+            ]
+        )
+        empty_graph = np.zeros((4, 4))
+        graph_labels = np.array([
+            ["no", "no", "no", "no"],
+            ["no", "no", "ROOT", "no"],
+            ["no", "no", "no", "graph_label"],
+            ["no", "graph_label", "no", "no"],
+        ])
+        root_label = "ROOT"
+        expected_deps = ["2:ROOT", "3:graph_label", "1:graph_label"]
+
+        # when
+        tree = graph.sdp_to_dag_deps(empty_graph, graph_labels, tree, root_label)
+        actual_deps = [t["deps"] for t in tree.tokens]
+
+        # then
+        self.assertEqual(actual_deps, expected_deps)
+
+    def test_extending_tree_with_graph(self):
+        # given
+        tree = conllu.TokenList(
+            tokens=[
+                {"head": 0, "deprel": "ROOT", "form": "word1"},
+                {"head": 1, "deprel": "tree_label", "form": "word2"},
+                {"head": 2, "deprel": "tree_label", "form": "word3"},
+            ]
+        )
+        arc_scores = np.array([
+            [0, 0, 0, 0],
+            [1, 0, 0, 0],
+            [0, 1, 0, 0],
+            [0, 1, 1, 0],
+        ])
+        graph_labels = np.array([
+            ["no", "no", "no", "no"],
+            ["ROOT", "no", "no", "no"],
+            ["no", "tree_label", "no", "no"],
+            ["no", "graph_label", "tree_label", "no"],
+        ])
+        root_label = "ROOT"
+        expected_deps = ["0:ROOT", "1:tree_label", "1:graph_label"]
+
+        # when
+        tree = graph.sdp_to_dag_deps(arc_scores, graph_labels, tree, root_label)
+        actual_deps = [t["deps"] for t in tree.tokens]
+
+        # then
+        self.assertEqual(actual_deps, expected_deps)
\ No newline at end of file
--- a/tests/utils/test_metrics.py
+++ b/tests/utils/test_metrics.py
@@ -27,12 +27,16 @@ class SemanticMetricsTest(unittest.TestCase):
        self.semrel, self.semrel_l = (("semrel", x) for x in [pred, gold])
        self.head, self.head_l = (("head", x) for x in [pred, gold])
        self.deprel, self.deprel_l = (("deprel", x) for x in [pred, gold])
+        # TODO(mklimasz) Add examples with correct dimension (with ROOT token)
+        self.enhanced_head, self.enhanced_head_l = (("enhanced_head", x) for x in [None, None])
+        self.enhanced_deprel, self.enhanced_deprel_l = (("enhanced_deprel", x) for x in [None, None])
        self.feats, self.feats_l = (("feats", x) for x in [pred_seq, gold_seq])
        self.lemma, self.lemma_l = (("lemma", x) for x in [pred_seq, gold_seq])
        self.predictions = dict(
-            [self.upostag, self.xpostag, self.semrel, self.feats, self.lemma, self.head, self.deprel])
+            [self.upostag, self.xpostag, self.semrel, self.feats, self.lemma, self.head, self.deprel,
+             self.enhanced_head, self.enhanced_deprel])
        self.gold_labels = dict([self.upostag_l, self.xpostag_l, self.semrel_l, self.feats_l, self.lemma_l, self.head_l,
-                                 self.deprel_l])
+                                 self.deprel_l, self.enhanced_head_l, self.enhanced_deprel_l])
        self.eps = 1e-6

    def test_every_prediction_correct(self):