Adding sentence attributes:

-relation_distribiution - probas of arc between dependent and head -relation_label_distribiution- probas for each label in sentence

Adding sentence attributes:
-relation_distribiution - probas of arc between dependent and head -relation_label_distribiution- probas for each label in sentence
1f8b632e · Łukasz Pszenny · 5cc01cd5 · 1f8b632e · 1f8b632e · 1f8b632e
Commit 1f8b632e authored 2 years ago by Łukasz Pszenny
--- a/combo/data/api.py
+++ b/combo/data/api.py
@@ -28,12 +28,16 @@ class Token:
 class Sentence:
    tokens: List[Token] = field(default_factory=list)
    sentence_embedding: List[float] = field(default_factory=list, repr=False)
+    relation_distribution: List[float] = field(default_factory=list, repr=False)
+    relation_label_distribution: List[float] = field(default_factory=list, repr=False)
    metadata: Dict[str, Any] = field(default_factory=collections.OrderedDict)

    def to_json(self):
        return json.dumps({
            "tokens": [dataclasses.asdict(t) for t in self.tokens],
            "sentence_embedding": self.sentence_embedding,
+            "relation_distribution": self.relation_distribution,
+            "relation_label_distribution": self.relation_label_distribution,
            "metadata": self.metadata,
        })

@@ -79,7 +83,12 @@ def tokens2conllu(tokens: List[str]) -> conllu.TokenList:


 def conllu2sentence(conllu_sentence: conllu.TokenList,
-                    sentence_embedding=None, embeddings=None) -> Sentence:
+                    sentence_embedding=None, embeddings=None, relation_distribution=None,
+                    relation_label_distribution=None) -> Sentence:
+    if relation_distribution is None:
+        relation_distribution = []
+    if relation_label_distribution is None:
+        relation_label_distribution = []
    if embeddings is None:
        embeddings = {}
    if sentence_embedding is None:
@@ -94,5 +103,7 @@ def conllu2sentence(conllu_sentence: conllu.TokenList,
    return Sentence(
        tokens=tokens,
        sentence_embedding=sentence_embedding,
+        relation_distribution=relation_distribution,
+        relation_label_distribution=relation_label_distribution,
        metadata=conllu_sentence.metadata
    )
--- a/combo/models/model.py
+++ b/combo/models/model.py
@@ -134,6 +134,8 @@ class ComboModel(allen_models.Model):
            "semrel_token_embedding": semrel_output["embedding"],
            "feats_token_embedding": morpho_output["embedding"],
            "deprel_token_embedding": parser_output["embedding"],
+            "deprel_tree_distribution": parser_output["deprel_tree_distribution"],
+            "deprel_label_distribution": parser_output["deprel_label_distribution"]
        }

        if "rel_probability" in enhanced_parser_output:

--- a/combo/models/parser.py
+++ b/combo/models/parser.py
@@ -41,7 +41,7 @@ class HeadPredictionModel(base.Predictor):
            # Adding non existing in mask ROOT to lengths
            lengths = mask.data.sum(dim=1).long().cpu().numpy() + 1
            for idx, length in enumerate(lengths):
-                probs = x[idx, :].softmax(dim=-1).cpu().numpy()
+                probs = x[idx, :].softmax(dim=-1).cpu().numpy() # tu jest macierz której szukamy

                # We do not want any word to be parent of the root node (ROOT, 0).
                # Also setting it to -1 instead of 0 fixes edge case where softmax made all
@@ -154,6 +154,9 @@ class DependencyRelationModel(base.Predictor):
        relation_prediction = self.relation_prediction_layer(dep_rel_pred)
        output = head_output
        output["embedding"] = dep_rel_pred
+        #import pdb;pdb.set_trace()
+        output["deprel_label_distribution"] = F.softmax(relation_prediction[:, 1:, 1:], dim=-1)
+        output["deprel_tree_distribution"] = head_pred_soft

        if self.training:
            output["prediction"] = (relation_prediction.argmax(-1)[:, 1:], head_output["prediction"])

--- a/combo/predict.py
+++ b/combo/predict.py
@@ -88,8 +88,8 @@ class COMBO(predictor.Predictor):
        sentences = []
        predictions = super().predict_batch_instance(instances)
        for prediction, instance in zip(predictions, instances):
-            tree, sentence_embedding, embeddings = self._predictions_as_tree(prediction, instance)
-            sentence = conllu2sentence(tree, sentence_embedding, embeddings)
+            tree, sentence_embedding, embeddings, relation_distribution, relation_label_distribution = self._predictions_as_tree(prediction, instance)
+            sentence = conllu2sentence(tree, sentence_embedding, embeddings, relation_distribution, relation_label_distribution)
            sentences.append(sentence)
        return sentences

@@ -102,8 +102,8 @@ class COMBO(predictor.Predictor):
    @overrides
    def predict_instance(self, instance: allen_data.Instance, serialize: bool = True) -> data.Sentence:
        predictions = super().predict_instance(instance)
-        tree, sentence_embedding, embeddings = self._predictions_as_tree(predictions, instance, )
-        return conllu2sentence(tree, sentence_embedding, embeddings)
+        tree, sentence_embedding, embeddings, relation_distribution, relation_label_distribution = self._predictions_as_tree(predictions, instance, )
+        return conllu2sentence(tree, sentence_embedding, embeddings, relation_distribution, relation_label_distribution)

    @overrides
    def predict_json(self, inputs: common.JsonDict) -> data.Sentence:
@@ -148,9 +148,15 @@ class COMBO(predictor.Predictor):
        field_names = instance.fields["metadata"]["field_names"]
        tree_tokens = [t for t in tree if isinstance(t["id"], int)]
        embeddings = {t["id"]: {} for t in tree}
+        deprel_tree_distribution = None
+        deprel_label_distribution = None
        for field_name in field_names:
            if field_name not in predictions:
                continue
+            if field_name == "deprel":
+                sentence_length = len(tree_tokens)
+                deprel_tree_distribution = np.matrix(predictions["deprel_tree_distribution"])[:sentence_length+1,:sentence_length+1]
+                deprel_label_distribution = np.matrix(predictions["deprel_label_distribution"])[:sentence_length,:]
            field_predictions = predictions[field_name]
            for idx, token in enumerate(tree_tokens):
                if field_name in {"xpostag", "upostag", "semrel", "deprel"}:
@@ -224,7 +230,8 @@ class COMBO(predictor.Predictor):
            empty_tokens = graph.restore_collapse_edges(tree_tokens)
            tree.tokens.extend(empty_tokens)

-        return tree, predictions["sentence_embedding"], embeddings
+        return tree, predictions["sentence_embedding"], embeddings, \
+               deprel_tree_distribution, deprel_label_distribution

    @classmethod
    def with_spacy_tokenizer(cls, model: models.Model,