diff --git a/README.md b/README.md index 19847b8e5df935feb36bb9e5ec1e89cc3f1d35ed..a9c21135005d9abf50f2234bedca817b7d180327 100644 --- a/README.md +++ b/README.md @@ -10,19 +10,24 @@ </p> ## Quick start -Clone this repository and install COMBO (we suggest using virtualenv/conda with Python 3.6+): +Clone this repository and install COMBO (we suggest creating a virtualenv/conda environment with Python 3.6+, as a bundle of required packages will be installed): ```bash git clone https://gitlab.clarin-pl.eu/syntactic-tools/clarinbiz/combo.git cd combo python setup.py develop ``` -Run the following lines in your Python console to make predictions with a pre-trained model: +Run the following commands in your Python console to make predictions with a pre-trained model: ```python -import combo.predict as predict +from combo.predict import COMBO -nlp = predict.SemanticMultitaskPredictor.from_pretrained("polish-herbert-base") -sentence = nlp("Moje zdanie.") -print(sentence.tokens) +nlp = COMBO.from_pretrained("polish-herbert-base") +sentence = nlp("COVID-19 to ostra choroba zakaźna układu oddechowego wywołana zakażeniem wirusem SARS-CoV-2.") +``` +Predictions are accessible as a list of token attributes: +```python +print("{:5} {:15} {:15} {:10} {:10} {:10}".format('ID', 'TOKEN', 'LEMMA', 'UPOS', 'HEAD', 'DEPREL')) +for token in sentence.tokens: + print("{:5} {:15} {:15} {:10} {:10} {:10}".format(str(token.id), token.token, token.lemma, token.upostag, str(token.head), token.deprel)) ``` ## Details @@ -31,4 +36,3 @@ print(sentence.tokens) - [**Pre-trained models**](docs/models.md) - [**Training**](docs/training.md) - [**Prediction**](docs/prediction.md) - diff --git a/combo/data/api.py b/combo/data/api.py index 10a3a727c9220601ebf243752a7e605e127a1774..7d44917ecc42a555be3c20e8500f595b8ee1edf1 100644 --- a/combo/data/api.py +++ b/combo/data/api.py @@ -1,13 +1,13 @@ import collections +import dataclasses +import json from dataclasses import dataclass, field from typing import Optional, List, Dict, Any, Union, Tuple import conllu -from dataclasses_json import dataclass_json from overrides import overrides -@dataclass_json @dataclass class Token: id: Optional[Union[int, Tuple]] = None @@ -23,13 +23,19 @@ class Token: semrel: Optional[str] = None -@dataclass_json @dataclass class Sentence: tokens: List[Token] = field(default_factory=list) sentence_embedding: List[float] = field(default_factory=list) metadata: Dict[str, Any] = field(default_factory=collections.OrderedDict) + def to_json(self): + return json.dumps({ + "tokens": [dataclasses.asdict(t) for t in self.tokens], + "sentence_embedding": self.sentence_embedding, + "metadata": self.metadata, + }) + class _TokenList(conllu.TokenList): @@ -41,7 +47,7 @@ class _TokenList(conllu.TokenList): def sentence2conllu(sentence: Sentence, keep_semrel: bool = True) -> conllu.TokenList: tokens = [] for token in sentence.tokens: - token_dict = collections.OrderedDict(token.to_dict()) + token_dict = collections.OrderedDict(dataclasses.asdict(token)) # Remove semrel to have default conllu format. if not keep_semrel: del token_dict["semrel"] @@ -50,6 +56,10 @@ def sentence2conllu(sentence: Sentence, keep_semrel: bool = True) -> conllu.Toke for t in tokens: if type(t["id"]) == list: t["id"] = tuple(t["id"]) + if t["deps"]: + for dep in t["deps"]: + if len(dep) > 1 and type(dep[1]) == list: + dep[1] = tuple(dep[1]) return _TokenList(tokens=tokens, metadata=sentence.metadata) @@ -64,9 +74,18 @@ def tokens2conllu(tokens: List[str]) -> conllu.TokenList: def conllu2sentence(conllu_sentence: conllu.TokenList, - sentence_embedding: List[float]) -> Sentence: + sentence_embedding=None) -> Sentence: + if sentence_embedding is None: + sentence_embedding = [] + tokens = [] + for token in conllu_sentence.tokens: + tokens.append( + Token( + **token + ) + ) return Sentence( - tokens=[Token.from_dict(t) for t in conllu_sentence.tokens], + tokens=tokens, sentence_embedding=sentence_embedding, metadata=conllu_sentence.metadata ) diff --git a/combo/data/dataset.py b/combo/data/dataset.py index 459a755c7f71c40e449d0542bf9af21d05e1f2c9..48b68b14e592dc6f98e3f62e8b5c3bd23899cb4c 100644 --- a/combo/data/dataset.py +++ b/combo/data/dataset.py @@ -1,9 +1,11 @@ +import copy import logging -from typing import Union, List, Dict, Iterable, Optional, Any +from typing import Union, List, Dict, Iterable, Optional, Any, Tuple import conllu +import torch from allennlp import data as allen_data -from allennlp.common import checks +from allennlp.common import checks, util from allennlp.data import fields as allen_fields, vocabulary from conllu import parser from dataclasses import dataclass @@ -35,6 +37,9 @@ class UniversalDependenciesDatasetReader(allen_data.DatasetReader): if "token" not in features and "char" not in features: raise checks.ConfigurationError("There must be at least one ('char' or 'token') text-based feature!") + if "deps" in targets and not ("head" in targets and "deprel" in targets): + raise checks.ConfigurationError("Add 'head' and 'deprel' to targets when using 'deps'!") + intersection = set(features).intersection(set(targets)) if len(intersection) != 0: raise checks.ConfigurationError( @@ -49,6 +54,8 @@ class UniversalDependenciesDatasetReader(allen_data.DatasetReader): field_parsers = parser.DEFAULT_FIELD_PARSERS # Do not make it nullable field_parsers.pop("xpostag", None) + # Ignore parsing misc + field_parsers.pop("misc", None) if self.use_sem: fields = list(fields) fields.append("semrel") @@ -102,13 +109,46 @@ class UniversalDependenciesDatasetReader(allen_data.DatasetReader): elif target_name == "feats": target_values = self._feat_values(tree_tokens) fields_[target_name] = fields.SequenceMultiLabelField(target_values, - self._feats_to_index_multi_label, + self._feats_indexer, + self._feats_as_tensor_wrapper, text_field, label_namespace="feats_labels") elif target_name == "head": target_values = [0 if v == "_" else int(v) for v in target_values] fields_[target_name] = allen_fields.SequenceLabelField(target_values, text_field, label_namespace=target_name + "_labels") + elif target_name == "deps": + # Graphs require adding ROOT (AdjacencyField uses sequence length from TextField). + text_field_deps = allen_fields.TextField([_Token("ROOT")] + copy.deepcopy(tokens), + self._token_indexers) + enhanced_heads: List[Tuple[int, int]] = [] + enhanced_deprels: List[str] = [] + for idx, t in enumerate(tree_tokens): + t_deps = t["deps"] + if t_deps and t_deps != "_": + for rel, head in t_deps: + # EmoryNLP skips the first edge, if there are two edges between the same + # nodes. Thanks to that one is in a tree and another in a graph. + # This snippet follows that approach. + if enhanced_heads and enhanced_heads[-1] == (idx, head): + enhanced_heads.pop() + enhanced_deprels.pop() + enhanced_heads.append((idx, head)) + enhanced_deprels.append(rel) + fields_["enhanced_heads"] = allen_fields.AdjacencyField( + indices=enhanced_heads, + sequence_field=text_field_deps, + label_namespace="enhanced_heads_labels", + padding_value=0, + ) + fields_["enhanced_deprels"] = allen_fields.AdjacencyField( + indices=enhanced_heads, + sequence_field=text_field_deps, + labels=enhanced_deprels, + # Label namespace matches regular tree parsing. + label_namespace="deprel_labels", + padding_value=0, + ) else: fields_[target_name] = allen_fields.SequenceLabelField(target_values, text_field, label_namespace=target_name + "_labels") @@ -128,7 +168,9 @@ class UniversalDependenciesDatasetReader(allen_data.DatasetReader): token["feats"] = field # metadata - fields_["metadata"] = allen_fields.MetadataField({"input": tree, "field_names": self.fields}) + fields_["metadata"] = allen_fields.MetadataField({"input": tree, + "field_names": self.fields, + "tokens": tokens}) return allen_data.Instance(fields_) @@ -151,12 +193,26 @@ class UniversalDependenciesDatasetReader(allen_data.DatasetReader): return features @staticmethod - def _feats_to_index_multi_label(vocab: allen_data.Vocabulary): + def _feats_as_tensor_wrapper(field: fields.SequenceMultiLabelField): + def as_tensor(padding_lengths): + desired_num_tokens = padding_lengths["num_tokens"] + assert len(field._indexed_multi_labels) > 0 + classes_count = len(field._indexed_multi_labels[0]) + default_value = [0.0] * classes_count + padded_tags = util.pad_sequence_to_length(field._indexed_multi_labels, desired_num_tokens, + lambda: default_value) + tensor = torch.LongTensor(padded_tags) + return tensor + + return as_tensor + + @staticmethod + def _feats_indexer(vocab: allen_data.Vocabulary): label_namespace = "feats_labels" vocab_size = vocab.get_vocab_size(label_namespace) slices = get_slices_if_not_provided(vocab) - def _m_from_n_ones_encoding(multi_label: List[str]) -> List[int]: + def _m_from_n_ones_encoding(multi_label: List[str], sentence_length: int) -> List[int]: one_hot_encoding = [0] * vocab_size for cat, cat_indices in slices.items(): if cat not in ["__PAD__", "_"]: diff --git a/combo/data/fields/sequence_multilabel_field.py b/combo/data/fields/sequence_multilabel_field.py index 4e98a148aee35e42af0b4828a031368fe0eafc12..b200580cf1edfba1e710b42bc19c4c4efdb0db4f 100644 --- a/combo/data/fields/sequence_multilabel_field.py +++ b/combo/data/fields/sequence_multilabel_field.py @@ -5,7 +5,7 @@ from typing import Set, List, Callable, Iterator, Union, Dict import torch from allennlp import data -from allennlp.common import checks, util +from allennlp.common import checks from allennlp.data import fields from overrides import overrides @@ -17,15 +17,16 @@ class SequenceMultiLabelField(data.Field[torch.Tensor]): A `SequenceMultiLabelField` is an extension of the :class:`MultiLabelField` that allows for multiple labels while keeping sequence dimension. - This field will get converted into a sequence of vectors of length equal to the vocabulary size with - M from N encoding for the labels (all zeros, and ones for the labels). + To allow configuration to different circumstances, class takes few delegates functions. # Parameters multi_labels : `List[List[str]]` multi_label_indexer : `Callable[[data.Vocabulary], Callable[[List[str]], List[int]]]` - Nested callable which based on vocab creates mapper for multilabel field in the sequence from strings - to indexed, int values. + Nested callable which based on vocab and sequence length maps values of the fields in the sequence + from strings to indexed, int values. + as_tensor: `Callable[["SequenceMultiLabelField"], Callable[[Dict[str, int]], torch.Tensor]]` + Nested callable which based on the field itself, maps indexed data to a tensor. sequence_field : `SequenceField` A field containing the sequence that this `SequenceMultiLabelField` is labeling. Most often, this is a `TextField`, for tagging individual tokens in a sentence. @@ -43,7 +44,8 @@ class SequenceMultiLabelField(data.Field[torch.Tensor]): def __init__( self, multi_labels: List[List[str]], - multi_label_indexer: Callable[[data.Vocabulary], Callable[[List[str]], List[int]]], + multi_label_indexer: Callable[[data.Vocabulary], Callable[[List[str], int], List[int]]], + as_tensor: Callable[["SequenceMultiLabelField"], Callable[[Dict[str, int]], torch.Tensor]], sequence_field: fields.SequenceField, label_namespace: str = "labels", ) -> None: @@ -53,6 +55,7 @@ class SequenceMultiLabelField(data.Field[torch.Tensor]): self._label_namespace = label_namespace self._indexed_multi_labels = None self._maybe_warn_for_namespace(label_namespace) + self.as_tensor_wrapper = as_tensor(self) if len(multi_labels) != sequence_field.sequence_length(): raise checks.ConfigurationError( "Label length and sequence length " @@ -101,7 +104,7 @@ class SequenceMultiLabelField(data.Field[torch.Tensor]): indexed = [] for multi_label in self.multi_labels: - indexed.append(indexer(multi_label)) + indexed.append(indexer(multi_label, len(self.multi_labels))) self._indexed_multi_labels = indexed @overrides @@ -110,19 +113,13 @@ class SequenceMultiLabelField(data.Field[torch.Tensor]): @overrides def as_tensor(self, padding_lengths: Dict[str, int]) -> torch.Tensor: - desired_num_tokens = padding_lengths["num_tokens"] - assert len(self._indexed_multi_labels) > 0 - classes_count = len(self._indexed_multi_labels[0]) - default_value = [0.0] * classes_count - padded_tags = util.pad_sequence_to_length(self._indexed_multi_labels, desired_num_tokens, lambda: default_value) - tensor = torch.LongTensor(padded_tags) - return tensor + return self.as_tensor_wrapper(padding_lengths) @overrides def empty_field(self) -> "SequenceMultiLabelField": - # The empty_list here is needed for mypy empty_list: List[List[str]] = [[]] sequence_label_field = SequenceMultiLabelField(empty_list, lambda x: lambda y: y, + lambda x: lambda y: y, self.sequence_field.empty_field()) sequence_label_field._indexed_labels = empty_list return sequence_label_field diff --git a/combo/main.py b/combo/main.py index 44ad091f8c5004bf9f4e70323ec828bf44288447..17c960ac7caa84513692841abb989955c7925721 100644 --- a/combo/main.py +++ b/combo/main.py @@ -18,7 +18,7 @@ from combo.utils import checks logger = logging.getLogger(__name__) _FEATURES = ["token", "char", "upostag", "xpostag", "lemma", "feats"] -_TARGETS = ["deprel", "feats", "head", "lemma", "upostag", "xpostag", "semrel", "sent"] +_TARGETS = ["deprel", "feats", "head", "lemma", "upostag", "xpostag", "semrel", "sent", "deps"] FLAGS = flags.FLAGS flags.DEFINE_enum(name="mode", default=None, enum_values=["train", "predict"], @@ -33,8 +33,10 @@ flags.DEFINE_string(name="output_file", default="output.log", # Training flags flags.DEFINE_list(name="training_data_path", default="./tests/fixtures/example.conllu", help="Training data path(s)") +flags.DEFINE_alias(name="training_data", original_name="training_data_path") flags.DEFINE_list(name="validation_data_path", default="", help="Validation data path(s)") +flags.DEFINE_alias(name="validation_data", original_name="validation_data_path") flags.DEFINE_string(name="pretrained_tokens", default="", help="Pretrained tokens embeddings path") flags.DEFINE_integer(name="embedding_dim", default=300, @@ -134,7 +136,7 @@ def run(_): params = common.Params.from_file(FLAGS.config_path, ext_vars=_get_ext_vars())["dataset_reader"] params.pop("type") dataset_reader = dataset.UniversalDependenciesDatasetReader.from_params(params) - predictor = predict.SemanticMultitaskPredictor( + predictor = predict.COMBO( model=model, dataset_reader=dataset_reader ) diff --git a/combo/models/__init__.py b/combo/models/__init__.py index 5aa7b283c70af76a27d97f63783368bdbe4ffa3f..ec7a1380e1cfc80b0302806e46cca4e5fc2d3568 100644 --- a/combo/models/__init__.py +++ b/combo/models/__init__.py @@ -1,8 +1,9 @@ """Models module.""" from .base import FeedForwardPredictor +from .graph_parser import GraphDependencyRelationModel from .parser import DependencyRelationModel from .embeddings import CharacterBasedWordEmbeddings from .encoder import ComboEncoder from .lemma import LemmatizerModel -from .model import SemanticMultitaskModel +from .model import ComboModel from .morpho import MorphologicalFeatures diff --git a/combo/models/base.py b/combo/models/base.py index 10e9d371a1cb1665358819898817e4b454b9244c..a5cb5fe61f85a98f78d143a54695d01948aa8dda 100644 --- a/combo/models/base.py +++ b/combo/models/base.py @@ -27,11 +27,11 @@ class Linear(nn.Linear, common.FromParams): def __init__(self, in_features: int, out_features: int, - activation: Optional[allen_nn.Activation] = lambda x: x, + activation: Optional[allen_nn.Activation] = None, dropout_rate: Optional[float] = 0.0): super().__init__(in_features, out_features) - self.activation = activation - self.dropout = nn.Dropout(p=dropout_rate) if dropout_rate else lambda x: x + self.activation = activation if activation else self.identity + self.dropout = nn.Dropout(p=dropout_rate) if dropout_rate else self.identity def forward(self, x: torch.FloatTensor) -> torch.FloatTensor: x = super().forward(x) @@ -41,6 +41,10 @@ class Linear(nn.Linear, common.FromParams): def get_output_dim(self) -> int: return self.out_features + @staticmethod + def identity(x): + return x + @Predictor.register("feedforward_predictor") @Predictor.register("feedforward_predictor_from_vocab", constructor="from_vocab") diff --git a/combo/models/embeddings.py b/combo/models/embeddings.py index 5cad95928dab03d8e5046eb1a281c07e9ffe33ff..6ad25590e3f29bcde42266b8ee9cc720787b4388 100644 --- a/combo/models/embeddings.py +++ b/combo/models/embeddings.py @@ -196,10 +196,10 @@ class FeatsTokenEmbedder(token_embedders.Embedding): def forward(self, tokens: torch.Tensor) -> torch.Tensor: # (batch_size, sentence_length, features_vocab_length) - mask = (tokens > 0).float() + mask = tokens.gt(0) # (batch_size, sentence_length, features_vocab_length, embedding_dim) x = super().forward(tokens) # (batch_size, sentence_length, embedding_dim) return x.sum(dim=-2) / ( - (mask.sum(dim=-1) + util.tiny_value_of_dtype(mask.dtype)).unsqueeze(dim=-1) + (mask.sum(dim=-1) + util.tiny_value_of_dtype(torch.float)).unsqueeze(dim=-1) ) diff --git a/combo/models/graph_parser.py b/combo/models/graph_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..edcdc2d0785dd73d91b3e79249d196ba55ec148c --- /dev/null +++ b/combo/models/graph_parser.py @@ -0,0 +1,188 @@ +"""Enhanced dependency parsing models.""" +from typing import Tuple, Dict, Optional, Union, List + +import numpy as np +import torch +import torch.nn.functional as F +from allennlp import data +from allennlp.nn import chu_liu_edmonds + +from combo.models import base, utils + + +class GraphHeadPredictionModel(base.Predictor): + """Head prediction model.""" + + def __init__(self, + head_projection_layer: base.Linear, + dependency_projection_layer: base.Linear, + cycle_loss_n: int = 0, + graph_weighting: float = 0.2): + super().__init__() + self.head_projection_layer = head_projection_layer + self.dependency_projection_layer = dependency_projection_layer + self.cycle_loss_n = cycle_loss_n + self.graph_weighting = graph_weighting + + def forward(self, + x: Union[torch.Tensor, List[torch.Tensor]], + labels: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None, + mask: Optional[torch.BoolTensor] = None, + sample_weights: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None) -> Dict[str, torch.Tensor]: + if mask is None: + mask = x.new_ones(x.size()[-1]) + heads_labels = None + if labels is not None and labels[0] is not None: + heads_labels = labels + + head_arc_emb = self.head_projection_layer(x) + dep_arc_emb = self.dependency_projection_layer(x) + x = dep_arc_emb.bmm(head_arc_emb.transpose(2, 1)) + pred = x.sigmoid() > 0.5 + + output = { + "prediction": pred, + "probability": x + } + + if heads_labels is not None: + if sample_weights is None: + sample_weights = heads_labels.new_ones([mask.size(0)]) + output["loss"], output["cycle_loss"] = self._loss(x, heads_labels, mask, sample_weights) + + return output + + def _cycle_loss(self, pred: torch.Tensor): + BATCH_SIZE, _, _ = pred.size() + loss = pred.new_zeros(BATCH_SIZE) + # Index from 1: as using non __ROOT__ tokens + pred = pred.softmax(-1)[:, 1:, 1:] + x = pred + for i in range(self.cycle_loss_n): + loss += self._batch_trace(x) + + # Don't multiple on last iteration + if i < self.cycle_loss_n - 1: + x = x.bmm(pred) + + return loss + + @staticmethod + def _batch_trace(x: torch.Tensor) -> torch.Tensor: + assert len(x.size()) == 3 + BATCH_SIZE, N, M = x.size() + assert N == M + identity = x.new_tensor(torch.eye(N)) + identity = identity.reshape((1, N, N)) + batch_identity = identity.repeat(BATCH_SIZE, 1, 1) + return (x * batch_identity).sum((-1, -2)) + + def _loss(self, pred: torch.Tensor, labels: torch.Tensor, mask: torch.BoolTensor, + sample_weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + BATCH_SIZE, N, M = pred.size() + assert N == M + SENTENCE_LENGTH = N + + valid_positions = mask.sum() + + result = [] + true = labels + # Ignore first pred dimension as it is ROOT token prediction + for i in range(SENTENCE_LENGTH - 1): + pred_i = pred[:, i + 1, 1:].reshape(-1) + true_i = true[:, i + 1, 1:].reshape(-1) + mask_i = mask[:, i] + bce_loss = F.binary_cross_entropy_with_logits(pred_i, true_i, reduction="none").mean(-1) * mask_i + result.append(bce_loss) + cycle_loss = self._cycle_loss(pred) + loss = torch.stack(result).transpose(1, 0) * sample_weights.unsqueeze(-1) + return loss.sum() / valid_positions + cycle_loss.mean(), cycle_loss.mean() + + +@base.Predictor.register("combo_graph_dependency_parsing_from_vocab", constructor="from_vocab") +class GraphDependencyRelationModel(base.Predictor): + """Dependency relation parsing model.""" + + def __init__(self, + head_predictor: GraphHeadPredictionModel, + head_projection_layer: base.Linear, + dependency_projection_layer: base.Linear, + relation_prediction_layer: base.Linear): + super().__init__() + self.head_predictor = head_predictor + self.head_projection_layer = head_projection_layer + self.dependency_projection_layer = dependency_projection_layer + self.relation_prediction_layer = relation_prediction_layer + + def forward(self, + x: Union[torch.Tensor, List[torch.Tensor]], + mask: Optional[torch.BoolTensor] = None, + labels: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None, + sample_weights: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None) -> Dict[str, torch.Tensor]: + relations_labels, head_labels, enhanced_heads_labels, enhanced_deprels_labels = None, None, None, None + if labels is not None and labels[0] is not None: + relations_labels, head_labels, enhanced_heads_labels = labels + + head_output = self.head_predictor(x, enhanced_heads_labels, mask, sample_weights) + head_pred = head_output["probability"] + BATCH_SIZE, LENGTH, _ = head_pred.size() + + head_rel_emb = self.head_projection_layer(x) + + dep_rel_emb = self.dependency_projection_layer(x) + + # All possible edges combinations for each batch + # Repeat interleave to have [emb1, emb1 ... (length times) ... emb1, emb2 ... ] + head_rel_pred = head_rel_emb.repeat_interleave(LENGTH, -2) + # Regular repeat to have all combinations [deprel1, deprel2, ... deprelL, deprel1 ...] + dep_rel_pred = dep_rel_emb.repeat(1, LENGTH, 1) + + # All possible edges combinations for each batch + dep_rel_pred = torch.cat((head_rel_pred, dep_rel_pred), dim=-1) + + relation_prediction = self.relation_prediction_layer(dep_rel_pred).reshape(BATCH_SIZE, LENGTH, LENGTH, -1) + output = head_output + + output["prediction"] = (relation_prediction.argmax(-1), head_output["prediction"]) + output["rel_probability"] = relation_prediction + + if labels is not None and labels[0] is not None: + if sample_weights is None: + sample_weights = labels.new_ones([mask.size(0)]) + loss = self._loss(relation_prediction, relations_labels, enhanced_heads_labels, mask, sample_weights) + output["loss"] = (loss, head_output["loss"]) + + return output + + @staticmethod + def _loss(pred: torch.Tensor, + true: torch.Tensor, + heads_true: torch.Tensor, + mask: torch.BoolTensor, + sample_weights: torch.Tensor) -> torch.Tensor: + correct_heads_mask = heads_true.long() == 1 + true = true[correct_heads_mask] + pred = pred[correct_heads_mask] + loss = F.cross_entropy(pred, true.long()) + return loss.sum() / pred.size(0) + + @classmethod + def from_vocab(cls, + vocab: data.Vocabulary, + vocab_namespace: str, + head_predictor: GraphHeadPredictionModel, + head_projection_layer: base.Linear, + dependency_projection_layer: base.Linear + ): + """Creates parser combining model configuration and vocabulary data.""" + assert vocab_namespace in vocab.get_namespaces() + relation_prediction_layer = base.Linear( + in_features=head_projection_layer.get_output_dim() + dependency_projection_layer.get_output_dim(), + out_features=vocab.get_vocab_size(vocab_namespace) + ) + return cls( + head_predictor=head_predictor, + head_projection_layer=head_projection_layer, + dependency_projection_layer=dependency_projection_layer, + relation_prediction_layer=relation_prediction_layer + ) diff --git a/combo/models/model.py b/combo/models/model.py index 77b43e3c1a95e09b15c310409af0090f097d47fa..9866bcb4fba41ed2506b2d33290e6cd0fe237d29 100644 --- a/combo/models/model.py +++ b/combo/models/model.py @@ -12,7 +12,7 @@ from combo.utils import metrics @allen_models.Model.register("semantic_multitask") -class SemanticMultitaskModel(allen_models.Model): +class ComboModel(allen_models.Model): """Main COMBO model.""" def __init__(self, @@ -27,6 +27,7 @@ class SemanticMultitaskModel(allen_models.Model): semantic_relation: Optional[base.Predictor] = None, morphological_feat: Optional[base.Predictor] = None, dependency_relation: Optional[base.Predictor] = None, + enhanced_dependency_relation: Optional[base.Predictor] = None, regularizer: allen_nn.RegularizerApplicator = None) -> None: super().__init__(vocab, regularizer) self.text_field_embedder = text_field_embedder @@ -39,6 +40,7 @@ class SemanticMultitaskModel(allen_models.Model): self.semantic_relation = semantic_relation self.morphological_feat = morphological_feat self.dependency_relation = dependency_relation + self.enhanced_dependency_relation = enhanced_dependency_relation self._head_sentinel = torch.nn.Parameter(torch.randn([1, 1, self.seq_encoder.get_output_dim()])) self.scores = metrics.SemanticMetrics() self._partial_losses = None @@ -53,12 +55,16 @@ class SemanticMultitaskModel(allen_models.Model): feats: torch.Tensor = None, head: torch.Tensor = None, deprel: torch.Tensor = None, - semrel: torch.Tensor = None, ) -> Dict[str, torch.Tensor]: + semrel: torch.Tensor = None, + enhanced_heads: torch.Tensor = None, + enhanced_deprels: torch.Tensor = None) -> Dict[str, torch.Tensor]: # Prepare masks - char_mask: torch.BoolTensor = sentence["char"]["token_characters"] > 0 + char_mask = sentence["char"]["token_characters"].gt(0) word_mask = util.get_text_field_mask(sentence) + device = word_mask.device + # If enabled weight samples loss by log(sentence_length) sample_weights = word_mask.sum(-1).float().log() if self.use_sample_weight else None @@ -69,42 +75,49 @@ class SemanticMultitaskModel(allen_models.Model): # Concatenate the head sentinel (ROOT) onto the sentence representation. head_sentinel = self._head_sentinel.expand(batch_size, 1, encoding_dim) - encoder_emb = torch.cat([head_sentinel, encoder_emb], 1) - word_mask = torch.cat([word_mask.new_ones((batch_size, 1)), word_mask], 1) + encoder_emb_with_root = torch.cat([head_sentinel, encoder_emb], 1) + word_mask_with_root = torch.cat([torch.ones((batch_size, 1), device=device), word_mask], 1) upos_output = self._optional(self.upos_tagger, - encoder_emb[:, 1:], - mask=word_mask[:, 1:], + encoder_emb, + mask=word_mask, labels=upostag, sample_weights=sample_weights) xpos_output = self._optional(self.xpos_tagger, - encoder_emb[:, 1:], - mask=word_mask[:, 1:], + encoder_emb, + mask=word_mask, labels=xpostag, sample_weights=sample_weights) semrel_output = self._optional(self.semantic_relation, - encoder_emb[:, 1:], - mask=word_mask[:, 1:], + encoder_emb, + mask=word_mask, labels=semrel, sample_weights=sample_weights) morpho_output = self._optional(self.morphological_feat, - encoder_emb[:, 1:], - mask=word_mask[:, 1:], + encoder_emb, + mask=word_mask, labels=feats, sample_weights=sample_weights) lemma_output = self._optional(self.lemmatizer, - (encoder_emb[:, 1:], sentence.get("char").get("token_characters") - if sentence.get("char") else None), - mask=word_mask[:, 1:], + (encoder_emb, sentence.get("char").get("token_characters") + if sentence.get("char") else None), + mask=word_mask, labels=lemma.get("char").get("token_characters") if lemma else None, sample_weights=sample_weights) parser_output = self._optional(self.dependency_relation, - encoder_emb, + encoder_emb_with_root, returns_tuple=True, - mask=word_mask, + mask=word_mask_with_root, labels=(deprel, head), sample_weights=sample_weights) + enhanced_parser_output = self._optional(self.enhanced_dependency_relation, + encoder_emb_with_root, + returns_tuple=True, + mask=word_mask_with_root, + labels=(enhanced_deprels, head, enhanced_heads), + sample_weights=sample_weights) relations_pred, head_pred = parser_output["prediction"] + enhanced_relations_pred, enhanced_head_pred = enhanced_parser_output["prediction"] output = { "upostag": upos_output["prediction"], "xpostag": xpos_output["prediction"], @@ -113,9 +126,14 @@ class SemanticMultitaskModel(allen_models.Model): "lemma": lemma_output["prediction"], "head": head_pred, "deprel": relations_pred, - "sentence_embedding": torch.max(encoder_emb[:, 1:], dim=1)[0], + "enhanced_head": enhanced_head_pred, + "enhanced_deprel": enhanced_relations_pred, + "sentence_embedding": torch.max(encoder_emb, dim=1)[0], } + if "rel_probability" in enhanced_parser_output: + output["enhanced_deprel_prob"] = enhanced_parser_output["rel_probability"] + if self._has_labels([upostag, xpostag, lemma, feats, head, deprel, semrel]): # Feats mapping @@ -134,9 +152,12 @@ class SemanticMultitaskModel(allen_models.Model): "lemma": lemma.get("char").get("token_characters") if lemma else None, "head": head, "deprel": deprel, + "enhanced_head": enhanced_heads, + "enhanced_deprel": enhanced_deprels, } - self.scores(output, labels, word_mask[:, 1:]) + self.scores(output, labels, word_mask) relations_loss, head_loss = parser_output["loss"] + enhanced_relations_loss, enhanced_head_loss = enhanced_parser_output["loss"] losses = { "upostag_loss": upos_output["loss"], "xpostag_loss": xpos_output["loss"], @@ -145,6 +166,8 @@ class SemanticMultitaskModel(allen_models.Model): "lemma_loss": lemma_output["loss"], "head_loss": head_loss, "deprel_loss": relations_loss, + "enhanced_head_loss": enhanced_head_loss, + "enhanced_deprel_loss": enhanced_relations_loss, # Cycle loss is only for the metrics purposes. "cycle_loss": parser_output.get("cycle_loss") } diff --git a/combo/models/parser.py b/combo/models/parser.py index 486b2481b96bf17bb19fd8557916f21dcb6c4584..dfb53ab8ded369b01eae4851dd1d7a9936c05bbe 100644 --- a/combo/models/parser.py +++ b/combo/models/parser.py @@ -115,11 +115,13 @@ class DependencyRelationModel(base.Predictor): """Dependency relation parsing model.""" def __init__(self, + root_idx: int, head_predictor: HeadPredictionModel, head_projection_layer: base.Linear, dependency_projection_layer: base.Linear, relation_prediction_layer: base.Linear): super().__init__() + self.root_idx = root_idx self.head_predictor = head_predictor self.head_projection_layer = head_projection_layer self.dependency_projection_layer = dependency_projection_layer @@ -130,6 +132,7 @@ class DependencyRelationModel(base.Predictor): mask: Optional[torch.BoolTensor] = None, labels: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None, sample_weights: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None) -> Dict[str, torch.Tensor]: + device = x.device if mask is not None: mask = mask[:, 1:] relations_labels, head_labels = None, None @@ -151,7 +154,23 @@ class DependencyRelationModel(base.Predictor): relation_prediction = self.relation_prediction_layer(dep_rel_pred) output = head_output - output["prediction"] = (relation_prediction.argmax(-1)[:, 1:], head_output["prediction"]) + if self.training: + output["prediction"] = (relation_prediction.argmax(-1)[:, 1:], head_output["prediction"]) + else: + # Mask root label whenever head is not 0. + relation_prediction_output = relation_prediction[:, 1:] + mask = (head_output["prediction"] == 0) + vocab_size = relation_prediction_output.size(-1) + root_idx = torch.tensor([self.root_idx], device=device) + relation_prediction_output[mask] = (relation_prediction_output + .masked_select(mask.unsqueeze(-1)) + .reshape(-1, vocab_size) + .index_fill(-1, root_idx, 10e10)) + relation_prediction_output[~mask] = (relation_prediction_output + .masked_select(~(mask.unsqueeze(-1))) + .reshape(-1, vocab_size) + .index_fill(-1, root_idx, -10e10)) + output["prediction"] = (relation_prediction_output.argmax(-1), head_output["prediction"]) if labels is not None and labels[0] is not None: if sample_weights is None: @@ -195,5 +214,6 @@ class DependencyRelationModel(base.Predictor): head_predictor=head_predictor, head_projection_layer=head_projection_layer, dependency_projection_layer=dependency_projection_layer, - relation_prediction_layer=relation_prediction_layer + relation_prediction_layer=relation_prediction_layer, + root_idx=vocab.get_token_index("root", vocab_namespace) ) diff --git a/combo/predict.py b/combo/predict.py index b6c7172c2477efdea4c5e11e0ea1575450602db0..21941d91d56170e7c552af2a3ac1af229816f76d 100644 --- a/combo/predict.py +++ b/combo/predict.py @@ -3,6 +3,7 @@ import os from typing import List, Union, Tuple import conllu +import numpy as np from allennlp import data as allen_data, common, models from allennlp.common import util from allennlp.data import tokenizers @@ -11,20 +12,20 @@ from overrides import overrides from combo import data from combo.data import sentence2conllu, tokens2conllu, conllu2sentence -from combo.utils import download +from combo.utils import download, graph logger = logging.getLogger(__name__) @predictor.Predictor.register("semantic-multitask-predictor") @predictor.Predictor.register("semantic-multitask-predictor-spacy", constructor="with_spacy_tokenizer") -class SemanticMultitaskPredictor(predictor.Predictor): +class COMBO(predictor.Predictor): def __init__(self, model: models.Model, dataset_reader: allen_data.DatasetReader, tokenizer: allen_data.Tokenizer = tokenizers.WhitespaceTokenizer(), - batch_size: int = 500, + batch_size: int = 32, line_to_conllu: bool = False) -> None: super().__init__(model, dataset_reader) self.batch_size = batch_size @@ -51,7 +52,7 @@ class SemanticMultitaskPredictor(predictor.Predictor): def predict(self, sentence: Union[str, List[str], List[List[str]], List[data.Sentence]]): if isinstance(sentence, str): - return data.Sentence.from_dict(self.predict_json({"sentence": sentence})) + return self.predict_json({"sentence": sentence}) elif isinstance(sentence, list): if len(sentence) == 0: return [] @@ -154,6 +155,10 @@ class SemanticMultitaskPredictor(predictor.Predictor): token[field_name] = value elif field_name in ["head"]: token[field_name] = int(predictions[field_name][idx]) + elif field_name == "deps": + # Handled after every other decoding + continue + elif field_name in ["feats"]: slices = self._model.morphological_feat.slices features = [] @@ -168,11 +173,11 @@ class SemanticMultitaskPredictor(predictor.Predictor): if len(features) == 0: field_value = "_" else: - field_value = "|".join(sorted(features)) + lowercase_features = [f.lower() for f in features] + arg_indices = sorted(range(len(lowercase_features)), key=lowercase_features.__getitem__) + field_value = "|".join(np.array(features)[arg_indices].tolist()) token[field_name] = field_value - elif field_name == "head": - pass elif field_name == "lemma": prediction = predictions[field_name][idx] word_chars = [] @@ -191,6 +196,20 @@ class SemanticMultitaskPredictor(predictor.Predictor): else: raise NotImplementedError(f"Unknown field name {field_name}!") + if "enhanced_head" in predictions and predictions["enhanced_head"]: + # TODO off-by-one hotfix, refactor + h = np.array(predictions["enhanced_head"]) + h = np.concatenate((h[-1:], h[:-1])) + r = np.array(predictions["enhanced_deprel_prob"]) + r = np.concatenate((r[-1:], r[:-1])) + graph.sdp_to_dag_deps(arc_scores=h, + rel_scores=r, + tree_tokens=tree_tokens, + root_idx=self.vocab.get_token_index("root", "deprel_labels"), + vocab_index=self.vocab.get_index_to_token_vocabulary("deprel_labels")) + empty_tokens = graph.restore_collapse_edges(tree_tokens) + tree.tokens.extend(empty_tokens) + return tree, predictions["sentence_embedding"] @classmethod @@ -200,7 +219,7 @@ class SemanticMultitaskPredictor(predictor.Predictor): @classmethod def from_pretrained(cls, path: str, tokenizer=tokenizers.SpacyTokenizer(), - batch_size: int = 500, + batch_size: int = 32, cuda_device: int = -1): util.import_module_and_submodules("combo.commands") util.import_module_and_submodules("combo.models") diff --git a/combo/utils/graph.py b/combo/utils/graph.py new file mode 100644 index 0000000000000000000000000000000000000000..651c14a7d79b7ea3c277b9466f5e050435a7a01b --- /dev/null +++ b/combo/utils/graph.py @@ -0,0 +1,115 @@ +"""Based on https://github.com/emorynlp/iwpt-shared-task-2020.""" +from typing import List + +import numpy as np + + +def sdp_to_dag_deps(arc_scores, rel_scores, tree_tokens: List, root_idx=0, vocab_index=None) -> None: + # adding ROOT + tree_heads = [0] + [t["head"] for t in tree_tokens] + graph = adjust_root_score_then_add_secondary_arcs(arc_scores, rel_scores, tree_heads, + root_idx) + for i, (t, g) in enumerate(zip(tree_heads, graph)): + if not i: + continue + rels = [vocab_index.get(x[1], "root") if vocab_index else x[1] for x in g] + heads = [x[0] for x in g] + head = tree_tokens[i - 1]["head"] + index = heads.index(head) + deprel = tree_tokens[i - 1]["deprel"] + deprel = deprel.split('>')[-1] + # TODO - Consider if there should be a condition, + # It doesn't seem to make any sense as DEPS should contain DEPREL + # (although sometimes with different/more detailed label) + # if len(heads) >= 2: + # heads.pop(index) + # rels.pop(index) + deps = '|'.join(f'{h}:{r}' for h, r in zip(heads, rels)) + tree_tokens[i - 1]["deps"] = deps + tree_tokens[i - 1]["deprel"] = deprel + return + + +def adjust_root_score_then_add_secondary_arcs(arc_scores, rel_scores, tree_heads, root_idx): + if len(arc_scores) != tree_heads: + arc_scores = arc_scores[:len(tree_heads)][:len(tree_heads)] + rel_scores = rel_scores[:len(tree_heads)][:len(tree_heads)] + # Self-loops aren't allowed, mask with 0. This is an in-place operation. + np.fill_diagonal(arc_scores, 0) + parse_preds = np.array(arc_scores) > 0 + parse_preds[:, 0] = False # set heads to False + rel_scores[:, :, root_idx] = -float('inf') + return add_secondary_arcs(arc_scores, rel_scores, tree_heads, root_idx, parse_preds) + + +def add_secondary_arcs(arc_scores, rel_scores, tree_heads, root_idx, parse_preds): + if not isinstance(tree_heads, np.ndarray): + tree_heads = np.array(tree_heads) + dh = np.argwhere(parse_preds) + sdh = sorted([(arc_scores[x[0]][x[1]], list(x)) for x in dh], reverse=True) + graph = [[] for _ in range(len(tree_heads))] + rel_pred = np.argmax(rel_scores, axis=-1) + for d, h in enumerate(tree_heads): + if d: + graph[h].append(d) + for s, (d, h) in sdh: + if not d or not h or d in graph[h]: + continue + try: + path = next(_dfs(graph, d, h)) + except StopIteration: + # no path from d to h + graph[h].append(d) + parse_graph = [[] for _ in range(len(tree_heads))] + num_root = 0 + for h in range(len(tree_heads)): + for d in graph[h]: + rel = rel_pred[d][h] + if h == 0: + rel = root_idx + assert num_root == 0 + num_root += 1 + parse_graph[d].append((h, rel)) + parse_graph[d] = sorted(parse_graph[d]) + return parse_graph + + +def _dfs(graph, start, end): + fringe = [(start, [])] + while fringe: + state, path = fringe.pop() + if path and state == end: + yield path + continue + for next_state in graph[state]: + if next_state in path: + continue + fringe.append((next_state, path + [next_state])) + + +def restore_collapse_edges(tree_tokens): + empty_tokens = [] + for token in tree_tokens: + deps = token["deps"].split("|") + for i, d in enumerate(deps): + if ">" in d: + # {head}:{empty_node_relation}>{current_node_relation} + # should map to + # For new, empty node: + # {head}:{empty_node_relation} + # For current node: + # {new_empty_node_id}:{current_node_relation} + # TODO consider where to put new_empty_node_id (currently at the end) + head, relation = d.split(':', 1) + ehead = f"{len(tree_tokens)}.{len(empty_tokens) + 1}" + empty_node_relation, current_node_relation = relation.split(">", 1) + deps[i] = f"{ehead}:{current_node_relation}" + empty_tokens.append( + { + "id": ehead, + "deps": f"{head}:{empty_node_relation}" + } + ) + deps = sorted([d.split(":", 1) for d in deps], key=lambda x: float(x[0])) + token["deps"] = "|".join([f"{k}:{v}" for k, v in deps]) + return empty_tokens diff --git a/combo/utils/metrics.py b/combo/utils/metrics.py index 28f8efa022c22cff7e6d7b2b0d2977ba50b39dc1..682e8859264a3414bf86d3a1e408ce5b3588a6f3 100644 --- a/combo/utils/metrics.py +++ b/combo/utils/metrics.py @@ -117,6 +117,8 @@ class AttachmentScores(metrics.Metric): mask : `torch.BoolTensor`, optional (default = None). A tensor of the same shape as `predicted_indices`. """ + if gold_labels is None or gold_indices is None: + return detached = self.detach_tensors( predicted_indices, predicted_labels, gold_indices, gold_labels, mask ) @@ -138,10 +140,14 @@ class AttachmentScores(metrics.Metric): correct_indices = predicted_indices.eq(gold_indices).long() * mask unlabeled_exact_match = (correct_indices + ~mask).prod(dim=-1) + if len(correct_indices.size()) > 2: + unlabeled_exact_match = unlabeled_exact_match.prod(dim=-1) correct_labels = predicted_labels.eq(gold_labels).long() * mask correct_labels_and_indices = correct_indices * correct_labels self.correct_indices = correct_labels_and_indices.flatten() labeled_exact_match = (correct_labels_and_indices + ~mask).prod(dim=-1) + if len(correct_indices.size()) > 2: + labeled_exact_match = labeled_exact_match.prod(dim=-1) self._unlabeled_correct += correct_indices.sum() self._exact_unlabeled_correct += unlabeled_exact_match.sum() @@ -198,6 +204,8 @@ class SemanticMetrics(metrics.Metric): self.feats_score = SequenceBoolAccuracy(prod_last_dim=True) self.lemma_score = SequenceBoolAccuracy(prod_last_dim=True) self.attachment_scores = AttachmentScores() + # Ignore PADDING and OOV + self.enhanced_attachment_scores = AttachmentScores(ignore_classes=[0, 1]) self.em_score = 0.0 def __call__( # type: ignore @@ -215,14 +223,25 @@ class SemanticMetrics(metrics.Metric): gold_labels["head"], gold_labels["deprel"], mask) + self.enhanced_attachment_scores(predictions["enhanced_head"], + predictions["enhanced_deprel"], + gold_labels["enhanced_head"], + gold_labels["enhanced_deprel"], + mask=None) + enhanced_indices = ( + self.enhanced_attachment_scores.correct_indices.reshape(mask.size(0), mask.size(1) + 1, -1)[:, 1:, 1:].sum( + -1).reshape(-1).bool() + if len(self.enhanced_attachment_scores.correct_indices.size()) > 0 + else self.enhanced_attachment_scores.correct_indices + ) total = mask.sum() correct_indices = (self.upos_score.correct_indices * self.xpos_score.correct_indices * self.semrel_score.correct_indices * self.feats_score.correct_indices * self.lemma_score.correct_indices * - self.attachment_scores.correct_indices - ) + self.attachment_scores.correct_indices * + enhanced_indices) total, correct_indices = self.detach_tensors(total, correct_indices) self.em_score = (correct_indices.float().sum() / total).item() @@ -237,6 +256,8 @@ class SemanticMetrics(metrics.Metric): "EM": self.em_score } metrics_dict.update(self.attachment_scores.get_metric(reset)) + enhanced_metrics = {f"E{k}": v for k, v in self.enhanced_attachment_scores.get_metric(reset).items()} + metrics_dict.update(enhanced_metrics) return metrics_dict def reset(self) -> None: @@ -246,4 +267,5 @@ class SemanticMetrics(metrics.Metric): self.lemma_score.reset() self.feats_score.reset() self.attachment_scores.reset() + self.enhanced_attachment_scores.reset() self.em_score = 0.0 diff --git a/config.graph.template.jsonnet b/config.graph.template.jsonnet new file mode 100644 index 0000000000000000000000000000000000000000..bc8c46580f17f22924a9f68628d64ce7f1060d55 --- /dev/null +++ b/config.graph.template.jsonnet @@ -0,0 +1,422 @@ +######################################################################################## +# BASIC configuration # +######################################################################################## +# Training data path, str +# Must be in CONNLU format (or it's extended version with semantic relation field). +# Can accepted multiple paths when concatenated with ',', "path1,path2" +local training_data_path = std.extVar("training_data_path"); +# Validation data path, str +# Can accepted multiple paths when concatenated with ',', "path1,path2" +local validation_data_path = if std.length(std.extVar("validation_data_path")) > 0 then std.extVar("validation_data_path"); +# Path to pretrained tokens, str or null +local pretrained_tokens = if std.length(std.extVar("pretrained_tokens")) > 0 then std.extVar("pretrained_tokens"); +# Name of pretrained transformer model, str or null +local pretrained_transformer_name = if std.length(std.extVar("pretrained_transformer_name")) > 0 then std.extVar("pretrained_transformer_name"); +# Learning rate value, float +local learning_rate = 0.002; +# Number of epochs, int +local num_epochs = std.parseInt(std.extVar("num_epochs")); +# Cuda device id, -1 for cpu, int +local cuda_device = std.parseInt(std.extVar("cuda_device")); +# Minimum number of words in batch, int +local word_batch_size = std.parseInt(std.extVar("word_batch_size")); +# Features used as input, list of str +# Choice "upostag", "xpostag", "lemma" +# Required "token", "char" +local features = std.split(std.extVar("features"), " "); +# Targets of the model, list of str +# Choice "feats", "lemma", "upostag", "xpostag", "semrel". "sent" +# Required "deprel", "head" +local targets = std.split(std.extVar("targets"), " "); +# Word embedding dimension, int +# If pretrained_tokens is not null must much provided dimensionality +local embedding_dim = std.parseInt(std.extVar("embedding_dim")); +# Dropout rate on predictors, float +# All of the models on top of the encoder uses this dropout +local predictors_dropout = 0.25; +# Xpostag embedding dimension, int +# (discarded if xpostag not in features) +local xpostag_dim = 32; +# Upostag embedding dimension, int +# (discarded if upostag not in features) +local upostag_dim = 32; +# Feats embedding dimension, int +# (discarded if feats not in featres) +local feats_dim = 32; +# Lemma embedding dimension, int +# (discarded if lemma not in features) +local lemma_char_dim = 64; +# Character embedding dim, int +local char_dim = 64; +# Word embedding projection dim, int +local projected_embedding_dim = 100; +# Loss weights, dict[str, int] +local loss_weights = { + xpostag: 0.05, + upostag: 0.05, + lemma: 0.05, + feats: 0.2, + deprel: 0.8, + head: 0.2, + semrel: 0.05, + enhanced_head: 0.2, + enhanced_deprel: 0.8, +}; +# Encoder hidden size, int +local hidden_size = 512; +# Number of layers in the encoder, int +local num_layers = 2; +# Cycle loss iterations, int +local cycle_loss_n = 0; +# Maximum length of the word, int +# Shorter words are padded, longer - truncated +local word_length = 30; +# Whether to use tensorboard, bool +local use_tensorboard = if std.extVar("use_tensorboard") == "True" then true else false; + +# Helper functions +local in_features(name) = !(std.length(std.find(name, features)) == 0); +local in_targets(name) = !(std.length(std.find(name, targets)) == 0); +local use_transformer = pretrained_transformer_name != null; + +# Verify some configuration requirements +assert in_features("token"): "Key 'token' must be in features!"; +assert in_features("char"): "Key 'char' must be in features!"; + +assert in_targets("deprel"): "Key 'deprel' must be in targets!"; +assert in_targets("head"): "Key 'head' must be in targets!"; + +assert pretrained_tokens == null || pretrained_transformer_name == null: "Can't use pretrained tokens and pretrained transformer at the same time!"; + +######################################################################################## +# ADVANCED configuration # +######################################################################################## + +# Detailed dataset, training, vocabulary and model configuration. +{ + # Configuration type (default or finetuning), str + type: std.extVar('type'), + # Datasets used for vocab creation, list of str + # Choice "train", "valid" + datasets_for_vocab_creation: ['train'], + # Path to training data, str + train_data_path: training_data_path, + # Path to validation data, str + validation_data_path: validation_data_path, + # Dataset reader configuration (conllu format) + dataset_reader: { + type: "conllu", + features: features, + targets: targets, + # Whether data contains semantic relation field, bool + use_sem: if in_targets("semrel") then true else false, + token_indexers: { + token: if use_transformer then { + type: "pretrained_transformer_mismatched_fixed", + model_name: pretrained_transformer_name, + tokenizer_kwargs: if std.startsWith(pretrained_transformer_name, "allegro/herbert") + then {use_fast: false} else {}, + } else { + # SingleIdTokenIndexer, token as single int + type: "single_id", + }, + upostag: { + type: "single_id", + namespace: "upostag", + feature_name: "pos_", + }, + xpostag: { + type: "single_id", + namespace: "xpostag", + feature_name: "tag_", + }, + lemma: { + type: "characters_const_padding", + character_tokenizer: { + start_tokens: ["__START__"], + end_tokens: ["__END__"], + }, + # +2 for start and end token + min_padding_length: word_length + 2, + }, + char: { + type: "characters_const_padding", + character_tokenizer: { + start_tokens: ["__START__"], + end_tokens: ["__END__"], + }, + # +2 for start and end token + min_padding_length: word_length + 2, + }, + feats: { + type: "feats_indexer", + }, + }, + lemma_indexers: { + char: { + type: "characters_const_padding", + namespace: "lemma_characters", + character_tokenizer: { + start_tokens: ["__START__"], + end_tokens: ["__END__"], + }, + # +2 for start and end token + min_padding_length: word_length + 2, + }, + }, + }, + # Data loader configuration + data_loader: { + batch_sampler: { + type: "token_count", + word_batch_size: word_batch_size, + }, + }, + # Vocabulary configuration + vocabulary: std.prune({ + type: "from_instances_extended", + only_include_pretrained_words: true, + pretrained_files: { + tokens: pretrained_tokens, + }, + oov_token: "_", + padding_token: "__PAD__", + non_padded_namespaces: ["head_labels"], + }), + model: std.prune({ + type: "semantic_multitask", + text_field_embedder: { + type: "basic", + token_embedders: { + xpostag: if in_features("xpostag") then { + type: "embedding", + padding_index: 0, + embedding_dim: xpostag_dim, + vocab_namespace: "xpostag", + }, + upostag: if in_features("upostag") then { + type: "embedding", + padding_index: 0, + embedding_dim: upostag_dim, + vocab_namespace: "upostag", + }, + token: if use_transformer then { + type: "transformers_word_embeddings", + model_name: pretrained_transformer_name, + projection_dim: projected_embedding_dim, + tokenizer_kwargs: if std.startsWith(pretrained_transformer_name, "allegro/herbert") + then {use_fast: false} else {}, + } else { + type: "embeddings_projected", + embedding_dim: embedding_dim, + projection_layer: { + in_features: embedding_dim, + out_features: projected_embedding_dim, + dropout_rate: 0.25, + activation: "tanh" + }, + vocab_namespace: "tokens", + pretrained_file: pretrained_tokens, + trainable: if pretrained_tokens == null then true else false, + }, + char: { + type: "char_embeddings_from_config", + embedding_dim: char_dim, + dilated_cnn_encoder: { + input_dim: char_dim, + filters: [512, 256, char_dim], + kernel_size: [3, 3, 3], + stride: [1, 1, 1], + padding: [1, 2, 4], + dilation: [1, 2, 4], + activations: ["relu", "relu", "linear"], + }, + }, + lemma: if in_features("lemma") then { + type: "char_embeddings_from_config", + embedding_dim: lemma_char_dim, + dilated_cnn_encoder: { + input_dim: lemma_char_dim, + filters: [512, 256, lemma_char_dim], + kernel_size: [3, 3, 3], + stride: [1, 1, 1], + padding: [1, 2, 4], + dilation: [1, 2, 4], + activations: ["relu", "relu", "linear"], + }, + }, + feats: if in_features("feats") then { + type: "feats_embedding", + padding_index: 0, + embedding_dim: feats_dim, + vocab_namespace: "feats", + }, + }, + }, + loss_weights: loss_weights, + seq_encoder: { + type: "combo_encoder", + layer_dropout_probability: 0.33, + stacked_bilstm: { + input_size: + (char_dim + projected_embedding_dim + + (if in_features('xpostag') then xpostag_dim else 0) + + (if in_features('lemma') then lemma_char_dim else 0) + + (if in_features('upostag') then upostag_dim else 0) + + (if in_features('feats') then feats_dim else 0)), + hidden_size: hidden_size, + num_layers: num_layers, + recurrent_dropout_probability: 0.33, + layer_dropout_probability: 0.33 + }, + }, + dependency_relation: { + type: "combo_dependency_parsing_from_vocab", + vocab_namespace: 'deprel_labels', + head_predictor: { + local projection_dim = 512, + cycle_loss_n: cycle_loss_n, + head_projection_layer: { + in_features: hidden_size * 2, + out_features: projection_dim, + activation: "tanh", + }, + dependency_projection_layer: { + in_features: hidden_size * 2, + out_features: projection_dim, + activation: "tanh", + }, + }, + local projection_dim = 128, + head_projection_layer: { + in_features: hidden_size * 2, + out_features: projection_dim, + dropout_rate: predictors_dropout, + activation: "tanh" + }, + dependency_projection_layer: { + in_features: hidden_size * 2, + out_features: projection_dim, + dropout_rate: predictors_dropout, + activation: "tanh" + }, + }, + enhanced_dependency_relation: if in_targets("deps") then { + type: "combo_graph_dependency_parsing_from_vocab", + vocab_namespace: 'deprel_labels', + head_predictor: { + local projection_dim = 512, + cycle_loss_n: cycle_loss_n, + head_projection_layer: { + in_features: hidden_size * 2, + out_features: projection_dim, + activation: "tanh", + }, + dependency_projection_layer: { + in_features: hidden_size * 2, + out_features: projection_dim, + activation: "tanh", + }, + }, + local projection_dim = 128, + head_projection_layer: { + in_features: hidden_size * 2, + out_features: projection_dim, + dropout_rate: predictors_dropout, + activation: "tanh" + }, + dependency_projection_layer: { + in_features: hidden_size * 2, + out_features: projection_dim, + dropout_rate: predictors_dropout, + activation: "tanh" + }, + }, + morphological_feat: if in_targets("feats") then { + type: "combo_morpho_from_vocab", + vocab_namespace: "feats_labels", + input_dim: hidden_size * 2, + hidden_dims: [128], + activations: ["tanh", "linear"], + dropout: [predictors_dropout, 0.0], + num_layers: 2, + }, + lemmatizer: if in_targets("lemma") then { + type: "combo_lemma_predictor_from_vocab", + char_vocab_namespace: "token_characters", + lemma_vocab_namespace: "lemma_characters", + embedding_dim: 256, + input_projection_layer: { + in_features: hidden_size * 2, + out_features: 32, + dropout_rate: predictors_dropout, + activation: "tanh" + }, + filters: [256, 256, 256], + kernel_size: [3, 3, 3, 1], + stride: [1, 1, 1, 1], + padding: [1, 2, 4, 0], + dilation: [1, 2, 4, 1], + activations: ["relu", "relu", "relu", "linear"], + }, + upos_tagger: if in_targets("upostag") then { + input_dim: hidden_size * 2, + hidden_dims: [64], + activations: ["tanh", "linear"], + dropout: [predictors_dropout, 0.0], + num_layers: 2, + vocab_namespace: "upostag_labels" + }, + xpos_tagger: if in_targets("xpostag") then { + input_dim: hidden_size * 2, + hidden_dims: [128], + activations: ["tanh", "linear"], + dropout: [predictors_dropout, 0.0], + num_layers: 2, + vocab_namespace: "xpostag_labels" + }, + semantic_relation: if in_targets("semrel") then { + input_dim: hidden_size * 2, + hidden_dims: [64], + activations: ["tanh", "linear"], + dropout: [predictors_dropout, 0.0], + num_layers: 2, + vocab_namespace: "semrel_labels" + }, + regularizer: { + regexes: [ + [".*conv1d.*", {type: "l2", alpha: 1e-6}], + [".*forward.*", {type: "l2", alpha: 1e-6}], + [".*backward.*", {type: "l2", alpha: 1e-6}], + [".*char_embed.*", {type: "l2", alpha: 1e-5}], + ], + }, + }), + trainer: std.prune({ + checkpointer: { + type: "finishing_only_checkpointer", + }, + type: "gradient_descent_validate_n", + cuda_device: cuda_device, + grad_clipping: 5.0, + num_epochs: num_epochs, + optimizer: { + type: "adam", + lr: learning_rate, + betas: [0.9, 0.9], + }, + patience: 1, # it will be overwriten by callback + epoch_callbacks: [ + { type: "transfer_patience" }, + ], + learning_rate_scheduler: { + type: "combo_scheduler", + }, + tensorboard_writer: if use_tensorboard then { + should_log_learning_rate: false, + should_log_parameter_statistics: false, + summary_interval: 100, + }, + validation_metric: "+EM", + }), +} diff --git a/config.template.jsonnet b/config.template.jsonnet index 8e5ddc9f3d120156a4d00b1d231a54d90a66b631..f41ba62672eb4f93e130261ac85a5abc00e1efee 100644 --- a/config.template.jsonnet +++ b/config.template.jsonnet @@ -71,8 +71,6 @@ local cycle_loss_n = 0; local word_length = 30; # Whether to use tensorboard, bool local use_tensorboard = if std.extVar("use_tensorboard") == "True" then true else false; -# Path for tensorboard metrics, str -local metrics_dir = "./runs"; # Helper functions local in_features(name) = !(std.length(std.find(name, features)) == 0); @@ -382,7 +380,6 @@ assert pretrained_tokens == null || pretrained_transformer_name == null: "Can't type: "combo_scheduler", }, tensorboard_writer: if use_tensorboard then { - serialization_dir: metrics_dir, should_log_learning_rate: false, should_log_parameter_statistics: false, summary_interval: 100, diff --git a/docs/models.md b/docs/models.md index 485f7614cc0b65c6413f88f0139a7dd7cd8a1711..25a7f7092ef295a0cf1ff2b3ba13e0adb05a5bc6 100644 --- a/docs/models.md +++ b/docs/models.md @@ -1,19 +1,26 @@ # Models -Pre-trained models are available [here](http://mozart.ipipan.waw.pl/~mklimaszewski/models/). +COMBO provides pre-trained models for: +- morphosyntactic prediction (i.e. part-of-speech tagging, morphosyntactic analysis, lemmatisation and dependency parsing) trained on the treebanks from [Universal Dependencies repository](https://universaldependencies.org), +- enhanced dependency parsing trained on IWPT 2020 shared task [data](https://universaldependencies.org/iwpt20/data.html). -## Automatic download -Python `from_pretrained` method will download the pre-trained model if the provided name (without the extension .tar.gz) matches one of the names in [here](http://mozart.ipipan.waw.pl/~mklimaszewski/models/). -```python -import combo.predict as predict +## Manual download -nlp = predict.SemanticMultitaskPredictor.from_pretrained("polish-herbert-base") -``` -Otherwise it looks for a model in local env. +The pre-trained models can be downloaded from [here](http://mozart.ipipan.waw.pl/~mklimaszewski/models/). -## Console prediction/Local model -If you want to use the console version of COMBO, you need to download a pre-trained model manually + +If you want to use the console version of COMBO, you need to download a pre-trained model manually: ```bash wget http://mozart.ipipan.waw.pl/~mklimaszewski/models/polish-herbert-base.tar.gz ``` -and pass it as a parameter (see [prediction doc](prediction.md)). + +The downloaded model should be passed as a parameter for COMBO (see [prediction doc](prediction.md)). + +## Automatic download +The pre-trained models can be downloaded automatically with the Python `from_pretrained` method. Select a model name (without the extension .tar.gz) from the list of [pre-trained models](http://mozart.ipipan.waw.pl/~mklimaszewski/models/) and pass the name as the attribute to `from_pretrained` method: +```python +from combo.predict import COMBO + +nlp = COMBO.from_pretrained("polish-herbert-base") +``` +If the model name doesn't match any model on the list of [pre-trained models](http://mozart.ipipan.waw.pl/~mklimaszewski/models/), COMBO looks for a model in local env. diff --git a/docs/prediction.md b/docs/prediction.md index 89cc74c27e8de8e4fafb44c60aea8ed260b67a3d..6de5d0e1892389ba5cd18c25b88947db3f717074 100644 --- a/docs/prediction.md +++ b/docs/prediction.md @@ -32,9 +32,19 @@ Use either `--predictor_name semantic-multitask-predictor` or `--predictor_name ## Python ```python -import combo.predict as predict +from combo.predict import COMBO model_path = "your_model.tar.gz" -nlp = predict.SemanticMultitaskPredictor.from_pretrained(model_path) +nlp = COMBO.from_pretrained(model_path) sentence = nlp("Sentence to parse.") ``` + +Using your own tokenization: +```python +from combo.predict import COMBO + +model_path = "your_model.tar.gz" +nlp = COMBO.from_pretrained(model_path) +tokenized_sentence = ["Sentence", "to", "parse", "."] +sentence = nlp([tokenized_sentence]) +``` diff --git a/docs/training.md b/docs/training.md index 9dc430a782baacb7344e95d29a7bf9066b1df613..d3f69e0913c59681279b1fd966be0f4901ade11e 100644 --- a/docs/training.md +++ b/docs/training.md @@ -1,6 +1,6 @@ # Training -Command: +Basic command: ```bash combo --mode train \ --training_data_path your_training_path \ @@ -32,18 +32,38 @@ Examples (for clarity without training/validation data paths): combo --mode train --pretrained_transformer_name your_choosen_pretrained_transformer ``` -* predict only dependency tree: +* train only a dependency parser: ```bash combo --mode train --targets head,deprel ``` -* use part-of-speech tags for predicting only dependency tree +* use additional features (e.g. part-of-speech tags) for training a dependency parser (`token` and `char` are default features) ```bash combo --mode train --targets head,deprel --features token,char,upostag ``` - + +## Enhanced UD + +Training a model with Enhanced UD prediction **requires** data pre-processing. + +```bash +combo --mode train \ + --training_data_path your_preprocessed_training_path \ + --validation_data_path your_preprocessed_validation_path \ + --targets feats,upostag,xpostag,head,deprel,lemma,deps \ + --config_path config.graph.template.jsonnet +``` +### Data pre-processing +Download data from [IWPT20 Shared Task](https://universaldependencies.org/iwpt20/data.html). +It contains `enhanced_collapse_empty_nodes.pl` script which is required as pre-processing step. +Apply this script to training and validation data. + +```bash +perl enhanced_collapse_empty_nodes.pl training.conllu > training.fixed.conllu +``` + ## Configuration ### Advanced diff --git a/scripts/train.py b/scripts/train.py new file mode 100644 index 0000000000000000000000000000000000000000..939088800f772c113693eb6d0858304ed82f766d --- /dev/null +++ b/scripts/train.py @@ -0,0 +1,172 @@ +"""Script to train Dependency Parsing models based on UD 2.x data.""" +import pathlib + +from absl import app +from absl import flags + +from scripts import utils + +TREEBANKS = [ + "UD_Afrikaans-AfriBooms", + "UD_Arabic-NYUAD", + "UD_Arabic-PADT", + "UD_Armenian-ArmTDP", + "UD_Basque-BDT", + "UD_Belarusian-HSE", + "UD_Breton-KEB", + "UD_Bulgarian-BTB", + "UD_Catalan-AnCora", + "UD_Croatian-SET", + "UD_Czech-CAC", + "UD_Czech-CLTT", + "UD_Czech-FicTree", + "UD_Czech-PDT", + "UD_Danish-DDT", + "UD_Dutch-Alpino", + "UD_Dutch-LassySmall", + "UD_English-ESL", + "UD_English-EWT", + "UD_English-GUM", + "UD_English-LinES", + "UD_English-ParTUT", + "UD_English-Pronouns", + "UD_Estonian-EDT", + "UD_Estonian-EWT", + "UD_Finnish-FTB", + "UD_Finnish-TDT", + "UD_French-FQB", + "UD_French-FTB", + "UD_French-GSD", + "UD_French-ParTUT", + "UD_French-Sequoia", + "UD_French-Spoken", + "UD_Galician-CTG", + "UD_Galician-TreeGal", + "UD_German-GSD", + "UD_German-HDT", + "UD_German-LIT", + "UD_Greek-GDT", + "UD_Hebrew-HTB", + "UD_Hindi_English-HIENCS", + "UD_Hindi-HDTB", + "UD_Hungarian-Szeged", + "UD_Indonesian-GSD", + "UD_Irish-IDT", + "UD_Italian-ISDT", + "UD_Italian-ParTUT", + "UD_Italian-PoSTWITA", + "UD_Italian-TWITTIRO", + "UD_Italian-VIT", + "UD_Japanese-BCCWJ", + "UD_Japanese-GSD", + "UD_Japanese-Modern", + "UD_Kazakh-KTB", + "UD_Korean-GSD", + "UD_Korean-Kaist", + "UD_Latin-ITTB", + "UD_Latin-Perseus", + "UD_Latin-PROIEL", + "UD_Latvian-LVTB", + "UD_Lithuanian-ALKSNIS", + "UD_Lithuanian-HSE", + "UD_Maltese-MUDT", + "UD_Marathi-UFAL", + "UD_Persian-Seraji", + "UD_Polish-LFG", + "UD_Polish-PDB", + "UD_Portuguese-Bosque", + "UD_Portuguese-GSD", + "UD_Romanian-Nonstandard", + "UD_Romanian-RRT", + "UD_Romanian-SiMoNERo", + "UD_Russian-GSD", + "UD_Russian-SynTagRus", + "UD_Russian-Taiga", + "UD_Serbian-SET", + "UD_Slovak-SNK", + "UD_Slovenian-SSJ", + "UD_Slovenian-SST", + "UD_Spanish-AnCora", + "UD_Spanish-GSD", + "UD_Swedish-LinES", + "UD_Swedish_Sign_Language-SSLC", + "UD_Swedish-Talbanken", + "UD_Tamil-TTB", + "UD_Telugu-MTG", + "UD_Turkish-GB", + "UD_Turkish-IMST", + "UD_Ukrainian-IU", + "UD_Urdu-UDTB", + "UD_Uyghur-UDT", + "UD_Vietnamese-VTB", +] + +FLAGS = flags.FLAGS +flags.DEFINE_list(name="treebanks", default=TREEBANKS, + help=f"Treebanks to train. Possible values: {TREEBANKS}.") +flags.DEFINE_string(name="data_dir", default="", + help="Path to UD data directory.") +flags.DEFINE_string(name="serialization_dir", default="/tmp/", + help="Model serialization directory.") +flags.DEFINE_string(name="embeddings_dir", default="", + help="Path to embeddings directory (with languages as subdirectories).") +flags.DEFINE_integer(name="cuda_device", default=-1, + help="Cuda device id (-1 for cpu).") + + +def run(_): + treebanks_dir = pathlib.Path(FLAGS.data_dir) + for treebank in FLAGS.treebanks: + assert treebank in TREEBANKS, f"Unknown treebank {treebank}." + treebank_dir = treebanks_dir / treebank + treebank_parts = treebank.split("_")[1].split("-") + language = treebank_parts[0] + + files = list(treebank_dir.iterdir()) + + training_file = [f for f in files if "train" in f.name and ".conllu" in f.name] + assert len(training_file) == 1, f"Couldn't find training file." + training_file_path = training_file[0] + + valid_file = [f for f in files if "dev" in f.name and ".conllu" in f.name] + assert len(valid_file) == 1, f"Couldn't find validation file." + valid_file_path = valid_file[0] + + embeddings_dir = FLAGS.embeddings_dir + embeddings_file = None + if embeddings_dir: + embeddings_dir = pathlib.Path(embeddings_dir) / language + embeddings_file = [f for f in embeddings_dir.iterdir() if "vectors" in f.name and ".vec.gz" in f.name] + assert len(embeddings_file) == 1, f"Couldn't find embeddings file." + embeddings_file = embeddings_file[0] + + language = training_file_path.name.split("_")[0] + + serialization_dir = pathlib.Path(FLAGS.serialization_dir) / treebank + serialization_dir.mkdir(exist_ok=True, parents=True) + + command = f"""time combo --mode train + --cuda_device {FLAGS.cuda_device} + --training_data_path {training_file_path} + --validation_data_path {valid_file_path} + {f"--pretrained_tokens {embeddings_file}" if embeddings_dir + else f"--pretrained_transformer_name {utils.LANG2TRANSFORMER[language]}"} + --serialization_dir {serialization_dir} + --config_path {pathlib.Path.cwd() / 'config.template.jsonnet'} + --word_batch_size 2500 + --notensorboard + """ + + # no XPOS datasets + if treebank in ["UD_Hungarian-Szeged", "UD_Armenian-ArmTDP"]: + command = command + " --targets deprel,head,upostag,lemma,feats" + + utils.execute_command(command) + + +def main(): + app.run(run) + + +if __name__ == "__main__": + main() diff --git a/scripts/train_eud.py b/scripts/train_eud.py new file mode 100644 index 0000000000000000000000000000000000000000..4904e0bff6a9d78a7d0a56bff2ed0357992b615b --- /dev/null +++ b/scripts/train_eud.py @@ -0,0 +1,126 @@ +"""Script to train Enhanced Dependency Parsing models based on IWPT'20 Shared Task data. + +Might require: +conda install -c bioconda perl-list-moreutils +conda install -c bioconda perl-namespace-autoclean +conda install -c bioconda perl-moose +conda install -c dan_blanchard perl-moosex-semiaffordanceaccessor +""" + +import os +import pathlib +from typing import List + +from absl import app +from absl import flags + +from scripts import utils + +LANG2TREEBANK = { + "ar": ["Arabic-PADT"], + "bg": ["Bulgarian-BTB"], + "cs": ["Czech-FicTree", "Czech-CAC", "Czech-PDT", "Czech-PUD"], + "nl": ["Dutch-Alpino", "Dutch-LassySmall"], + "en": ["English-EWT", "English-PUD"], + "et": ["Estonian-EDT", "Estonian-EWT"], + "fi": ["Finnish-TDT", "Finnish-PUD"], + "fr": ["French-Sequoia", "French-FQB"], + "it": ["Italian-ISDT"], + "lv": ["Latvian-LVTB"], + "lt": ["Lithuanian-ALKSNIS"], + "pl": ["Polish-LFG", "Polish-PDB", "Polish-PUD"], + "ru": ["Russian-SynTagRus"], + "sk": ["Slovak-SNK"], + "sv": ["Swedish-Talbanken", "Swedish-PUD"], + "ta": ["Tamil-TTB"], + "uk": ["Ukrainian-IU"], +} + +FLAGS = flags.FLAGS +flags.DEFINE_list(name="lang", default=list(LANG2TREEBANK.keys()), + help=f"Language of models to train. Possible values: {LANG2TREEBANK.keys()}.") +flags.DEFINE_string(name="data_dir", default="", + help="Path to 'iwpt2020stdata' directory.") +flags.DEFINE_string(name="serialization_dir", default="/tmp/", + help="Model serialization dir.") +flags.DEFINE_integer(name="cuda_device", default=-1, + help="Cuda device id (-1 for cpu).") + + +def path_to_str(path: pathlib.Path) -> str: + return str(path.resolve()) + + +def merge_files(files: List[str], output: pathlib.Path): + if not output.exists(): + os.system(f"cat {' '.join(files)} > {output}") + + +def collapse_nodes(data_dir: pathlib.Path, treebank_file: pathlib.Path, output: str): + output_path = pathlib.Path(output) + if not output_path.exists(): + utils.execute_command(f"perl {path_to_str(data_dir / 'tools' / 'enhanced_collapse_empty_nodes.pl')} " + f"{path_to_str(treebank_file)}", output) + + +def run(_): + languages = FLAGS.lang + for lang in languages: + assert lang in LANG2TREEBANK, f"'{lang}' must be one of {list(LANG2TREEBANK.keys())}." + assert lang in utils.LANG2TRANSFORMER, f"Transformer for '{lang}' isn't defined. See 'LANG2TRANSFORMER' dict." + data_dir = pathlib.Path(FLAGS.data_dir) + assert data_dir.is_dir(), f"'{data_dir}' is not a directory!" + + treebanks = LANG2TREEBANK[lang] + train_paths = [] + dev_paths = [] + test_paths = [] + for treebank in treebanks: + treebank_dir = data_dir / f"UD_{treebank}" + assert treebank_dir.exists() and treebank_dir.is_dir(), f"'{treebank_dir}' directory doesn't exists." + for treebank_file in treebank_dir.iterdir(): + name = treebank_file.name + if "conllu" in name and "fixed" not in name: + output = path_to_str(treebank_file).replace('.conllu', '.fixed.conllu') + if "train" in name: + collapse_nodes(data_dir, treebank_file, output) + train_paths.append(output) + elif "dev" in name: + collapse_nodes(data_dir, treebank_file, output) + dev_paths.append(output) + elif "test" in name: + collapse_nodes(data_dir, treebank_file, output) + test_paths.append(output) + + lang_data_dir = pathlib.Path(data_dir / lang) + lang_data_dir.mkdir(exist_ok=True) + + train_path = lang_data_dir / "train.conllu" + dev_path = lang_data_dir / "dev.conllu" + test_path = lang_data_dir / "test.conllu" + + merge_files(train_paths, output=train_path) + merge_files(dev_paths, output=dev_path) + merge_files(test_paths, output=test_path) + + serialization_dir = pathlib.Path(FLAGS.serialization_dir) / lang + serialization_dir.mkdir(exist_ok=True, parents=True) + utils.execute_command("".join(f"""combo --mode train + --training_data {train_path} + --validation_data {dev_path} + --targets feats,upostag,xpostag,head,deprel,lemma,deps + --pretrained_transformer_name {utils.LANG2TRANSFORMER[lang]} + --serialization_dir {serialization_dir} + --cuda_device {FLAGS.cuda_device} + --word_batch_size 2500 + --config_path {pathlib.Path.cwd() / 'config.graph.template.jsonnet'} + --notensorboard + """.splitlines())) + + +def main(): + app.run(run) + + +if __name__ == "__main__": + main() diff --git a/scripts/utils.py b/scripts/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5dda2b89693fc1431b810709d9e7002d9f5f8071 --- /dev/null +++ b/scripts/utils.py @@ -0,0 +1,16 @@ +"""Utils for scripts.""" +import subprocess + +LANG2TRANSFORMER = { + "en": "bert-base-cased", + "pl": "allegro/herbert-base-cased", +} + + +def execute_command(command, output_file=None): + command = [c for c in command.split() if c.strip()] + if output_file: + with open(output_file, "w") as f: + subprocess.run(command, check=True, stdout=f) + else: + subprocess.run(command, check=True) diff --git a/setup.cfg b/setup.cfg index b7e478982ccf9ab1963c74e1084dfccb6e42c583..6876d0d7447015400e616dbd7479de01d19c2948 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,2 +1,5 @@ [aliases] test=pytest + +[metadata] +description-file = README.md diff --git a/setup.py b/setup.py index 9529a0c4f5f2138cf913d94d8665a0810552cdd9..fdaa2be51e5098e9aa2d2ddc3b691188f11c7b10 100644 --- a/setup.py +++ b/setup.py @@ -3,10 +3,9 @@ from setuptools import find_packages, setup REQUIREMENTS = [ 'absl-py==0.9.0', - 'allennlp==1.2.0', + 'allennlp==1.2.1', 'conllu==2.3.2', 'dataclasses;python_version<"3.7"', - 'dataclasses-json==0.5.2', 'joblib==0.14.1', 'jsonnet==0.15.0', 'requests==2.23.0', @@ -20,11 +19,23 @@ REQUIREMENTS = [ setup( name='COMBO', - version='0.0.1', + version='1.0.0b1', install_requires=REQUIREMENTS, packages=find_packages(exclude=['tests']), + license="GPL-3.0", + url='https://gitlab.clarin-pl.eu/syntactic-tools/combo', + keywords="nlp natural-language-processing dependency-parsing", setup_requires=['pytest-runner', 'pytest-pylint'], tests_require=['pytest', 'pylint'], python_requires='>=3.6', entry_points={'console_scripts': ['combo = combo.main:main']}, + classifiers=[ + 'Development Status :: 4 - Beta', + 'Intended Audience :: Science/Research', + 'License :: OSI Approved :: GNU General Public License v3 (GPLv3)', + 'Topic :: Scientific/Engineering :: Artificial Intelligence' + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', + ] ) diff --git a/tests/data/fields/test_sequence_multilabel_field.py b/tests/data/fields/test_sequence_multilabel_field.py index d2a1f8bc6b4d853e427d6a7214592f1b881db52c..fff8ff4ee285215aa4015d1c55609f3c2d28a3a6 100644 --- a/tests/data/fields/test_sequence_multilabel_field.py +++ b/tests/data/fields/test_sequence_multilabel_field.py @@ -4,6 +4,7 @@ from typing import List import torch from allennlp import data as allen_data +from allennlp.common import util from allennlp.data import fields as allen_fields from combo.data import fields @@ -22,7 +23,7 @@ class IndexingSequenceMultiLabelFieldTest(unittest.TestCase): def _indexer(vocab: allen_data.Vocabulary): vocab_size = vocab.get_vocab_size(self.namespace) - def _mapper(multi_label: List[str]) -> List[int]: + def _mapper(multi_label: List[str], _: int) -> List[int]: one_hot = [0] * vocab_size for label in multi_label: index = vocab.get_token_index(label, self.namespace) @@ -31,7 +32,21 @@ class IndexingSequenceMultiLabelFieldTest(unittest.TestCase): return _mapper + def _as_tensor(field: fields.SequenceMultiLabelField): + + def _wrapped(padding_lengths): + desired_num_tokens = padding_lengths["num_tokens"] + classes_count = len(field._indexed_multi_labels[0]) + default_value = [0.0] * classes_count + padded_tags = util.pad_sequence_to_length(field._indexed_multi_labels, desired_num_tokens, + lambda: default_value) + tensor = torch.LongTensor(padded_tags) + return tensor + + return _wrapped + self.indexer = _indexer + self.as_tensor = _as_tensor self.sequence_field = _SequenceFieldTestWrapper(self.vocab.get_vocab_size(self.namespace)) def test_indexing(self): @@ -39,6 +54,7 @@ class IndexingSequenceMultiLabelFieldTest(unittest.TestCase): field = fields.SequenceMultiLabelField( multi_labels=[["t1", "t2"], [], ["t0"]], multi_label_indexer=self.indexer, + as_tensor=self.as_tensor, sequence_field=self.sequence_field, label_namespace=self.namespace ) @@ -55,6 +71,7 @@ class IndexingSequenceMultiLabelFieldTest(unittest.TestCase): field = fields.SequenceMultiLabelField( multi_labels=[["t1", "t2"], [], ["t0"]], multi_label_indexer=self.indexer, + as_tensor=self.as_tensor, sequence_field=self.sequence_field, label_namespace=self.namespace ) @@ -72,6 +89,7 @@ class IndexingSequenceMultiLabelFieldTest(unittest.TestCase): field = fields.SequenceMultiLabelField( multi_labels=[["t1", "t2"], [], ["t0"]], multi_label_indexer=self.indexer, + as_tensor=self.as_tensor, sequence_field=self.sequence_field, label_namespace=self.namespace ) diff --git a/tests/fixtures/example.conllu b/tests/fixtures/example.conllu index 1125392e17d71f9db09c5236e1fb27ac0968d410..32e0653525e1160135fbe6f05dc2fa07b6a7fd9d 100644 --- a/tests/fixtures/example.conllu +++ b/tests/fixtures/example.conllu @@ -4,3 +4,10 @@ 2 Sentence verylonglemmawhichmustbetruncatedbythesystemto30 NOUN nom Number=Sing 0 root _ _ 3 . . PUNCT . _ 1 punct _ _ +# sent_id = test-s1 +# text = Easy sentence. +1 Verylongwordwhichmustbetruncatedbythesystemto30 easy ADJ adj AdpType=Prep|Adp 2 amod _ _ +2 Sentence verylonglemmawhichmustbetruncatedbythesystemto30 NOUN nom Number=Sing 0 root _ _ +3 . . PUNCT . _ 1 punct 2:mod _ +4 . . PUNCT . _ 1 punct 2:xmod _ + diff --git a/tests/test_predict.py b/tests/test_predict.py index 2a56bd9baff30a6d34d3ad6bb16ce4ccdea71792..332ced3cfa010723fa51b77ce1742b8d976e1025 100644 --- a/tests/test_predict.py +++ b/tests/test_predict.py @@ -22,7 +22,7 @@ class PredictionTest(unittest.TestCase): data.Token(id=2, token=".") ])] api_wrapped_tokenized_sentence = [data.conllu2sentence(data.tokens2conllu(["Test", "."]), [])] - nlp = predict.SemanticMultitaskPredictor.from_pretrained(os.path.join(self.FIXTURES_ROOT, "model.tar.gz")) + nlp = predict.COMBO.from_pretrained(os.path.join(self.FIXTURES_ROOT, "model.tar.gz")) # when results = [ diff --git a/tests/utils/test_graph.py b/tests/utils/test_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..74e37446684f68c6d4ea4abe77c69ba9d3ae4c2b --- /dev/null +++ b/tests/utils/test_graph.py @@ -0,0 +1,106 @@ +import unittest +import combo.utils.graph as graph + +import conllu +import numpy as np + + +class GraphTest(unittest.TestCase): + + def test_adding_empty_graph_with_the_same_labels(self): + tree = conllu.TokenList( + tokens=[ + {"head": 0, "deprel": "root", "form": "word1"}, + {"head": 3, "deprel": "yes", "form": "word2"}, + {"head": 1, "deprel": "yes", "form": "word3"}, + ] + ) + vocab_index = {0: "root", 1: "yes", 2: "yes", 3: "yes"} + empty_graph = np.zeros((4, 4)) + graph_labels = np.zeros((4, 4, 4)) + expected_deps = ["0:root", "3:yes", "1:yes"] + + # when + graph.sdp_to_dag_deps(empty_graph, graph_labels, tree.tokens, root_idx=0, vocab_index=vocab_index) + actual_deps = [t["deps"] for t in tree.tokens] + + # then + self.assertEqual(expected_deps, actual_deps) + + def test_adding_empty_graph_with_different_labels(self): + tree = conllu.TokenList( + tokens=[ + {"head": 0, "deprel": "root", "form": "word1"}, + {"head": 3, "deprel": "tree_label", "form": "word2"}, + {"head": 1, "deprel": "tree_label", "form": "word3"}, + ] + ) + vocab_index = {0: "root", 1: "tree_label", 2: "graph_label"} + empty_graph = np.zeros((4, 4)) + graph_labels = np.zeros((4, 4, 3)) + graph_labels[2][3][2] = 10e10 + graph_labels[3][1][2] = 10e10 + expected_deps = ["0:root", "3:graph_label", "1:graph_label"] + + # when + graph.sdp_to_dag_deps(empty_graph, graph_labels, tree.tokens, root_idx=0, vocab_index=vocab_index) + actual_deps = [t["deps"] for t in tree.tokens] + + # then + self.assertEqual(actual_deps, expected_deps) + + def test_extending_tree_with_graph(self): + # given + tree = conllu.TokenList( + tokens=[ + {"head": 0, "deprel": "root", "form": "word1"}, + {"head": 1, "deprel": "tree_label", "form": "word2"}, + {"head": 2, "deprel": "tree_label", "form": "word3"}, + ] + ) + vocab_index = {0: "root", 1: "tree_label", 2: "graph_label"} + arc_scores = np.array([ + [0, 0, 0, 0], + [1, 0, 0, 0], + [0, 1, 0, 0], + [0, 1, 1, 0], + ]) + graph_labels = np.zeros((4, 4, 3)) + graph_labels[3][1][2] = 10e10 + expected_deps = ["0:root", "1:tree_label", "1:graph_label|2:tree_label"] + + # when + graph.sdp_to_dag_deps(arc_scores, graph_labels, tree.tokens, root_idx=0, vocab_index=vocab_index) + actual_deps = [t["deps"] for t in tree.tokens] + + # then + self.assertEqual(actual_deps, expected_deps) + + def test_extending_tree_with_self_loop_edge_shouldnt_add_edge(self): + # given + tree = conllu.TokenList( + tokens=[ + {"head": 0, "deprel": "root", "form": "word1"}, + {"head": 1, "deprel": "tree_label", "form": "word2"}, + {"head": 2, "deprel": "tree_label", "form": "word3"}, + ] + ) + vocab_index = {0: "root", 1: "tree_label", 2: "graph_label"} + arc_scores = np.array([ + [0, 0, 0, 0], + [1, 0, 0, 0], + [0, 1, 0, 0], + [0, 0, 1, 1], + ]) + graph_labels = np.zeros((4, 4, 3)) + graph_labels[3][3][2] = 10e10 + expected_deps = ["0:root", "1:tree_label", "2:tree_label"] + # TODO current actual, adds self-loop + # actual_deps = ["0:root", "1:tree_label", "2:tree_label|3:graph_label"] + + # when + graph.sdp_to_dag_deps(arc_scores, graph_labels, tree.tokens, root_idx=0, vocab_index=vocab_index) + actual_deps = [t["deps"] for t in tree.tokens] + + # then + self.assertEqual(expected_deps, actual_deps) diff --git a/tests/utils/test_metrics.py b/tests/utils/test_metrics.py index 5b8411bb44c4df7baf7ee1c8d751ac06daa320ec..242eaa3dcffaf452c19a52e2f56625de00cd0433 100644 --- a/tests/utils/test_metrics.py +++ b/tests/utils/test_metrics.py @@ -27,12 +27,16 @@ class SemanticMetricsTest(unittest.TestCase): self.semrel, self.semrel_l = (("semrel", x) for x in [pred, gold]) self.head, self.head_l = (("head", x) for x in [pred, gold]) self.deprel, self.deprel_l = (("deprel", x) for x in [pred, gold]) + # TODO(mklimasz) Set up an example with size 3x5x5 + self.enhanced_head, self.enhanced_head_l = (("enhanced_head", x) for x in [None, None]) + self.enhanced_deprel, self.enhanced_deprel_l = (("enhanced_deprel", x) for x in [None, None]) self.feats, self.feats_l = (("feats", x) for x in [pred_seq, gold_seq]) self.lemma, self.lemma_l = (("lemma", x) for x in [pred_seq, gold_seq]) self.predictions = dict( - [self.upostag, self.xpostag, self.semrel, self.feats, self.lemma, self.head, self.deprel]) + [self.upostag, self.xpostag, self.semrel, self.feats, self.lemma, self.head, self.deprel, + self.enhanced_head, self.enhanced_deprel]) self.gold_labels = dict([self.upostag_l, self.xpostag_l, self.semrel_l, self.feats_l, self.lemma_l, self.head_l, - self.deprel_l]) + self.deprel_l, self.enhanced_head_l, self.enhanced_deprel_l]) self.eps = 1e-6 def test_every_prediction_correct(self):