Skip to content
Snippets Groups Projects
Commit 1bf88a88 authored by Maja Jabłońska's avatar Maja Jabłońska
Browse files

General structure

parent e4b2e1c5
1 merge request!46Merge COMBO 3.0 into master
Showing
with 222 additions and 587 deletions
......@@ -2,7 +2,7 @@
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.9 (combolightning)" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="combo" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (combolightning)" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="combo" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
</component>
</project>
\ No newline at end of file
from .train import FinetuningTrainModel
\ No newline at end of file
from pytorch_lightning import Trainer
class FinetuningTrainModel(Trainer):
"""
Class made only for finetuning,
the only difference is saving vocab from concatenated
(archive and current) datasets
"""
pass
\ No newline at end of file
from .samplers import TokenCountBatchSampler
from .token import Token
from .token_indexers import *
from .api import *
import copy
import logging
import pathlib
from typing import Union, List, Dict, Iterable, Optional, Any, Tuple
import conllu
import torch
from allennlp import data as allen_data
from allennlp.common import checks, util
from allennlp.data import fields as allen_fields, vocabulary
from conllu import parser
from dataclasses import dataclass
from overrides import overrides
from combo.data import fields
logger = logging.getLogger(__name__)
@allen_data.DatasetReader.register("conllu")
class UniversalDependenciesDatasetReader(allen_data.DatasetReader):
def __init__(
self,
token_indexers: Dict[str, allen_data.TokenIndexer] = None,
lemma_indexers: Dict[str, allen_data.TokenIndexer] = None,
features: List[str] = None,
targets: List[str] = None,
use_sem: bool = False,
**kwargs,
) -> None:
super().__init__(**kwargs)
if features is None:
features = ["token", "char"]
if targets is None:
targets = ["head", "deprel", "upostag", "xpostag", "lemma", "feats"]
if "token" not in features and "char" not in features:
raise checks.ConfigurationError("There must be at least one ('char' or 'token') text-based feature!")
if "deps" in targets and not ("head" in targets and "deprel" in targets):
raise checks.ConfigurationError("Add 'head' and 'deprel' to targets when using 'deps'!")
intersection = set(features).intersection(set(targets))
if len(intersection) != 0:
raise checks.ConfigurationError(
"Features and targets cannot share elements! "
"Remove {} from either features or targets.".format(intersection)
)
self.use_sem = use_sem
# *.conllu readers config
fields = list(parser.DEFAULT_FIELDS)
fields[1] = "token" # use 'token' instead of 'form'
field_parsers = parser.DEFAULT_FIELD_PARSERS
# Do not make it nullable
field_parsers.pop("xpostag", None)
# Ignore parsing misc
field_parsers.pop("misc", None)
if self.use_sem:
fields = list(fields)
fields.append("semrel")
field_parsers["semrel"] = lambda line, i: line[i]
self.field_parsers = field_parsers
self.fields = tuple(fields)
self._token_indexers = token_indexers
self._lemma_indexers = lemma_indexers
self._targets = targets
self._features = features
self.generate_labels = True
# Filter out not required token indexers to avoid
# Mismatched token keys ConfigurationError
for indexer_name in list(self._token_indexers.keys()):
if indexer_name not in self._features:
del self._token_indexers[indexer_name]
@overrides
def _read(self, file_path: str) -> Iterable[allen_data.Instance]:
file_path = [file_path] if len(file_path.split(",")) == 0 else file_path.split(",")
for conllu_file in file_path:
file = pathlib.Path(conllu_file)
assert conllu_file and file.exists(), f"File with path '{conllu_file}' does not exists!"
with file.open("r", encoding="utf-8") as f:
for annotation in conllu.parse_incr(f, fields=self.fields, field_parsers=self.field_parsers):
yield self.text_to_instance(annotation)
@overrides
def text_to_instance(self, tree: conllu.TokenList) -> allen_data.Instance:
fields_: Dict[str, allen_data.Field] = {}
tree_tokens = [t for t in tree if isinstance(t["id"], int)]
tokens = [_Token(t["token"],
pos_=t.get("upostag"),
tag_=t.get("xpostag"),
lemma_=t.get("lemma"),
feats_=t.get("feats"))
for t in tree_tokens]
# features
text_field = allen_fields.TextField(tokens, self._token_indexers)
fields_["sentence"] = text_field
# targets
if self.generate_labels:
for target_name in self._targets:
if target_name != "sent":
target_values = [t[target_name] for t in tree_tokens]
if target_name == "lemma":
target_values = [allen_data.Token(v) for v in target_values]
fields_[target_name] = allen_fields.TextField(target_values, self._lemma_indexers)
elif target_name == "feats":
target_values = self._feat_values(tree_tokens)
fields_[target_name] = fields.SequenceMultiLabelField(target_values,
self._feats_indexer,
self._feats_as_tensor_wrapper,
text_field,
label_namespace="feats_labels")
elif target_name == "head":
target_values = [0 if v == "_" else int(v) for v in target_values]
fields_[target_name] = allen_fields.SequenceLabelField(target_values, text_field,
label_namespace=target_name + "_labels")
elif target_name == "deps":
# Graphs require adding ROOT (AdjacencyField uses sequence length from TextField).
text_field_deps = allen_fields.TextField([_Token("ROOT")] + copy.deepcopy(tokens),
self._token_indexers)
enhanced_heads: List[Tuple[int, int]] = []
enhanced_deprels: List[str] = []
for idx, t in enumerate(tree_tokens):
t_deps = t["deps"]
if t_deps and t_deps != "_":
for rel, head in t_deps:
# EmoryNLP skips the first edge, if there are two edges between the same
# nodes. Thanks to that one is in a tree and another in a graph.
# This snippet follows that approach.
if enhanced_heads and enhanced_heads[-1] == (idx, head):
enhanced_heads.pop()
enhanced_deprels.pop()
enhanced_heads.append((idx, head))
enhanced_deprels.append(rel)
fields_["enhanced_heads"] = allen_fields.AdjacencyField(
indices=enhanced_heads,
sequence_field=text_field_deps,
label_namespace="enhanced_heads_labels",
padding_value=0,
)
fields_["enhanced_deprels"] = allen_fields.AdjacencyField(
indices=enhanced_heads,
sequence_field=text_field_deps,
labels=enhanced_deprels,
# Label namespace matches regular tree parsing.
label_namespace="enhanced_deprel_labels",
padding_value=0,
)
else:
fields_[target_name] = allen_fields.SequenceLabelField(target_values, text_field,
label_namespace=target_name + "_labels")
# Restore feats fields to string representation
# parser.serialize_field doesn't handle key without value
for token in tree.tokens:
if "feats" in token:
feats = token["feats"]
if feats:
feats_values = []
for k, v in feats.items():
feats_values.append('='.join((k, v)) if v else k)
field = "|".join(feats_values)
else:
field = "_"
token["feats"] = field
# metadata
fields_["metadata"] = allen_fields.MetadataField({"input": tree,
"field_names": self.fields,
"tokens": tokens})
return allen_data.Instance(fields_)
@staticmethod
def _feat_values(tree: List[Dict[str, Any]]):
features = []
for token in tree:
token_features = []
if token["feats"] is not None:
for feat, value in token["feats"].items():
if feat in ["_", "__ROOT__"]:
pass
else:
# Handle case where feature is binary (doesn't have associated value)
if value:
token_features.append(feat + "=" + value)
else:
token_features.append(feat)
features.append(token_features)
return features
@staticmethod
def _feats_as_tensor_wrapper(field: fields.SequenceMultiLabelField):
def as_tensor(padding_lengths):
desired_num_tokens = padding_lengths["num_tokens"]
assert len(field._indexed_multi_labels) > 0
classes_count = len(field._indexed_multi_labels[0])
default_value = [0.0] * classes_count
padded_tags = util.pad_sequence_to_length(field._indexed_multi_labels, desired_num_tokens,
lambda: default_value)
tensor = torch.tensor(padded_tags, dtype=torch.long)
return tensor
return as_tensor
@staticmethod
def _feats_indexer(vocab: allen_data.Vocabulary):
label_namespace = "feats_labels"
vocab_size = vocab.get_vocab_size(label_namespace)
slices = get_slices_if_not_provided(vocab)
def _m_from_n_ones_encoding(multi_label: List[str], sentence_length: int) -> List[int]:
one_hot_encoding = [0] * vocab_size
for cat, cat_indices in slices.items():
if cat not in ["__PAD__", "_"]:
label_from_cat = [label for label in multi_label if cat == label.split("=")[0]]
if label_from_cat:
label_from_cat = label_from_cat[0]
index = vocab.get_token_index(label_from_cat, label_namespace)
else:
# Get Cat=None index
index = vocab.get_token_index(cat + "=None", label_namespace)
one_hot_encoding[index] = 1
return one_hot_encoding
return _m_from_n_ones_encoding
@allen_data.Vocabulary.register("from_instances_extended", constructor="from_instances_extended")
class Vocabulary(allen_data.Vocabulary):
@classmethod
def from_instances_extended(
cls,
instances: Iterable[allen_data.Instance],
min_count: Dict[str, int] = None,
max_vocab_size: Union[int, Dict[str, int]] = None,
non_padded_namespaces: Iterable[str] = vocabulary.DEFAULT_NON_PADDED_NAMESPACES,
pretrained_files: Optional[Dict[str, str]] = None,
only_include_pretrained_words: bool = False,
min_pretrained_embeddings: Dict[str, int] = None,
padding_token: Optional[str] = vocabulary.DEFAULT_PADDING_TOKEN,
oov_token: Optional[str] = vocabulary.DEFAULT_OOV_TOKEN,
) -> "Vocabulary":
"""
Extension to manually fill gaps in missing 'feats_labels'.
"""
# Load manually tokens from pretrained file (using different strategy
# - only words add all embedding file, without checking if were seen
# in any dataset.
tokens_to_add = None
if pretrained_files and "tokens" in pretrained_files:
pretrained_set = set(vocabulary._read_pretrained_tokens(pretrained_files["tokens"]))
tokens_to_add = {"tokens": list(pretrained_set)}
pretrained_files = None
vocab = super().from_instances(
instances=instances,
min_count=min_count,
max_vocab_size=max_vocab_size,
non_padded_namespaces=non_padded_namespaces,
pretrained_files=pretrained_files,
only_include_pretrained_words=only_include_pretrained_words,
tokens_to_add=tokens_to_add,
min_pretrained_embeddings=min_pretrained_embeddings,
padding_token=padding_token,
oov_token=oov_token
)
# Extending vocab with features that does not show up explicitly.
# To know all features we need to read full dataset first.
# Adding auxiliary '=None' feature for each category is needed
# to perform classification.
get_slices_if_not_provided(vocab)
return vocab
def get_slices_if_not_provided(vocab: allen_data.Vocabulary):
if hasattr(vocab, "slices"):
return vocab.slices
if "feats_labels" in vocab.get_namespaces():
idx2token = vocab.get_index_to_token_vocabulary("feats_labels")
for _, v in dict(idx2token).items():
if v not in ["_", "__PAD__"]:
empty_value = v.split("=")[0] + "=None"
vocab.add_token_to_namespace(empty_value, "feats_labels")
slices = {}
for idx, name in vocab.get_index_to_token_vocabulary("feats_labels").items():
# There are 2 types features: with (Case=Acc) or without assigment (None).
# Here we group their indices by name (before assigment sign).
name = name.split("=")[0]
if name in slices:
slices[name].append(idx)
else:
slices[name] = [idx]
vocab.slices = slices
return vocab.slices
@dataclass(init=False, repr=False)
class _Token(allen_data.Token):
__slots__ = allen_data.Token.__slots__ + ['feats_']
feats_: Optional[str]
class DatasetReader:
pass
def __init__(self, text: str = None, idx: int = None, idx_end: int = None, lemma_: str = None, pos_: str = None,
tag_: str = None, dep_: str = None, ent_type_: str = None, text_id: int = None, type_id: int = None,
feats_: str = None) -> None:
super().__init__(text, idx, idx_end, lemma_, pos_, tag_, dep_, ent_type_, text_id, type_id)
self.feats_ = feats_
class UniversalDependenciesDatasetReader(DatasetReader):
pass
\ No newline at end of file
from .base_field import Field
from .sequence_multilabel_field import SequenceMultiLabelField
from abc import ABCMeta
class Field(metaclass=ABCMeta):
pass
"""Sequence multilabel field implementation."""
import logging
import textwrap
from typing import Set, List, Callable, Iterator, Union, Dict
import torch
from allennlp import data
from allennlp.common import checks
from allennlp.data import fields
from overrides import overrides
from combo.data.fields import Field
logger = logging.getLogger(__name__)
class SequenceMultiLabelField(data.Field[torch.Tensor]):
class SequenceMultiLabelField(Field):
"""
A `SequenceMultiLabelField` is an extension of the :class:`MultiLabelField` that allows for multiple labels
while keeping sequence dimension.
......@@ -93,43 +91,23 @@ class SequenceMultiLabelField(data.Field[torch.Tensor]):
@overrides
def count_vocab_items(self, counter: Dict[str, Dict[str, int]]):
if self._indexed_multi_labels is None:
for multi_label in self.multi_labels:
for label in multi_label:
counter[self._label_namespace][label] += 1 # type: ignore
pass
@overrides
def index(self, vocab: data.Vocabulary):
indexer = self.multi_label_indexer(vocab)
indexed = []
for multi_label in self.multi_labels:
indexed.append(indexer(multi_label, len(self.multi_labels)))
self._indexed_multi_labels = indexed
pass
@overrides
def get_padding_lengths(self) -> Dict[str, int]:
return {"num_tokens": self.sequence_field.sequence_length()}
pass
@overrides
def as_tensor(self, padding_lengths: Dict[str, int]) -> torch.Tensor:
return self.as_tensor_wrapper(padding_lengths)
pass
@overrides
def empty_field(self) -> "SequenceMultiLabelField":
empty_list: List[List[str]] = [[]]
sequence_label_field = SequenceMultiLabelField(empty_list, lambda x: lambda y: y,
lambda x: lambda y: y,
self.sequence_field.empty_field())
sequence_label_field._indexed_labels = empty_list
return sequence_label_field
pass
def __str__(self) -> str:
length = self.sequence_field.sequence_length()
formatted_labels = "".join(
"\t\t" + labels + "\n" for labels in textwrap.wrap(repr(self.multi_labels), 100)
)
return (
f"SequenceMultiLabelField of length {length} with "
f"labels:\n {formatted_labels} \t\tin namespace: '{self._label_namespace}'."
)
pass
from .base_sampler import Sampler
from .samplers import TokenCountBatchSampler
from abc import ABCMeta
class Sampler(metaclass=ABCMeta):
pass
......@@ -2,11 +2,10 @@ from typing import List
import numpy as np
from allennlp import data as allen_data
from combo.data.samplers import Sampler
@allen_data.BatchSampler.register("token_count")
class TokenCountBatchSampler(allen_data.BatchSampler):
class TokenCountBatchSampler(Sampler):
def __init__(self, dataset, word_batch_size: int = 2500, shuffle_dataset: bool = True):
self._index = 0
......
class Token:
pass
from .base_indexer import TokenIndexer
from .pretrained_transformer_mismatched_indexer import PretrainedTransformerMismatchedIndexer
from .token_characters_indexer import TokenCharactersIndexer
from .token_features_indexer import TokenFeatsIndexer
from abc import ABCMeta
class TokenIndexer(metaclass=ABCMeta):
pass
class PretrainedTransformerMismatchedIndexer(TokenIndexer):
pass
from typing import Optional, Dict, Any, List, Tuple
from combo.data import TokenIndexer
from allennlp import data
from allennlp.data import token_indexers, tokenizers, IndexedTokenList, vocabulary
from overrides import overrides
class PretrainedTransformerMismatchedIndexer(TokenIndexer):
pass
@data.TokenIndexer.register("pretrained_transformer_mismatched_fixed")
class PretrainedTransformerMismatchedIndexer(token_indexers.PretrainedTransformerMismatchedIndexer):
def __init__(self, model_name: str, namespace: str = "tags", max_length: int = None,
tokenizer_kwargs: Optional[Dict[str, Any]] = None, **kwargs) -> None:
# The matched version v.s. mismatched
super().__init__(model_name, namespace, max_length, tokenizer_kwargs, **kwargs)
self._matched_indexer = PretrainedTransformerIndexer(
model_name,
namespace=namespace,
max_length=max_length,
tokenizer_kwargs=tokenizer_kwargs,
**kwargs,
)
self._allennlp_tokenizer = self._matched_indexer._allennlp_tokenizer
self._tokenizer = self._matched_indexer._tokenizer
self._num_added_start_tokens = self._matched_indexer._num_added_start_tokens
self._num_added_end_tokens = self._matched_indexer._num_added_end_tokens
class PretrainedTransformerIndexer(TokenIndexer):
pass
@overrides
def tokens_to_indices(self,
tokens,
vocabulary: vocabulary ) -> IndexedTokenList:
"""
Method is overridden in order to raise an error while the number of tokens needed to embed a sentence exceeds the
maximal input of a model.
"""
self._matched_indexer._add_encoding_to_vocabulary_if_needed(vocabulary)
wordpieces, offsets = self._allennlp_tokenizer.intra_word_tokenize(
[t.ensure_text() for t in tokens])
if len(wordpieces) > self._tokenizer.max_len_single_sentence:
raise ValueError("Following sentence consists of more wordpiece tokens that the model can process:\n" +\
" ".join([str(x) for x in tokens[:10]]) + " ... \n" + \
f"Maximal input: {self._tokenizer.max_len_single_sentence}\n"+ \
f"Current input: {len(wordpieces)}")
offsets = [x if x is not None else (-1, -1) for x in offsets]
output: IndexedTokenList = {
"token_ids": [t.text_id for t in wordpieces],
"mask": [True] * len(tokens), # for original tokens (i.e. word-level)
"type_ids": [t.type_id for t in wordpieces],
"offsets": offsets,
"wordpiece_mask": [True] * len(wordpieces), # for wordpieces (i.e. subword-level)
}
return self._matched_indexer._postprocess_output(output)
class PretrainedTransformerIndexer(token_indexers.PretrainedTransformerIndexer):
def __init__(
self,
model_name: str,
namespace: str = "tags",
max_length: int = None,
tokenizer_kwargs: Optional[Dict[str, Any]] = None,
**kwargs,
) -> None:
super().__init__(model_name, namespace, max_length, tokenizer_kwargs, **kwargs)
self._namespace = namespace
self._allennlp_tokenizer = PretrainedTransformerTokenizer(
model_name, tokenizer_kwargs=tokenizer_kwargs
)
self._tokenizer = self._allennlp_tokenizer.tokenizer
self._added_to_vocabulary = False
self._num_added_start_tokens = len(self._allennlp_tokenizer.single_sequence_start_tokens)
self._num_added_end_tokens = len(self._allennlp_tokenizer.single_sequence_end_tokens)
self._max_length = max_length
if self._max_length is not None:
num_added_tokens = len(self._allennlp_tokenizer.tokenize("a")) - 1
self._effective_max_length = ( # we need to take into account special tokens
self._max_length - num_added_tokens
)
if self._effective_max_length <= 0:
raise ValueError(
"max_length needs to be greater than the number of special tokens inserted."
)
class PretrainedTransformerTokenizer(tokenizers.PretrainedTransformerTokenizer):
def _intra_word_tokenize(
self, string_tokens: List[str]
) -> Tuple[List[data.Token], List[Optional[Tuple[int, int]]]]:
tokens: List[data.Token] = []
offsets: List[Optional[Tuple[int, int]]] = []
for token_string in string_tokens:
wordpieces = self.tokenizer.encode_plus(
token_string,
add_special_tokens=False,
return_tensors=None,
return_offsets_mapping=False,
return_attention_mask=False,
)
wp_ids = wordpieces["input_ids"]
if len(wp_ids) > 0:
offsets.append((len(tokens), len(tokens) + len(wp_ids) - 1))
tokens.extend(
data.Token(text=wp_text, text_id=wp_id)
for wp_id, wp_text in zip(wp_ids, self.tokenizer.convert_ids_to_tokens(wp_ids))
)
else:
offsets.append(None)
return tokens, offsets
class PretrainedTransformerTokenizer(TokenIndexer):
pass
"""Custom character token indexer."""
import itertools
from typing import List, Dict
from combo.data import TokenIndexer
import torch
from allennlp import data
from allennlp.common import util
from allennlp.data import tokenizers
from allennlp.data.token_indexers import token_characters_indexer
from overrides import overrides
@data.TokenIndexer.register("characters_const_padding")
class TokenCharactersIndexer(token_characters_indexer.TokenCharactersIndexer):
class TokenCharactersIndexer(TokenIndexer):
"""Wrapper around allennlp token indexer with const padding."""
def __init__(self,
namespace: str = "token_characters",
character_tokenizer: tokenizers.CharacterTokenizer = tokenizers.CharacterTokenizer(),
start_tokens: List[str] = None,
end_tokens: List[str] = None,
min_padding_length: int = 0,
token_min_padding_length: int = 0):
super().__init__(namespace, character_tokenizer, start_tokens, end_tokens, min_padding_length,
token_min_padding_length)
@overrides
def get_padding_lengths(self, indexed_tokens: data.IndexedTokenList) -> Dict[str, int]:
padding_lengths = {"token_characters": len(indexed_tokens["token_characters"]),
"num_token_characters": self._min_padding_length}
return padding_lengths
@overrides
def as_padded_tensor_dict(
self, tokens: data.IndexedTokenList, padding_lengths: Dict[str, int]
) -> Dict[str, torch.Tensor]:
# Pad the tokens.
padded_tokens = util.pad_sequence_to_length(
tokens["token_characters"],
padding_lengths["token_characters"],
default_value=lambda: [],
)
# Pad the characters within the tokens.
desired_token_length = padding_lengths["num_token_characters"]
longest_token: List[int] = max(tokens["token_characters"], key=len, default=[]) # type: ignore
padding_value = 0
if desired_token_length > len(longest_token):
# Since we want to pad to greater than the longest token, we add a
# "dummy token" so we can take advantage of the fast implementation of itertools.zip_longest.
padded_tokens.append([padding_value] * desired_token_length)
# pad the list of lists to the longest sublist, appending 0's
padded_tokens = list(zip(*itertools.zip_longest(*padded_tokens, fillvalue=padding_value)))
if desired_token_length > len(longest_token):
# Removes the "dummy token".
padded_tokens.pop()
# Truncates all the tokens to the desired length, and return the result.
return {
"token_characters": torch.LongTensor(
[list(token[:desired_token_length]) for token in padded_tokens]
)
}
pass
"""Features indexer."""
import collections
from typing import List, Dict
import torch
from allennlp import data
from allennlp.common import util
from overrides import overrides
from combo.data import TokenIndexer
@data.TokenIndexer.register("feats_indexer")
class TokenFeatsIndexer(data.TokenIndexer):
def __init__(
self,
namespace: str = "feats",
feature_name: str = "feats_",
token_min_padding_length: int = 0,
) -> None:
super().__init__(token_min_padding_length)
self.namespace = namespace
self._feature_name = feature_name
@overrides
def count_vocab_items(self, token: data.Token, counter: Dict[str, Dict[str, int]]):
feats = self._feat_values(token)
for feat in feats:
counter[self.namespace][feat] += 1
@overrides
def tokens_to_indices(self, tokens: List[data.Token], vocabulary: data.Vocabulary) -> data.IndexedTokenList:
indices: List[List[int]] = []
vocab_size = vocabulary.get_vocab_size(self.namespace)
for token in tokens:
token_indices = []
feats = self._feat_values(token)
for feat in feats:
token_indices.append(vocabulary.get_token_index(feat, self.namespace))
indices.append(util.pad_sequence_to_length(token_indices, vocab_size))
return {"tokens": indices}
@overrides
def get_empty_token_list(self) -> data.IndexedTokenList:
return {"tokens": [[]]}
def _feat_values(self, token):
feats = getattr(token, self._feature_name)
if feats is None:
feats = collections.OrderedDict()
features = []
for feat, value in feats.items():
if feat in ["_", "__ROOT__"]:
pass
else:
# Handle case where feature is binary (doesn't have associated value)
if value:
features.append(feat + "=" + value)
else:
features.append(feat)
return features
@overrides
def as_padded_tensor_dict(
self, tokens: data.IndexedTokenList, padding_lengths: Dict[str, int]
) -> Dict[str, torch.Tensor]:
tensor_dict = {}
for key, val in tokens.items():
vocab_size = len(val[0])
tensor = torch.tensor(util.pad_sequence_to_length(val,
padding_lengths[key],
default_value=lambda: [0] * vocab_size,
)
)
tensor_dict[key] = tensor
return tensor_dict
class TokenFeatsIndexer(TokenIndexer):
pass
import logging
import os
import pathlib
import tempfile
from typing import Dict
import torch
from absl import app
from absl import flags
from combo import models
from combo.models.base import Predictor
from combo.utils import checks
logger = logging.getLogger(__name__)
_FEATURES = ["token", "char", "upostag", "xpostag", "lemma", "feats"]
_TARGETS = ["deprel", "feats", "head", "lemma", "upostag", "xpostag", "semrel", "sent", "deps"]
FLAGS = flags.FLAGS
flags.DEFINE_enum(name="mode", default=None, enum_values=["train", "predict"],
help="Specify COMBO mode: train or predict")
# Common flags
flags.DEFINE_integer(name="cuda_device", default=-1,
help="Cuda device id (default -1 cpu)")
flags.DEFINE_string(name="output_file", default="output.log",
help="Predictions result file.")
# Training flags
flags.DEFINE_list(name="training_data_path", default=[],
help="Training data path(s)")
flags.DEFINE_alias(name="training_data", original_name="training_data_path")
flags.DEFINE_list(name="validation_data_path", default="",
help="Validation data path(s)")
flags.DEFINE_alias(name="validation_data", original_name="validation_data_path")
flags.DEFINE_string(name="pretrained_tokens", default="",
help="Pretrained tokens embeddings path")
flags.DEFINE_integer(name="embedding_dim", default=300,
help="Embeddings dim")
flags.DEFINE_integer(name="num_epochs", default=400,
help="Epochs num")
flags.DEFINE_integer(name="word_batch_size", default=2500,
help="Minimum words in batch")
flags.DEFINE_string(name="pretrained_transformer_name", default="",
help="Pretrained transformer model name (see transformers from HuggingFace library for list of "
"available models) for transformers based embeddings.")
flags.DEFINE_list(name="features", default=["token", "char"],
help=f"Features used to train model (required 'token' and 'char'). Possible values: {_FEATURES}.")
flags.DEFINE_list(name="targets", default=["deprel", "feats", "head", "lemma", "upostag", "xpostag"],
help=f"Targets of the model (required `deprel` and `head`). Possible values: {_TARGETS}.")
flags.DEFINE_string(name="serialization_dir", default=None,
help="Model serialization directory (default - system temp dir).")
flags.DEFINE_boolean(name="tensorboard", default=False,
help="When provided model will log tensorboard metrics.")
# Finetune after training flags
flags.DEFINE_list(name="finetuning_training_data_path", default="",
help="Training data path(s)")
flags.DEFINE_list(name="finetuning_validation_data_path", default="",
help="Validation data path(s)")
flags.DEFINE_string(name="config_path", default=str(pathlib.Path(__file__).parent / "config.template.jsonnet"),
help="Config file path.")
# Test after training flags
flags.DEFINE_string(name="test_path", default=None,
help="Test path file.")
# Experimental
flags.DEFINE_boolean(name="use_pure_config", default=False,
help="Ignore ext flags (experimental).")
# Prediction flags
flags.DEFINE_string(name="model_path", default=None,
help="Pretrained model path.")
flags.DEFINE_string(name="input_file", default=None,
help="File to predict path")
flags.DEFINE_boolean(name="conllu_format", default=True,
help="Prediction based on conllu format (instead of raw text).")
flags.DEFINE_integer(name="batch_size", default=1,
help="Prediction batch size.")
flags.DEFINE_boolean(name="silent", default=True,
help="Silent prediction to file (without printing to console).")
flags.DEFINE_enum(name="predictor_name", default="combo-spacy",
enum_values=["combo", "combo-spacy"],
help="Use predictor with whitespace or spacy tokenizer.")
def run(_):
pass
def _get_predictor() -> Predictor:
# Check for GPU
# allen_checks.check_for_gpu(FLAGS.cuda_device)
checks.file_exists(FLAGS.model_path)
# load model from archive
# archive = models.load_archive(
# FLAGS.model_path,
# cuda_device=FLAGS.cuda_device,
# )
# return predictors.Predictor.from_archive(
# archive, FLAGS.predictor_name
# )
return Predictor()
def _get_ext_vars(finetuning: bool = False) -> Dict:
if FLAGS.use_pure_config:
return {}
return {
"training_data_path": (
",".join(FLAGS.training_data_path if not finetuning else FLAGS.finetuning_training_data_path)),
"validation_data_path": (
",".join(FLAGS.validation_data_path if not finetuning else FLAGS.finetuning_validation_data_path)),
"pretrained_tokens": FLAGS.pretrained_tokens,
"pretrained_transformer_name": FLAGS.pretrained_transformer_name,
"features": " ".join(FLAGS.features),
"targets": " ".join(FLAGS.targets),
"type": "finetuning" if finetuning else "default",
"embedding_dim": str(FLAGS.embedding_dim),
"cuda_device": str(FLAGS.cuda_device),
"num_epochs": str(FLAGS.num_epochs),
"word_batch_size": str(FLAGS.word_batch_size),
"use_tensorboard": str(FLAGS.tensorboard),
}
def main():
"""Parse flags."""
flags.register_validator(
"features",
lambda values: all(
value in _FEATURES for value in values),
message="Flags --features contains unknown value(s)."
)
flags.register_validator(
"mode",
lambda value: value is not None,
message="Flag --mode must be set with either `predict` or `train` value")
flags.register_validator(
"targets",
lambda values: all(
value in _TARGETS for value in values),
message="Flag --targets contains unknown value(s)."
)
app.run(run)
if __name__ == "__main__":
main()
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment