diff --git a/combo/data/api.py b/combo/data/api.py index b04192f04b2db84981ea34cfcfa1297769078f26..0f407cdc7f3b3a71e88c96f12a0b45d175ff2c1d 100644 --- a/combo/data/api.py +++ b/combo/data/api.py @@ -2,14 +2,14 @@ import collections import dataclasses import json from dataclasses import dataclass, field -from typing import Optional, List, Dict, Any, Union, Tuple +from typing import List, Dict, Any +import conllu from conllu.models import Metadata +from overrides import overrides from combo.data.tokenizers import Token -import conllu -from overrides import overrides @dataclass class Sentence: diff --git a/combo/data/samplers/__init__.py b/combo/data/samplers/__init__.py index 7e90dca36a0bf3ca8e0c7cbf939ca0f4305d03e8..db8c55c67dbd4158915dfd120b483d6bfe5accd7 100644 --- a/combo/data/samplers/__init__.py +++ b/combo/data/samplers/__init__.py @@ -1,2 +1,2 @@ from .batch_sampler import BatchSampler -from .samplers import TokenCountBatchSampler +from .token_count_batch_sampler import TokenCountBatchSampler diff --git a/combo/data/samplers/samplers.py b/combo/data/samplers/samplers.py deleted file mode 100644 index 3dcab19fc6621cb25c7d34321f0b4feeab20612c..0000000000000000000000000000000000000000 --- a/combo/data/samplers/samplers.py +++ /dev/null @@ -1,58 +0,0 @@ -""" -Adapted from COMBO -Author: Mateusz Klimaszewski -""" - -from typing import List - -import numpy as np - -from combo.data.samplers import BatchSampler - - -class TokenCountBatchSampler(BatchSampler): - - def __init__(self, dataset, word_batch_size: int = 2500, shuffle_dataset: bool = True): - self._index = 0 - self.shuffle_dataset = shuffle_dataset - self.batch_dataset = self._batchify(dataset, word_batch_size) - if shuffle_dataset: - self._shuffle() - - @staticmethod - def _batchify(dataset, word_batch_size) -> List[List[int]]: - dataset = list(dataset) - batches = [] - batch = [] - words_count = 0 - lengths = [len(instance.fields["sentence"].tokens) for instance in dataset] - argsorted_lengths = np.argsort(lengths) - for idx in argsorted_lengths: - words_count += lengths[idx] - batch.append(idx) - if words_count > word_batch_size: - batches.append(batch) - words_count = 0 - batch = [] - return batches - - def __iter__(self): - return self - - def __next__(self): - if self._index >= len(self.batch_dataset): - if self.shuffle_dataset: - self._index = 0 - self._shuffle() - raise StopIteration() - - batch = self.batch_dataset[self._index] - self._index += 1 - return batch - - def _shuffle(self): - indices = np.random.permutation(range(len(self.batch_dataset))) - self.batch_dataset = np.array(self.batch_dataset)[indices].tolist() - - def __len__(self): - return len(self.batch_dataset) diff --git a/combo/data/tokenizers/lambo_tokenizer.py b/combo/data/tokenizers/lambo_tokenizer.py index fc3bbb4639052d7b8fb132456339b64da7738dc8..f37b07640e8cf7f22add0cc56ba2a9d049c3d201 100644 --- a/combo/data/tokenizers/lambo_tokenizer.py +++ b/combo/data/tokenizers/lambo_tokenizer.py @@ -6,7 +6,6 @@ from combo.config import Registry from combo.config.from_parameters import register_arguments from combo.data.tokenizers.token import Token from combo.data.tokenizers.tokenizer import Tokenizer -from combo.data.api import Sentence @Registry.register('lambo_tokenizer') diff --git a/combo/data/tokenizers/token.py b/combo/data/tokenizers/token.py index 33182ddd3a30dafa7886fed5c337af43b1b116d3..16c3a8e57131de512201fc27981d363d7fa7e79b 100644 --- a/combo/data/tokenizers/token.py +++ b/combo/data/tokenizers/token.py @@ -2,7 +2,6 @@ Adapted from AllenNLP https://github.com/allenai/allennlp/blob/main/allennlp/data/tokenizers/token_class.py """ -from collections import defaultdict from typing import Any, Dict, List, Optional, Tuple, Union import logging from dataclasses import dataclass, field diff --git a/combo/main.py b/combo/main.py index b471f67861a82f744f24ef1be0b172c581e08644..9e40120761bc44c9575bf339cf8611e9bbd0cce6 100755 --- a/combo/main.py +++ b/combo/main.py @@ -16,16 +16,16 @@ from combo.training.trainable_combo import TrainableCombo from combo.utils import checks, ComboLogger from combo.config import resolve -from combo.default_model import default_ud_dataset_reader, default_data_loader +from combo.default_model import default_ud_dataset_reader, default_data_loader, default_model from combo.modules.archival import load_archive, archive from combo.predict import COMBO from combo.data import api -from config import override_parameters -from data import LamboTokenizer, Vocabulary, DatasetReader -from data.dataset_loaders import DataLoader -from modules.model import Model -from utils import ConfigurationError -from utils.matrices import extract_combo_matrices +from combo.config import override_parameters +from combo.data import LamboTokenizer, Vocabulary, DatasetReader +from combo.data.dataset_loaders import DataLoader +from combo.modules.model import Model +from combo.utils import ConfigurationError +from combo.utils.matrices import extract_combo_matrices logging.setLoggerClass(ComboLogger) logger = logging.getLogger(__name__) @@ -275,6 +275,17 @@ def run(_): if FLAGS.config_path: logger.info(f'Reading parameters from configuration path {FLAGS.config_path}', prefix=prefix) model, dataset_reader, training_data_loader, validation_data_loader, vocabulary = read_model_from_config(prefix) + else: + dataset_reader, training_data_loader, validation_data_loader, vocabulary = get_defaults( + dataset_reader, + training_data_loader, + validation_data_loader, + vocabulary, + FLAGS.training_data_path, + FLAGS.validation_data_path, + prefix + ) + model = default_model(FLAGS.pretrained_transformer_name, vocabulary) if FLAGS.use_pure_config and model is None: logger.error('Error in configuration - model could not be read from parameters. ' + diff --git a/combo/modules/graph_parser.py b/combo/modules/graph_parser.py index 8796f1e6937ec289fc62d421941e14c001fb2be7..3f03211c78005d2d00b6e78483dc4fbd0ddd8e9b 100644 --- a/combo/modules/graph_parser.py +++ b/combo/modules/graph_parser.py @@ -11,12 +11,13 @@ from combo import data from combo.config import Registry from combo.config.from_parameters import register_arguments from combo.nn import base -from combo.nn.base import Predictor - +from combo.predictors import Predictor import torch import torch.nn.functional as F + + @Registry.register("graph_head_predictor") class GraphHeadPredictionModel(Predictor): """Head prediction model.""" diff --git a/combo/nn/util.py b/combo/nn/util.py deleted file mode 100644 index 69c8d017760606cfdbf4b09999d813e1932b71f7..0000000000000000000000000000000000000000 --- a/combo/nn/util.py +++ /dev/null @@ -1,259 +0,0 @@ -""" -Adapted from AllenNLP -https://github.com/allenai/allennlp/blob/80fb6061e568cb9d6ab5d45b661e86eb61b92c82/allennlp/nn/util.py -""" -from typing import Union, Dict, Optional, List, Any - -import torch - -from combo.common.util import int_to_device -from combo.utils import ConfigurationError - - -def move_to_device(obj, device: Union[torch.device, int]): - """ - Given a structure (possibly) containing Tensors, - move all the Tensors to the specified device (or do nothing, if they are already on - the target device). - """ - device = int_to_device(device) - - if isinstance(obj, torch.Tensor): - # You may be wondering why we don't just always call `obj.to(device)` since that would - # be a no-op anyway if `obj` is already on `device`. Well that works fine except - # when PyTorch is not compiled with CUDA support, in which case even calling - # `obj.to(torch.device("cpu"))` would result in an error. - return obj if obj.device == device else obj.to(device=device) - elif isinstance(obj, dict): - for key, value in obj.items(): - obj[key] = move_to_device(value, device) - return obj - elif isinstance(obj, list): - for i, item in enumerate(obj): - obj[i] = move_to_device(item, device) - return obj - elif isinstance(obj, tuple) and hasattr(obj, "_fields"): - # This is the best way to detect a NamedTuple, it turns out. - return obj.__class__(*(move_to_device(item, device) for item in obj)) - elif isinstance(obj, tuple): - return tuple(move_to_device(item, device) for item in obj) - else: - return obj - - -def device_mapping(cuda_device: int): - """ - In order to `torch.load()` a GPU-trained model onto a CPU (or specific GPU), - you have to supply a `map_location` function. Call this with - the desired `cuda_device` to get the function that `torch.load()` needs. - """ - - def inner_device_mapping(storage: torch.Storage, location) -> torch.Storage: - if cuda_device >= 0: - return storage.cuda(cuda_device) - else: - return storage - - return inner_device_mapping - - -def get_lengths_from_binary_sequence_mask(mask: torch.BoolTensor) -> torch.LongTensor: - """ - Compute sequence lengths for each batch element in a tensor using a - binary mask. - # Parameters - mask : `torch.BoolTensor`, required. - A 2D binary mask of shape (batch_size, sequence_length) to - calculate the per-batch sequence lengths from. - # Returns - `torch.LongTensor` - A torch.LongTensor of shape (batch_size,) representing the lengths - of the sequences in the batch. - """ - return mask.sum(-1) - - -def sort_batch_by_length(tensor: torch.Tensor, sequence_lengths: torch.Tensor): - """ - Sort a batch first tensor by some specified lengths. - # Parameters - tensor : `torch.FloatTensor`, required. - A batch first Pytorch tensor. - sequence_lengths : `torch.LongTensor`, required. - A tensor representing the lengths of some dimension of the tensor which - we want to sort by. - # Returns - sorted_tensor : `torch.FloatTensor` - The original tensor sorted along the batch dimension with respect to sequence_lengths. - sorted_sequence_lengths : `torch.LongTensor` - The original sequence_lengths sorted by decreasing size. - restoration_indices : `torch.LongTensor` - Indices into the sorted_tensor such that - `sorted_tensor.index_select(0, restoration_indices) == original_tensor` - permutation_index : `torch.LongTensor` - The indices used to sort the tensor. This is useful if you want to sort many - tensors using the same ordering. - """ - - if not isinstance(tensor, torch.Tensor) or not isinstance(sequence_lengths, torch.Tensor): - raise ConfigurationError("Both the tensor and sequence lengths must be torch.Tensors.") - - sorted_sequence_lengths, permutation_index = sequence_lengths.sort(0, descending=True) - sorted_tensor = tensor.index_select(0, permutation_index) - - index_range = torch.arange(0, len(sequence_lengths), device=sequence_lengths.device) - # This is the equivalent of zipping with index, sorting by the original - # sequence lengths and returning the now sorted indices. - _, reverse_mapping = permutation_index.sort(0, descending=False) - restoration_indices = index_range.index_select(0, reverse_mapping) - return sorted_tensor, sorted_sequence_lengths, restoration_indices, permutation_index - -def get_text_field_mask( - text_field_tensors: Dict[str, Dict[str, torch.Tensor]], - num_wrapping_dims: int = 0, - padding_id: int = 0, -) -> torch.BoolTensor: - """ - Takes the dictionary of tensors produced by a `TextField` and returns a mask - with 0 where the tokens are padding, and 1 otherwise. `padding_id` specifies the id of padding tokens. - We also handle `TextFields` wrapped by an arbitrary number of `ListFields`, where the number of wrapping - `ListFields` is given by `num_wrapping_dims`. - If `num_wrapping_dims == 0`, the returned mask has shape `(batch_size, num_tokens)`. - If `num_wrapping_dims > 0` then the returned mask has `num_wrapping_dims` extra - dimensions, so the shape will be `(batch_size, ..., num_tokens)`. - There could be several entries in the tensor dictionary with different shapes (e.g., one for - word ids, one for character ids). In order to get a token mask, we use the tensor in - the dictionary with the lowest number of dimensions. After subtracting `num_wrapping_dims`, - if this tensor has two dimensions we assume it has shape `(batch_size, ..., num_tokens)`, - and use it for the mask. If instead it has three dimensions, we assume it has shape - `(batch_size, ..., num_tokens, num_features)`, and sum over the last dimension to produce - the mask. Most frequently this will be a character id tensor, but it could also be a - featurized representation of each token, etc. - If the input `text_field_tensors` contains the "mask" key, this is returned instead of inferring the mask. - """ - masks = [] - for indexer_name, indexer_tensors in text_field_tensors.items(): - if "mask" in indexer_tensors: - masks.append(indexer_tensors["mask"].bool()) - if len(masks) == 1: - return masks[0] - elif len(masks) > 1: - # TODO(mattg): My guess is this will basically never happen, so I'm not writing logic to - # handle it. Should be straightforward to handle, though. If you see this error in - # practice, open an issue on github. - raise ValueError("found two mask outputs; not sure which to use!") - - tensor_dims = [ - (tensor.dim(), tensor) - for indexer_output in text_field_tensors.values() - for tensor in indexer_output.values() - ] - tensor_dims.sort(key=lambda x: x[0]) - - smallest_dim = tensor_dims[0][0] - num_wrapping_dims - if smallest_dim == 2: - token_tensor = tensor_dims[0][1] - return token_tensor != padding_id - elif smallest_dim == 3: - character_tensor = tensor_dims[0][1] - return (character_tensor != padding_id).any(dim=-1) - else: - raise ValueError("Expected a tensor with dimension 2 or 3, found {}".format(smallest_dim)) - - -def get_dropout_mask(dropout_probability: float, tensor_for_masking: torch.Tensor): - """ - Computes and returns an element-wise dropout mask for a given tensor, where - each element in the mask is dropped out with probability dropout_probability. - Note that the mask is NOT applied to the tensor - the tensor is passed to retain - the correct CUDA tensor type for the mask. - # Parameters - dropout_probability : `float`, required. - Probability of dropping a dimension of the input. - tensor_for_masking : `torch.Tensor`, required. - # Returns - `torch.FloatTensor` - A torch.FloatTensor consisting of the binary mask scaled by 1/ (1 - dropout_probability). - This scaling ensures expected values and variances of the output of applying this mask - and the original tensor are the same. - """ - binary_mask = (torch.rand(tensor_for_masking.size()) > dropout_probability).to( - tensor_for_masking.device - ) - # Scale mask by 1/keep_prob to preserve output statistics. - dropout_mask = binary_mask.float().div(1.0 - dropout_probability) - return dropout_mask - - -def find_embedding_layer(model: torch.nn.Module) -> torch.nn.Module: - """ - Takes a model (typically an AllenNLP `Model`, but this works for any `torch.nn.Module`) and - makes a best guess about which module is the embedding layer. For typical AllenNLP models, - this often is the `TextFieldEmbedder`, but if you're using a pre-trained contextualizer, we - really want layer 0 of that contextualizer, not the output. So there are a bunch of hacks in - here for specific pre-trained contextualizers. - """ - # We'll look for a few special cases in a first pass, then fall back to just finding a - # TextFieldEmbedder in a second pass if we didn't find a special case. - from transformers.models.gpt2.modeling_gpt2 import GPT2Model - from transformers.models.bert.modeling_bert import BertEmbeddings - from transformers.models.albert.modeling_albert import AlbertEmbeddings - from transformers.models.roberta.modeling_roberta import RobertaEmbeddings - - for module in model.modules(): - if isinstance(module, BertEmbeddings): - return module.word_embeddings - if isinstance(module, RobertaEmbeddings): - return module.word_embeddings - if isinstance(module, AlbertEmbeddings): - return module.word_embeddings - if isinstance(module, GPT2Model): - return module.wte - - return None - - # for module in model.modules(): - # if isinstance(module, TextFieldEmbedder): - # - # if isinstance(module, BasicTextFieldEmbedder): - # # We'll have a check for single Embedding cases, because we can be more efficient - # # in cases like this. If this check fails, then for something like hotflip we need - # # to actually run the text field embedder and construct a vector for each token. - # if len(module._token_embedders) == 1: - # embedder = list(module._token_embedders.values())[0] - # if isinstance(embedder, Embedding): - # if embedder._projection is None: - # # If there's a projection inside the Embedding, then we need to return - # # the whole TextFieldEmbedder, because there's more computation that - # # needs to be done than just multiply by an embedding matrix. - # return embedder - # return module - raise RuntimeError("No embedding module found!") - - - -def get_token_offsets_from_text_field_inputs( - text_field_inputs: List[Any], -) -> Optional[torch.Tensor]: - """ - Given a list of inputs to a TextFieldEmbedder, tries to find token offsets from those inputs, if - there are any. You will have token offsets if you are using a mismatched token embedder; if - you're not, the return value from this function should be None. This function is intended to be - called from a `forward_hook` attached to a `TextFieldEmbedder`, so the inputs are formatted just - as a list. - It's possible in theory that you could have multiple offsets as inputs to a single call to a - `TextFieldEmbedder`, but that's an extremely rare use case (I can't really imagine anyone - wanting to do that). In that case, we'll only return the first one. If you need different - behavior for your model, open an issue on github describing what you're doing. - """ - for input_index, text_field_input in enumerate(text_field_inputs): - if not isinstance(text_field_input, dict): - continue - for input_value in text_field_input.values(): - if not isinstance(input_value, dict): - continue - for embedder_arg_name, embedder_arg_value in input_value.items(): - if embedder_arg_name == "offsets": - return embedder_arg_value - return None - diff --git a/combo/utils/cached_transformers.py b/combo/utils/cached_transformers.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/combo/utils/matrices.py b/combo/utils/matrices.py index 395e23d2b4628313551aa061328cbeef87895086..61b357d7263f5e2da145f2668ee9b704c19df9f2 100644 --- a/combo/utils/matrices.py +++ b/combo/utils/matrices.py @@ -8,8 +8,8 @@ import numpy as np import pandas as pd from pathlib import Path -from data import Sentence -from utils import ComboLogger +from combo.data import Sentence +from combo.utils import ComboLogger def extract_combo_matrices(predictions: List[Sentence], diff --git a/docs/Configuration.md b/docs/Configuration.md index c7e5761b38ccc71166b1ca483dfed38039faf774..128c02558768663827b055e26358aaf5ac96a913 100644 --- a/docs/Configuration.md +++ b/docs/Configuration.md @@ -20,4 +20,4 @@ The ```"type"``` field serves as a dependency-injection like mechanism and sets which class and constructor is used. The ```"parameters"``` dictionary is passed to the constructor method. Every parameter is attempted to be resolved from the class registry and only if that's -not possible they are passed as-is. \ No newline at end of file +not possible they are passed as-is. diff --git a/docs/training.md b/docs/Training.md similarity index 100% rename from docs/training.md rename to docs/Training.md