Skip to content
Snippets Groups Projects
Commit 3c47dbf1 authored by Maja Jablonska's avatar Maja Jablonska
Browse files

Import fixes

parent f21ca541
Branches
Tags
1 merge request!46Merge COMBO 3.0 into master
......@@ -2,14 +2,14 @@ import collections
import dataclasses
import json
from dataclasses import dataclass, field
from typing import Optional, List, Dict, Any, Union, Tuple
from typing import List, Dict, Any
import conllu
from conllu.models import Metadata
from overrides import overrides
from combo.data.tokenizers import Token
import conllu
from overrides import overrides
@dataclass
class Sentence:
......
from .batch_sampler import BatchSampler
from .samplers import TokenCountBatchSampler
from .token_count_batch_sampler import TokenCountBatchSampler
"""
Adapted from COMBO
Author: Mateusz Klimaszewski
"""
from typing import List
import numpy as np
from combo.data.samplers import BatchSampler
class TokenCountBatchSampler(BatchSampler):
def __init__(self, dataset, word_batch_size: int = 2500, shuffle_dataset: bool = True):
self._index = 0
self.shuffle_dataset = shuffle_dataset
self.batch_dataset = self._batchify(dataset, word_batch_size)
if shuffle_dataset:
self._shuffle()
@staticmethod
def _batchify(dataset, word_batch_size) -> List[List[int]]:
dataset = list(dataset)
batches = []
batch = []
words_count = 0
lengths = [len(instance.fields["sentence"].tokens) for instance in dataset]
argsorted_lengths = np.argsort(lengths)
for idx in argsorted_lengths:
words_count += lengths[idx]
batch.append(idx)
if words_count > word_batch_size:
batches.append(batch)
words_count = 0
batch = []
return batches
def __iter__(self):
return self
def __next__(self):
if self._index >= len(self.batch_dataset):
if self.shuffle_dataset:
self._index = 0
self._shuffle()
raise StopIteration()
batch = self.batch_dataset[self._index]
self._index += 1
return batch
def _shuffle(self):
indices = np.random.permutation(range(len(self.batch_dataset)))
self.batch_dataset = np.array(self.batch_dataset)[indices].tolist()
def __len__(self):
return len(self.batch_dataset)
......@@ -6,7 +6,6 @@ from combo.config import Registry
from combo.config.from_parameters import register_arguments
from combo.data.tokenizers.token import Token
from combo.data.tokenizers.tokenizer import Tokenizer
from combo.data.api import Sentence
@Registry.register('lambo_tokenizer')
......
......@@ -2,7 +2,6 @@
Adapted from AllenNLP
https://github.com/allenai/allennlp/blob/main/allennlp/data/tokenizers/token_class.py
"""
from collections import defaultdict
from typing import Any, Dict, List, Optional, Tuple, Union
import logging
from dataclasses import dataclass, field
......
......@@ -16,16 +16,16 @@ from combo.training.trainable_combo import TrainableCombo
from combo.utils import checks, ComboLogger
from combo.config import resolve
from combo.default_model import default_ud_dataset_reader, default_data_loader
from combo.default_model import default_ud_dataset_reader, default_data_loader, default_model
from combo.modules.archival import load_archive, archive
from combo.predict import COMBO
from combo.data import api
from config import override_parameters
from data import LamboTokenizer, Vocabulary, DatasetReader
from data.dataset_loaders import DataLoader
from modules.model import Model
from utils import ConfigurationError
from utils.matrices import extract_combo_matrices
from combo.config import override_parameters
from combo.data import LamboTokenizer, Vocabulary, DatasetReader
from combo.data.dataset_loaders import DataLoader
from combo.modules.model import Model
from combo.utils import ConfigurationError
from combo.utils.matrices import extract_combo_matrices
logging.setLoggerClass(ComboLogger)
logger = logging.getLogger(__name__)
......@@ -275,6 +275,17 @@ def run(_):
if FLAGS.config_path:
logger.info(f'Reading parameters from configuration path {FLAGS.config_path}', prefix=prefix)
model, dataset_reader, training_data_loader, validation_data_loader, vocabulary = read_model_from_config(prefix)
else:
dataset_reader, training_data_loader, validation_data_loader, vocabulary = get_defaults(
dataset_reader,
training_data_loader,
validation_data_loader,
vocabulary,
FLAGS.training_data_path,
FLAGS.validation_data_path,
prefix
)
model = default_model(FLAGS.pretrained_transformer_name, vocabulary)
if FLAGS.use_pure_config and model is None:
logger.error('Error in configuration - model could not be read from parameters. ' +
......
......@@ -11,12 +11,13 @@ from combo import data
from combo.config import Registry
from combo.config.from_parameters import register_arguments
from combo.nn import base
from combo.nn.base import Predictor
from combo.predictors import Predictor
import torch
import torch.nn.functional as F
@Registry.register("graph_head_predictor")
class GraphHeadPredictionModel(Predictor):
"""Head prediction model."""
......
"""
Adapted from AllenNLP
https://github.com/allenai/allennlp/blob/80fb6061e568cb9d6ab5d45b661e86eb61b92c82/allennlp/nn/util.py
"""
from typing import Union, Dict, Optional, List, Any
import torch
from combo.common.util import int_to_device
from combo.utils import ConfigurationError
def move_to_device(obj, device: Union[torch.device, int]):
"""
Given a structure (possibly) containing Tensors,
move all the Tensors to the specified device (or do nothing, if they are already on
the target device).
"""
device = int_to_device(device)
if isinstance(obj, torch.Tensor):
# You may be wondering why we don't just always call `obj.to(device)` since that would
# be a no-op anyway if `obj` is already on `device`. Well that works fine except
# when PyTorch is not compiled with CUDA support, in which case even calling
# `obj.to(torch.device("cpu"))` would result in an error.
return obj if obj.device == device else obj.to(device=device)
elif isinstance(obj, dict):
for key, value in obj.items():
obj[key] = move_to_device(value, device)
return obj
elif isinstance(obj, list):
for i, item in enumerate(obj):
obj[i] = move_to_device(item, device)
return obj
elif isinstance(obj, tuple) and hasattr(obj, "_fields"):
# This is the best way to detect a NamedTuple, it turns out.
return obj.__class__(*(move_to_device(item, device) for item in obj))
elif isinstance(obj, tuple):
return tuple(move_to_device(item, device) for item in obj)
else:
return obj
def device_mapping(cuda_device: int):
"""
In order to `torch.load()` a GPU-trained model onto a CPU (or specific GPU),
you have to supply a `map_location` function. Call this with
the desired `cuda_device` to get the function that `torch.load()` needs.
"""
def inner_device_mapping(storage: torch.Storage, location) -> torch.Storage:
if cuda_device >= 0:
return storage.cuda(cuda_device)
else:
return storage
return inner_device_mapping
def get_lengths_from_binary_sequence_mask(mask: torch.BoolTensor) -> torch.LongTensor:
"""
Compute sequence lengths for each batch element in a tensor using a
binary mask.
# Parameters
mask : `torch.BoolTensor`, required.
A 2D binary mask of shape (batch_size, sequence_length) to
calculate the per-batch sequence lengths from.
# Returns
`torch.LongTensor`
A torch.LongTensor of shape (batch_size,) representing the lengths
of the sequences in the batch.
"""
return mask.sum(-1)
def sort_batch_by_length(tensor: torch.Tensor, sequence_lengths: torch.Tensor):
"""
Sort a batch first tensor by some specified lengths.
# Parameters
tensor : `torch.FloatTensor`, required.
A batch first Pytorch tensor.
sequence_lengths : `torch.LongTensor`, required.
A tensor representing the lengths of some dimension of the tensor which
we want to sort by.
# Returns
sorted_tensor : `torch.FloatTensor`
The original tensor sorted along the batch dimension with respect to sequence_lengths.
sorted_sequence_lengths : `torch.LongTensor`
The original sequence_lengths sorted by decreasing size.
restoration_indices : `torch.LongTensor`
Indices into the sorted_tensor such that
`sorted_tensor.index_select(0, restoration_indices) == original_tensor`
permutation_index : `torch.LongTensor`
The indices used to sort the tensor. This is useful if you want to sort many
tensors using the same ordering.
"""
if not isinstance(tensor, torch.Tensor) or not isinstance(sequence_lengths, torch.Tensor):
raise ConfigurationError("Both the tensor and sequence lengths must be torch.Tensors.")
sorted_sequence_lengths, permutation_index = sequence_lengths.sort(0, descending=True)
sorted_tensor = tensor.index_select(0, permutation_index)
index_range = torch.arange(0, len(sequence_lengths), device=sequence_lengths.device)
# This is the equivalent of zipping with index, sorting by the original
# sequence lengths and returning the now sorted indices.
_, reverse_mapping = permutation_index.sort(0, descending=False)
restoration_indices = index_range.index_select(0, reverse_mapping)
return sorted_tensor, sorted_sequence_lengths, restoration_indices, permutation_index
def get_text_field_mask(
text_field_tensors: Dict[str, Dict[str, torch.Tensor]],
num_wrapping_dims: int = 0,
padding_id: int = 0,
) -> torch.BoolTensor:
"""
Takes the dictionary of tensors produced by a `TextField` and returns a mask
with 0 where the tokens are padding, and 1 otherwise. `padding_id` specifies the id of padding tokens.
We also handle `TextFields` wrapped by an arbitrary number of `ListFields`, where the number of wrapping
`ListFields` is given by `num_wrapping_dims`.
If `num_wrapping_dims == 0`, the returned mask has shape `(batch_size, num_tokens)`.
If `num_wrapping_dims > 0` then the returned mask has `num_wrapping_dims` extra
dimensions, so the shape will be `(batch_size, ..., num_tokens)`.
There could be several entries in the tensor dictionary with different shapes (e.g., one for
word ids, one for character ids). In order to get a token mask, we use the tensor in
the dictionary with the lowest number of dimensions. After subtracting `num_wrapping_dims`,
if this tensor has two dimensions we assume it has shape `(batch_size, ..., num_tokens)`,
and use it for the mask. If instead it has three dimensions, we assume it has shape
`(batch_size, ..., num_tokens, num_features)`, and sum over the last dimension to produce
the mask. Most frequently this will be a character id tensor, but it could also be a
featurized representation of each token, etc.
If the input `text_field_tensors` contains the "mask" key, this is returned instead of inferring the mask.
"""
masks = []
for indexer_name, indexer_tensors in text_field_tensors.items():
if "mask" in indexer_tensors:
masks.append(indexer_tensors["mask"].bool())
if len(masks) == 1:
return masks[0]
elif len(masks) > 1:
# TODO(mattg): My guess is this will basically never happen, so I'm not writing logic to
# handle it. Should be straightforward to handle, though. If you see this error in
# practice, open an issue on github.
raise ValueError("found two mask outputs; not sure which to use!")
tensor_dims = [
(tensor.dim(), tensor)
for indexer_output in text_field_tensors.values()
for tensor in indexer_output.values()
]
tensor_dims.sort(key=lambda x: x[0])
smallest_dim = tensor_dims[0][0] - num_wrapping_dims
if smallest_dim == 2:
token_tensor = tensor_dims[0][1]
return token_tensor != padding_id
elif smallest_dim == 3:
character_tensor = tensor_dims[0][1]
return (character_tensor != padding_id).any(dim=-1)
else:
raise ValueError("Expected a tensor with dimension 2 or 3, found {}".format(smallest_dim))
def get_dropout_mask(dropout_probability: float, tensor_for_masking: torch.Tensor):
"""
Computes and returns an element-wise dropout mask for a given tensor, where
each element in the mask is dropped out with probability dropout_probability.
Note that the mask is NOT applied to the tensor - the tensor is passed to retain
the correct CUDA tensor type for the mask.
# Parameters
dropout_probability : `float`, required.
Probability of dropping a dimension of the input.
tensor_for_masking : `torch.Tensor`, required.
# Returns
`torch.FloatTensor`
A torch.FloatTensor consisting of the binary mask scaled by 1/ (1 - dropout_probability).
This scaling ensures expected values and variances of the output of applying this mask
and the original tensor are the same.
"""
binary_mask = (torch.rand(tensor_for_masking.size()) > dropout_probability).to(
tensor_for_masking.device
)
# Scale mask by 1/keep_prob to preserve output statistics.
dropout_mask = binary_mask.float().div(1.0 - dropout_probability)
return dropout_mask
def find_embedding_layer(model: torch.nn.Module) -> torch.nn.Module:
"""
Takes a model (typically an AllenNLP `Model`, but this works for any `torch.nn.Module`) and
makes a best guess about which module is the embedding layer. For typical AllenNLP models,
this often is the `TextFieldEmbedder`, but if you're using a pre-trained contextualizer, we
really want layer 0 of that contextualizer, not the output. So there are a bunch of hacks in
here for specific pre-trained contextualizers.
"""
# We'll look for a few special cases in a first pass, then fall back to just finding a
# TextFieldEmbedder in a second pass if we didn't find a special case.
from transformers.models.gpt2.modeling_gpt2 import GPT2Model
from transformers.models.bert.modeling_bert import BertEmbeddings
from transformers.models.albert.modeling_albert import AlbertEmbeddings
from transformers.models.roberta.modeling_roberta import RobertaEmbeddings
for module in model.modules():
if isinstance(module, BertEmbeddings):
return module.word_embeddings
if isinstance(module, RobertaEmbeddings):
return module.word_embeddings
if isinstance(module, AlbertEmbeddings):
return module.word_embeddings
if isinstance(module, GPT2Model):
return module.wte
return None
# for module in model.modules():
# if isinstance(module, TextFieldEmbedder):
#
# if isinstance(module, BasicTextFieldEmbedder):
# # We'll have a check for single Embedding cases, because we can be more efficient
# # in cases like this. If this check fails, then for something like hotflip we need
# # to actually run the text field embedder and construct a vector for each token.
# if len(module._token_embedders) == 1:
# embedder = list(module._token_embedders.values())[0]
# if isinstance(embedder, Embedding):
# if embedder._projection is None:
# # If there's a projection inside the Embedding, then we need to return
# # the whole TextFieldEmbedder, because there's more computation that
# # needs to be done than just multiply by an embedding matrix.
# return embedder
# return module
raise RuntimeError("No embedding module found!")
def get_token_offsets_from_text_field_inputs(
text_field_inputs: List[Any],
) -> Optional[torch.Tensor]:
"""
Given a list of inputs to a TextFieldEmbedder, tries to find token offsets from those inputs, if
there are any. You will have token offsets if you are using a mismatched token embedder; if
you're not, the return value from this function should be None. This function is intended to be
called from a `forward_hook` attached to a `TextFieldEmbedder`, so the inputs are formatted just
as a list.
It's possible in theory that you could have multiple offsets as inputs to a single call to a
`TextFieldEmbedder`, but that's an extremely rare use case (I can't really imagine anyone
wanting to do that). In that case, we'll only return the first one. If you need different
behavior for your model, open an issue on github describing what you're doing.
"""
for input_index, text_field_input in enumerate(text_field_inputs):
if not isinstance(text_field_input, dict):
continue
for input_value in text_field_input.values():
if not isinstance(input_value, dict):
continue
for embedder_arg_name, embedder_arg_value in input_value.items():
if embedder_arg_name == "offsets":
return embedder_arg_value
return None
......@@ -8,8 +8,8 @@ import numpy as np
import pandas as pd
from pathlib import Path
from data import Sentence
from utils import ComboLogger
from combo.data import Sentence
from combo.utils import ComboLogger
def extract_combo_matrices(predictions: List[Sentence],
......
......@@ -20,4 +20,4 @@ The ```"type"``` field serves as a dependency-injection like mechanism and
sets which class and constructor is used.
The ```"parameters"``` dictionary is passed to the constructor method. Every
parameter is attempted to be resolved from the class registry and only if that's
not possible they are passed as-is.
\ No newline at end of file
not possible they are passed as-is.
File moved
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment