Skip to content
Snippets Groups Projects
Commit 7496eabb authored by Maja Jabłońska's avatar Maja Jabłońska
Browse files

Add Embedders

parent 0bdbb324
Branches
Tags
1 merge request!46Merge COMBO 3.0 into master
from .base import FeedForwardPredictor
from .graph_parser import GraphDependencyRelationModel
from .parser import DependencyRelationModel
from .embeddings import CharacterBasedWordEmbeddings
from .embeddings import (GloVe6BEmbedder, GloVe840BEmbedder, GloVeTwitter27BEmbedder,
GloVe42BEmbedder, FastTextEmbedder, CharNGramEmbedder)
from .encoder import ComboEncoder
from .lemma import LemmatizerModel
from .model import ComboModel
......
......@@ -2,7 +2,7 @@ from typing import Dict, Optional, List, Union, Tuple
import torch
import torch.nn as nn
import utils
import combo.models.utils as utils
import combo.models.combo_nn as combo_nn
import combo.utils.checks as checks
from combo import data
......
"""
Adapted from COMBO 1.0
Author: Mateusz Klimaszewski
"""
from typing import List
import torch
import torch.nn as nn
from combo.models.combo_nn import Activation
class DilatedCnnEncoder(nn.Module):
def __init__(self,
input_dim: int,
filters: List[int],
kernel_size: List[int],
stride: List[int],
padding: List[int],
dilation: List[int],
activations: List[Activation]):
super().__init__()
conv1d_layers = []
input_dims = [input_dim] + filters[:-1]
output_dims = filters
for idx in range(len(activations)):
conv1d_layers.append(nn.Conv1d(
in_channels=input_dims[idx],
out_channels=output_dims[idx],
kernel_size=(kernel_size[idx],),
stride=(stride[idx],),
padding=padding[idx],
dilation=(dilation[idx],)))
self.conv1d_layers = nn.ModuleList(conv1d_layers)
self.activations = activations
assert len(self.activations) == len(self.conv1d_layers)
def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
for layer, activation in zip(self.conv1d_layers, self.activations):
x = activation(layer(x))
return x
class Embedding:
pass
from typing import Optional
class TokenEmbedder:
pass
import torch
from overrides import overrides
from torch import nn
from torchtext.vocab import Vectors, GloVe, FastText, CharNGram
class CharacterBasedWordEmbeddings(TokenEmbedder):
pass
from combo.data import Vocabulary
from combo.models.dilated_cnn import DilatedCnnEncoder
from combo.models.utils import tiny_value_of_dtype
from combo.utils import ConfigurationError
class ProjectedWordEmbedder(TokenEmbedder):
pass
"""
Adapted from AllenNLP
"""
class TimeDistributed(torch.nn.Module):
"""
Given an input shaped like `(batch_size, time_steps, [rest])` and a `Module` that takes
inputs like `(batch_size, [rest])`, `TimeDistributed` reshapes the input to be
`(batch_size * time_steps, [rest])`, applies the contained `Module`, then reshapes it back.
Note that while the above gives shapes with `batch_size` first, this `Module` also works if
`batch_size` is second - we always just combine the first two dimensions, then split them.
It also reshapes keyword arguments unless they are not tensors or their name is specified in
the optional `pass_through` iterable.
"""
def __init__(self, module):
super().__init__()
self._module = module
@overrides
def forward(self, *inputs, pass_through: List[str] = None, **kwargs):
pass_through = pass_through or []
reshaped_inputs = [self._reshape_tensor(input_tensor) for input_tensor in inputs]
# Need some input to then get the batch_size and time_steps.
some_input = None
if inputs:
some_input = inputs[-1]
reshaped_kwargs = {}
for key, value in kwargs.items():
if isinstance(value, torch.Tensor) and key not in pass_through:
if some_input is None:
some_input = value
value = self._reshape_tensor(value)
reshaped_kwargs[key] = value
reshaped_outputs = self._module(*reshaped_inputs, **reshaped_kwargs)
if some_input is None:
raise RuntimeError("No input tensor to time-distribute")
# Now get the output back into the right shape.
# (batch_size, time_steps, **output_size)
new_size = some_input.size()[:2] + reshaped_outputs.size()[1:]
outputs = reshaped_outputs.contiguous().view(new_size)
return outputs
@staticmethod
def _reshape_tensor(input_tensor):
input_size = input_tensor.size()
if len(input_size) <= 2:
raise RuntimeError(f"No dimension to distribute: {input_size}")
# Squash batch_size and time_steps into a single axis; result has shape
# (batch_size * time_steps, **input_size).
squashed_shape = [-1] + list(input_size[2:])
return input_tensor.contiguous().view(*squashed_shape)
class TokenEmbedder(nn.Module):
def __init__(self):
super(TokenEmbedder, self).__init__()
@property
def output_dim(self) -> int:
raise NotImplementedError()
def forward(self,
x: torch.Tensor,
char_mask: Optional[torch.BoolTensor] = None) -> torch.Tensor:
raise NotImplementedError()
class _TorchEmbedder(TokenEmbedder):
def __init__(self,
num_embeddings: int,
embedding_dim: int,
padding_idx: Optional[int] = None,
max_norm: Optional[float] = None,
norm_type: float = 2.,
scale_grad_by_freq: bool = False,
sparse: bool = False,
vocab_namespace: str = "tokens",
vocab: Vocabulary = None,
weight: Optional[torch.Tensor] = None,
trainable: bool = True,
projection_dim: Optional[int] = None):
super(_TorchEmbedder, self).__init__()
self._embedding_dim = embedding_dim
self._embedding = nn.Embedding(num_embeddings=num_embeddings,
embedding_dim=embedding_dim,
padding_idx=padding_idx,
max_norm=max_norm,
norm_type=norm_type,
scale_grad_by_freq=scale_grad_by_freq,
sparse=sparse)
self.__vocab_namespace = vocab_namespace
self.__vocab = vocab
if weight is not None:
if weight.shape() != (num_embeddings, embedding_dim):
raise ConfigurationError(
"Weight matrix must be of shape (num_embeddings, embedding_dim)." +
f"Got: ({weight.shape()})"
)
self.__weight = torch.nn.Parameter(weight, requires_grad=trainable)
else:
self.__weight = torch.nn.Parameter(torch.FloatTensor(num_embeddings, embedding_dim),
requires_grad=trainable)
torch.nn.init.xavier_uniform_(self.__weight)
if padding_idx is not None:
self.__weight.data[padding_idx].fill_(0)
if projection_dim:
self._projection = torch.nn.Linear(embedding_dim, projection_dim)
self._output_dim = projection_dim
else:
self._projection = None
self._output_dim = embedding_dim
@overrides
def output_dim(self) -> int:
return self._output_dim
@overrides
def forward(self,
x: torch.Tensor,
char_mask: Optional[torch.BoolTensor] = None) -> torch.Tensor:
embedded = self._embedding(x)
if self._projection:
projection = self._projection
for _ in range(embedded.dim()-2):
projection = TimeDistributed(p)
embedded = projection(embedded)
return embedded
class _TorchtextVectorsEmbedder(TokenEmbedder):
"""
Torchtext Vectors object wrapper
"""
def __init__(self,
torchtext_embedder: Vectors,
lower_case_backup: bool = False):
"""
:param torchtext_embedder: Torchtext Vectors object
:param lower_case_backup: whether to look up the token in the
lower case. Default: False.
"""
super(_TorchtextVectorsEmbedder, self).__init__()
self.__torchtext_embedder = torchtext_embedder
self.__lower_case_backup = lower_case_backup
@overrides
def output_dim(self) -> int:
return len(self.__torchtext_embedder)
@overrides
def forward(self,
x: torch.Tensor,
char_mask: Optional[torch.BoolTensor] = None) -> torch.Tensor:
return self.__torchtext_embedder.get_vecs_by_tokens(x, self.__lower_case_backup)
class GloVe42BEmbedder(_TorchtextVectorsEmbedder):
def __init__(self, dim: int = 300):
super(GloVe42BEmbedder, self).__init__(GloVe("42B", dim))
class GloVe840BEmbedder(_TorchtextVectorsEmbedder):
def __init__(self, dim: int = 300):
super(GloVe840BEmbedder, self).__init__(GloVe("840B", dim))
class GloVeTwitter27BEmbedder(_TorchtextVectorsEmbedder):
def __init__(self, dim: int = 300):
super(GloVeTwitter27BEmbedder, self).__init__(GloVe("twitter.27B", dim))
class GloVe6BEmbedder(_TorchtextVectorsEmbedder):
def __init__(self, dim: int = 300):
super(GloVe6BEmbedder, self).__init__(GloVe("6B", dim))
class FastTextEmbedder(_TorchtextVectorsEmbedder):
def __init__(self, language: str = "en"):
super(FastTextEmbedder, self).__init__(FastText(language))
class CharNGramEmbedder(_TorchtextVectorsEmbedder):
def __init__(self):
super(CharNGramEmbedder, self).__init__(CharNGram())
class CharacterBasedWordEmbedder(TokenEmbedder):
def __init__(self,
num_embeddings: int,
embedding_dim: int,
dilated_cnn_encoder: DilatedCnnEncoder):
super(CharacterBasedWordEmbedder, self).__init__()
self.__embedding_dim = embedding_dim
self.__dilated_cnn_encoder = dilated_cnn_encoder
self.char_embed = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim)
@overrides
def output_dim(self) -> int:
return self.__embedding_dim
@overrides
def forward(self,
x: torch.Tensor,
char_mask: Optional[torch.BoolTensor] = None) -> torch.Tensor:
if char_mask is None:
char_mask = x.new_ones(x.size())
x = self.char_embed(x)
x = x * char_mask.unsqueeze(-1).float()
x = self.__dilated_cnn_encoder(x.transpose(2, 3))
return torch.max(x, dim=-1)[0]
class PretrainedTransformerMismatchedEmbedder(TokenEmbedder):
......@@ -20,5 +250,36 @@ class TransformersWordEmbedder(PretrainedTransformerMismatchedEmbedder):
pass
class FeatsTokenEmbedder(TokenEmbedder):
pass
\ No newline at end of file
class FeatsTokenEmbedder(_TorchEmbedder):
def __init__(self,
num_embeddings: int,
embedding_dim: int,
padding_idx: Optional[int] = None,
max_norm: Optional[float] = None,
norm_type: float = 2.,
scale_grad_by_freq: bool = False,
sparse: bool = False,
vocab_namespace: str = "feats",
vocab: Vocabulary = None,
weight: Optional[torch.Tensor] = None,
trainable: bool = True):
super(FeatsTokenEmbedder, self).__init__(num_embeddings,
embedding_dim,
padding_idx,
max_norm,
norm_type,
scale_grad_by_freq,
sparse,
vocab_namespace,
vocab,
weight,
trainable)
@overrides
def forward(self,
x: torch.Tensor) -> torch.Tensor:
mask = x.gt(0)
x = super().forward(x)
return x.sum(dim=-2)/(
(mask.sum(dim=-1)+tiny_value_of_dtype(torch.float)).unsqueeze(dim=-1)
)
from combo.models.base import Model
class ComboModel(Model):
class ComboModel:
pass
\ No newline at end of file
......@@ -5,3 +5,23 @@ import torch.nn.functional as F
def masked_cross_entropy(pred: torch.Tensor, true: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor:
pred = pred + (mask.float().unsqueeze(-1) + 1e-45).log()
return F.cross_entropy(pred, true, reduction="none") * mask
"""
Adapted from AllenNLP
"""
def tiny_value_of_dtype(dtype: torch.dtype):
"""
Returns a moderately tiny value for a given PyTorch data type that is used to avoid numerical
issues such as division by zero.
This is different from `info_value_of_dtype(dtype).tiny` because it causes some NaN bugs.
Only supports floating point dtypes.
"""
if not dtype.is_floating_point:
raise TypeError("Only supports floating point dtypes.")
if dtype == torch.float or dtype == torch.double:
return 1e-13
elif dtype == torch.half:
return 1e-4
else:
raise TypeError("Does not support dtype " + str(dtype))
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment