Skip to content
Snippets Groups Projects
Commit 16988605 authored by Maja Jabłońska's avatar Maja Jabłońska Committed by Martyna Wiącek
Browse files

First commit

parent b5b12d54
1 merge request!46Merge COMBO 3.0 into master
from collections import defaultdict, OrderedDict
from typing import Dict, Union, Optional, Iterable, Callable, Any, Set
from torchtext.vocab import Vocab as TorchtextVocab
from torchtext.vocab import vocab as torchtext_vocab
DEFAULT_NON_PADDED_NAMESPACES = ("*tags", "*labels")
DEFAULT_PADDING_TOKEN = "@@PADDING@@"
DEFAULT_OOV_TOKEN = "@@UNKNOWN@@"
NAMESPACE_PADDING_FILE = "non_padded_namespaces.txt"
DEFAULT_NAMESPACE = "tokens"
def match_namespace(pattern: str, namespace: str):
if not isinstance(pattern, str):
raise ValueError("Pattern and namespace must be string types, got %s and %s." %
(type(pattern), type(namespace)))
if pattern == namespace:
return True
if len(pattern)>2 and pattern[0] == '*' and namespace.endswith(pattern[1:]):
return True
return False
class _NamespaceDependentDefaultDict(defaultdict[str, TorchtextVocab]):
def __init__(self,
non_padded_namespaces: Iterable[str],
padding_token: str,
oov_token: str):
self._non_padded_namespaces = set(non_padded_namespaces)
self._padding_token = padding_token
self._oov_token = oov_token
super().__init__()
def __missing__(self, namespace: str):
# Non-padded namespace
if any([match_namespace(namespace, npn) for npn in self._non_padded_namespaces]):
value = torchtext_vocab(
OrderedDict([
(self._padding_token, 0),
(self._oov_token, 1)])
)
else:
value = torchtext_vocab(OrderedDict([]))
dict.__setitem__(self, namespace, value)
return value
def add_non_padded_namespaces(self, non_padded_namespaces: Set[str]):
self._non_padded_namespaces.update(non_padded_namespaces)
class Vocabulary:
def __init__(self,
counter: Dict[str, Dict[str, int]] = None,
min_count: Dict[str, int] = None,
max_vocab_size: Union[int, Dict[str, int]] = None,
non_padded_namespaces: Iterable[str] = DEFAULT_NON_PADDED_NAMESPACES,
padding_token: Optional[str] = DEFAULT_PADDING_TOKEN,
oov_token: Optional[str] = DEFAULT_OOV_TOKEN):
# ordered_dict – Ordered Dictionary mapping tokens to their corresponding occurance frequencies.
#
# min_freq – The minimum frequency needed to include a token in the vocabulary.
#
# specials – Special symbols to add. The order of supplied tokens will be preserved.
#
# special_first – Indicates whether to insert symbols at the beginning or at the end.
self._padding_token = padding_token if padding_token is not None else DEFAULT_PADDING_TOKEN
self._oov_token = oov_token if oov_token is not None else DEFAULT_OOV_TOKEN
self._non_padded_namespaces = set(non_padded_namespaces)
self._vocab = _NamespaceDependentDefaultDict(self._non_padded_namespaces,
self._padding_token,
self._oov_token)
def _extend(self,
tokens_to_add: Dict[str, Dict[str, int]]):
for namespace, tokens in tokens_to_add.items():
for token in tokens:
self._vocab[namespace].append_token(token)
# def add_token_to_namespace(self, token: str, namespace: str = DEFAULT_NAMESPACE):
# """
# Add the token if not present and return the index even if token was already in the namespace.
#
# :param token: token to be added
# :param namespace: namespace to add the token to
# :return: index of the token in the namespace
# """
#
# if not isinstance(token, str):
# raise ValueError("Vocabulary tokens must be strings. Got %s with type %s" % (repr(token), type(token)))
#
@classmethod
def empty(cls):
return cls()
from typing import Dict, Optional, List, Union, Tuple
import torch
import torch.nn as nn
import utils
import combo.models.combo_nn as combo_nn
import combo.checks as checks
class Predictor(nn.Module):
def forward(self,
x: Union[torch.Tensor, List[torch.Tensor]],
mask: Optional[torch.BoolTensor] = None,
labels: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
sample_weights: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None) -> Dict[str, torch.Tensor]:
raise NotImplementedError()
class Linear(nn.Linear):
def __init__(self,
in_features: int,
out_features: int,
activation: Optional[combo_nn.Activation] = None,
dropout_rate: Optional[float] = 0.0):
super().__init__(in_features, out_features)
self.activation = activation if activation else self.identity
self.dropout = nn.Dropout(p=dropout_rate) if dropout_rate else self.identity
def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
x = super().forward(x)
x = self.activation(x)
return self.dropout(x)
def get_output_dim(self) -> int:
return self.out_features
@staticmethod
def identity(x):
return x
class FeedForward(torch.nn.Module):
"""
Modified copy of allennlp.modules.feedforward.FeedForward
This `Module` is a feed-forward neural network, just a sequence of `Linear` layers with
activation functions in between.
# Parameters
input_dim : `int`, required
The dimensionality of the input. We assume the input has shape `(batch_size, input_dim)`.
num_layers : `int`, required
The number of `Linear` layers to apply to the input.
hidden_dims : `Union[int, List[int]]`, required
The output dimension of each of the `Linear` layers. If this is a single `int`, we use
it for all `Linear` layers. If it is a `List[int]`, `len(hidden_dims)` must be
`num_layers`.
activations : `Union[Activation, List[Activation]]`, required
The activation function to use after each `Linear` layer. If this is a single function,
we use it after all `Linear` layers. If it is a `List[Activation]`,
`len(activations)` must be `num_layers`. Activation must have torch.nn.Module type.
dropout : `Union[float, List[float]]`, optional (default = `0.0`)
If given, we will apply this amount of dropout after each layer. Semantics of `float`
versus `List[float]` is the same as with other parameters.
# Examples
```python
FeedForward(124, 2, [64, 32], torch.nn.ReLU(), 0.2)
#> FeedForward(
#> (_activations): ModuleList(
#> (0): ReLU()
#> (1): ReLU()
#> )
#> (_linear_layers): ModuleList(
#> (0): Linear(in_features=124, out_features=64, bias=True)
#> (1): Linear(in_features=64, out_features=32, bias=True)
#> )
#> (_dropout): ModuleList(
#> (0): Dropout(p=0.2, inplace=False)
#> (1): Dropout(p=0.2, inplace=False)
#> )
#> )
```
"""
def __init__(
self,
input_dim: int,
num_layers: int,
hidden_dims: Union[int, List[int]],
activations: Union[combo_nn.Activation, List[combo_nn.Activation]],
dropout: Union[float, List[float]] = 0.0,
) -> None:
super().__init__()
if not isinstance(hidden_dims, list):
hidden_dims = [hidden_dims] * num_layers # type: ignore
if not isinstance(activations, list):
activations = [activations] * num_layers # type: ignore
if not isinstance(dropout, list):
dropout = [dropout] * num_layers # type: ignore
if len(hidden_dims) != num_layers:
raise checks.ConfigurationError(
"len(hidden_dims) (%d) != num_layers (%d)" % (len(hidden_dims), num_layers)
)
if len(activations) != num_layers:
raise checks.ConfigurationError(
"len(activations) (%d) != num_layers (%d)" % (len(activations), num_layers)
)
if len(dropout) != num_layers:
raise checks.ConfigurationError(
"len(dropout) (%d) != num_layers (%d)" % (len(dropout), num_layers)
)
self._activations = torch.nn.ModuleList(activations)
input_dims = [input_dim] + hidden_dims[:-1]
linear_layers = []
for layer_input_dim, layer_output_dim in zip(input_dims, hidden_dims):
linear_layers.append(torch.nn.Linear(layer_input_dim, layer_output_dim))
self._linear_layers = torch.nn.ModuleList(linear_layers)
dropout_layers = [torch.nn.Dropout(p=value) for value in dropout]
self._dropout = torch.nn.ModuleList(dropout_layers)
self._output_dim = hidden_dims[-1]
self.input_dim = input_dim
def get_output_dim(self):
return self._output_dim
def get_input_dim(self):
return self.input_dim
def forward(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, List[torch.Tensor]]:
output = inputs
feature_maps = []
for layer, activation, dropout in zip(
self._linear_layers, self._activations, self._dropout
):
feature_maps.append(output)
output = dropout(activation(layer(output)))
return output, feature_maps
class FeedForwardPredictor(Predictor):
"""Feedforward predictor. Should be used on top of Seq2Seq encoder."""
def __init__(self, feedforward_network: "FeedForward"):
super().__init__()
self.feedforward_network = feedforward_network
def forward(self,
x: Union[torch.Tensor, List[torch.Tensor]],
mask: Optional[torch.BoolTensor] = None,
labels: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
sample_weights: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None) -> Dict[str, torch.Tensor]:
if mask is None:
mask = x.new_ones(x.size()[:-1])
x, feature_maps = self.feedforward_network(x)
output = {
"prediction": x.argmax(-1),
"probability": x,
"embedding": feature_maps[-1],
}
if labels is not None:
if sample_weights is None:
sample_weights = labels.new_ones([mask.size(0)])
output["loss"] = self._loss(x, labels, mask, sample_weights)
return output
def _loss(self,
pred: torch.Tensor,
true: torch.Tensor,
mask: torch.BoolTensor,
sample_weights: torch.Tensor) -> torch.Tensor:
BATCH_SIZE, _, CLASSES = pred.size()
valid_positions = mask.sum()
pred = pred.reshape(-1, CLASSES)
true = true.reshape(-1)
mask = mask.reshape(-1)
loss = utils.masked_cross_entropy(pred, true, mask)
loss = loss.reshape(BATCH_SIZE, -1) * sample_weights.unsqueeze(-1)
return loss.sum() / valid_positions
@classmethod
def from_vocab(cls,
vocab: data.Vocabulary,
vocab_namespace: str,
input_dim: int,
num_layers: int,
hidden_dims: List[int],
activations: Union[combo_nn.Activation, List[combo_nn.Activation]],
dropout: Union[float, List[float]] = 0.0,
):
if len(hidden_dims) + 1 != num_layers:
raise checks.ConfigurationError(
f"len(hidden_dims) ({len(hidden_dims):d}) + 1 != num_layers ({num_layers:d})"
)
assert vocab_namespace in vocab.get_namespaces(), \
f"There is not {vocab_namespace} in created vocabs, check if this field has any values to predict!"
hidden_dims = hidden_dims + [vocab.get_vocab_size(vocab_namespace)]
return cls(FeedForward(
input_dim=input_dim,
num_layers=num_layers,
hidden_dims=hidden_dims,
activations=activations,
dropout=dropout))
import torch
import torch.nn as nn
class Activation(nn.Module):
def __cal__(self, tensor: torch.Tensor) -> torch.Tensor:
raise NotImplementedError
import torch
import torch.nn.functional as F
def masked_cross_entropy(pred: torch.Tensor, true: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor:
pred = pred + (mask.float().unsqueeze(-1) + 1e-45).log()
return F.cross_entropy(pred, true, reduction="none") * mask
# Configuration
Dependency injection is used for configuration using the ```dependency_injector``` package.
Configuration files can be in json or ini format.
\ No newline at end of file
# sent_id = test-s1
# text = Easy sentence.
1 Verylongwordwhichmustbetruncatedbythesystemto30 easy ADJ adj AdpType=Prep|Adp 2 amod _ _
2 Sentence verylonglemmawhichmustbetruncatedbythesystemto30 NOUN nom Number=Sing 0 root _ _
3 . . PUNCT . _ 1 punct _ _
# sent_id = test-s1
# text = Easy sentence.
1 Verylongwordwhichmustbetruncatedbythesystemto30 easy ADJ adj AdpType=Prep|Adp 2 amod _ _
2 Sentence verylonglemmawhichmustbetruncatedbythesystemto30 NOUN nom Number=Sing 0 root _ _
3 . . PUNCT . _ 1 punct 2:mod _
4 . . PUNCT . _ 1 punct 2:xmod _
main.py 0 → 100644
# This is a sample Python script.
# Press Shift+F10 to execute it or replace it with your code.
# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
def print_hi(name):
# Use a breakpoint in the code line below to debug your script.
print(f'Hi, {name}') # Press Ctrl+F8 to toggle the breakpoint.
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
print_hi('PyCharm')
# See PyCharm help at https://www.jetbrains.com/help/pycharm/
conllu~=4.4.1
dependency-injector~=4.41.0
overrides~=7.3.1
torch~=1.13.1
torchtext~=0.14.1
numpy~=1.24.1
pytorch-lightning~=1.9.0
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment