First commit

16988605 · Maja Jabłońska · Martyna Wiącek · b5b12d54 · 16988605 · 16988605
Commit 16988605 authored 2 years ago by Maja Jabłońska Committed by Martyna Wiącek 2 years ago
--- a/combo/data/vocabulary.py
+++ b/combo/data/vocabulary.py
+from collections import defaultdict, OrderedDict
+from typing import Dict, Union, Optional, Iterable, Callable, Any, Set
+
+from torchtext.vocab import Vocab as TorchtextVocab
+from torchtext.vocab import vocab as torchtext_vocab
+
+DEFAULT_NON_PADDED_NAMESPACES = ("*tags", "*labels")
+DEFAULT_PADDING_TOKEN = "@@PADDING@@"
+DEFAULT_OOV_TOKEN = "@@UNKNOWN@@"
+NAMESPACE_PADDING_FILE = "non_padded_namespaces.txt"
+DEFAULT_NAMESPACE = "tokens"
+
+
+def match_namespace(pattern: str, namespace: str):
+    if not isinstance(pattern, str):
+        raise ValueError("Pattern and namespace must be string types, got %s and %s." %
+                         (type(pattern), type(namespace)))
+    if pattern == namespace:
+        return True
+    if len(pattern)>2 and pattern[0] == '*' and namespace.endswith(pattern[1:]):
+        return True
+    return False
+
+
+class _NamespaceDependentDefaultDict(defaultdict[str, TorchtextVocab]):
+    def __init__(self,
+                 non_padded_namespaces: Iterable[str],
+                 padding_token: str,
+                 oov_token: str):
+        self._non_padded_namespaces = set(non_padded_namespaces)
+        self._padding_token = padding_token
+        self._oov_token = oov_token
+        super().__init__()
+
+    def __missing__(self, namespace: str):
+        # Non-padded namespace
+        if any([match_namespace(namespace, npn) for npn in self._non_padded_namespaces]):
+            value = torchtext_vocab(
+                OrderedDict([
+                    (self._padding_token, 0),
+                    (self._oov_token, 1)])
+            )
+        else:
+            value = torchtext_vocab(OrderedDict([]))
+        dict.__setitem__(self, namespace, value)
+        return value
+
+    def add_non_padded_namespaces(self, non_padded_namespaces: Set[str]):
+        self._non_padded_namespaces.update(non_padded_namespaces)
+
+
+class Vocabulary:
+    def __init__(self,
+                 counter: Dict[str, Dict[str, int]] = None,
+                 min_count: Dict[str, int] = None,
+                 max_vocab_size: Union[int, Dict[str, int]] = None,
+                 non_padded_namespaces: Iterable[str] = DEFAULT_NON_PADDED_NAMESPACES,
+                 padding_token: Optional[str] = DEFAULT_PADDING_TOKEN,
+                 oov_token: Optional[str] = DEFAULT_OOV_TOKEN):
+
+        # ordered_dict – Ordered Dictionary mapping tokens to their corresponding occurance frequencies.
+        #
+        # min_freq – The minimum frequency needed to include a token in the vocabulary.
+        #
+        # specials – Special symbols to add. The order of supplied tokens will be preserved.
+        #
+        # special_first – Indicates whether to insert symbols at the beginning or at the end.
+        self._padding_token = padding_token if padding_token is not None else DEFAULT_PADDING_TOKEN
+        self._oov_token = oov_token if oov_token is not None else DEFAULT_OOV_TOKEN
+        self._non_padded_namespaces = set(non_padded_namespaces)
+        self._vocab = _NamespaceDependentDefaultDict(self._non_padded_namespaces,
+                                                     self._padding_token,
+                                                     self._oov_token)
+
+    def _extend(self,
+                tokens_to_add: Dict[str, Dict[str, int]]):
+        for namespace, tokens in tokens_to_add.items():
+            for token in tokens:
+                self._vocab[namespace].append_token(token)
+
+    # def add_token_to_namespace(self, token: str, namespace: str = DEFAULT_NAMESPACE):
+    #     """
+    #     Add the token if not present and return the index even if token was already in the namespace.
+    #
+    #     :param token: token to be added
+    #     :param namespace: namespace to add the token to
+    #     :return: index of the token in the namespace
+    #     """
+    #
+    #     if not isinstance(token, str):
+    #         raise ValueError("Vocabulary tokens must be strings. Got %s with type %s" % (repr(token), type(token)))
+    #
+
+
+    @classmethod
+    def empty(cls):
+        return cls()
+
--- a/combo/models/__init__.py
+++ b/combo/models/__init__.py
--- a/combo/models/base.py
+++ b/combo/models/base.py
+from typing import Dict, Optional, List, Union, Tuple
+
+import torch
+import torch.nn as nn
+import utils
+import combo.models.combo_nn as combo_nn
+import combo.checks as checks
+
+
+class Predictor(nn.Module):
+    def forward(self,
+                x: Union[torch.Tensor, List[torch.Tensor]],
+                mask: Optional[torch.BoolTensor] = None,
+                labels: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
+                sample_weights: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None) -> Dict[str, torch.Tensor]:
+        raise NotImplementedError()
+
+
+class Linear(nn.Linear):
+
+    def __init__(self,
+                 in_features: int,
+                 out_features: int,
+                 activation: Optional[combo_nn.Activation] = None,
+                 dropout_rate: Optional[float] = 0.0):
+        super().__init__(in_features, out_features)
+        self.activation = activation if activation else self.identity
+        self.dropout = nn.Dropout(p=dropout_rate) if dropout_rate else self.identity
+
+    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
+        x = super().forward(x)
+        x = self.activation(x)
+        return self.dropout(x)
+
+    def get_output_dim(self) -> int:
+        return self.out_features
+
+    @staticmethod
+    def identity(x):
+        return x
+
+
+class FeedForward(torch.nn.Module):
+    """
+    Modified copy of allennlp.modules.feedforward.FeedForward
+
+    This `Module` is a feed-forward neural network, just a sequence of `Linear` layers with
+    activation functions in between.
+
+    # Parameters
+
+    input_dim : `int`, required
+        The dimensionality of the input.  We assume the input has shape `(batch_size, input_dim)`.
+    num_layers : `int`, required
+        The number of `Linear` layers to apply to the input.
+    hidden_dims : `Union[int, List[int]]`, required
+        The output dimension of each of the `Linear` layers.  If this is a single `int`, we use
+        it for all `Linear` layers.  If it is a `List[int]`, `len(hidden_dims)` must be
+        `num_layers`.
+    activations : `Union[Activation, List[Activation]]`, required
+        The activation function to use after each `Linear` layer.  If this is a single function,
+        we use it after all `Linear` layers.  If it is a `List[Activation]`,
+        `len(activations)` must be `num_layers`. Activation must have torch.nn.Module type.
+    dropout : `Union[float, List[float]]`, optional (default = `0.0`)
+        If given, we will apply this amount of dropout after each layer.  Semantics of `float`
+        versus `List[float]` is the same as with other parameters.
+
+    # Examples
+
+    ```python
+    FeedForward(124, 2, [64, 32], torch.nn.ReLU(), 0.2)
+    #> FeedForward(
+    #>   (_activations): ModuleList(
+    #>     (0): ReLU()
+    #>     (1): ReLU()
+    #>   )
+    #>   (_linear_layers): ModuleList(
+    #>     (0): Linear(in_features=124, out_features=64, bias=True)
+    #>     (1): Linear(in_features=64, out_features=32, bias=True)
+    #>   )
+    #>   (_dropout): ModuleList(
+    #>     (0): Dropout(p=0.2, inplace=False)
+    #>     (1): Dropout(p=0.2, inplace=False)
+    #>   )
+    #> )
+    ```
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        num_layers: int,
+        hidden_dims: Union[int, List[int]],
+        activations: Union[combo_nn.Activation, List[combo_nn.Activation]],
+        dropout: Union[float, List[float]] = 0.0,
+    ) -> None:
+
+        super().__init__()
+        if not isinstance(hidden_dims, list):
+            hidden_dims = [hidden_dims] * num_layers  # type: ignore
+        if not isinstance(activations, list):
+            activations = [activations] * num_layers  # type: ignore
+        if not isinstance(dropout, list):
+            dropout = [dropout] * num_layers  # type: ignore
+        if len(hidden_dims) != num_layers:
+            raise checks.ConfigurationError(
+                "len(hidden_dims) (%d) != num_layers (%d)" % (len(hidden_dims), num_layers)
+            )
+        if len(activations) != num_layers:
+            raise checks.ConfigurationError(
+                "len(activations) (%d) != num_layers (%d)" % (len(activations), num_layers)
+            )
+        if len(dropout) != num_layers:
+            raise checks.ConfigurationError(
+                "len(dropout) (%d) != num_layers (%d)" % (len(dropout), num_layers)
+            )
+        self._activations = torch.nn.ModuleList(activations)
+        input_dims = [input_dim] + hidden_dims[:-1]
+        linear_layers = []
+        for layer_input_dim, layer_output_dim in zip(input_dims, hidden_dims):
+            linear_layers.append(torch.nn.Linear(layer_input_dim, layer_output_dim))
+        self._linear_layers = torch.nn.ModuleList(linear_layers)
+        dropout_layers = [torch.nn.Dropout(p=value) for value in dropout]
+        self._dropout = torch.nn.ModuleList(dropout_layers)
+        self._output_dim = hidden_dims[-1]
+        self.input_dim = input_dim
+
+    def get_output_dim(self):
+        return self._output_dim
+
+    def get_input_dim(self):
+        return self.input_dim
+
+    def forward(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+
+        output = inputs
+        feature_maps = []
+        for layer, activation, dropout in zip(
+            self._linear_layers, self._activations, self._dropout
+        ):
+            feature_maps.append(output)
+            output = dropout(activation(layer(output)))
+        return output, feature_maps
+
+
+
+class FeedForwardPredictor(Predictor):
+    """Feedforward predictor. Should be used on top of Seq2Seq encoder."""
+
+    def __init__(self, feedforward_network: "FeedForward"):
+        super().__init__()
+        self.feedforward_network = feedforward_network
+
+    def forward(self,
+                x: Union[torch.Tensor, List[torch.Tensor]],
+                mask: Optional[torch.BoolTensor] = None,
+                labels: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
+                sample_weights: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None) -> Dict[str, torch.Tensor]:
+        if mask is None:
+            mask = x.new_ones(x.size()[:-1])
+
+        x, feature_maps = self.feedforward_network(x)
+        output = {
+            "prediction": x.argmax(-1),
+            "probability": x,
+            "embedding": feature_maps[-1],
+        }
+
+        if labels is not None:
+            if sample_weights is None:
+                sample_weights = labels.new_ones([mask.size(0)])
+            output["loss"] = self._loss(x, labels, mask, sample_weights)
+
+        return output
+
+    def _loss(self,
+              pred: torch.Tensor,
+              true: torch.Tensor,
+              mask: torch.BoolTensor,
+              sample_weights: torch.Tensor) -> torch.Tensor:
+        BATCH_SIZE, _, CLASSES = pred.size()
+        valid_positions = mask.sum()
+        pred = pred.reshape(-1, CLASSES)
+        true = true.reshape(-1)
+        mask = mask.reshape(-1)
+        loss = utils.masked_cross_entropy(pred, true, mask)
+        loss = loss.reshape(BATCH_SIZE, -1) * sample_weights.unsqueeze(-1)
+        return loss.sum() / valid_positions
+
+    @classmethod
+    def from_vocab(cls,
+                   vocab: data.Vocabulary,
+                   vocab_namespace: str,
+                   input_dim: int,
+                   num_layers: int,
+                   hidden_dims: List[int],
+                   activations: Union[combo_nn.Activation, List[combo_nn.Activation]],
+                   dropout: Union[float, List[float]] = 0.0,
+                   ):
+        if len(hidden_dims) + 1 != num_layers:
+            raise checks.ConfigurationError(
+                f"len(hidden_dims) ({len(hidden_dims):d}) + 1 != num_layers ({num_layers:d})"
+            )
+
+        assert vocab_namespace in vocab.get_namespaces(), \
+            f"There is not {vocab_namespace} in created vocabs, check if this field has any values to predict!"
+        hidden_dims = hidden_dims + [vocab.get_vocab_size(vocab_namespace)]
+
+        return cls(FeedForward(
+            input_dim=input_dim,
+            num_layers=num_layers,
+            hidden_dims=hidden_dims,
+            activations=activations,
+            dropout=dropout))
+
--- a/combo/models/combo_nn.py
+++ b/combo/models/combo_nn.py
+import torch
+import torch.nn as nn
+
+
+class Activation(nn.Module):
+    def __cal__(self, tensor: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
--- a/combo/models/embeddings.py
+++ b/combo/models/embeddings.py
--- a/combo/models/utils.py
+++ b/combo/models/utils.py
+import torch
+import torch.nn.functional as F
+
+
+def masked_cross_entropy(pred: torch.Tensor, true: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor:
+    pred = pred + (mask.float().unsqueeze(-1) + 1e-45).log()
+    return F.cross_entropy(pred, true, reduction="none") * mask
--- a/docs/Configuration.md
+++ b/docs/Configuration.md
+# Configuration
+
+Dependency injection is used for configuration using the ```dependency_injector``` package.
+Configuration files can be in json or ini format.
\ No newline at end of file
--- a/example.conllu
+++ b/example.conllu
+# sent_id = test-s1
+# text = Easy sentence.
+1	Verylongwordwhichmustbetruncatedbythesystemto30	easy	ADJ	adj	AdpType=Prep|Adp	2	amod	_	_
+2	Sentence	verylonglemmawhichmustbetruncatedbythesystemto30	NOUN	nom	Number=Sing	0	root	_	_
+3	.	.	PUNCT	.	_	1	punct	_	_
+
+# sent_id = test-s1
+# text = Easy sentence.
+1	Verylongwordwhichmustbetruncatedbythesystemto30	easy	ADJ	adj	AdpType=Prep|Adp	2	amod	_	_
+2	Sentence	verylonglemmawhichmustbetruncatedbythesystemto30	NOUN	nom	Number=Sing	0	root	_	_
+3	.	.	PUNCT	.	_	1	punct	2:mod	_
+4	.	.	PUNCT	.	_	1	punct	2:xmod	_
+
--- a/main.py
+++ b/main.py
+# This is a sample Python script.
+
+# Press Shift+F10 to execute it or replace it with your code.
+# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
+
+
+def print_hi(name):
+    # Use a breakpoint in the code line below to debug your script.
+    print(f'Hi, {name}')  # Press Ctrl+F8 to toggle the breakpoint.
+
+
+# Press the green button in the gutter to run the script.
+if __name__ == '__main__':
+    print_hi('PyCharm')
+
+# See PyCharm help at https://www.jetbrains.com/help/pycharm/
--- a/requirements.txt
+++ b/requirements.txt
+conllu~=4.4.1
+dependency-injector~=4.41.0
+overrides~=7.3.1
+torch~=1.13.1
+torchtext~=0.14.1
+numpy~=1.24.1
+pytorch-lightning~=1.9.0
\ No newline at end of file