From f39ffe688fefc4c54c21d076308df9a5fba261aa Mon Sep 17 00:00:00 2001
From: pszenny <pszenny@e-science.pl>
Date: Wed, 20 Oct 2021 01:34:33 +0200
Subject: [PATCH] Release 1.0.5 Error while too many tokens are processed by
 BERT

---
 README.md                                     |  2 +-
 combo/config.template.jsonnet                 |  2 +-
 ...etrained_transformer_mismatched_indexer.py | 37 ++++++++++++++++++-
 docs/installation.md                          |  6 +--
 setup.py                                      |  2 +-
 5 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index ce43fb4..e758bc6 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@
 Clone this repository and install COMBO (we suggest creating a virtualenv/conda environment with Python 3.6+, as a bundle of required packages will be installed):
 ```bash
 pip install -U pip setuptools wheel
-pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4
+pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5
 ```
 Run the following commands in your Python console to make predictions with a pre-trained model:
 ```python
diff --git a/combo/config.template.jsonnet b/combo/config.template.jsonnet
index c602d9c..4e44f42 100644
--- a/combo/config.template.jsonnet
+++ b/combo/config.template.jsonnet
@@ -389,4 +389,4 @@ assert pretrained_tokens == null || pretrained_transformer_name == null: "Can't
     random_seed: 8787,
     pytorch_seed: 8787,
     numpy_seed: 8787,
-}
+}
\ No newline at end of file
diff --git a/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py b/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py
index 3eee80e..b9a4e3c 100644
--- a/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py
+++ b/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py
@@ -1,12 +1,14 @@
 from typing import Optional, Dict, Any, List, Tuple
 
 from allennlp import data
-from allennlp.data import token_indexers, tokenizers
+from allennlp.data import token_indexers, tokenizers, IndexedTokenList, vocabulary
+from overrides import overrides
+
+from typing import List
 
 
 @data.TokenIndexer.register("pretrained_transformer_mismatched_fixed")
 class PretrainedTransformerMismatchedIndexer(token_indexers.PretrainedTransformerMismatchedIndexer):
-    """TODO(mklimasz) Remove during next allennlp update, fixed on allennlp master."""
 
     def __init__(self, model_name: str, namespace: str = "tags", max_length: int = None,
                  tokenizer_kwargs: Optional[Dict[str, Any]] = None, **kwargs) -> None:
@@ -24,6 +26,37 @@ class PretrainedTransformerMismatchedIndexer(token_indexers.PretrainedTransforme
         self._num_added_start_tokens = self._matched_indexer._num_added_start_tokens
         self._num_added_end_tokens = self._matched_indexer._num_added_end_tokens
 
+    @overrides
+    def tokens_to_indices(self,
+                          tokens,
+                          vocabulary: vocabulary ) -> IndexedTokenList:
+        """
+        Method is overridden in order to raise an error while the number of tokens needed to embed a sentence exceeds the
+        maximal input of a model.
+        """
+        self._matched_indexer._add_encoding_to_vocabulary_if_needed(vocabulary)
+
+        wordpieces, offsets = self._allennlp_tokenizer.intra_word_tokenize(
+            [t.ensure_text() for t in tokens])
+
+        if len(wordpieces) > self._tokenizer.max_len_single_sentence:
+            raise ValueError("Following sentence consists of more wordpiece tokens that the model can process:\n" +\
+                             " ".join([str(x) for x in tokens[:10]]) + " ... \n" + \
+                             f"Maximal input: {self._tokenizer.max_len_single_sentence}\n"+ \
+                             f"Current input: {len(wordpieces)}")
+
+        offsets = [x if x is not None else (-1, -1) for x in offsets]
+
+        output: IndexedTokenList = {
+            "token_ids": [t.text_id for t in wordpieces],
+            "mask": [True] * len(tokens),  # for original tokens (i.e. word-level)
+            "type_ids": [t.type_id for t in wordpieces],
+            "offsets": offsets,
+            "wordpiece_mask": [True] * len(wordpieces),  # for wordpieces (i.e. subword-level)
+        }
+
+        return self._matched_indexer._postprocess_output(output)
+
 
 class PretrainedTransformerIndexer(token_indexers.PretrainedTransformerIndexer):
 
diff --git a/docs/installation.md b/docs/installation.md
index 422bed2..6354582 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -2,7 +2,7 @@
 Clone this repository and install COMBO (we suggest using virtualenv/conda with Python 3.6+):
 ```bash
 pip install -U pip setuptools wheel
-pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4
+pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5
 combo --helpfull
 ```
 
@@ -11,7 +11,7 @@ combo --helpfull
 python -m venv venv
 source venv/bin/activate
 pip install -U pip setuptools wheel
-pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4
+pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5
 ```
 
 ### Conda example:
@@ -19,7 +19,7 @@ pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4
 conda create -n combo python=3.8
 conda activate combo
 pip install -U pip setuptools wheel
-pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.1
+pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5
 ```
 
 ## Problems & solutions
diff --git a/setup.py b/setup.py
index 876909d..0e28601 100644
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,7 @@ REQUIREMENTS = [
 
 setup(
     name='combo',
-    version='1.0.4',
+    version='1.0.5',
     author='Mateusz Klimaszewski',
     author_email='M.Klimaszewski@ii.pw.edu.pl',
     install_requires=REQUIREMENTS,
-- 
GitLab