From 6a60de673f49c9b9f519c7c8190e16678468e3be Mon Sep 17 00:00:00 2001
From: pszenny <pszenny@e-science.pl>
Date: Wed, 20 Oct 2021 01:34:33 +0200
Subject: [PATCH] Release 1.0.5 Error while too many tokens are processed by
 BERT

---
 README.md                                     | 27 ++++++++------
 combo/config.template.jsonnet                 |  2 +-
 ...etrained_transformer_mismatched_indexer.py | 37 ++++++++++++++++++-
 docs/installation.md                          |  6 +--
 setup.py                                      |  2 +-
 5 files changed, 55 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index 76e436b..e758bc6 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@
 Clone this repository and install COMBO (we suggest creating a virtualenv/conda environment with Python 3.6+, as a bundle of required packages will be installed):
 ```bash
 pip install -U pip setuptools wheel
-pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4
+pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5
 ```
 Run the following commands in your Python console to make predictions with a pre-trained model:
 ```python
@@ -42,21 +42,24 @@ We encourage you to use the [beginner's tutorial](https://colab.research.google.
 
 ## Citing
 
-### Accepted at EMNLP'21 demo session :tada: :fire:
-
-If you use COMBO in your research, please cite [COMBO: State-of-the-Art Morphosyntactic Analysis](https://arxiv.org/abs/2109.05361)
+If you use COMBO in your research, please cite [COMBO: State-of-the-Art Morphosyntactic Analysis](https://aclanthology.org/2021.emnlp-demo.7)
 ```bibtex
-@misc{klimaszewski2021combo,
-      title={COMBO: State-of-the-Art Morphosyntactic Analysis}, 
-      author={Mateusz Klimaszewski and Alina WrÃ³blewska},
-      year={2021},
-      eprint={2109.05361},
-      archivePrefix={arXiv},
-      primaryClass={cs.CL}
+@inproceedings{klimaszewski-wroblewska-2021-combo-state,
+    title = "{COMBO}: State-of-the-Art Morphosyntactic Analysis",
+    author = "Klimaszewski, Mateusz  and
+      Wr{\'o}blewska, Alina",
+    booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
+    month = nov,
+    year = "2021",
+    address = "Online and Punta Cana, Dominican Republic",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2021.emnlp-demo.7",
+    pages = "50--62",
+    abstract = "We introduce COMBO {--} a fully neural NLP system for accurate part-of-speech tagging, morphological analysis, lemmatisation, and (enhanced) dependency parsing. It predicts categorical morphosyntactic features whilst also exposes their vector representations, extracted from hidden layers. COMBO is an easy to install Python package with automatically downloadable pre-trained models for over 40 languages. It maintains a balance between efficiency and quality. As it is an end-to-end system and its modules are jointly trained, its training is competitively fast. As its models are optimised for accuracy, they achieve often better prediction quality than SOTA. The COMBO library is available at: https://gitlab.clarin-pl.eu/syntactic-tools/combo.",
 }
 ```
 
-If you use an EUD module in your research, please cite [COMBO: A New Module for EUD Parsing](https://aclanthology.org/2021.iwpt-1.16/)
+If you use an EUD module in your research, please cite [COMBO: A New Module for EUD Parsing](https://aclanthology.org/2021.iwpt-1.16)
 ```bibtex
 @inproceedings{klimaszewski-wroblewska-2021-combo,
     title = "{COMBO}: A New Module for {EUD} Parsing",
diff --git a/combo/config.template.jsonnet b/combo/config.template.jsonnet
index c602d9c..4e44f42 100644
--- a/combo/config.template.jsonnet
+++ b/combo/config.template.jsonnet
@@ -389,4 +389,4 @@ assert pretrained_tokens == null || pretrained_transformer_name == null: "Can't
     random_seed: 8787,
     pytorch_seed: 8787,
     numpy_seed: 8787,
-}
+}
\ No newline at end of file
diff --git a/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py b/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py
index 3eee80e..b9a4e3c 100644
--- a/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py
+++ b/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py
@@ -1,12 +1,14 @@
 from typing import Optional, Dict, Any, List, Tuple
 
 from allennlp import data
-from allennlp.data import token_indexers, tokenizers
+from allennlp.data import token_indexers, tokenizers, IndexedTokenList, vocabulary
+from overrides import overrides
+
+from typing import List
 
 
 @data.TokenIndexer.register("pretrained_transformer_mismatched_fixed")
 class PretrainedTransformerMismatchedIndexer(token_indexers.PretrainedTransformerMismatchedIndexer):
-    """TODO(mklimasz) Remove during next allennlp update, fixed on allennlp master."""
 
     def __init__(self, model_name: str, namespace: str = "tags", max_length: int = None,
                  tokenizer_kwargs: Optional[Dict[str, Any]] = None, **kwargs) -> None:
@@ -24,6 +26,37 @@ class PretrainedTransformerMismatchedIndexer(token_indexers.PretrainedTransforme
         self._num_added_start_tokens = self._matched_indexer._num_added_start_tokens
         self._num_added_end_tokens = self._matched_indexer._num_added_end_tokens
 
+    @overrides
+    def tokens_to_indices(self,
+                          tokens,
+                          vocabulary: vocabulary ) -> IndexedTokenList:
+        """
+        Method is overridden in order to raise an error while the number of tokens needed to embed a sentence exceeds the
+        maximal input of a model.
+        """
+        self._matched_indexer._add_encoding_to_vocabulary_if_needed(vocabulary)
+
+        wordpieces, offsets = self._allennlp_tokenizer.intra_word_tokenize(
+            [t.ensure_text() for t in tokens])
+
+        if len(wordpieces) > self._tokenizer.max_len_single_sentence:
+            raise ValueError("Following sentence consists of more wordpiece tokens that the model can process:\n" +\
+                             " ".join([str(x) for x in tokens[:10]]) + " ... \n" + \
+                             f"Maximal input: {self._tokenizer.max_len_single_sentence}\n"+ \
+                             f"Current input: {len(wordpieces)}")
+
+        offsets = [x if x is not None else (-1, -1) for x in offsets]
+
+        output: IndexedTokenList = {
+            "token_ids": [t.text_id for t in wordpieces],
+            "mask": [True] * len(tokens),  # for original tokens (i.e. word-level)
+            "type_ids": [t.type_id for t in wordpieces],
+            "offsets": offsets,
+            "wordpiece_mask": [True] * len(wordpieces),  # for wordpieces (i.e. subword-level)
+        }
+
+        return self._matched_indexer._postprocess_output(output)
+
 
 class PretrainedTransformerIndexer(token_indexers.PretrainedTransformerIndexer):
 
diff --git a/docs/installation.md b/docs/installation.md
index 422bed2..6354582 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -2,7 +2,7 @@
 Clone this repository and install COMBO (we suggest using virtualenv/conda with Python 3.6+):
 ```bash
 pip install -U pip setuptools wheel
-pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4
+pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5
 combo --helpfull
 ```
 
@@ -11,7 +11,7 @@ combo --helpfull
 python -m venv venv
 source venv/bin/activate
 pip install -U pip setuptools wheel
-pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4
+pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5
 ```
 
 ### Conda example:
@@ -19,7 +19,7 @@ pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4
 conda create -n combo python=3.8
 conda activate combo
 pip install -U pip setuptools wheel
-pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.1
+pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5
 ```
 
 ## Problems & solutions
diff --git a/setup.py b/setup.py
index 876909d..0e28601 100644
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,7 @@ REQUIREMENTS = [
 
 setup(
     name='combo',
-    version='1.0.4',
+    version='1.0.5',
     author='Mateusz Klimaszewski',
     author_email='M.Klimaszewski@ii.pw.edu.pl',
     install_requires=REQUIREMENTS,
-- 
GitLab