From f39ffe688fefc4c54c21d076308df9a5fba261aa Mon Sep 17 00:00:00 2001
From: pszenny <pszenny@e-science.pl>
Date: Wed, 20 Oct 2021 01:34:33 +0200
Subject: [PATCH 1/4] Release 1.0.5 Error while too many tokens are processed
 by BERT

---
 README.md                                     |  2 +-
 combo/config.template.jsonnet                 |  2 +-
 ...etrained_transformer_mismatched_indexer.py | 37 ++++++++++++++++++-
 docs/installation.md                          |  6 +--
 setup.py                                      |  2 +-
 5 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index ce43fb4..e758bc6 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@
 Clone this repository and install COMBO (we suggest creating a virtualenv/conda environment with Python 3.6+, as a bundle of required packages will be installed):
 ```bash
 pip install -U pip setuptools wheel
-pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4
+pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5
 ```
 Run the following commands in your Python console to make predictions with a pre-trained model:
 ```python
diff --git a/combo/config.template.jsonnet b/combo/config.template.jsonnet
index c602d9c..4e44f42 100644
--- a/combo/config.template.jsonnet
+++ b/combo/config.template.jsonnet
@@ -389,4 +389,4 @@ assert pretrained_tokens == null || pretrained_transformer_name == null: "Can't
     random_seed: 8787,
     pytorch_seed: 8787,
     numpy_seed: 8787,
-}
+}
\ No newline at end of file
diff --git a/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py b/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py
index 3eee80e..b9a4e3c 100644
--- a/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py
+++ b/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py
@@ -1,12 +1,14 @@
 from typing import Optional, Dict, Any, List, Tuple
 
 from allennlp import data
-from allennlp.data import token_indexers, tokenizers
+from allennlp.data import token_indexers, tokenizers, IndexedTokenList, vocabulary
+from overrides import overrides
+
+from typing import List
 
 
 @data.TokenIndexer.register("pretrained_transformer_mismatched_fixed")
 class PretrainedTransformerMismatchedIndexer(token_indexers.PretrainedTransformerMismatchedIndexer):
-    """TODO(mklimasz) Remove during next allennlp update, fixed on allennlp master."""
 
     def __init__(self, model_name: str, namespace: str = "tags", max_length: int = None,
                  tokenizer_kwargs: Optional[Dict[str, Any]] = None, **kwargs) -> None:
@@ -24,6 +26,37 @@ class PretrainedTransformerMismatchedIndexer(token_indexers.PretrainedTransforme
         self._num_added_start_tokens = self._matched_indexer._num_added_start_tokens
         self._num_added_end_tokens = self._matched_indexer._num_added_end_tokens
 
+    @overrides
+    def tokens_to_indices(self,
+                          tokens,
+                          vocabulary: vocabulary ) -> IndexedTokenList:
+        """
+        Method is overridden in order to raise an error while the number of tokens needed to embed a sentence exceeds the
+        maximal input of a model.
+        """
+        self._matched_indexer._add_encoding_to_vocabulary_if_needed(vocabulary)
+
+        wordpieces, offsets = self._allennlp_tokenizer.intra_word_tokenize(
+            [t.ensure_text() for t in tokens])
+
+        if len(wordpieces) > self._tokenizer.max_len_single_sentence:
+            raise ValueError("Following sentence consists of more wordpiece tokens that the model can process:\n" +\
+                             " ".join([str(x) for x in tokens[:10]]) + " ... \n" + \
+                             f"Maximal input: {self._tokenizer.max_len_single_sentence}\n"+ \
+                             f"Current input: {len(wordpieces)}")
+
+        offsets = [x if x is not None else (-1, -1) for x in offsets]
+
+        output: IndexedTokenList = {
+            "token_ids": [t.text_id for t in wordpieces],
+            "mask": [True] * len(tokens),  # for original tokens (i.e. word-level)
+            "type_ids": [t.type_id for t in wordpieces],
+            "offsets": offsets,
+            "wordpiece_mask": [True] * len(wordpieces),  # for wordpieces (i.e. subword-level)
+        }
+
+        return self._matched_indexer._postprocess_output(output)
+
 
 class PretrainedTransformerIndexer(token_indexers.PretrainedTransformerIndexer):
 
diff --git a/docs/installation.md b/docs/installation.md
index 422bed2..6354582 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -2,7 +2,7 @@
 Clone this repository and install COMBO (we suggest using virtualenv/conda with Python 3.6+):
 ```bash
 pip install -U pip setuptools wheel
-pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4
+pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5
 combo --helpfull
 ```
 
@@ -11,7 +11,7 @@ combo --helpfull
 python -m venv venv
 source venv/bin/activate
 pip install -U pip setuptools wheel
-pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4
+pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5
 ```
 
 ### Conda example:
@@ -19,7 +19,7 @@ pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4
 conda create -n combo python=3.8
 conda activate combo
 pip install -U pip setuptools wheel
-pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.1
+pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5
 ```
 
 ## Problems & solutions
diff --git a/setup.py b/setup.py
index 876909d..0e28601 100644
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,7 @@ REQUIREMENTS = [
 
 setup(
     name='combo',
-    version='1.0.4',
+    version='1.0.5',
     author='Mateusz Klimaszewski',
     author_email='M.Klimaszewski@ii.pw.edu.pl',
     install_requires=REQUIREMENTS,
-- 
GitLab


From e9b1721f566fe9a20d5ad9ba8bf0d5cfa1c40a85 Mon Sep 17 00:00:00 2001
From: martynawiacek <Hkkm6072>
Date: Thu, 20 Jan 2022 17:28:20 +0100
Subject: [PATCH 2/4] Add try/catch clause for sentences with large number of
 wordpieces.

---
 .../pretrained_transformer_mismatched_indexer.py          | 2 --
 combo/predict.py                                          | 8 +++++++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py b/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py
index b9a4e3c..fc29896 100644
--- a/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py
+++ b/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py
@@ -4,8 +4,6 @@ from allennlp import data
 from allennlp.data import token_indexers, tokenizers, IndexedTokenList, vocabulary
 from overrides import overrides
 
-from typing import List
-
 
 @data.TokenIndexer.register("pretrained_transformer_mismatched_fixed")
 class PretrainedTransformerMismatchedIndexer(token_indexers.PretrainedTransformerMismatchedIndexer):
diff --git a/combo/predict.py b/combo/predict.py
index 01a0837..83b030f 100644
--- a/combo/predict.py
+++ b/combo/predict.py
@@ -1,5 +1,6 @@
 import logging
 import os
+import sys
 from typing import List, Union, Dict, Any
 
 import numpy as np
@@ -48,7 +49,12 @@ class COMBO(predictor.Predictor):
         :param sentence: sentence(s) representation
         :return: Sentence or List[Sentence] depending on the input
         """
-        return self.predict(sentence)
+        try:
+            return self.predict(sentence)
+        except Exception as e:
+            logger.error(e)
+            logger.error('Exiting.')
+            sys.exit(1)
 
     def predict(self, sentence: Union[str, List[str], List[List[str]], List[data.Sentence]]):
         if isinstance(sentence, str):
-- 
GitLab


From a3edda3911ef0b55faf11993c313ef7d58357128 Mon Sep 17 00:00:00 2001
From: pszenny <pszenny@e-science.pl>
Date: Thu, 3 Feb 2022 01:13:53 +0100
Subject: [PATCH 3/4] Support for python 3.9

---
 setup.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/setup.py b/setup.py
index 0e28601..f4a82e5 100644
--- a/setup.py
+++ b/setup.py
@@ -7,15 +7,20 @@ REQUIREMENTS = [
     'conllu==2.3.2',
     'dataclasses;python_version<"3.7"',
     'jsonnet==0.15.0',
-    'numpy==1.19.4',
+    'filelock==3.0;python_version>="3.9"',
+    'numpy==1.19.4;python_version<"3.9"',
+    'numpy==1.22.0;python_version>="3.9"',
     'overrides==3.1.0',
     'requests==2.23.0',
     'sentencepiece==0.1.83;python_version<"3.8"',
-    'sentencepiece==0.1.85;python_version>="3.8"',
+    'sentencepiece==0.1.85;python_version>="3.8" and python_version<"3.9"',
+    'sentencepiece==0.1.94;python_version>="3.9"',
     'scipy<1.6.0;python_version<"3.7"',  # SciPy 1.6.0 works for 3.7+
+    'scipy==1.6.0;python_version>="3.7"',
     'spacy==2.3.2',
-    'scikit-learn<=0.23.2',
-    'torch==1.7.0',
+    'scikit-learn<=0.23.2;python_version<"3.9"',
+    'scikit-learn==0.23.2;python_version>="3.9"',
+    'torch==1.7.1',
     'tqdm==4.43.0',
     'transformers==4.0.1',
     'urllib3==1.25.11',
@@ -31,7 +36,10 @@ setup(
     license='GPL-3.0',
     url='https://gitlab.clarin-pl.eu/syntactic-tools/combo',
     keywords="nlp natural-language-processing dependency-parsing",
-    setup_requires=['pytest-runner', 'pytest-pylint'],
+    setup_requires=['pytest-runner',
+    		    'pytest-pylint',
+    		    'numpy==1.22.0;python_version>="3.9"',
+    		    'scipy==1.6.0;python_version>="3.7"'],
     tests_require=['pytest', 'pylint'],
     python_requires='>=3.6',
     package_data={'combo': ['config.graph.template.jsonnet', 'config.template.jsonnet']},
@@ -44,5 +52,6 @@ setup(
         'Programming Language :: Python :: 3.6',
         'Programming Language :: Python :: 3.7',
         'Programming Language :: Python :: 3.8',
+        'Programming Language :: Python :: 3.9',
     ]
-)
+)
\ No newline at end of file
-- 
GitLab


From 4dc8c92f51e6e881bdd23734127c5af1fab1bfe7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=81ukasz=20Pszenny?= <pszenny@e-science.pl>
Date: Fri, 11 Feb 2022 11:11:37 +0000
Subject: [PATCH 4/4] Update README.md

---
 README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/README.md b/README.md
index e758bc6..0b0cfc7 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,11 @@ Clone this repository and install COMBO (we suggest creating a virtualenv/conda
 pip install -U pip setuptools wheel
 pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5
 ```
+For Python 3.9 you may also need to install cython:
+```bash
+pip install -U pip cython
+```
+
 Run the following commands in your Python console to make predictions with a pre-trained model:
 ```python
 from combo.predict import COMBO
-- 
GitLab