From f39ffe688fefc4c54c21d076308df9a5fba261aa Mon Sep 17 00:00:00 2001 From: pszenny <pszenny@e-science.pl> Date: Wed, 20 Oct 2021 01:34:33 +0200 Subject: [PATCH 1/4] Release 1.0.5 Error while too many tokens are processed by BERT --- README.md | 2 +- combo/config.template.jsonnet | 2 +- ...etrained_transformer_mismatched_indexer.py | 37 ++++++++++++++++++- docs/installation.md | 6 +-- setup.py | 2 +- 5 files changed, 41 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index ce43fb4..e758bc6 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ Clone this repository and install COMBO (we suggest creating a virtualenv/conda environment with Python 3.6+, as a bundle of required packages will be installed): ```bash pip install -U pip setuptools wheel -pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4 +pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5 ``` Run the following commands in your Python console to make predictions with a pre-trained model: ```python diff --git a/combo/config.template.jsonnet b/combo/config.template.jsonnet index c602d9c..4e44f42 100644 --- a/combo/config.template.jsonnet +++ b/combo/config.template.jsonnet @@ -389,4 +389,4 @@ assert pretrained_tokens == null || pretrained_transformer_name == null: "Can't random_seed: 8787, pytorch_seed: 8787, numpy_seed: 8787, -} +} \ No newline at end of file diff --git a/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py b/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py index 3eee80e..b9a4e3c 100644 --- a/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py +++ b/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py @@ -1,12 +1,14 @@ from typing import Optional, Dict, Any, List, Tuple from allennlp import data -from allennlp.data import token_indexers, tokenizers +from allennlp.data import token_indexers, tokenizers, IndexedTokenList, vocabulary +from overrides import overrides + +from typing import List @data.TokenIndexer.register("pretrained_transformer_mismatched_fixed") class PretrainedTransformerMismatchedIndexer(token_indexers.PretrainedTransformerMismatchedIndexer): - """TODO(mklimasz) Remove during next allennlp update, fixed on allennlp master.""" def __init__(self, model_name: str, namespace: str = "tags", max_length: int = None, tokenizer_kwargs: Optional[Dict[str, Any]] = None, **kwargs) -> None: @@ -24,6 +26,37 @@ class PretrainedTransformerMismatchedIndexer(token_indexers.PretrainedTransforme self._num_added_start_tokens = self._matched_indexer._num_added_start_tokens self._num_added_end_tokens = self._matched_indexer._num_added_end_tokens + @overrides + def tokens_to_indices(self, + tokens, + vocabulary: vocabulary ) -> IndexedTokenList: + """ + Method is overridden in order to raise an error while the number of tokens needed to embed a sentence exceeds the + maximal input of a model. + """ + self._matched_indexer._add_encoding_to_vocabulary_if_needed(vocabulary) + + wordpieces, offsets = self._allennlp_tokenizer.intra_word_tokenize( + [t.ensure_text() for t in tokens]) + + if len(wordpieces) > self._tokenizer.max_len_single_sentence: + raise ValueError("Following sentence consists of more wordpiece tokens that the model can process:\n" +\ + " ".join([str(x) for x in tokens[:10]]) + " ... \n" + \ + f"Maximal input: {self._tokenizer.max_len_single_sentence}\n"+ \ + f"Current input: {len(wordpieces)}") + + offsets = [x if x is not None else (-1, -1) for x in offsets] + + output: IndexedTokenList = { + "token_ids": [t.text_id for t in wordpieces], + "mask": [True] * len(tokens), # for original tokens (i.e. word-level) + "type_ids": [t.type_id for t in wordpieces], + "offsets": offsets, + "wordpiece_mask": [True] * len(wordpieces), # for wordpieces (i.e. subword-level) + } + + return self._matched_indexer._postprocess_output(output) + class PretrainedTransformerIndexer(token_indexers.PretrainedTransformerIndexer): diff --git a/docs/installation.md b/docs/installation.md index 422bed2..6354582 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -2,7 +2,7 @@ Clone this repository and install COMBO (we suggest using virtualenv/conda with Python 3.6+): ```bash pip install -U pip setuptools wheel -pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4 +pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5 combo --helpfull ``` @@ -11,7 +11,7 @@ combo --helpfull python -m venv venv source venv/bin/activate pip install -U pip setuptools wheel -pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4 +pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5 ``` ### Conda example: @@ -19,7 +19,7 @@ pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4 conda create -n combo python=3.8 conda activate combo pip install -U pip setuptools wheel -pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.1 +pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5 ``` ## Problems & solutions diff --git a/setup.py b/setup.py index 876909d..0e28601 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ REQUIREMENTS = [ setup( name='combo', - version='1.0.4', + version='1.0.5', author='Mateusz Klimaszewski', author_email='M.Klimaszewski@ii.pw.edu.pl', install_requires=REQUIREMENTS, -- GitLab From e9b1721f566fe9a20d5ad9ba8bf0d5cfa1c40a85 Mon Sep 17 00:00:00 2001 From: martynawiacek <Hkkm6072> Date: Thu, 20 Jan 2022 17:28:20 +0100 Subject: [PATCH 2/4] Add try/catch clause for sentences with large number of wordpieces. --- .../pretrained_transformer_mismatched_indexer.py | 2 -- combo/predict.py | 8 +++++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py b/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py index b9a4e3c..fc29896 100644 --- a/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py +++ b/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py @@ -4,8 +4,6 @@ from allennlp import data from allennlp.data import token_indexers, tokenizers, IndexedTokenList, vocabulary from overrides import overrides -from typing import List - @data.TokenIndexer.register("pretrained_transformer_mismatched_fixed") class PretrainedTransformerMismatchedIndexer(token_indexers.PretrainedTransformerMismatchedIndexer): diff --git a/combo/predict.py b/combo/predict.py index 01a0837..83b030f 100644 --- a/combo/predict.py +++ b/combo/predict.py @@ -1,5 +1,6 @@ import logging import os +import sys from typing import List, Union, Dict, Any import numpy as np @@ -48,7 +49,12 @@ class COMBO(predictor.Predictor): :param sentence: sentence(s) representation :return: Sentence or List[Sentence] depending on the input """ - return self.predict(sentence) + try: + return self.predict(sentence) + except Exception as e: + logger.error(e) + logger.error('Exiting.') + sys.exit(1) def predict(self, sentence: Union[str, List[str], List[List[str]], List[data.Sentence]]): if isinstance(sentence, str): -- GitLab From a3edda3911ef0b55faf11993c313ef7d58357128 Mon Sep 17 00:00:00 2001 From: pszenny <pszenny@e-science.pl> Date: Thu, 3 Feb 2022 01:13:53 +0100 Subject: [PATCH 3/4] Support for python 3.9 --- setup.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index 0e28601..f4a82e5 100644 --- a/setup.py +++ b/setup.py @@ -7,15 +7,20 @@ REQUIREMENTS = [ 'conllu==2.3.2', 'dataclasses;python_version<"3.7"', 'jsonnet==0.15.0', - 'numpy==1.19.4', + 'filelock==3.0;python_version>="3.9"', + 'numpy==1.19.4;python_version<"3.9"', + 'numpy==1.22.0;python_version>="3.9"', 'overrides==3.1.0', 'requests==2.23.0', 'sentencepiece==0.1.83;python_version<"3.8"', - 'sentencepiece==0.1.85;python_version>="3.8"', + 'sentencepiece==0.1.85;python_version>="3.8" and python_version<"3.9"', + 'sentencepiece==0.1.94;python_version>="3.9"', 'scipy<1.6.0;python_version<"3.7"', # SciPy 1.6.0 works for 3.7+ + 'scipy==1.6.0;python_version>="3.7"', 'spacy==2.3.2', - 'scikit-learn<=0.23.2', - 'torch==1.7.0', + 'scikit-learn<=0.23.2;python_version<"3.9"', + 'scikit-learn==0.23.2;python_version>="3.9"', + 'torch==1.7.1', 'tqdm==4.43.0', 'transformers==4.0.1', 'urllib3==1.25.11', @@ -31,7 +36,10 @@ setup( license='GPL-3.0', url='https://gitlab.clarin-pl.eu/syntactic-tools/combo', keywords="nlp natural-language-processing dependency-parsing", - setup_requires=['pytest-runner', 'pytest-pylint'], + setup_requires=['pytest-runner', + 'pytest-pylint', + 'numpy==1.22.0;python_version>="3.9"', + 'scipy==1.6.0;python_version>="3.7"'], tests_require=['pytest', 'pylint'], python_requires='>=3.6', package_data={'combo': ['config.graph.template.jsonnet', 'config.template.jsonnet']}, @@ -44,5 +52,6 @@ setup( 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', ] -) +) \ No newline at end of file -- GitLab From 4dc8c92f51e6e881bdd23734127c5af1fab1bfe7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Pszenny?= <pszenny@e-science.pl> Date: Fri, 11 Feb 2022 11:11:37 +0000 Subject: [PATCH 4/4] Update README.md --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index e758bc6..0b0cfc7 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,11 @@ Clone this repository and install COMBO (we suggest creating a virtualenv/conda pip install -U pip setuptools wheel pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5 ``` +For Python 3.9 you may also need to install cython: +```bash +pip install -U pip cython +``` + Run the following commands in your Python console to make predictions with a pre-trained model: ```python from combo.predict import COMBO -- GitLab