diff --git a/README.md b/README.md index ce43fb4f325d19e65193d75e6bc436a0ceef56dd..e758bc63fe132f0f7de42b3700bba0753cba4a72 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ Clone this repository and install COMBO (we suggest creating a virtualenv/conda environment with Python 3.6+, as a bundle of required packages will be installed): ```bash pip install -U pip setuptools wheel -pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4 +pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5 ``` Run the following commands in your Python console to make predictions with a pre-trained model: ```python diff --git a/combo/config.template.jsonnet b/combo/config.template.jsonnet index c602d9cdc25fbee465f8b0d5fd51f3368e0811c7..4e44f42bbac89d4bf852d31fee29d5d2ef4d4671 100644 --- a/combo/config.template.jsonnet +++ b/combo/config.template.jsonnet @@ -389,4 +389,4 @@ assert pretrained_tokens == null || pretrained_transformer_name == null: "Can't random_seed: 8787, pytorch_seed: 8787, numpy_seed: 8787, -} +} \ No newline at end of file diff --git a/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py b/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py index 3eee80efd65ad905c447849d9e4cdd93673be522..b9a4e3ce0aacfb6d49c635ca4e88990c6dcd2660 100644 --- a/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py +++ b/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py @@ -1,12 +1,14 @@ from typing import Optional, Dict, Any, List, Tuple from allennlp import data -from allennlp.data import token_indexers, tokenizers +from allennlp.data import token_indexers, tokenizers, IndexedTokenList, vocabulary +from overrides import overrides + +from typing import List @data.TokenIndexer.register("pretrained_transformer_mismatched_fixed") class PretrainedTransformerMismatchedIndexer(token_indexers.PretrainedTransformerMismatchedIndexer): - """TODO(mklimasz) Remove during next allennlp update, fixed on allennlp master.""" def __init__(self, model_name: str, namespace: str = "tags", max_length: int = None, tokenizer_kwargs: Optional[Dict[str, Any]] = None, **kwargs) -> None: @@ -24,6 +26,37 @@ class PretrainedTransformerMismatchedIndexer(token_indexers.PretrainedTransforme self._num_added_start_tokens = self._matched_indexer._num_added_start_tokens self._num_added_end_tokens = self._matched_indexer._num_added_end_tokens + @overrides + def tokens_to_indices(self, + tokens, + vocabulary: vocabulary ) -> IndexedTokenList: + """ + Method is overridden in order to raise an error while the number of tokens needed to embed a sentence exceeds the + maximal input of a model. + """ + self._matched_indexer._add_encoding_to_vocabulary_if_needed(vocabulary) + + wordpieces, offsets = self._allennlp_tokenizer.intra_word_tokenize( + [t.ensure_text() for t in tokens]) + + if len(wordpieces) > self._tokenizer.max_len_single_sentence: + raise ValueError("Following sentence consists of more wordpiece tokens that the model can process:\n" +\ + " ".join([str(x) for x in tokens[:10]]) + " ... \n" + \ + f"Maximal input: {self._tokenizer.max_len_single_sentence}\n"+ \ + f"Current input: {len(wordpieces)}") + + offsets = [x if x is not None else (-1, -1) for x in offsets] + + output: IndexedTokenList = { + "token_ids": [t.text_id for t in wordpieces], + "mask": [True] * len(tokens), # for original tokens (i.e. word-level) + "type_ids": [t.type_id for t in wordpieces], + "offsets": offsets, + "wordpiece_mask": [True] * len(wordpieces), # for wordpieces (i.e. subword-level) + } + + return self._matched_indexer._postprocess_output(output) + class PretrainedTransformerIndexer(token_indexers.PretrainedTransformerIndexer): diff --git a/docs/installation.md b/docs/installation.md index 422bed22423873a334c3894e48b02ceed5e4e7b9..6354582dc8f25526c51aa90d4ac5a5b3b16106d5 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -2,7 +2,7 @@ Clone this repository and install COMBO (we suggest using virtualenv/conda with Python 3.6+): ```bash pip install -U pip setuptools wheel -pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4 +pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5 combo --helpfull ``` @@ -11,7 +11,7 @@ combo --helpfull python -m venv venv source venv/bin/activate pip install -U pip setuptools wheel -pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4 +pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5 ``` ### Conda example: @@ -19,7 +19,7 @@ pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4 conda create -n combo python=3.8 conda activate combo pip install -U pip setuptools wheel -pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.1 +pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5 ``` ## Problems & solutions diff --git a/setup.py b/setup.py index 876909dc196e631e2df7c83aaeae6ae53e2cfe17..0e2860139b5a928034998bfba1a1a06d99c5616f 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ REQUIREMENTS = [ setup( name='combo', - version='1.0.4', + version='1.0.5', author='Mateusz Klimaszewski', author_email='M.Klimaszewski@ii.pw.edu.pl', install_requires=REQUIREMENTS,