diff --git a/combo/data/__init__.py b/combo/data/__init__.py index 7fb7833cc78b5aec3cacf4df3eebf88fe28b12b9..4864d179632006735d451bdbce663c8f9fcca3ed 100644 --- a/combo/data/__init__.py +++ b/combo/data/__init__.py @@ -3,11 +3,7 @@ from .samplers import TokenCountBatchSampler from .instance import Instance from .token_indexers import (SingleIdTokenIndexer, TokenIndexer, TokenFeatsIndexer) from .tokenizers import (Tokenizer, Token, CharacterTokenizer, PretrainedTransformerTokenizer, - SpacyTokenizer, WhitespaceTokenizer, LamboTokenizer) + WhitespaceTokenizer, LamboTokenizer) from .dataset_readers import (ConllDatasetReader, DatasetReader, TextClassificationJSONReader, UniversalDependenciesDatasetReader) from .api import (Sentence, tokens2conllu, conllu2sentence, sentence2conllu) - - -class TokenCharactersIndexer: - pass \ No newline at end of file diff --git a/combo/data/dataset_readers/text_classification_json_reader.py b/combo/data/dataset_readers/text_classification_json_reader.py index d01590fe169426fb4b164fc1c6c34636dcb51c5e..e0a5aac3b9f5eced9cdd0361b4c4b42ea8518816 100644 --- a/combo/data/dataset_readers/text_classification_json_reader.py +++ b/combo/data/dataset_readers/text_classification_json_reader.py @@ -30,35 +30,16 @@ class TextClassificationJSONReader(DatasetReader): def __init__(self, tokenizer: Optional[Tokenizer] = None, token_indexers: Optional[Dict[str, TokenIndexer]] = None, - sentence_segmenter: Optional[Tokenizer] = None, max_sequence_length: Optional[int] = None, skip_label_indexing: bool = False, text_key: str = "text", label_key: str = "label") -> None: - if ((sentence_segmenter is not None) and - (not _is_sentence_segmenter(sentence_segmenter))): - raise ConfigurationError(f'Passed sentence segmenter has no' - f'split_sentences method!') - super().__init__(tokenizer, token_indexers) - self.__sentence_segmenter = sentence_segmenter self.__max_sequence_length = max_sequence_length self.__skip_label_indexing = skip_label_indexing self.__text_key = text_key self.__label_key = label_key - @property - def sentence_segmenter(self) -> Optional[Tokenizer]: - return self.__sentence_segmenter - - @sentence_segmenter.setter - def sentence_segmenter(self, value: Optional[Tokenizer]): - if ((value is not None) and - (not _is_sentence_segmenter(value))): - raise ConfigurationError(f'Passed sentence segmenter has no' - f'split_sentences method!') - self.__sentence_segmenter = value - @property def max_sequence_length(self) -> Optional[int]: return self.__max_sequence_length @@ -127,22 +108,15 @@ class TextClassificationJSONReader(DatasetReader): - label ('LabelField') """ fields: Dict[str, Field] = {} - if self.sentence_segmenter is not None: - sentences: List[Field] = [] - - # TODO: some subclass for sentence segmenter for tokenizers? - sentence_splits = self.sentence_segmenter.split_sentences(text) - for sentence in sentence_splits: - word_tokens = self.tokenizer.tokenize(sentence) - if self.max_sequence_length is not None: - word_tokens = self._truncate(word_tokens) - sentences.append(TextField(word_tokens)) - fields["tokens"] = ListField(sentences) - else: - tokens = self.tokenizer.tokenize(text) + sentences: List[Field] = [] + + # TODO: some subclass for sentence segmenter for tokenizers? + sentence_splits = self.tokenizer.tokenize(text) + for word_tokens in sentence_splits: if self.max_sequence_length is not None: - tokens = self._truncate(tokens) - fields["tokens"] = TextField(tokens) + word_tokens = self._truncate(word_tokens) + sentences.append(TextField(word_tokens)) + fields["tokens"] = ListField(sentences) if label is not None: fields["label"] = LabelField(label, @@ -163,9 +137,8 @@ class TextClassificationJSONReader(DatasetReader): @overrides def apply_token_indexers(self, instance: Instance) -> None: - if self.sentence_segmenter is not None: + if isinstance(instance.fields["tokens"], ListField): for text_field in instance.fields["tokens"]: # type: ignore text_field._token_indexers = self.token_indexers - else: instance.fields["tokens"]._token_indexers = self.token_indexers # type: ignore diff --git a/combo/data/dataset_readers/universal_dependencies_dataset_reader.py b/combo/data/dataset_readers/universal_dependencies_dataset_reader.py index d8f9d0e9b129e4fcbccb671dbc6ab4711a759663..22960a217e272c81995d4624a93f05ff0035da61 100644 --- a/combo/data/dataset_readers/universal_dependencies_dataset_reader.py +++ b/combo/data/dataset_readers/universal_dependencies_dataset_reader.py @@ -128,10 +128,10 @@ class UniversalDependenciesDatasetReader(DatasetReader, ABC): assert conllu_file and file.exists(), f"File with path '{conllu_file}' does not exist!" with file.open("r", encoding="utf-8") as f: for annotation in conllu.parse_incr(f, fields=self.fields, field_parsers=self.field_parsers): - yield self.text_to_instance(annotation) + yield self.text_to_instance([Token.from_conllu_token(t) for t in annotation if isinstance(t.get("id"), int)]) - def text_to_instance(self, tree: conllu.models.TokenList) -> Instance: + def text_to_instance(self, tree: List[Token]) -> Instance: fields_: Dict[str, Field] = {} # features diff --git a/combo/data/tokenizers/__init__.py b/combo/data/tokenizers/__init__.py index 0afccddf3b8ae428ff5d857f9285eaf819e2581c..4deeda91dd809fafd67efae7ade23bf3125bce3a 100644 --- a/combo/data/tokenizers/__init__.py +++ b/combo/data/tokenizers/__init__.py @@ -1,7 +1,6 @@ from .tokenizer import Tokenizer, Token from .character_tokenizer import CharacterTokenizer from .pretrained_transformer_tokenizer import PretrainedTransformerTokenizer -from .spacy_tokenizer import SpacyTokenizer from .sentence_splitter import SentenceSplitter, SpacySentenceSplitter from .whitespace_tokenizer import WhitespaceTokenizer from .lambo_tokenizer import LamboTokenizer diff --git a/combo/data/tokenizers/lambo_tokenizer.py b/combo/data/tokenizers/lambo_tokenizer.py index c5bb322f7377f8d608148d9697e75a95d81fb62b..e0beb838ec2ec31779a53a8917d3bb8be4d8d882 100644 --- a/combo/data/tokenizers/lambo_tokenizer.py +++ b/combo/data/tokenizers/lambo_tokenizer.py @@ -33,7 +33,7 @@ def _sentence_tokens(token: Token, in zip(subword_idxs, token.subwords)] return tokens else: - return [Token(idx=next(_token_idx()), text=token.text)] + return [Token(idx=next(_token_idx()), text=token.text, subwords=token.subwords)] @Registry.register('lambo_tokenizer') diff --git a/combo/data/tokenizers/spacy_tokenizer.py b/combo/data/tokenizers/spacy_tokenizer.py deleted file mode 100644 index b884aae278b3891d4bdce0dab6c07b0f2323edf6..0000000000000000000000000000000000000000 --- a/combo/data/tokenizers/spacy_tokenizer.py +++ /dev/null @@ -1,163 +0,0 @@ -""" -Adapted from AllenNLP -https://github.com/allenai/allennlp/blob/main/allennlp/data/tokenizers/spacy_tokenizer.py -""" - -from typing import List, Optional - -import spacy -from spacy.tokens import Doc - -from combo.config import Registry -from combo.config.from_parameters import register_arguments -from combo.data.tokenizers.token import Token -from combo.data.tokenizers.tokenizer import Tokenizer -from combo.utils.spacy import get_spacy_model - - -@Registry.register('spacy_tokenizer') -class SpacyTokenizer(Tokenizer): - """ - A `Tokenizer` that uses spaCy's tokenizer. It's fast and reasonable - this is the - recommended `Tokenizer`. By default it will return allennlp Tokens, - which are small, efficient NamedTuples (and are serializable). If you want - to keep the original spaCy tokens, pass keep_spacy_tokens=True. Note that we leave one particular piece of - post-processing for later: the decision of whether or not to lowercase the token. This is for - two reasons: (1) if you want to make two different casing decisions for whatever reason, you - won't have to run the tokenizer twice, and more importantly (2) if you want to lowercase words - for your word embedding, but retain capitalization in a character-level representation, we need - to retain the capitalization here. - Registered as a `Tokenizer` with name "spacy", which is currently the default. - # Parameters - language : `str`, optional, (default=`"en_core_web_sm"`) - Spacy model name. - pos_tags : `bool`, optional, (default=`False`) - If `True`, performs POS tagging with spacy model on the tokens. - Generally used in conjunction with :class:`~allennlp.data.token_indexers.pos_tag_indexer.PosTagIndexer`. - parse : `bool`, optional, (default=`False`) - If `True`, performs dependency parsing with spacy model on the tokens. - Generally used in conjunction with :class:`~allennlp.data.token_indexers.pos_tag_indexer.DepLabelIndexer`. - ner : `bool`, optional, (default=`False`) - If `True`, performs dependency parsing with spacy model on the tokens. - Generally used in conjunction with :class:`~allennlp.data.token_indexers.ner_tag_indexer.NerTagIndexer`. - keep_spacy_tokens : `bool`, optional, (default=`False`) - If `True`, will preserve spacy token objects, We copy spacy tokens into our own class by default instead - because spacy Cython Tokens can't be pickled. - split_on_spaces : `bool`, optional, (default=`False`) - If `True`, will split by spaces without performing tokenization. - Used when your data is already tokenized, but you want to perform pos, ner or parsing on the tokens. - start_tokens : `Optional[List[str]]`, optional, (default=`None`) - If given, these tokens will be added to the beginning of every string we tokenize. - end_tokens : `Optional[List[str]]`, optional, (default=`None`) - If given, these tokens will be added to the end of every string we tokenize. - """ - - @register_arguments - def __init__( - self, - language: str = "en_core_web_sm", - pos_tags: bool = True, - parse: bool = False, - ner: bool = False, - keep_spacy_tokens: bool = False, - split_on_spaces: bool = False, - start_tokens: Optional[List[str]] = None, - end_tokens: Optional[List[str]] = None, - ) -> None: - # Save these for use later in the _to_params method - self._language = language - self._pos_tags = pos_tags - self._parse = parse - self._ner = ner - self._split_on_spaces = split_on_spaces - - self.spacy = get_spacy_model(self._language, self._pos_tags, self._parse, self._ner) - - if self._split_on_spaces: - self.spacy.tokenizer = _WhitespaceSpacyTokenizer(self.spacy.vocab) - - self._keep_spacy_tokens = keep_spacy_tokens - self._start_tokens = start_tokens or [] - # We reverse the tokens here because we're going to insert them with `insert(0)` later; - # this makes sure they show up in the right order. - self._start_tokens.reverse() - self._is_version_3 = spacy.__version__ >= "3.0" - self._end_tokens = end_tokens or [] - - def _sanitize(self, tokens: List[spacy.tokens.Token]) -> List[Token]: - """ - Converts spaCy tokens to allennlp tokens. Is a no-op if - keep_spacy_tokens is True - """ - # self.text = text - # self.lemma = lemma - # self.upostag = upostag - # self.xpostag = xpostag - # self.entity_type = entity_type - # self.feats = feats - # self.head = head - # self.deprel = deprel - # self.deps = deps - # self.misc = misc - # self.semrel = semrel - - # TODO: add morph from SpaCy? - if not self._keep_spacy_tokens: - tokens = [ - Token( - idx=(token.idx, token.idx + len(token.text)), - text=token.text, - lemma=token.lemma_, - upostag=token.pos_, - xpostag=token.tag_, - deprel=token.dep_, - entity_type=token.ent_type_ - ) - for token in tokens - ] - for start_token in self._start_tokens: - tokens.insert(0, Token(idx=0, text=start_token)) - for end_token in self._end_tokens: - tokens.append(Token(idx=-1, text=end_token)) - return tokens - - def batch_tokenize(self, texts: List[str]) -> List[List[Token]]: - if self._is_version_3: - return [ - self._sanitize(_remove_spaces(tokens)) - for tokens in self.spacy.pipe(texts, n_process=-1) - ] - else: - return [ - self._sanitize(_remove_spaces(tokens)) - for tokens in self.spacy.pipe(texts, n_threads=-1) - ] - - def tokenize(self, text: str) -> List[Token]: - # This works because our Token class matches spacy's. - return self._sanitize(_remove_spaces(self.spacy(text))) - - -class _WhitespaceSpacyTokenizer: - """ - Spacy doesn't assume that text is tokenised. Sometimes this - is annoying, like when you have gold data which is pre-tokenised, - but Spacy's tokenisation doesn't match the gold. This can be used - as follows: - nlp = spacy.load("en_core_web_md") - # hack to replace tokenizer with a whitespace tokenizer - nlp.tokenizer = _WhitespaceSpacyTokenizer(nlp.vocabulary) - ... use nlp("here is some text") as normal. - """ - - def __init__(self, vocab): - self.vocab = vocab - - def __call__(self, text): - words = text.split(" ") - spaces = [True] * len(words) - return Doc(self.vocab, words=words, spaces=spaces) - - -def _remove_spaces(tokens: List[spacy.tokens.Token]) -> List[spacy.tokens.Token]: - return [token for token in tokens if not token.is_space] diff --git a/combo/modules/model.py b/combo/modules/model.py index 76e83be87dd3e38559319df44d26e45b757c8ff7..44e61652ac53c2eee08c9c6645919bb30c2e8985 100644 --- a/combo/modules/model.py +++ b/combo/modules/model.py @@ -512,7 +512,7 @@ class Model(Module, pl.LightningModule, FromParameters): If `vocabulary` is given, we will extend the loaded model's vocabulary using the passed vocabulary object (including calling `extend_embedder_vocab`, which extends embedding layers). """ - from modules.archival import load_archive # here to avoid circular imports + from combo.modules.archival import load_archive # here to avoid circular imports model = load_archive(archive_file).model if vocabulary: diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000000000000000000000000000000000000..a2c216db8698c3e42dbb41233daed6ddc76059f3 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,13 @@ +[aliases] +test=pytest + +[tox:tox] +envlist = py38 +skipsdist = True + +[testenv:pytest] +deps = pytest +commands = pytest + +[metadata] +description-file = README.md diff --git a/setup.py b/setup.py index 0aeb7ee93ea5f4322c43c22480996c971cb00d7d..5649e1d5b0f50dc68fded7f8afa84c89047d7a74 100644 --- a/setup.py +++ b/setup.py @@ -24,6 +24,7 @@ REQUIREMENTS = [ "pandas~=2.1.3", "pytest~=7.2.2", "transformers~=4.27.3", + "sacremoses~=0.0.53", "spacy~=3.3.1" ] diff --git a/tests/config/test_archive.py b/tests/config/test_archive.py index fa17b0dcb5b20bd5435539ee2e3181856415a777..91bfd925958acf293c062e5a1f155cfa695a4003 100644 --- a/tests/config/test_archive.py +++ b/tests/config/test_archive.py @@ -1,6 +1,7 @@ import os import unittest from tempfile import TemporaryDirectory +import importlib.resources from combo.combo_model import ComboModel from combo.data.vocabulary import Vocabulary @@ -8,10 +9,10 @@ from combo.default_model import default_model from combo.modules import archive TEMP_FILE_PATH = 'temp_serialization_dir' - +test_file_path_str = str(importlib.resources.files('tests').joinpath('test_fixtures')) def _test_vocabulary() -> Vocabulary: - return Vocabulary.from_files(os.path.normpath(os.path.join(os.getcwd(), '../fixtures/train_vocabulary')), + return Vocabulary.from_files(os.path.normpath(os.path.join(test_file_path_str, 'train_vocabulary')), oov_token='_', padding_token='__PAD__') diff --git a/tests/config/test_configuration.py b/tests/config/test_configuration.py index 123bfbc5d677ecba7b48168a6448b81ba6ab6861..4c70a2654cddf32a1e69e096265a25f6af68cbbb 100644 --- a/tests/config/test_configuration.py +++ b/tests/config/test_configuration.py @@ -1,13 +1,14 @@ import unittest import os +import importlib.resources from combo.config import Registry from combo.config.from_parameters import override_parameters from combo.data import WhitespaceTokenizer, UniversalDependenciesDatasetReader, Vocabulary from combo.data.token_indexers.token_characters_indexer import TokenCharactersIndexer - -VOCABULARY_DIR = os.path.normpath(os.path.join(os.path.dirname(__file__), '../fixtures/small_vocabulary')) +test_file_path_str = str(importlib.resources.files('tests').joinpath('test_fixtures')) +VOCABULARY_DIR = os.path.normpath(os.path.join(test_file_path_str, 'small_vocabulary')) class ConfigurationTest(unittest.TestCase): @@ -68,13 +69,16 @@ class ConfigurationTest(unittest.TestCase): { 'type': 'from_files_vocabulary', 'parameters': { - 'directory': VOCABULARY_DIR + 'directory': 'small_vocabulary', + 'oov_token': '@@UNKNOWN@@', + 'padding_token': '@@PADDING@@' } }) def test_serialize_and_load_non_base_constructor(self): vocab = Vocabulary.from_files(VOCABULARY_DIR) serialized_vocab = vocab.serialize() + serialized_vocab['parameters']['directory'] = VOCABULARY_DIR clz, constructor = Registry.resolve(serialized_vocab['type']) reconstructed_vocab = clz.from_parameters(serialized_vocab['parameters'], constructor) self.assertEqual(type(reconstructed_vocab), Vocabulary) diff --git a/tests/data/data_readers/test_text_classification_json_reader.py b/tests/data/data_readers/test_text_classification_json_reader.py index 07bb8db2bc15e6093cda57e28f0b66e622c7ea8d..7a6f40a6ced9f578d1195b2ef4bd5d5b9dfdb155 100644 --- a/tests/data/data_readers/test_text_classification_json_reader.py +++ b/tests/data/data_readers/test_text_classification_json_reader.py @@ -2,8 +2,7 @@ import unittest import os from combo.data.dataset_readers import TextClassificationJSONReader -from combo.data.fields import LabelField, TextField, ListField -from combo.data.tokenizers import SpacySentenceSplitter +from combo.data.fields import LabelField, ListField class TextClassificationJSONReaderTest(unittest.TestCase): @@ -22,18 +21,8 @@ class TextClassificationJSONReaderTest(unittest.TestCase): self.assertIsInstance(tokens[1].fields.get('label'), LabelField) self.assertEqual(tokens[1].fields.get('label').label, 'label2') - def test_read_two_examples_tokens_without_sentence_splitting(self): - reader = TextClassificationJSONReader() - tokens = [token for token in reader.read(os.path.join(os.path.dirname(__file__), 'text_classification_json_reader.json'))] - self.assertEqual(len(tokens[0].fields.items()), 2) - self.assertIsInstance(tokens[0].fields.get('tokens'), TextField) - self.assertEqual(len(tokens[0].fields.get('tokens').tokens), 13) - self.assertEqual(len(tokens[1].fields.items()), 2) - self.assertIsInstance(tokens[1].fields.get('tokens'), TextField) - self.assertEqual(len(tokens[1].fields.get('tokens').tokens), 6) - def test_read_two_examples_tokens_with_sentence_splitting(self): - reader = TextClassificationJSONReader(sentence_segmenter=SpacySentenceSplitter()) + reader = TextClassificationJSONReader() tokens = [token for token in reader.read(os.path.join(os.path.dirname(__file__), 'text_classification_json_reader.json'))] self.assertEqual(len(tokens[0].fields.items()), 2) self.assertIsInstance(tokens[0].fields.get('tokens'), ListField) diff --git a/tests/data/data_readers/test_universal_dependencies_dataset_reader.py b/tests/data/data_readers/test_universal_dependencies_dataset_reader.py index b3292ee0a915a2a11e19cfb472f084fbc18b3985..de3e963882674f638433c39b60246f20b4dba297 100644 --- a/tests/data/data_readers/test_universal_dependencies_dataset_reader.py +++ b/tests/data/data_readers/test_universal_dependencies_dataset_reader.py @@ -1,7 +1,7 @@ import unittest import os -from combo.data import UniversalDependenciesDatasetReader, LamboTokenizer +from combo.data import UniversalDependenciesDatasetReader class UniversalDependenciesDatasetReaderTest(unittest.TestCase): diff --git a/tests/data/token_indexers/test_pretrained_transformer_fixed_mismatched_indexer.py b/tests/data/token_indexers/test_pretrained_transformer_fixed_mismatched_indexer.py index d2e3e050caa5b7c7edd69a2cf9c909e7412b0cb3..1d2df930932defbcf8575adf2ee0a2b3c648b657 100644 --- a/tests/data/token_indexers/test_pretrained_transformer_fixed_mismatched_indexer.py +++ b/tests/data/token_indexers/test_pretrained_transformer_fixed_mismatched_indexer.py @@ -1,10 +1,12 @@ import unittest import os +import importlib.resources from combo.data.tokenizers import Token from combo.data.token_indexers import PretrainedTransformerFixedMismatchedIndexer from combo.data.vocabulary import Vocabulary +test_file_path_str = str(importlib.resources.files('tests').joinpath('test_fixtures')) class TokenFeatsIndexerTest(unittest.TestCase): def setUp(self) -> None: @@ -12,7 +14,7 @@ class TokenFeatsIndexerTest(unittest.TestCase): self.short_indexer = PretrainedTransformerFixedMismatchedIndexer("allegro/herbert-base-cased", max_length=3) self.vocabulary = Vocabulary.from_files( - os.path.join(os.getcwd(), '../../fixtures/train_vocabulary'), + os.path.join(test_file_path_str, 'train_vocabulary'), oov_token='_', padding_token='__PAD__' ) diff --git a/tests/data/token_indexers/test_single_id_token_indexer.py b/tests/data/token_indexers/test_single_id_token_indexer.py index 65ea3d7c4d37ebd01f39e50938ad46fb8e30797d..4723105b94a7bda7176d8e89379e56a3c858e632 100644 --- a/tests/data/token_indexers/test_single_id_token_indexer.py +++ b/tests/data/token_indexers/test_single_id_token_indexer.py @@ -1,15 +1,17 @@ import os import unittest +import importlib.resources from combo.data import SingleIdTokenIndexer, Token from combo.data.vocabulary import Vocabulary +test_file_path_str = str(importlib.resources.files('tests').joinpath('test_fixtures')) -class VocabularyTest(unittest.TestCase): +class SingleIDTokenIndexerTest(unittest.TestCase): def setUp(self): self.vocabulary = Vocabulary.from_files( - os.path.join(os.getcwd(), '../../fixtures/train_vocabulary'), + os.path.join(test_file_path_str, 'train_vocabulary'), oov_token='_', padding_token='__PAD__' ) diff --git a/tests/data/token_indexers/test_token_characters_indexer.py b/tests/data/token_indexers/test_token_characters_indexer.py index 13a7f44c1e299eac3bc2df2b7e4841ce83d976d4..b2dedb30852f133af0612f057d6295875c62a67b 100644 --- a/tests/data/token_indexers/test_token_characters_indexer.py +++ b/tests/data/token_indexers/test_token_characters_indexer.py @@ -1,16 +1,18 @@ import unittest import os +import importlib.resources from combo.data.tokenizers import Token from combo.data.token_indexers import TokenCharactersIndexer from combo.data.vocabulary import Vocabulary +test_file_path_str = str(importlib.resources.files('tests').joinpath('test_fixtures')) class TokenCharactersIndexerTest(unittest.TestCase): def setUp(self): self.vocabulary = Vocabulary.from_files( - os.path.normpath(os.path.join(os.getcwd(), '../../fixtures/train_vocabulary')), + os.path.normpath(os.path.join(test_file_path_str, 'train_vocabulary')), oov_token='_', padding_token='__PAD__' ) diff --git a/tests/data/token_indexers/test_token_features_indexer.py b/tests/data/token_indexers/test_token_features_indexer.py index 200112c96beb865bd0065397888e1d02c9c512b4..58b94147e27060b2ac21580daae178101ba294ae 100644 --- a/tests/data/token_indexers/test_token_features_indexer.py +++ b/tests/data/token_indexers/test_token_features_indexer.py @@ -1,16 +1,18 @@ import unittest import os +import importlib.resources from combo.data.tokenizers import Token from combo.data.token_indexers import TokenFeatsIndexer from combo.data.vocabulary import Vocabulary +test_file_path_str = str(importlib.resources.files('tests').joinpath('test_fixtures')) class TokenFeatsIndexerTest(unittest.TestCase): def setUp(self): self.vocabulary = Vocabulary.from_files( - os.path.join(os.getcwd(), '../../fixtures/train_vocabulary'), + os.path.join(test_file_path_str, 'train_vocabulary'), oov_token='_', padding_token='__PAD__' ) diff --git a/tests/data/tokenizers/test_lambo_tokenizer.py b/tests/data/tokenizers/test_lambo_tokenizer.py index 08e18d4f3199290a566080b9de144c241a59e09b..45906381806f00a1d4d04f33d3775773257901b7 100644 --- a/tests/data/tokenizers/test_lambo_tokenizer.py +++ b/tests/data/tokenizers/test_lambo_tokenizer.py @@ -9,8 +9,8 @@ class LamboTokenizerTest(unittest.TestCase): self.lambo_tokenizer = LamboTokenizer() def test_tokenize_sentence(self): - tokens = self.lambo_tokenizer.tokenize('Hello cats. I love you') - self.assertListEqual([t.text for t in tokens], + sentences = self.lambo_tokenizer.tokenize('Hello cats. I love you') + self.assertListEqual([t.text for t in sentences[0] + sentences[1]], ['Hello', 'cats', '.', 'I', 'love', 'you']) def test_segment_text(self): @@ -29,13 +29,16 @@ class LamboTokenizerTest(unittest.TestCase): [['I', 'do', 'n\'t', 'want', 'a', 'pizza', '.']]) def test_segment_text_with_multiwords_without_splitting(self): - tokens = self.lambo_tokenizer.segment('I don\'t want a pizza.', split_subwords=False) - self.assertListEqual(tokens, - [['I', 'don\'t', 'want', 'a', 'pizza', '.']]) - - def test_tokenize_sentence_with_multiword(self): - tokens = self.lambo_tokenizer.tokenize('I don\'t like apples.') - self.assertListEqual([t.text for t in tokens], - ['I', 'don\'t', 'like', 'apples', '.']) - self.assertListEqual([t.subwords for t in tokens], - [[], ['do', 'n\'t'], [], [], []]) + tokens = self.lambo_tokenizer.tokenize('I don\'t want a pizza.', split_subwords=False) + self.assertListEqual([t.text for t in tokens[0]], + ['I', 'don\'t', 'want', 'a', 'pizza', '.']) + self.assertListEqual([t.subwords for t in tokens[0]], + [[], ['do', 'n\'t'], [], [], [], []]) + + def test_segment_text_with_multiwords_with_splitting(self): + tokens = self.lambo_tokenizer.tokenize('I don\'t want a pizza.', split_subwords=True) + self.assertListEqual([t.text for t in tokens[0]], + ['I', 'do', 'n\'t', 'want', 'a', 'pizza', '.']) + self.assertListEqual([t.multiword for t in tokens[0]], + [None, ('don\'t', (2, 3)), ('don\'t', (2, 3)), None, None, None, None]) + diff --git a/tests/data/tokenizers/test_spacy_tokenizer.py b/tests/data/tokenizers/test_spacy_tokenizer.py deleted file mode 100644 index e0e1c6eacbe8071f2db592d93b8b2bd91109e51f..0000000000000000000000000000000000000000 --- a/tests/data/tokenizers/test_spacy_tokenizer.py +++ /dev/null @@ -1,19 +0,0 @@ -import unittest - -from combo.data import SpacyTokenizer - - -class SpacyTokenizerTest(unittest.TestCase): - - def setUp(self) -> None: - self.spacy_tokenizer = SpacyTokenizer() - - def test_tokenize_sentence(self): - tokens = self.spacy_tokenizer.tokenize('Hello cats. I love you') - self.assertListEqual([t.text for t in tokens], - ['Hello', 'cats', '.', 'I', 'love', 'you']) - - def test_tokenize_empty_sentence(self): - tokens = self.spacy_tokenizer.tokenize('') - self.assertEqual(len(tokens), 0) - diff --git a/tests/fixtures/PDBUD_train.conllu b/tests/test_fixtures/PDBUD_train.conllu similarity index 100% rename from tests/fixtures/PDBUD_train.conllu rename to tests/test_fixtures/PDBUD_train.conllu diff --git a/tests/fixtures/small_vocabulary/animals.txt b/tests/test_fixtures/small_vocabulary/animals.txt similarity index 100% rename from tests/fixtures/small_vocabulary/animals.txt rename to tests/test_fixtures/small_vocabulary/animals.txt diff --git a/tests/fixtures/small_vocabulary/non_padded_namespaces.txt b/tests/test_fixtures/small_vocabulary/non_padded_namespaces.txt similarity index 100% rename from tests/fixtures/small_vocabulary/non_padded_namespaces.txt rename to tests/test_fixtures/small_vocabulary/non_padded_namespaces.txt diff --git a/tests/fixtures/small_vocabulary/slices.json b/tests/test_fixtures/small_vocabulary/slices.json similarity index 100% rename from tests/fixtures/small_vocabulary/slices.json rename to tests/test_fixtures/small_vocabulary/slices.json diff --git a/tests/fixtures/train_vocabulary/slices.json b/tests/test_fixtures/train_vocabulary/slices.json similarity index 100% rename from tests/fixtures/train_vocabulary/slices.json rename to tests/test_fixtures/train_vocabulary/slices.json