diff --git a/combo/common/util.py b/combo/common/util.py index dfb2e7c9bd67ea79a5bb79d7aacb4fe746f3af41..7cb5c8880228fc6e7f13707510d4d61f4fe7e08b 100644 --- a/combo/common/util.py +++ b/combo/common/util.py @@ -47,7 +47,7 @@ def sanitize(x: Any) -> Any: can be serialized into JSON. """ # Import here to avoid circular references - from combo.data.tokenizers import TokenizerToken + from combo.data.tokenizers import Token if isinstance(x, (str, float, int, bool)): # x is already serializable @@ -67,7 +67,7 @@ def sanitize(x: Any) -> Any: elif isinstance(x, numpy.bool_): # Numpy bool_ need to be converted to python bool. return bool(x) - elif isinstance(x, (spacy.tokens.Token, TokenizerToken)): + elif isinstance(x, (spacy.tokens.Token, Token)): # Tokens get sanitized to just their text. return x.text elif isinstance(x, (list, tuple, set)): diff --git a/combo/data/__init__.py b/combo/data/__init__.py index 3b1bb5afbc217f04e3e7f2b97a98dae94efd71ee..8f6789be731460f535b1aad3d5aa2eeee362ab00 100644 --- a/combo/data/__init__.py +++ b/combo/data/__init__.py @@ -3,7 +3,7 @@ from .vocabulary import Vocabulary from .samplers import TokenCountBatchSampler from .instance import Instance from .token_indexers import (SingleIdTokenIndexer, TokenIndexer, TokenFeatsIndexer) -from .tokenizers import (Tokenizer, TokenizerToken, CharacterTokenizer, PretrainedTransformerTokenizer, +from .tokenizers import (Tokenizer, Token, CharacterTokenizer, PretrainedTransformerTokenizer, SpacyTokenizer, WhitespaceTokenizer) from .dataset_readers import (ConllDatasetReader, DatasetReader, TextClassificationJSONReader, UniversalDependenciesDatasetReader) diff --git a/combo/data/api.py b/combo/data/api.py index fe1cf99944f1328723518ca888a786b924af86e3..1129985d5bf08ba6b6bf19ed577c75f685ebff7f 100644 --- a/combo/data/api.py +++ b/combo/data/api.py @@ -3,6 +3,7 @@ import dataclasses import json from dataclasses import dataclass, field from typing import Optional, List, Dict, Any, Union, Tuple +from tokenizers import Token import conllu from overrides import overrides @@ -10,21 +11,21 @@ from overrides import overrides # Moze NER moglby uzywac tej 11tej kolumny? @dataclass -class Token: - id: Optional[Union[int, Tuple]] = None # czemu tuple? multiwordy? +class OldToken: + id: Optional[Union[int, Tuple]] = None # czemu tuple? multiwordy? token: Optional[str] = None lemma: Optional[str] = None upostag: Optional[str] = None xpostag: Optional[str] = None feats: Optional[str] = None - head: Optional[int] = None # Identyfikator innego tokena, ktory jest nadrzednikiem, drzewo zaleznosciowe + head: Optional[int] = None # Identyfikator innego tokena, ktory jest nadrzednikiem, drzewo zaleznosciowe deprel: Optional[str] = None deps: Optional[str] = None - misc: Optional[str] = None # wszystko, najczesciej czy jest spacja (np. po "spi" w "spi." nie m spacji) + misc: Optional[str] = None # wszystko, najczesciej czy jest spacja (np. po "spi" w "spi." nie m spacji) # nie predykujemy tego, to jest robione na poziomie tokenizera # czasem wpisuja sie tam tez dodatkowe informacje, np. teksty z transliteracjami # to jest w formacie conllu - semrel: Optional[str] = None # w conllu w formacie 10kolumnowym tego nie ma + semrel: Optional[str] = None # w conllu w formacie 10kolumnowym tego nie ma # ale sa pomysly, zeby semantyke podawac jako kolejna kolumne # moze ja zostawmy # np. jesli mamy okoliczniki, to deprel to "adjunct", np. "w lozeczku" mamy okolicznik, diff --git a/combo/data/dataset.py b/combo/data/dataset.py index f5876b0e5a073b95849130947b612ab23f1b1208..653fbde2c2a1245bfc18533e422ee5f1a411c0cc 100644 --- a/combo/data/dataset.py +++ b/combo/data/dataset.py @@ -2,14 +2,14 @@ import logging from dataclasses import dataclass from typing import Optional -from combo.data import TokenizerToken +from combo.data import Token logger = logging.getLogger(__name__) @dataclass(init=False, repr=False) -class _Token(TokenizerToken): - __slots__ = TokenizerToken.__slots__ + ['feats_'] +class _Token(Token): + __slots__ = Token.__slots__ + ['feats_'] feats_: Optional[str] diff --git a/combo/data/dataset_readers/conll.py b/combo/data/dataset_readers/conll.py index 2b97890c5ea95d305099a93f3466ec76bbe38548..60a3d29e168488b065150d6c71a55c828793f3f4 100644 --- a/combo/data/dataset_readers/conll.py +++ b/combo/data/dataset_readers/conll.py @@ -9,7 +9,7 @@ import logging from combo.utils import ConfigurationError from .dataset_reader import DatasetReader, PathOrStr -from combo.data.token_indexers.token_indexer import TokenIndexer, TokenizerToken +from combo.data.token_indexers.token_indexer import TokenIndexer, Token from combo.data.token_indexers.single_id_token_indexer import SingleIdTokenIndexer from .dataset_utils.span_utils import to_bioul from .. import Instance @@ -144,13 +144,13 @@ class ConllDatasetReader(DatasetReader): fields = [list(field) for field in zip(*fields)] tokens_, pos_tags, chunk_tags, ner_tags = fields # TextField requires `Token` objects - tokens = [TokenizerToken(token) for token in tokens_] + tokens = [Token(token) for token in tokens_] yield self.text_to_instance(tokens, pos_tags, chunk_tags, ner_tags) def text_to_instance( # type: ignore self, - tokens: List[TokenizerToken], + tokens: List[Token], pos_tags: List[str] = None, chunk_tags: List[str] = None, ner_tags: List[str] = None, diff --git a/combo/data/dataset_readers/dataset_utils/span_utils.py b/combo/data/dataset_readers/dataset_utils/span_utils.py index 541a6f7fb2f8a45e44a148fb50c8581505889d7e..4d25add8437a38b6669df821d7b2e3ecd0883e1c 100644 --- a/combo/data/dataset_readers/dataset_utils/span_utils.py +++ b/combo/data/dataset_readers/dataset_utils/span_utils.py @@ -6,13 +6,13 @@ https://github.com/allenai/allennlp/blob/80fb6061e568cb9d6ab5d45b661e86eb61b92c8 from typing import Callable, List, Set, Tuple, TypeVar, Optional import warnings -from combo.data.tokenizers.tokenizer import TokenizerToken +from combo.data.tokenizers.tokenizer import Token from combo.utils import ConfigurationError, InvalidTagSequence TypedSpan = Tuple[int, Tuple[int, int]] TypedStringSpan = Tuple[str, Tuple[int, int]] -T = TypeVar("T", str, TokenizerToken) +T = TypeVar("T", str, Token) def bio_tags_to_spans( diff --git a/combo/data/dataset_readers/universal_dependencies_dataset_reader.py b/combo/data/dataset_readers/universal_dependencies_dataset_reader.py index ce17177d07431e6847fa5c0296f078bfbd9b2f11..4c7d0866ac089e667caeb5cc3fbe3dc6cce3c79e 100644 --- a/combo/data/dataset_readers/universal_dependencies_dataset_reader.py +++ b/combo/data/dataset_readers/universal_dependencies_dataset_reader.py @@ -11,7 +11,7 @@ import torch from overrides import overrides from combo import data -from combo.data import Vocabulary, fields, Instance, TokenizerToken +from combo.data import Vocabulary, fields, Instance, Token from combo.data.dataset import _Token from combo.data.dataset_readers.dataset_reader import DatasetReader from combo.data.fields import Field @@ -138,7 +138,7 @@ class UniversalDependenciesDatasetReader(DatasetReader): if target_name != "sent": target_values = [t[target_name] for t in tree_tokens] if target_name == "lemma": - target_values = [TokenizerToken(v) for v in target_values] + target_values = [Token(v) for v in target_values] fields_[target_name] = TextField(target_values, self.lemma_indexers) elif target_name == "feats": target_values = self._feat_values(tree_tokens) diff --git a/combo/data/fields/text_field.py b/combo/data/fields/text_field.py index 5f52eba6043f4c5a2b00142bdaff475e65883a2b..c4de2733c1338c684e1592d86a5345d87028b111 100644 --- a/combo/data/fields/text_field.py +++ b/combo/data/fields/text_field.py @@ -22,7 +22,7 @@ import torch from combo.data import Vocabulary from combo.data.fields.sequence_field import SequenceField from combo.data.token_indexers import TokenIndexer, IndexedTokenList -from combo.data.tokenizers import TokenizerToken +from combo.data.tokenizers import Token from combo.utils import ConfigurationError TextFieldTensors = Dict[str, Dict[str, torch.Tensor]] @@ -70,13 +70,13 @@ class TextField(SequenceField[TextFieldTensors]): __slots__ = ["tokens", "_token_indexers", "_indexed_tokens"] def __init__( - self, tokens: List[TokenizerToken], token_indexers: Optional[Dict[str, TokenIndexer]] = None + self, tokens: List[Token], token_indexers: Optional[Dict[str, TokenIndexer]] = None ) -> None: self.tokens = tokens self._token_indexers = token_indexers self._indexed_tokens: Optional[Dict[str, IndexedTokenList]] = None - if not all(isinstance(x, (TokenizerToken, SpacyToken)) for x in tokens): + if not all(isinstance(x, (Token, SpacyToken)) for x in tokens): raise ConfigurationError( "TextFields must be passed Tokens. " "Found: {} with types {}.".format(tokens, [type(x) for x in tokens]) @@ -192,10 +192,10 @@ class TextField(SequenceField[TextFieldTensors]): return f"TextField of length {self.sequence_length()} with text: \n {formatted_text}" # Sequence[Token] methods - def __iter__(self) -> Iterator[TokenizerToken]: + def __iter__(self) -> Iterator[Token]: return iter(self.tokens) - def __getitem__(self, idx: int) -> TokenizerToken: + def __getitem__(self, idx: int) -> Token: return self.tokens[idx] def __len__(self) -> int: diff --git a/combo/data/token_indexers/pretrained_transformer_fixed_mismatched_indexer.py b/combo/data/token_indexers/pretrained_transformer_fixed_mismatched_indexer.py index 90b3ff62724d9e7301c1fcec95f38cd1bbebb03e..6366a1e43808df3c79b17d5a1643396f4d3c3c46 100644 --- a/combo/data/token_indexers/pretrained_transformer_fixed_mismatched_indexer.py +++ b/combo/data/token_indexers/pretrained_transformer_fixed_mismatched_indexer.py @@ -8,7 +8,7 @@ from typing import Optional, Dict, Any, List, Tuple from overrides import overrides from combo.data import Vocabulary -from combo.data.tokenizers import TokenizerToken +from combo.data.tokenizers import Token from combo.data.token_indexers import IndexedTokenList from combo.data.token_indexers.pretrained_transformer_indexer import PretrainedTransformerIndexer from combo.data.token_indexers.pretrained_transformer_mismatched_indexer import PretrainedTransformerMismatchedIndexer @@ -102,8 +102,8 @@ class PretrainedTransformerTokenizer(PretrainedTransformerTokenizer): def _intra_word_tokenize( self, string_tokens: List[str] - ) -> Tuple[List[TokenizerToken], List[Optional[Tuple[int, int]]]]: - tokens: List[TokenizerToken] = [] + ) -> Tuple[List[Token], List[Optional[Tuple[int, int]]]]: + tokens: List[Token] = [] offsets: List[Optional[Tuple[int, int]]] = [] for token_string in string_tokens: wordpieces = self.tokenizer.encode_plus( @@ -118,7 +118,7 @@ class PretrainedTransformerTokenizer(PretrainedTransformerTokenizer): if len(wp_ids) > 0: offsets.append((len(tokens), len(tokens) + len(wp_ids) - 1)) tokens.extend( - TokenizerToken(text=wp_text, text_id=wp_id) + Token(text=wp_text, text_id=wp_id) for wp_id, wp_text in zip(wp_ids, self.tokenizer.convert_ids_to_tokens(wp_ids)) ) else: diff --git a/combo/data/token_indexers/pretrained_transformer_indexer.py b/combo/data/token_indexers/pretrained_transformer_indexer.py index 6af8347812ee8e7231f5045a3c2aed7aa63710ac..1c5302c4cc5cd4b2169ace7204b51cb1adef1e51 100644 --- a/combo/data/token_indexers/pretrained_transformer_indexer.py +++ b/combo/data/token_indexers/pretrained_transformer_indexer.py @@ -8,7 +8,7 @@ import logging import torch from combo.data import Vocabulary -from combo.data.tokenizers import TokenizerToken +from combo.data.tokenizers import Token from combo.data.token_indexers import TokenIndexer, IndexedTokenList from combo.data.tokenizers.pretrained_transformer_tokenizer import PretrainedTransformerTokenizer from combo.utils import pad_sequence_to_length @@ -84,11 +84,11 @@ class PretrainedTransformerIndexer(TokenIndexer): self._added_to_vocabulary = True - def count_vocab_items(self, token: TokenizerToken, counter: Dict[str, Dict[str, int]]): + def count_vocab_items(self, token: Token, counter: Dict[str, Dict[str, int]]): # If we only use pretrained models, we don't need to do anything here. pass - def tokens_to_indices(self, tokens: List[TokenizerToken], vocabulary: Vocabulary) -> IndexedTokenList: + def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary) -> IndexedTokenList: self._add_encoding_to_vocabulary_if_needed(vocabulary) indices, type_ids = self._extract_token_and_type_ids(tokens) @@ -103,14 +103,14 @@ class PretrainedTransformerIndexer(TokenIndexer): def indices_to_tokens( self, indexed_tokens: IndexedTokenList, vocabulary: Vocabulary - ) -> List[TokenizerToken]: + ) -> List[Token]: self._add_encoding_to_vocabulary_if_needed(vocabulary) token_ids = indexed_tokens["token_ids"] type_ids = indexed_tokens.get("type_ids") return [ - TokenizerToken( + Token( text=vocabulary.get_token_from_index(token_ids[i], self._namespace), text_id=token_ids[i], type_id=type_ids[i] if type_ids is not None else None, @@ -118,7 +118,7 @@ class PretrainedTransformerIndexer(TokenIndexer): for i in range(len(token_ids)) ] - def _extract_token_and_type_ids(self, tokens: List[TokenizerToken]) -> Tuple[List[int], List[int]]: + def _extract_token_and_type_ids(self, tokens: List[Token]) -> Tuple[List[int], List[int]]: """ Roughly equivalent to `zip(*[(token.text_id, token.type_id) for token in tokens])`, with some checks. diff --git a/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py b/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py index 6b5043b5310b8c2eda0095f677ef2d7ec41276c5..f5105177e8c95d0138a3575053a600a08bb1725d 100644 --- a/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py +++ b/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py @@ -9,7 +9,7 @@ import logging import torch from combo.data import Vocabulary -from combo.data.tokenizers import TokenizerToken +from combo.data.tokenizers import Token from combo.data.token_indexers import TokenIndexer, IndexedTokenList from combo.data.token_indexers.pretrained_transformer_indexer import PretrainedTransformerIndexer from combo.utils import pad_sequence_to_length @@ -67,10 +67,10 @@ class PretrainedTransformerMismatchedIndexer(TokenIndexer): self._num_added_start_tokens = self._matched_indexer._num_added_start_tokens self._num_added_end_tokens = self._matched_indexer._num_added_end_tokens - def count_vocab_items(self, token: TokenizerToken, counter: Dict[str, Dict[str, int]]): + def count_vocab_items(self, token: Token, counter: Dict[str, Dict[str, int]]): return self._matched_indexer.count_vocab_items(token, counter) - def tokens_to_indices(self, tokens: List[TokenizerToken], vocabulary: Vocabulary) -> IndexedTokenList: + def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary) -> IndexedTokenList: self._matched_indexer._add_encoding_to_vocabulary_if_needed(vocabulary) wordpieces, offsets = self._allennlp_tokenizer.intra_word_tokenize( diff --git a/combo/data/token_indexers/single_id_token_indexer.py b/combo/data/token_indexers/single_id_token_indexer.py index ed03cb22c692ce557b0cce5591c140cb31eef81a..576ae8a6bff933d0d441b550622425625b1a9256 100644 --- a/combo/data/token_indexers/single_id_token_indexer.py +++ b/combo/data/token_indexers/single_id_token_indexer.py @@ -7,7 +7,7 @@ from typing import Dict, List, Optional, Any import itertools from combo.data import Vocabulary -from combo.data.tokenizers import TokenizerToken +from combo.data.tokenizers import Token from combo.data.token_indexers import TokenIndexer, IndexedTokenList _DEFAULT_VALUE = "THIS IS A REALLY UNLIKELY VALUE THAT HAS TO BE A STRING" @@ -59,12 +59,12 @@ class SingleIdTokenIndexer(TokenIndexer): self.namespace = namespace self.lowercase_tokens = lowercase_tokens - self._start_tokens = [TokenizerToken(st) for st in (start_tokens or [])] - self._end_tokens = [TokenizerToken(et) for et in (end_tokens or [])] + self._start_tokens = [Token(st) for st in (start_tokens or [])] + self._end_tokens = [Token(et) for et in (end_tokens or [])] self._feature_name = feature_name self._default_value = default_value - def count_vocab_items(self, token: TokenizerToken, counter: Dict[str, Dict[str, int]]): + def count_vocab_items(self, token: Token, counter: Dict[str, Dict[str, int]]): if self.namespace is not None: text = self._get_feature_value(token) if self.lowercase_tokens: @@ -72,7 +72,7 @@ class SingleIdTokenIndexer(TokenIndexer): counter[self.namespace][text] += 1 def tokens_to_indices( - self, tokens: List[TokenizerToken], vocabulary: Vocabulary + self, tokens: List[Token], vocabulary: Vocabulary ) -> Dict[str, List[int]]: indices: List[int] = [] @@ -91,7 +91,7 @@ class SingleIdTokenIndexer(TokenIndexer): def get_empty_token_list(self) -> IndexedTokenList: return {"tokens": []} - def _get_feature_value(self, token: TokenizerToken) -> str: + def _get_feature_value(self, token: Token) -> str: text = getattr(token, self._feature_name) if text is None: if self._default_value is not _DEFAULT_VALUE: diff --git a/combo/data/token_indexers/token_characters_indexer.py b/combo/data/token_indexers/token_characters_indexer.py index 2b62e331e80d37a9f4a300a5904bce92a410023e..b788875b6947de368e14eb0e589bb5e27e81e8a8 100644 --- a/combo/data/token_indexers/token_characters_indexer.py +++ b/combo/data/token_indexers/token_characters_indexer.py @@ -12,7 +12,7 @@ import torch from combo.data import Vocabulary from combo.data.token_indexers import TokenIndexer, IndexedTokenList -from combo.data.tokenizers import TokenizerToken, CharacterTokenizer +from combo.data.tokenizers import Token, CharacterTokenizer from combo.utils import ConfigurationError, pad_sequence_to_length @@ -66,11 +66,11 @@ class TokenCharactersIndexer(TokenIndexer): self._namespace = namespace self._character_tokenizer = character_tokenizer - self._start_tokens = [TokenizerToken(st) for st in (start_tokens or [])] - self._end_tokens = [TokenizerToken(et) for et in (end_tokens or [])] + self._start_tokens = [Token(st) for st in (start_tokens or [])] + self._end_tokens = [Token(et) for et in (end_tokens or [])] @overrides - def count_vocab_items(self, token: TokenizerToken, counter: Dict[str, Dict[str, int]]): + def count_vocab_items(self, token: Token, counter: Dict[str, Dict[str, int]]): if token.text is None: raise ConfigurationError("TokenCharactersIndexer needs a tokenizer that retains text") for character in self._character_tokenizer.tokenize(token.text): @@ -81,7 +81,7 @@ class TokenCharactersIndexer(TokenIndexer): @overrides def tokens_to_indices( - self, tokens: List[TokenizerToken], vocabulary: Vocabulary + self, tokens: List[Token], vocabulary: Vocabulary ) -> Dict[str, List[List[int]]]: indices: List[List[int]] = [] for token in itertools.chain(self._start_tokens, tokens, self._end_tokens): diff --git a/combo/data/token_indexers/token_features_indexer.py b/combo/data/token_indexers/token_features_indexer.py index b3ad468bf3ea8aae5e2b3581edc7f80c533d9223..c0c1f21915f7bef134e0f5ad5458cfe876fe1e63 100644 --- a/combo/data/token_indexers/token_features_indexer.py +++ b/combo/data/token_indexers/token_features_indexer.py @@ -10,7 +10,7 @@ import torch from overrides import overrides from combo.data import Vocabulary -from combo.data.tokenizers.tokenizer import TokenizerToken +from combo.data.tokenizers.tokenizer import Token from combo.data.token_indexers.token_indexer import TokenIndexer, IndexedTokenList from combo.utils import pad_sequence_to_length @@ -28,13 +28,13 @@ class TokenFeatsIndexer(TokenIndexer, ABC): self._feature_name = feature_name @overrides - def count_vocab_items(self, token: TokenizerToken, counter: Dict[str, Dict[str, int]]): + def count_vocab_items(self, token: Token, counter: Dict[str, Dict[str, int]]): feats = self._feat_values(token) for feat in feats: counter[self.namespace][feat] += 1 @overrides - def tokens_to_indices(self, tokens: List[TokenizerToken], vocabulary: Vocabulary) -> IndexedTokenList: + def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary) -> IndexedTokenList: indices: List[List[int]] = [] vocab_size = vocabulary.get_vocab_size(self.namespace) for token in tokens: diff --git a/combo/data/token_indexers/token_indexer.py b/combo/data/token_indexers/token_indexer.py index 0e2921cbd473f08a4eae9c485b26b2498cfae662..31835daf3a8aa97242369466b2625a69db3e7efd 100644 --- a/combo/data/token_indexers/token_indexer.py +++ b/combo/data/token_indexers/token_indexer.py @@ -7,7 +7,7 @@ from typing import Any, Dict, List import torch -from combo.data.tokenizers.tokenizer import TokenizerToken +from combo.data.tokenizers.tokenizer import Token from combo.data.vocabulary import Vocabulary from combo.utils import pad_sequence_to_length @@ -43,7 +43,7 @@ class TokenIndexer: def __init__(self, token_min_padding_length: int = 0) -> None: self._token_min_padding_length: int = token_min_padding_length - def count_vocab_items(self, token: TokenizerToken, counter: Dict[str, Dict[str, int]]): + def count_vocab_items(self, token: Token, counter: Dict[str, Dict[str, int]]): """ The :class:`Vocabulary` needs to assign indices to whatever strings we see in the training data (possibly doing some frequency filtering and using an OOV, or out of vocabulary, @@ -54,7 +54,7 @@ class TokenIndexer: """ raise NotImplementedError - def tokens_to_indices(self, tokens: List[TokenizerToken], vocabulary: Vocabulary) -> IndexedTokenList: + def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary) -> IndexedTokenList: """ Takes a list of tokens and converts them to an `IndexedTokenList`. This could be just an ID for each token from the vocabulary. @@ -67,7 +67,7 @@ class TokenIndexer: def indices_to_tokens( self, indexed_tokens: IndexedTokenList, vocabulary: Vocabulary - ) -> List[TokenizerToken]: + ) -> List[Token]: """ Inverse operations of tokens_to_indices. Takes an `IndexedTokenList` and converts it back into a list of tokens. diff --git a/combo/data/tokenizers/__init__.py b/combo/data/tokenizers/__init__.py index a92183e98aa51d3b233d11e1fd6591407e8b70bd..1e404f0140924882d32914b8064cabe61f1ecd17 100644 --- a/combo/data/tokenizers/__init__.py +++ b/combo/data/tokenizers/__init__.py @@ -1,4 +1,4 @@ -from .tokenizer import Tokenizer, TokenizerToken +from .tokenizer import Tokenizer, Token from .character_tokenizer import CharacterTokenizer from .pretrained_transformer_tokenizer import PretrainedTransformerTokenizer from .spacy_tokenizer import SpacyTokenizer diff --git a/combo/data/tokenizers/character_tokenizer.py b/combo/data/tokenizers/character_tokenizer.py index f46c0f6724638b6c0cbf99cf946f7dc1e4155169..0a3066529b6913ea2fe65485589b4e199bfd8899 100644 --- a/combo/data/tokenizers/character_tokenizer.py +++ b/combo/data/tokenizers/character_tokenizer.py @@ -5,7 +5,7 @@ https://github.com/allenai/allennlp/blob/main/allennlp/data/tokenizers/character from typing import List, Union, Dict, Any -from combo.data.tokenizers import Tokenizer, TokenizerToken +from combo.data.tokenizers import Tokenizer, Token class CharacterTokenizer(Tokenizer): @@ -48,26 +48,26 @@ class CharacterTokenizer(Tokenizer): self._start_tokens.reverse() self._end_tokens = end_tokens or [] - def tokenize(self, text: str) -> List[TokenizerToken]: + def tokenize(self, text: str) -> List[Token]: if self._lowercase_characters: text = text.lower() if self._byte_encoding is not None: # We add 1 here so that we can still use 0 for masking, no matter what bytes we get out # of this. - tokens = [TokenizerToken(text_id=c + 1) for c in text.encode(self._byte_encoding)] + tokens = [Token(text_id=c + 1) for c in text.encode(self._byte_encoding)] else: - tokens = [TokenizerToken(t) for t in list(text)] + tokens = [Token(t) for t in list(text)] for start_token in self._start_tokens: if isinstance(start_token, int): - token = TokenizerToken(text_id=start_token, idx=0) + token = Token(text_id=start_token, id=0) else: - token = TokenizerToken(text=start_token, idx=0) + token = Token(text=start_token, id=0) tokens.insert(0, token) for end_token in self._end_tokens: if isinstance(end_token, int): - token = TokenizerToken(text_id=end_token, idx=0) + token = Token(text_id=end_token, id=0) else: - token = TokenizerToken(text=end_token, idx=0) + token = Token(text=end_token, id=0) tokens.append(token) return tokens diff --git a/combo/data/tokenizers/pretrained_transformer_tokenizer.py b/combo/data/tokenizers/pretrained_transformer_tokenizer.py index 60102f7369a50b871066060a8fbbec7c28e2a686..6e423e3a5d3776456808536c851a219dbcffd904 100644 --- a/combo/data/tokenizers/pretrained_transformer_tokenizer.py +++ b/combo/data/tokenizers/pretrained_transformer_tokenizer.py @@ -14,7 +14,7 @@ import dill from transformers import PreTrainedTokenizer, AutoTokenizer -from combo.data.tokenizers import Tokenizer, TokenizerToken +from combo.data.tokenizers import Tokenizer, Token from combo.utils import sanitize_wordpiece logger = logging.getLogger(__name__) @@ -180,7 +180,7 @@ class PretrainedTransformerTokenizer(Tokenizer): self.sequence_pair_second_token_type_id = token_type_id continue - token = TokenizerToken( + token = Token( tokenizer_with_special_tokens.convert_ids_to_tokens(token_id), text_id=token_id, type_id=token_type_id, @@ -226,7 +226,7 @@ class PretrainedTransformerTokenizer(Tokenizer): self.single_sequence_token_type_id = token_type_id continue - token = TokenizerToken( + token = Token( tokenizer_with_special_tokens.convert_ids_to_tokens(token_id), text_id=token_id, type_id=token_type_id, @@ -250,7 +250,7 @@ class PretrainedTransformerTokenizer(Tokenizer): detokenized = " ".join(tokenized) return "a" in detokenized - def tokenize(self, text: str) -> List[TokenizerToken]: + def tokenize(self, text: str) -> List[Token]: """ This method only handles a single sentence (or sequence) of text. """ @@ -299,7 +299,7 @@ class PretrainedTransformerTokenizer(Tokenizer): start, end = offsets tokens.append( - TokenizerToken( + Token( text=self.tokenizer.convert_ids_to_tokens(token_id, skip_special_tokens=False), text_id=token_id, type_id=token_type_id, @@ -373,8 +373,8 @@ class PretrainedTransformerTokenizer(Tokenizer): def _intra_word_tokenize( self, string_tokens: List[str] - ) -> Tuple[List[TokenizerToken], List[Optional[Tuple[int, int]]]]: - tokens: List[TokenizerToken] = [] + ) -> Tuple[List[Token], List[Optional[Tuple[int, int]]]]: + tokens: List[Token] = [] offsets: List[Optional[Tuple[int, int]]] = [] for token_string in string_tokens: wordpieces = self.tokenizer.encode_plus( @@ -389,7 +389,7 @@ class PretrainedTransformerTokenizer(Tokenizer): if len(wp_ids) > 0: offsets.append((len(tokens), len(tokens) + len(wp_ids) - 1)) tokens.extend( - TokenizerToken(text=wp_text, text_id=wp_id) + Token(text=wp_text, text_id=wp_id) for wp_id, wp_text in zip(wp_ids, self.tokenizer.convert_ids_to_tokens(wp_ids)) ) else: @@ -407,7 +407,7 @@ class PretrainedTransformerTokenizer(Tokenizer): def intra_word_tokenize( self, string_tokens: List[str] - ) -> Tuple[List[TokenizerToken], List[Optional[Tuple[int, int]]]]: + ) -> Tuple[List[Token], List[Optional[Tuple[int, int]]]]: """ Tokenizes each word into wordpieces separately and returns the wordpiece IDs. Also calculates offsets such that tokens[offsets[i][0]:offsets[i][1] + 1] @@ -421,7 +421,7 @@ class PretrainedTransformerTokenizer(Tokenizer): def intra_word_tokenize_sentence_pair( self, string_tokens_a: List[str], string_tokens_b: List[str] - ) -> Tuple[List[TokenizerToken], List[Optional[Tuple[int, int]]], List[Optional[Tuple[int, int]]]]: + ) -> Tuple[List[Token], List[Optional[Tuple[int, int]]], List[Optional[Tuple[int, int]]]]: """ Tokenizes each word into wordpieces separately and returns the wordpiece IDs. Also calculates offsets such that wordpieces[offsets[i][0]:offsets[i][1] + 1] @@ -444,9 +444,9 @@ class PretrainedTransformerTokenizer(Tokenizer): return tokens_a, offsets_a, offsets_b def add_special_tokens( - self, tokens1: List[TokenizerToken], tokens2: Optional[List[TokenizerToken]] = None - ) -> List[TokenizerToken]: - def with_new_type_id(tokens: List[TokenizerToken], type_id: int) -> List[TokenizerToken]: + self, tokens1: List[Token], tokens2: Optional[List[Token]] = None + ) -> List[Token]: + def with_new_type_id(tokens: List[Token], type_id: int) -> List[Token]: return [dataclasses.replace(t, type_id=type_id) for t in tokens] # Make sure we don't change the input parameters diff --git a/combo/data/tokenizers/spacy_tokenizer.py b/combo/data/tokenizers/spacy_tokenizer.py index 38c0f61dc52509440ca446278b6f04bef754ee0a..70164fdf61c7dd509db4811f952aed61dc06bb18 100644 --- a/combo/data/tokenizers/spacy_tokenizer.py +++ b/combo/data/tokenizers/spacy_tokenizer.py @@ -5,11 +5,10 @@ https://github.com/allenai/allennlp/blob/main/allennlp/data/tokenizers/spacy_tok from typing import List, Optional - import spacy from spacy.tokens import Doc -from combo.data.tokenizers.tokenizer import Tokenizer, TokenizerToken +from combo.data.tokenizers.tokenizer import Tokenizer, Token from combo.utils.spacy import get_spacy_model @@ -50,15 +49,15 @@ class SpacyTokenizer(Tokenizer): """ def __init__( - self, - language: str = "en_core_web_sm", - pos_tags: bool = True, - parse: bool = False, - ner: bool = False, - keep_spacy_tokens: bool = False, - split_on_spaces: bool = False, - start_tokens: Optional[List[str]] = None, - end_tokens: Optional[List[str]] = None, + self, + language: str = "en_core_web_sm", + pos_tags: bool = True, + parse: bool = False, + ner: bool = False, + keep_spacy_tokens: bool = False, + split_on_spaces: bool = False, + start_tokens: Optional[List[str]] = None, + end_tokens: Optional[List[str]] = None, ) -> None: # Save these for use later in the _to_params method self._language = language @@ -80,32 +79,44 @@ class SpacyTokenizer(Tokenizer): self._is_version_3 = spacy.__version__ >= "3.0" self._end_tokens = end_tokens or [] - def _sanitize(self, tokens: List[spacy.tokens.Token]) -> List[TokenizerToken]: + def _sanitize(self, tokens: List[spacy.tokens.Token]) -> List[Token]: """ Converts spaCy tokens to allennlp tokens. Is a no-op if keep_spacy_tokens is True """ + # self.text = text + # self.lemma = lemma + # self.upostag = upostag + # self.xpostag = xpostag + # self.entity_type = entity_type + # self.feats = feats + # self.head = head + # self.deprel = deprel + # self.deps = deps + # self.misc = misc + # self.semrel = semrel + + # TODO: add morph from SpaCy? if not self._keep_spacy_tokens: tokens = [ - TokenizerToken( - token.text, - token.idx, - token.idx + len(token.text), - token.lemma_, - token.pos_, - token.tag_, - token.dep_, - token.ent_type_, + Token( + id=(token.idx, token.idx + len(token.text)), + text=token.text, + lemma=token.lemma_, + upostag=token.pos_, + xpostag=token.tag_, + deprel=token.dep_, + entity_type=token.ent_type_ ) for token in tokens ] for start_token in self._start_tokens: - tokens.insert(0, TokenizerToken(start_token, 0)) + tokens.insert(0, Token(id=0, text=start_token)) for end_token in self._end_tokens: - tokens.append(TokenizerToken(end_token, -1)) + tokens.append(Token(id=-1, text=end_token)) return tokens - def batch_tokenize(self, texts: List[str]) -> List[List[TokenizerToken]]: + def batch_tokenize(self, texts: List[str]) -> List[List[Token]]: if self._is_version_3: return [ self._sanitize(_remove_spaces(tokens)) @@ -117,7 +128,7 @@ class SpacyTokenizer(Tokenizer): for tokens in self.spacy.pipe(texts, n_threads=-1) ] - def tokenize(self, text: str) -> List[TokenizerToken]: + def tokenize(self, text: str) -> List[Token]: # This works because our Token class matches spacy's. return self._sanitize(_remove_spaces(self.spacy(text))) diff --git a/combo/data/tokenizers/tokenizer.py b/combo/data/tokenizers/tokenizer.py index f3969f7da1fcc379504a5b3b0900dbe5a3766308..d1b8192678bef276171d8315a9b9c6e9eeb7659b 100644 --- a/combo/data/tokenizers/tokenizer.py +++ b/combo/data/tokenizers/tokenizer.py @@ -4,102 +4,95 @@ https://github.com/allenai/allennlp/blob/main/allennlp/data/tokenizers/token_cla https://github.com/allenai/allennlp/blob/main/allennlp/data/tokenizers/tokenizer.py """ -from typing import List, Optional +from typing import Any, Dict, List, Optional, Tuple, Union import logging -from dataclasses import dataclass +from dataclasses import dataclass, field logger = logging.getLogger(__name__) -@dataclass(init=False, repr=False) -class TokenizerToken: - """ - A simple token representation, keeping track of the token's text, offset in the passage it was - taken from, POS tag, dependency relation, and similar information. These fields match spacy's - exactly, so we can just use a spacy token for this. - # Parameters - text : `str`, optional - The original text represented by this token. - idx : `int`, optional - The character offset of this token into the tokenized passage. - idx_end : `int`, optional - The character offset one past the last character in the tokenized passage. - lemma_ : `str`, optional - The lemma of this token. - pos_ : `str`, optional - The coarse-grained part of speech of this token. - tag_ : `str`, optional - The fine-grained part of speech of this token. - dep_ : `str`, optional - The dependency relation for this token. - ent_type_ : `str`, optional - The entity type (i.e., the NER tag) for this token. - text_id : `int`, optional - If your tokenizer returns integers instead of strings (e.g., because you're doing byte - encoding, or some hash-based embedding), set this with the integer. If this is set, we - will bypass the vocabulary when indexing this token, regardless of whether `text` is also - set. You can `also` set `text` with the original text, if you want, so that you can - still use a character-level representation in addition to a hash-based word embedding. - type_id : `int`, optional - Token type id used by some pretrained language models like original BERT - The other fields on `Token` follow the fields on spacy's `Token` object; this is one we - added, similar to spacy's `lex_id`. - """ +def _assert_none_or_type(value: Any, type_to_check: type) -> bool: + return value is None or isinstance(value, type_to_check) + +def _assert_tuple_of_types(value: Any, types: List[type]) -> bool: + return value is None or (isinstance(value, Tuple) and len(value) == len(types) and all( + [isinstance(v, t) for v, t in zip(value, types)])) + + +@dataclass(init=False, repr=False) +class Token: __slots__ = [ "text", - "idx", - "idx_end", - "lemma_", - "pos_", - "tag_", - "dep_", - "ent_type_", - "text_id", - "type_id", + "id", + "lemma", + "upostag", + "xpostag", + "entity_type", + "feats", + "head", + "deprel", + "deps", + "misc", + "semrel", + "embeddings", + "text_id" ] - # Defining the `__slots__` of this class is an optimization that dramatically reduces - # the size in memory of a `Token` instance. The downside of using `__slots__` - # with a dataclass is that you can't assign default values at the class level, - # which is why we need a custom `__init__` function that provides the default values. text: Optional[str] - idx: Optional[int] - idx_end: Optional[int] - lemma_: Optional[str] - pos_: Optional[str] - tag_: Optional[str] - dep_: Optional[str] - ent_type_: Optional[str] + id: Optional[Union[int, Tuple]] + lemma: Optional[str] + upostag: Optional[str] # Coarse-grained part of speech? pos_? + xpostag: Optional[str] # Fine-grained part of speech? tag_ ? + entity_type: Optional[str] + feats: Optional[str] + head: Optional[int] + deprel: Optional[str] # dep_ ? + deps: Optional[str] + misc: Optional[str] + semrel: Optional[str] + embeddings: Dict[str, List[float]] text_id: Optional[int] - type_id: Optional[int] - - def __init__( - self, - text: str = None, - idx: int = None, - idx_end: int = None, - lemma_: str = None, - pos_: str = None, - tag_: str = None, - dep_: str = None, - ent_type_: str = None, - text_id: int = None, - type_id: int = None, - ) -> None: - assert text is None or isinstance( - text, str - ) # Some very hard to debug errors happen when this is not true. + + def __init__(self, + text: str = None, + id: Union[int, Tuple] = None, + lemma: str = None, + upostag: str = None, + xpostag: str = None, + entity_type: str = None, + feats: str = None, + head: int = None, + deprel: str = None, + deps: str = None, + misc: str = None, + semrel: str = None, + embeddings: Dict[str, List[float]] = None, + text_id: int = None) -> None: + _assert_none_or_type(id, int) + _assert_tuple_of_types(id, [int, int]) + _assert_none_or_type(text, str) + self.text = text - self.idx = idx - self.idx_end = idx_end - self.lemma_ = lemma_ - self.pos_ = pos_ - self.tag_ = tag_ - self.dep_ = dep_ - self.ent_type_ = ent_type_ + self.id = id + self.lemma = lemma + self.upostag = upostag + self.xpostag = xpostag + self.entity_type = entity_type + self.feats = feats + self.head = head + self.deprel = deprel + self.deps = deps + self.misc = misc + self.semrel = semrel + + if embeddings is None: + # what? + self.embeddings = field(default_factory=list, repr=False) + else: + self.embeddings = embeddings + self.text_id = text_id - self.type_id = type_id def __str__(self): return self.text @@ -107,29 +100,23 @@ class TokenizerToken: def __repr__(self): return self.__str__() - def ensure_text(self) -> str: - """ - Return the `text` field, raising an exception if it's `None`. - """ - if self.text is None: - raise ValueError("Unexpected null text for token") - else: - return self.text - - -def show_token(token: TokenizerToken) -> str: - return ( - f"{token.text} " - f"(idx: {token.idx}) " - f"(idx_end: {token.idx_end}) " - f"(lemma: {token.lemma_}) " - f"(pos: {token.pos_}) " - f"(tag: {token.tag_}) " - f"(dep: {token.dep_}) " - f"(ent_type: {token.ent_type_}) " - f"(text_id: {token.text_id}) " - f"(type_id: {token.type_id}) " - ) + def __show__(self): + return ( + f"{self.text} " + f"(id: {self.id}) " + f"(lemma: {self.lemma}) " + f"(upostag: {self.upostag}) " + f"(xpostag: {self.xpostag}) " + f"(entity_type: {self.entity_type}) " + f"(feats: {self.feats}) " + f"(head: {self.head}) " + f"(deprel: {self.deprel}) " + f"(deps: {self.deps}) " + f"(misc: {self.misc}) " + f"(semrel: {self.semrel}) " + f"(embeddings: {self.embeddings}) " + f"(text_id: {self.text_id})" + ) class Tokenizer: @@ -148,7 +135,7 @@ class Tokenizer: default_implementation = "spacy" - def batch_tokenize(self, texts: List[str]) -> List[List[TokenizerToken]]: + def batch_tokenize(self, texts: List[str]) -> List[List[Token]]: """ Batches together tokenization of several texts, in case that is faster for particular tokenizers. @@ -157,7 +144,7 @@ class Tokenizer: """ return [self.tokenize(text) for text in texts] - def tokenize(self, text: str) -> List[TokenizerToken]: + def tokenize(self, text: str) -> List[Token]: """ Actually implements splitting words into tokens. # Returns @@ -166,8 +153,8 @@ class Tokenizer: raise NotImplementedError def add_special_tokens( - self, tokens1: List[TokenizerToken], tokens2: Optional[List[TokenizerToken]] = None - ) -> List[TokenizerToken]: + self, tokens1: List[Token], tokens2: Optional[List[Token]] = None + ) -> List[Token]: """ Adds special tokens to tokenized text. These are tokens like [CLS] or [SEP]. Not all tokenizers do this. The default is to just return the tokens unchanged. diff --git a/combo/data/tokenizers/whitespace_tokenizer.py b/combo/data/tokenizers/whitespace_tokenizer.py index dfaff11220f6ebcb4cfdffefc4dd61a254322428..4177df5a3d3dbddd5aef1b9d17f98c7c4a8896f4 100644 --- a/combo/data/tokenizers/whitespace_tokenizer.py +++ b/combo/data/tokenizers/whitespace_tokenizer.py @@ -1,9 +1,9 @@ from typing import List, Dict, Any -from combo.data.tokenizers.tokenizer import TokenizerToken +from combo.data.tokenizers.tokenizer import Token -class WhitespaceTokenizer(TokenizerToken): +class WhitespaceTokenizer(Token): """ A `Tokenizer` that assumes you've already done your own tokenization somehow and have separated the tokens by spaces. We just split the input string on whitespace and return the @@ -13,8 +13,8 @@ class WhitespaceTokenizer(TokenizerToken): Registered as a `Tokenizer` with name "whitespace" and "just_spaces". """ - def tokenize(self, text: str) -> List[TokenizerToken]: - return [TokenizerToken(t) for t in text.split()] + def tokenize(self, text: str) -> List[Token]: + return [Token(t) for t in text.split()] def _to_params(self) -> Dict[str, Any]: return {"type": "whitespace"} diff --git a/tests/data/fields/test_samplers.py b/tests/data/fields/test_samplers.py index 325ca29c50a99c68db86cd3c3fc6f36f524c9197..1f220e153c2db42d9e81245225194aa85414a36d 100644 --- a/tests/data/fields/test_samplers.py +++ b/tests/data/fields/test_samplers.py @@ -3,7 +3,7 @@ import unittest from combo.data import TokenCountBatchSampler, Instance from combo.data.fields.text_field import TextField -from combo.data.tokenizers import TokenizerToken +from combo.data.tokenizers import Token class TokenCountBatchSamplerTest(unittest.TestCase): @@ -12,7 +12,7 @@ class TokenCountBatchSamplerTest(unittest.TestCase): self.dataset = [] self.sentences = ["First sentence makes full batch.", "Short", "This ends first batch"] for sentence in self.sentences: - tokens = [TokenizerToken(t) + tokens = [Token(t) for t in sentence.split()] text_field = TextField(tokens, {}) self.dataset.append(Instance({"sentence": text_field})) diff --git a/tests/data/tokenizers/test_character_tokenizer.py b/tests/data/tokenizers/test_character_tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..dac5f77642447f4a1a969d768331a7fa2c749af3 --- /dev/null +++ b/tests/data/tokenizers/test_character_tokenizer.py @@ -0,0 +1,34 @@ +import unittest + +from combo.data import CharacterTokenizer + + +class CharacterTokenizerText(unittest.TestCase): + + def setUp(self) -> None: + self.character_tokenizer = CharacterTokenizer() + + def test_tokenize_sentence(self): + tokens = self.character_tokenizer.tokenize('I love you!') + self.assertListEqual([t.text for t in tokens], + ['I', ' ', 'l', 'o', 'v', 'e', ' ', 'y', 'o', 'u', '!']) + + def test_tokenize_sentence_with_start_tokens(self): + tokenizer_w_start_tokens = CharacterTokenizer(start_tokens=['@']) + tokens = tokenizer_w_start_tokens.tokenize('Hi! Hello.') + self.assertListEqual([t.text for t in tokens], + ['@', 'H', 'i', '!', ' ', 'H', 'e', 'l', 'l', 'o', '.']) + self.assertEqual(tokens[0].id, 0) + self.assertTrue([t.id > 0 for t in tokens if t.id is not None]) + + def test_tokenize_sentence_with_end_tokens(self): + tokenizer_w_end_tokens = CharacterTokenizer(end_tokens=['#']) + tokens = tokenizer_w_end_tokens.tokenize('Hi! Hello.') + self.assertListEqual([t.text for t in tokens], + ['H', 'i', '!', ' ', 'H', 'e', 'l', 'l', 'o', '.', '#']) + self.assertEqual(tokens[-1].id, 0) + self.assertTrue([t.id > 0 for t in tokens if t.id is not None]) + + def test_tokenize_empty_sentence(self): + tokens = self.character_tokenizer.tokenize('') + self.assertEqual(len(tokens), 0) diff --git a/tests/data/tokenizers/test_spacy_tokenizer.py b/tests/data/tokenizers/test_spacy_tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..67aaead386e7d3b8714a000f81f69cdc32d2eea1 --- /dev/null +++ b/tests/data/tokenizers/test_spacy_tokenizer.py @@ -0,0 +1,25 @@ +import unittest + +from combo.data import SpacyTokenizer + + +class SpacyTokenizerTest(unittest.TestCase): + + def setUp(self) -> None: + self.spacy_tokenizer = SpacyTokenizer() + + def test_tokenize_sentence(self): + tokens = self.spacy_tokenizer.tokenize('Hello cats. I love you') + self.assertListEqual([t.text for t in tokens], + ['Hello', 'cats', '.', 'I', 'love', 'you']) + + def test_tokenize_empty_sentence(self): + tokens = self.spacy_tokenizer.tokenize('') + self.assertEqual(len(tokens), 0) + + # def test_batch_tokenize_sentence(self): + # tokens = self.spacy_tokenizer.batch_tokenize(['First sentence!', 'This is the second sentence.']) + # self.assertListEqual([t.text for t in tokens[0]], + # ['First', 'sentence', '!']) + # self.assertListEqual([t.text for t in tokens[1]], + # ['This', 'is', 'the', 'second', 'sentence', '.'])