diff --git a/combo/data/tokenizers/__init__.py b/combo/data/tokenizers/__init__.py index 71d32e6df96485d7367cd661246e306831801917..cadae7e755774a9a9ded2c6e7ef03786f042c2ed 100644 --- a/combo/data/tokenizers/__init__.py +++ b/combo/data/tokenizers/__init__.py @@ -1,2 +1,2 @@ -from .tokenizer import Tokenizer +from .tokenizer import Tokenizer, TokenizerToken from .character_tokenizer import CharacterTokenizer diff --git a/combo/data/tokenizers/character_tokenizer.py b/combo/data/tokenizers/character_tokenizer.py index 5302e7b8d4ee7c508a6286091310a11eb02af749..f46c0f6724638b6c0cbf99cf946f7dc1e4155169 100644 --- a/combo/data/tokenizers/character_tokenizer.py +++ b/combo/data/tokenizers/character_tokenizer.py @@ -1,11 +1,86 @@ -from typing import List +""" +Adapted from AllenNLP +https://github.com/allenai/allennlp/blob/main/allennlp/data/tokenizers/character_tokenizer.py +""" -from combo.data import Token -from combo.data.tokenizers import Tokenizer -from overrides import override +from typing import List, Union, Dict, Any + +from combo.data.tokenizers import Tokenizer, TokenizerToken class CharacterTokenizer(Tokenizer): - @override - def tokenize(self, text: str) -> List[Token]: - return [Token(c) for c in list(text)] + """ + A `CharacterTokenizer` splits strings into character tokens. + Registered as a `Tokenizer` with name "character". + # Parameters + byte_encoding : `str`, optional (default=`None`) + If not `None`, we will use this encoding to encode the string as bytes, and use the byte + sequence as characters, instead of the unicode characters in the python string. E.g., the + character 'á' would be a single token if this option is `None`, but it would be two + tokens if this option is set to `"utf-8"`. + If this is not `None`, `tokenize` will return a `List[int]` instead of a + `List[str]`, and we will bypass the vocabulary in the `TokenIndexer`. + lowercase_characters : `bool`, optional (default=`False`) + If `True`, we will lowercase all of the characters in the text before doing any other + operation. You probably do not want to do this, as character vocabularies are generally + not very large to begin with, but it's an option if you really want it. + start_tokens : `List[str]`, optional + If given, these tokens will be added to the beginning of every string we tokenize. If + using byte encoding, this should actually be a `List[int]`, not a `List[str]`. + end_tokens : `List[str]`, optional + If given, these tokens will be added to the end of every string we tokenize. If using byte + encoding, this should actually be a `List[int]`, not a `List[str]`. + """ + + def __init__( + self, + byte_encoding: str = None, + lowercase_characters: bool = False, + start_tokens: List[Union[str, int]] = None, + end_tokens: List[Union[str, int]] = None, + ) -> None: + # TODO(brendanr): Add length truncation. + self._byte_encoding = byte_encoding + self._lowercase_characters = lowercase_characters + self._start_tokens = start_tokens or [] + # We reverse the tokens here because we're going to insert them with `insert(0)` later; + # this makes sure they show up in the right order. + self._start_tokens.reverse() + self._end_tokens = end_tokens or [] + + def tokenize(self, text: str) -> List[TokenizerToken]: + if self._lowercase_characters: + text = text.lower() + if self._byte_encoding is not None: + # We add 1 here so that we can still use 0 for masking, no matter what bytes we get out + # of this. + tokens = [TokenizerToken(text_id=c + 1) for c in text.encode(self._byte_encoding)] + else: + tokens = [TokenizerToken(t) for t in list(text)] + for start_token in self._start_tokens: + if isinstance(start_token, int): + token = TokenizerToken(text_id=start_token, idx=0) + else: + token = TokenizerToken(text=start_token, idx=0) + tokens.insert(0, token) + for end_token in self._end_tokens: + if isinstance(end_token, int): + token = TokenizerToken(text_id=end_token, idx=0) + else: + token = TokenizerToken(text=end_token, idx=0) + tokens.append(token) + return tokens + + def __eq__(self, other) -> bool: + if isinstance(self, other.__class__): + return self.__dict__ == other.__dict__ + return NotImplemented + + def _to_params(self) -> Dict[str, Any]: + return { + "type": "character", + "byte_encoding": self._byte_encoding, + "lowercase_characters": self._lowercase_characters, + "start_tokens": self._start_tokens, + "end_tokens": self._end_tokens, + } diff --git a/combo/data/tokenizers/pretrained_transformer_tokenizer.py b/combo/data/tokenizers/pretrained_transformer_tokenizer.py index f9e2ba84808eac52a6e44b068b35c02a45b64843..60102f7369a50b871066060a8fbbec7c28e2a686 100644 --- a/combo/data/tokenizers/pretrained_transformer_tokenizer.py +++ b/combo/data/tokenizers/pretrained_transformer_tokenizer.py @@ -14,8 +14,7 @@ import dill from transformers import PreTrainedTokenizer, AutoTokenizer -from combo.data import Token -from combo.data.tokenizers import Tokenizer +from combo.data.tokenizers import Tokenizer, TokenizerToken from combo.utils import sanitize_wordpiece logger = logging.getLogger(__name__) @@ -181,7 +180,7 @@ class PretrainedTransformerTokenizer(Tokenizer): self.sequence_pair_second_token_type_id = token_type_id continue - token = Token( + token = TokenizerToken( tokenizer_with_special_tokens.convert_ids_to_tokens(token_id), text_id=token_id, type_id=token_type_id, @@ -227,7 +226,7 @@ class PretrainedTransformerTokenizer(Tokenizer): self.single_sequence_token_type_id = token_type_id continue - token = Token( + token = TokenizerToken( tokenizer_with_special_tokens.convert_ids_to_tokens(token_id), text_id=token_id, type_id=token_type_id, @@ -251,7 +250,7 @@ class PretrainedTransformerTokenizer(Tokenizer): detokenized = " ".join(tokenized) return "a" in detokenized - def tokenize(self, text: str) -> List[Token]: + def tokenize(self, text: str) -> List[TokenizerToken]: """ This method only handles a single sentence (or sequence) of text. """ @@ -300,7 +299,7 @@ class PretrainedTransformerTokenizer(Tokenizer): start, end = offsets tokens.append( - Token( + TokenizerToken( text=self.tokenizer.convert_ids_to_tokens(token_id, skip_special_tokens=False), text_id=token_id, type_id=token_type_id, @@ -374,8 +373,8 @@ class PretrainedTransformerTokenizer(Tokenizer): def _intra_word_tokenize( self, string_tokens: List[str] - ) -> Tuple[List[Token], List[Optional[Tuple[int, int]]]]: - tokens: List[Token] = [] + ) -> Tuple[List[TokenizerToken], List[Optional[Tuple[int, int]]]]: + tokens: List[TokenizerToken] = [] offsets: List[Optional[Tuple[int, int]]] = [] for token_string in string_tokens: wordpieces = self.tokenizer.encode_plus( @@ -390,7 +389,7 @@ class PretrainedTransformerTokenizer(Tokenizer): if len(wp_ids) > 0: offsets.append((len(tokens), len(tokens) + len(wp_ids) - 1)) tokens.extend( - Token(text=wp_text, text_id=wp_id) + TokenizerToken(text=wp_text, text_id=wp_id) for wp_id, wp_text in zip(wp_ids, self.tokenizer.convert_ids_to_tokens(wp_ids)) ) else: @@ -408,7 +407,7 @@ class PretrainedTransformerTokenizer(Tokenizer): def intra_word_tokenize( self, string_tokens: List[str] - ) -> Tuple[List[Token], List[Optional[Tuple[int, int]]]]: + ) -> Tuple[List[TokenizerToken], List[Optional[Tuple[int, int]]]]: """ Tokenizes each word into wordpieces separately and returns the wordpiece IDs. Also calculates offsets such that tokens[offsets[i][0]:offsets[i][1] + 1] @@ -422,7 +421,7 @@ class PretrainedTransformerTokenizer(Tokenizer): def intra_word_tokenize_sentence_pair( self, string_tokens_a: List[str], string_tokens_b: List[str] - ) -> Tuple[List[Token], List[Optional[Tuple[int, int]]], List[Optional[Tuple[int, int]]]]: + ) -> Tuple[List[TokenizerToken], List[Optional[Tuple[int, int]]], List[Optional[Tuple[int, int]]]]: """ Tokenizes each word into wordpieces separately and returns the wordpiece IDs. Also calculates offsets such that wordpieces[offsets[i][0]:offsets[i][1] + 1] @@ -445,9 +444,9 @@ class PretrainedTransformerTokenizer(Tokenizer): return tokens_a, offsets_a, offsets_b def add_special_tokens( - self, tokens1: List[Token], tokens2: Optional[List[Token]] = None - ) -> List[Token]: - def with_new_type_id(tokens: List[Token], type_id: int) -> List[Token]: + self, tokens1: List[TokenizerToken], tokens2: Optional[List[TokenizerToken]] = None + ) -> List[TokenizerToken]: + def with_new_type_id(tokens: List[TokenizerToken], type_id: int) -> List[TokenizerToken]: return [dataclasses.replace(t, type_id=type_id) for t in tokens] # Make sure we don't change the input parameters diff --git a/combo/data/tokenizers/tokenizer.py b/combo/data/tokenizers/tokenizer.py index b9bd901410c0585f64f65372f9aa51b29ff92a80..f3969f7da1fcc379504a5b3b0900dbe5a3766308 100644 --- a/combo/data/tokenizers/tokenizer.py +++ b/combo/data/tokenizers/tokenizer.py @@ -1,15 +1,196 @@ -from typing import List +""" +Adapted from AllenNLP +https://github.com/allenai/allennlp/blob/main/allennlp/data/tokenizers/token_class.py +https://github.com/allenai/allennlp/blob/main/allennlp/data/tokenizers/tokenizer.py +""" + +from typing import List, Optional import logging +from dataclasses import dataclass -from combo.data import Token +logger = logging.getLogger(__name__) -logger = logging.getLogger(__name__) +@dataclass(init=False, repr=False) +class TokenizerToken: + """ + A simple token representation, keeping track of the token's text, offset in the passage it was + taken from, POS tag, dependency relation, and similar information. These fields match spacy's + exactly, so we can just use a spacy token for this. + # Parameters + text : `str`, optional + The original text represented by this token. + idx : `int`, optional + The character offset of this token into the tokenized passage. + idx_end : `int`, optional + The character offset one past the last character in the tokenized passage. + lemma_ : `str`, optional + The lemma of this token. + pos_ : `str`, optional + The coarse-grained part of speech of this token. + tag_ : `str`, optional + The fine-grained part of speech of this token. + dep_ : `str`, optional + The dependency relation for this token. + ent_type_ : `str`, optional + The entity type (i.e., the NER tag) for this token. + text_id : `int`, optional + If your tokenizer returns integers instead of strings (e.g., because you're doing byte + encoding, or some hash-based embedding), set this with the integer. If this is set, we + will bypass the vocabulary when indexing this token, regardless of whether `text` is also + set. You can `also` set `text` with the original text, if you want, so that you can + still use a character-level representation in addition to a hash-based word embedding. + type_id : `int`, optional + Token type id used by some pretrained language models like original BERT + The other fields on `Token` follow the fields on spacy's `Token` object; this is one we + added, similar to spacy's `lex_id`. + """ + + __slots__ = [ + "text", + "idx", + "idx_end", + "lemma_", + "pos_", + "tag_", + "dep_", + "ent_type_", + "text_id", + "type_id", + ] + # Defining the `__slots__` of this class is an optimization that dramatically reduces + # the size in memory of a `Token` instance. The downside of using `__slots__` + # with a dataclass is that you can't assign default values at the class level, + # which is why we need a custom `__init__` function that provides the default values. + + text: Optional[str] + idx: Optional[int] + idx_end: Optional[int] + lemma_: Optional[str] + pos_: Optional[str] + tag_: Optional[str] + dep_: Optional[str] + ent_type_: Optional[str] + text_id: Optional[int] + type_id: Optional[int] + + def __init__( + self, + text: str = None, + idx: int = None, + idx_end: int = None, + lemma_: str = None, + pos_: str = None, + tag_: str = None, + dep_: str = None, + ent_type_: str = None, + text_id: int = None, + type_id: int = None, + ) -> None: + assert text is None or isinstance( + text, str + ) # Some very hard to debug errors happen when this is not true. + self.text = text + self.idx = idx + self.idx_end = idx_end + self.lemma_ = lemma_ + self.pos_ = pos_ + self.tag_ = tag_ + self.dep_ = dep_ + self.ent_type_ = ent_type_ + self.text_id = text_id + self.type_id = type_id + + def __str__(self): + return self.text + + def __repr__(self): + return self.__str__() + + def ensure_text(self) -> str: + """ + Return the `text` field, raising an exception if it's `None`. + """ + if self.text is None: + raise ValueError("Unexpected null text for token") + else: + return self.text + + +def show_token(token: TokenizerToken) -> str: + return ( + f"{token.text} " + f"(idx: {token.idx}) " + f"(idx_end: {token.idx_end}) " + f"(lemma: {token.lemma_}) " + f"(pos: {token.pos_}) " + f"(tag: {token.tag_}) " + f"(dep: {token.dep_}) " + f"(ent_type: {token.ent_type_}) " + f"(text_id: {token.text_id}) " + f"(type_id: {token.type_id}) " + ) class Tokenizer: - def tokenize(self, text: str) -> List[Token]: - raise NotImplementedError + """ + A `Tokenizer` splits strings of text into tokens. Typically, this either splits text into + word tokens or character tokens, and those are the two tokenizer subclasses we have implemented + here, though you could imagine wanting to do other kinds of tokenization for structured or + other inputs. + See the parameters to, e.g., :class:`~.SpacyTokenizer`, or whichever tokenizer + you want to use. + If the base input to your model is words, you should use a :class:`~.SpacyTokenizer`, even if + you also want to have a character-level encoder to get an additional vector for each word + token. Splitting word tokens into character arrays is handled separately, in the + :class:`..token_representations.TokenRepresentation` class. + """ - def batch_tokenize(self, texts: List[str]) -> List[List[Token]]: + default_implementation = "spacy" + + def batch_tokenize(self, texts: List[str]) -> List[List[TokenizerToken]]: + """ + Batches together tokenization of several texts, in case that is faster for particular + tokenizers. + By default we just do this without batching. Override this in your tokenizer if you have a + good way of doing batched computation. + """ return [self.tokenize(text) for text in texts] + + def tokenize(self, text: str) -> List[TokenizerToken]: + """ + Actually implements splitting words into tokens. + # Returns + tokens : `List[Token]` + """ + raise NotImplementedError + + def add_special_tokens( + self, tokens1: List[TokenizerToken], tokens2: Optional[List[TokenizerToken]] = None + ) -> List[TokenizerToken]: + """ + Adds special tokens to tokenized text. These are tokens like [CLS] or [SEP]. + Not all tokenizers do this. The default is to just return the tokens unchanged. + # Parameters + tokens1 : `List[Token]` + The list of tokens to add special tokens to. + tokens2 : `Optional[List[Token]]` + An optional second list of tokens. This will be concatenated with `tokens1`. Special tokens will be + added as appropriate. + # Returns + tokens : `List[Token]` + The combined list of tokens, with special tokens added. + """ + return tokens1 + (tokens2 or []) + + def num_special_tokens_for_sequence(self) -> int: + """ + Returns the number of special tokens added for a single sequence. + """ + return 0 + + def num_special_tokens_for_pair(self) -> int: + """ + Returns the number of special tokens added for a pair of sequences. + """ + return 0