Skip to content
Snippets Groups Projects
Commit b5f16ddf authored by Maja Jabłońska's avatar Maja Jabłońska Committed by Martyna Wiącek
Browse files

Add AllenNLP's Token

parent 07d074d7
Branches
Tags
1 merge request!46Merge COMBO 3.0 into master
from .tokenizer import Tokenizer
from .tokenizer import Tokenizer, TokenizerToken
from .character_tokenizer import CharacterTokenizer
from typing import List
"""
Adapted from AllenNLP
https://github.com/allenai/allennlp/blob/main/allennlp/data/tokenizers/character_tokenizer.py
"""
from combo.data import Token
from combo.data.tokenizers import Tokenizer
from overrides import override
from typing import List, Union, Dict, Any
from combo.data.tokenizers import Tokenizer, TokenizerToken
class CharacterTokenizer(Tokenizer):
@override
def tokenize(self, text: str) -> List[Token]:
return [Token(c) for c in list(text)]
"""
A `CharacterTokenizer` splits strings into character tokens.
Registered as a `Tokenizer` with name "character".
# Parameters
byte_encoding : `str`, optional (default=`None`)
If not `None`, we will use this encoding to encode the string as bytes, and use the byte
sequence as characters, instead of the unicode characters in the python string. E.g., the
character 'á' would be a single token if this option is `None`, but it would be two
tokens if this option is set to `"utf-8"`.
If this is not `None`, `tokenize` will return a `List[int]` instead of a
`List[str]`, and we will bypass the vocabulary in the `TokenIndexer`.
lowercase_characters : `bool`, optional (default=`False`)
If `True`, we will lowercase all of the characters in the text before doing any other
operation. You probably do not want to do this, as character vocabularies are generally
not very large to begin with, but it's an option if you really want it.
start_tokens : `List[str]`, optional
If given, these tokens will be added to the beginning of every string we tokenize. If
using byte encoding, this should actually be a `List[int]`, not a `List[str]`.
end_tokens : `List[str]`, optional
If given, these tokens will be added to the end of every string we tokenize. If using byte
encoding, this should actually be a `List[int]`, not a `List[str]`.
"""
def __init__(
self,
byte_encoding: str = None,
lowercase_characters: bool = False,
start_tokens: List[Union[str, int]] = None,
end_tokens: List[Union[str, int]] = None,
) -> None:
# TODO(brendanr): Add length truncation.
self._byte_encoding = byte_encoding
self._lowercase_characters = lowercase_characters
self._start_tokens = start_tokens or []
# We reverse the tokens here because we're going to insert them with `insert(0)` later;
# this makes sure they show up in the right order.
self._start_tokens.reverse()
self._end_tokens = end_tokens or []
def tokenize(self, text: str) -> List[TokenizerToken]:
if self._lowercase_characters:
text = text.lower()
if self._byte_encoding is not None:
# We add 1 here so that we can still use 0 for masking, no matter what bytes we get out
# of this.
tokens = [TokenizerToken(text_id=c + 1) for c in text.encode(self._byte_encoding)]
else:
tokens = [TokenizerToken(t) for t in list(text)]
for start_token in self._start_tokens:
if isinstance(start_token, int):
token = TokenizerToken(text_id=start_token, idx=0)
else:
token = TokenizerToken(text=start_token, idx=0)
tokens.insert(0, token)
for end_token in self._end_tokens:
if isinstance(end_token, int):
token = TokenizerToken(text_id=end_token, idx=0)
else:
token = TokenizerToken(text=end_token, idx=0)
tokens.append(token)
return tokens
def __eq__(self, other) -> bool:
if isinstance(self, other.__class__):
return self.__dict__ == other.__dict__
return NotImplemented
def _to_params(self) -> Dict[str, Any]:
return {
"type": "character",
"byte_encoding": self._byte_encoding,
"lowercase_characters": self._lowercase_characters,
"start_tokens": self._start_tokens,
"end_tokens": self._end_tokens,
}
......@@ -14,8 +14,7 @@ import dill
from transformers import PreTrainedTokenizer, AutoTokenizer
from combo.data import Token
from combo.data.tokenizers import Tokenizer
from combo.data.tokenizers import Tokenizer, TokenizerToken
from combo.utils import sanitize_wordpiece
logger = logging.getLogger(__name__)
......@@ -181,7 +180,7 @@ class PretrainedTransformerTokenizer(Tokenizer):
self.sequence_pair_second_token_type_id = token_type_id
continue
token = Token(
token = TokenizerToken(
tokenizer_with_special_tokens.convert_ids_to_tokens(token_id),
text_id=token_id,
type_id=token_type_id,
......@@ -227,7 +226,7 @@ class PretrainedTransformerTokenizer(Tokenizer):
self.single_sequence_token_type_id = token_type_id
continue
token = Token(
token = TokenizerToken(
tokenizer_with_special_tokens.convert_ids_to_tokens(token_id),
text_id=token_id,
type_id=token_type_id,
......@@ -251,7 +250,7 @@ class PretrainedTransformerTokenizer(Tokenizer):
detokenized = " ".join(tokenized)
return "a" in detokenized
def tokenize(self, text: str) -> List[Token]:
def tokenize(self, text: str) -> List[TokenizerToken]:
"""
This method only handles a single sentence (or sequence) of text.
"""
......@@ -300,7 +299,7 @@ class PretrainedTransformerTokenizer(Tokenizer):
start, end = offsets
tokens.append(
Token(
TokenizerToken(
text=self.tokenizer.convert_ids_to_tokens(token_id, skip_special_tokens=False),
text_id=token_id,
type_id=token_type_id,
......@@ -374,8 +373,8 @@ class PretrainedTransformerTokenizer(Tokenizer):
def _intra_word_tokenize(
self, string_tokens: List[str]
) -> Tuple[List[Token], List[Optional[Tuple[int, int]]]]:
tokens: List[Token] = []
) -> Tuple[List[TokenizerToken], List[Optional[Tuple[int, int]]]]:
tokens: List[TokenizerToken] = []
offsets: List[Optional[Tuple[int, int]]] = []
for token_string in string_tokens:
wordpieces = self.tokenizer.encode_plus(
......@@ -390,7 +389,7 @@ class PretrainedTransformerTokenizer(Tokenizer):
if len(wp_ids) > 0:
offsets.append((len(tokens), len(tokens) + len(wp_ids) - 1))
tokens.extend(
Token(text=wp_text, text_id=wp_id)
TokenizerToken(text=wp_text, text_id=wp_id)
for wp_id, wp_text in zip(wp_ids, self.tokenizer.convert_ids_to_tokens(wp_ids))
)
else:
......@@ -408,7 +407,7 @@ class PretrainedTransformerTokenizer(Tokenizer):
def intra_word_tokenize(
self, string_tokens: List[str]
) -> Tuple[List[Token], List[Optional[Tuple[int, int]]]]:
) -> Tuple[List[TokenizerToken], List[Optional[Tuple[int, int]]]]:
"""
Tokenizes each word into wordpieces separately and returns the wordpiece IDs.
Also calculates offsets such that tokens[offsets[i][0]:offsets[i][1] + 1]
......@@ -422,7 +421,7 @@ class PretrainedTransformerTokenizer(Tokenizer):
def intra_word_tokenize_sentence_pair(
self, string_tokens_a: List[str], string_tokens_b: List[str]
) -> Tuple[List[Token], List[Optional[Tuple[int, int]]], List[Optional[Tuple[int, int]]]]:
) -> Tuple[List[TokenizerToken], List[Optional[Tuple[int, int]]], List[Optional[Tuple[int, int]]]]:
"""
Tokenizes each word into wordpieces separately and returns the wordpiece IDs.
Also calculates offsets such that wordpieces[offsets[i][0]:offsets[i][1] + 1]
......@@ -445,9 +444,9 @@ class PretrainedTransformerTokenizer(Tokenizer):
return tokens_a, offsets_a, offsets_b
def add_special_tokens(
self, tokens1: List[Token], tokens2: Optional[List[Token]] = None
) -> List[Token]:
def with_new_type_id(tokens: List[Token], type_id: int) -> List[Token]:
self, tokens1: List[TokenizerToken], tokens2: Optional[List[TokenizerToken]] = None
) -> List[TokenizerToken]:
def with_new_type_id(tokens: List[TokenizerToken], type_id: int) -> List[TokenizerToken]:
return [dataclasses.replace(t, type_id=type_id) for t in tokens]
# Make sure we don't change the input parameters
......
from typing import List
"""
Adapted from AllenNLP
https://github.com/allenai/allennlp/blob/main/allennlp/data/tokenizers/token_class.py
https://github.com/allenai/allennlp/blob/main/allennlp/data/tokenizers/tokenizer.py
"""
from typing import List, Optional
import logging
from dataclasses import dataclass
from combo.data import Token
logger = logging.getLogger(__name__)
logger = logging.getLogger(__name__)
@dataclass(init=False, repr=False)
class TokenizerToken:
"""
A simple token representation, keeping track of the token's text, offset in the passage it was
taken from, POS tag, dependency relation, and similar information. These fields match spacy's
exactly, so we can just use a spacy token for this.
# Parameters
text : `str`, optional
The original text represented by this token.
idx : `int`, optional
The character offset of this token into the tokenized passage.
idx_end : `int`, optional
The character offset one past the last character in the tokenized passage.
lemma_ : `str`, optional
The lemma of this token.
pos_ : `str`, optional
The coarse-grained part of speech of this token.
tag_ : `str`, optional
The fine-grained part of speech of this token.
dep_ : `str`, optional
The dependency relation for this token.
ent_type_ : `str`, optional
The entity type (i.e., the NER tag) for this token.
text_id : `int`, optional
If your tokenizer returns integers instead of strings (e.g., because you're doing byte
encoding, or some hash-based embedding), set this with the integer. If this is set, we
will bypass the vocabulary when indexing this token, regardless of whether `text` is also
set. You can `also` set `text` with the original text, if you want, so that you can
still use a character-level representation in addition to a hash-based word embedding.
type_id : `int`, optional
Token type id used by some pretrained language models like original BERT
The other fields on `Token` follow the fields on spacy's `Token` object; this is one we
added, similar to spacy's `lex_id`.
"""
__slots__ = [
"text",
"idx",
"idx_end",
"lemma_",
"pos_",
"tag_",
"dep_",
"ent_type_",
"text_id",
"type_id",
]
# Defining the `__slots__` of this class is an optimization that dramatically reduces
# the size in memory of a `Token` instance. The downside of using `__slots__`
# with a dataclass is that you can't assign default values at the class level,
# which is why we need a custom `__init__` function that provides the default values.
text: Optional[str]
idx: Optional[int]
idx_end: Optional[int]
lemma_: Optional[str]
pos_: Optional[str]
tag_: Optional[str]
dep_: Optional[str]
ent_type_: Optional[str]
text_id: Optional[int]
type_id: Optional[int]
def __init__(
self,
text: str = None,
idx: int = None,
idx_end: int = None,
lemma_: str = None,
pos_: str = None,
tag_: str = None,
dep_: str = None,
ent_type_: str = None,
text_id: int = None,
type_id: int = None,
) -> None:
assert text is None or isinstance(
text, str
) # Some very hard to debug errors happen when this is not true.
self.text = text
self.idx = idx
self.idx_end = idx_end
self.lemma_ = lemma_
self.pos_ = pos_
self.tag_ = tag_
self.dep_ = dep_
self.ent_type_ = ent_type_
self.text_id = text_id
self.type_id = type_id
def __str__(self):
return self.text
def __repr__(self):
return self.__str__()
def ensure_text(self) -> str:
"""
Return the `text` field, raising an exception if it's `None`.
"""
if self.text is None:
raise ValueError("Unexpected null text for token")
else:
return self.text
def show_token(token: TokenizerToken) -> str:
return (
f"{token.text} "
f"(idx: {token.idx}) "
f"(idx_end: {token.idx_end}) "
f"(lemma: {token.lemma_}) "
f"(pos: {token.pos_}) "
f"(tag: {token.tag_}) "
f"(dep: {token.dep_}) "
f"(ent_type: {token.ent_type_}) "
f"(text_id: {token.text_id}) "
f"(type_id: {token.type_id}) "
)
class Tokenizer:
def tokenize(self, text: str) -> List[Token]:
raise NotImplementedError
"""
A `Tokenizer` splits strings of text into tokens. Typically, this either splits text into
word tokens or character tokens, and those are the two tokenizer subclasses we have implemented
here, though you could imagine wanting to do other kinds of tokenization for structured or
other inputs.
See the parameters to, e.g., :class:`~.SpacyTokenizer`, or whichever tokenizer
you want to use.
If the base input to your model is words, you should use a :class:`~.SpacyTokenizer`, even if
you also want to have a character-level encoder to get an additional vector for each word
token. Splitting word tokens into character arrays is handled separately, in the
:class:`..token_representations.TokenRepresentation` class.
"""
def batch_tokenize(self, texts: List[str]) -> List[List[Token]]:
default_implementation = "spacy"
def batch_tokenize(self, texts: List[str]) -> List[List[TokenizerToken]]:
"""
Batches together tokenization of several texts, in case that is faster for particular
tokenizers.
By default we just do this without batching. Override this in your tokenizer if you have a
good way of doing batched computation.
"""
return [self.tokenize(text) for text in texts]
def tokenize(self, text: str) -> List[TokenizerToken]:
"""
Actually implements splitting words into tokens.
# Returns
tokens : `List[Token]`
"""
raise NotImplementedError
def add_special_tokens(
self, tokens1: List[TokenizerToken], tokens2: Optional[List[TokenizerToken]] = None
) -> List[TokenizerToken]:
"""
Adds special tokens to tokenized text. These are tokens like [CLS] or [SEP].
Not all tokenizers do this. The default is to just return the tokens unchanged.
# Parameters
tokens1 : `List[Token]`
The list of tokens to add special tokens to.
tokens2 : `Optional[List[Token]]`
An optional second list of tokens. This will be concatenated with `tokens1`. Special tokens will be
added as appropriate.
# Returns
tokens : `List[Token]`
The combined list of tokens, with special tokens added.
"""
return tokens1 + (tokens2 or [])
def num_special_tokens_for_sequence(self) -> int:
"""
Returns the number of special tokens added for a single sequence.
"""
return 0
def num_special_tokens_for_pair(self) -> int:
"""
Returns the number of special tokens added for a pair of sequences.
"""
return 0
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment