Skip to content
Snippets Groups Projects
Commit 0bdbb324 authored by Maja Jabłońska's avatar Maja Jabłońska
Browse files

Add Spacy tokenizer adapted from AllenNLP

parent 918301af
No related branches found
No related tags found
1 merge request!46Merge COMBO 3.0 into master
from .tokenizer import Tokenizer, TokenizerToken from .tokenizer import Tokenizer, TokenizerToken
from .character_tokenizer import CharacterTokenizer from .character_tokenizer import CharacterTokenizer
from .pretrained_transformer_tokenizer import PretrainedTransformerTokenizer
from .spacy_tokenizer import SpacyTokenizer
"""
Adapted from AllenNLP
https://github.com/allenai/allennlp/blob/main/allennlp/data/tokenizers/spacy_tokenizer.py
"""
from typing import List, Optional
import spacy
from spacy.tokens import Doc
from combo.data.tokenizers.tokenizer import Tokenizer, TokenizerToken
from combo.utils.spacy import get_spacy_model
class SpacyTokenizer(Tokenizer):
"""
A `Tokenizer` that uses spaCy's tokenizer. It's fast and reasonable - this is the
recommended `Tokenizer`. By default it will return allennlp Tokens,
which are small, efficient NamedTuples (and are serializable). If you want
to keep the original spaCy tokens, pass keep_spacy_tokens=True. Note that we leave one particular piece of
post-processing for later: the decision of whether or not to lowercase the token. This is for
two reasons: (1) if you want to make two different casing decisions for whatever reason, you
won't have to run the tokenizer twice, and more importantly (2) if you want to lowercase words
for your word embedding, but retain capitalization in a character-level representation, we need
to retain the capitalization here.
Registered as a `Tokenizer` with name "spacy", which is currently the default.
# Parameters
language : `str`, optional, (default=`"en_core_web_sm"`)
Spacy model name.
pos_tags : `bool`, optional, (default=`False`)
If `True`, performs POS tagging with spacy model on the tokens.
Generally used in conjunction with :class:`~allennlp.data.token_indexers.pos_tag_indexer.PosTagIndexer`.
parse : `bool`, optional, (default=`False`)
If `True`, performs dependency parsing with spacy model on the tokens.
Generally used in conjunction with :class:`~allennlp.data.token_indexers.pos_tag_indexer.DepLabelIndexer`.
ner : `bool`, optional, (default=`False`)
If `True`, performs dependency parsing with spacy model on the tokens.
Generally used in conjunction with :class:`~allennlp.data.token_indexers.ner_tag_indexer.NerTagIndexer`.
keep_spacy_tokens : `bool`, optional, (default=`False`)
If `True`, will preserve spacy token objects, We copy spacy tokens into our own class by default instead
because spacy Cython Tokens can't be pickled.
split_on_spaces : `bool`, optional, (default=`False`)
If `True`, will split by spaces without performing tokenization.
Used when your data is already tokenized, but you want to perform pos, ner or parsing on the tokens.
start_tokens : `Optional[List[str]]`, optional, (default=`None`)
If given, these tokens will be added to the beginning of every string we tokenize.
end_tokens : `Optional[List[str]]`, optional, (default=`None`)
If given, these tokens will be added to the end of every string we tokenize.
"""
def __init__(
self,
language: str = "en_core_web_sm",
pos_tags: bool = True,
parse: bool = False,
ner: bool = False,
keep_spacy_tokens: bool = False,
split_on_spaces: bool = False,
start_tokens: Optional[List[str]] = None,
end_tokens: Optional[List[str]] = None,
) -> None:
# Save these for use later in the _to_params method
self._language = language
self._pos_tags = pos_tags
self._parse = parse
self._ner = ner
self._split_on_spaces = split_on_spaces
self.spacy = get_spacy_model(self._language, self._pos_tags, self._parse, self._ner)
if self._split_on_spaces:
self.spacy.tokenizer = _WhitespaceSpacyTokenizer(self.spacy.vocab)
self._keep_spacy_tokens = keep_spacy_tokens
self._start_tokens = start_tokens or []
# We reverse the tokens here because we're going to insert them with `insert(0)` later;
# this makes sure they show up in the right order.
self._start_tokens.reverse()
self._is_version_3 = spacy.__version__ >= "3.0"
self._end_tokens = end_tokens or []
def _sanitize(self, tokens: List[spacy.tokens.Token]) -> List[TokenizerToken]:
"""
Converts spaCy tokens to allennlp tokens. Is a no-op if
keep_spacy_tokens is True
"""
if not self._keep_spacy_tokens:
tokens = [
TokenizerToken(
token.text,
token.idx,
token.idx + len(token.text),
token.lemma_,
token.pos_,
token.tag_,
token.dep_,
token.ent_type_,
)
for token in tokens
]
for start_token in self._start_tokens:
tokens.insert(0, TokenizerToken(start_token, 0))
for end_token in self._end_tokens:
tokens.append(TokenizerToken(end_token, -1))
return tokens
def batch_tokenize(self, texts: List[str]) -> List[List[TokenizerToken]]:
if self._is_version_3:
return [
self._sanitize(_remove_spaces(tokens))
for tokens in self.spacy.pipe(texts, n_process=-1)
]
else:
return [
self._sanitize(_remove_spaces(tokens))
for tokens in self.spacy.pipe(texts, n_threads=-1)
]
def tokenize(self, text: str) -> List[TokenizerToken]:
# This works because our Token class matches spacy's.
return self._sanitize(_remove_spaces(self.spacy(text)))
def _to_params(self):
return {
"type": "spacy",
"language": self._language,
"pos_tags": self._pos_tags,
"parse": self._parse,
"ner": self._ner,
"keep_spacy_tokens": self._keep_spacy_tokens,
"split_on_spaces": self._split_on_spaces,
"start_tokens": self._start_tokens,
"end_tokens": self._end_tokens,
}
class _WhitespaceSpacyTokenizer:
"""
Spacy doesn't assume that text is tokenised. Sometimes this
is annoying, like when you have gold data which is pre-tokenised,
but Spacy's tokenisation doesn't match the gold. This can be used
as follows:
nlp = spacy.load("en_core_web_md")
# hack to replace tokenizer with a whitespace tokenizer
nlp.tokenizer = _WhitespaceSpacyTokenizer(nlp.vocab)
... use nlp("here is some text") as normal.
"""
def __init__(self, vocab):
self.vocab = vocab
def __call__(self, text):
words = text.split(" ")
spaces = [True] * len(words)
return Doc(self.vocab, words=words, spaces=spaces)
def _remove_spaces(tokens: List[spacy.tokens.Token]) -> List[spacy.tokens.Token]:
return [token for token in tokens if not token.is_space]
"""
Adapted from AllenNLP
https://github.com/allenai/allennlp/blob/80fb6061e568cb9d6ab5d45b661e86eb61b92c82/allennlp/common/util.py#L261
"""
import logging
from typing import Dict, Tuple
import spacy
from spacy.cli.download import download as spacy_download
from spacy.language import Language as SpacyModelType
logger = logging.getLogger(__name__)
LOADED_SPACY_MODELS: Dict[Tuple[str, bool, bool, bool], SpacyModelType] = {}
def get_spacy_model(
spacy_model_name: str, pos_tags: bool = True, parse: bool = False, ner: bool = False
) -> SpacyModelType:
"""
In order to avoid loading spacy models a whole bunch of times, we'll save references to them,
keyed by the options we used to create the spacy model, so any particular configuration only
gets loaded once.
"""
options = (spacy_model_name, pos_tags, parse, ner)
if options not in LOADED_SPACY_MODELS:
disable = ["vectors", "textcat"]
if not pos_tags:
disable.append("tagger")
if not parse:
disable.append("parser")
if not ner:
disable.append("ner")
try:
spacy_model = spacy.load(spacy_model_name, disable=disable)
except OSError:
logger.warning(
f"Spacy models '{spacy_model_name}' not found. Downloading and installing."
)
spacy_download(spacy_model_name)
# Import the downloaded model module directly and load from there
spacy_model_module = __import__(spacy_model_name)
spacy_model = spacy_model_module.load(disable=disable) # type: ignore
LOADED_SPACY_MODELS[options] = spacy_model
return LOADED_SPACY_MODELS[options]
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment