diff --git a/combo/data/tokenizers/__init__.py b/combo/data/tokenizers/__init__.py index 04e3c6eb9e11f73721445addebe9586e7e626cc0..a92183e98aa51d3b233d11e1fd6591407e8b70bd 100644 --- a/combo/data/tokenizers/__init__.py +++ b/combo/data/tokenizers/__init__.py @@ -2,4 +2,5 @@ from .tokenizer import Tokenizer, TokenizerToken from .character_tokenizer import CharacterTokenizer from .pretrained_transformer_tokenizer import PretrainedTransformerTokenizer from .spacy_tokenizer import SpacyTokenizer +from .sentence_splitter import SentenceSplitter, SpacySentenceSplitter from .whitespace_tokenizer import WhitespaceTokenizer diff --git a/combo/data/tokenizers/sentence_splitter.py b/combo/data/tokenizers/sentence_splitter.py new file mode 100644 index 0000000000000000000000000000000000000000..30051135a0746425314a878d01cf094058e501e8 --- /dev/null +++ b/combo/data/tokenizers/sentence_splitter.py @@ -0,0 +1,81 @@ +""" +Adapted from AllenNLP +https://github.com/allenai/allennlp/blob/80fb6061e568cb9d6ab5d45b661e86eb61b92c82/allennlp/data/tokenizers/sentence_splitter.py +""" +from typing import List, Dict, Any + +import spacy + +from combo.utils.spacy import get_spacy_model + + +class SentenceSplitter: + """ + A `SentenceSplitter` splits strings into sentences. + """ + + default_implementation = "spacy" + + def split_sentences(self, text: str) -> List[str]: + """ + Splits a `text` :class:`str` paragraph into a list of :class:`str`, where each is a sentence. + """ + raise NotImplementedError + + def batch_split_sentences(self, texts: List[str]) -> List[List[str]]: + """ + Default implementation is to just iterate over the texts and call `split_sentences`. + """ + return [self.split_sentences(text) for text in texts] + + +@SentenceSplitter.register("spacy") +class SpacySentenceSplitter(SentenceSplitter): + """ + A `SentenceSplitter` that uses spaCy's built-in sentence boundary detection. + Spacy's default sentence splitter uses a dependency parse to detect sentence boundaries, so + it is slow, but accurate. + Another option is to use rule-based sentence boundary detection. It's fast and has a small memory footprint, + since it uses punctuation to detect sentence boundaries. This can be activated with the `rule_based` flag. + By default, `SpacySentenceSplitter` calls the default spacy boundary detector. + Registered as a `SentenceSplitter` with name "spacy". + """ + + def __init__(self, language: str = "en_core_web_sm", rule_based: bool = False) -> None: + self._language = language + self._rule_based = rule_based + + # we need spacy's dependency parser if we're not using rule-based sentence boundary detection. + self.spacy = get_spacy_model(self._language, parse=not self._rule_based, ner=False) + self._is_version_3 = spacy.__version__ >= "3.0" + if rule_based: + # we use `sentencizer`, a built-in spacy module for rule-based sentence boundary detection. + # depending on the spacy version, it could be called 'sentencizer' or 'sbd' + sbd_name = "sbd" if spacy.__version__ < "2.1" else "sentencizer" + if not self.spacy.has_pipe(sbd_name): + if self._is_version_3: + self.spacy.add_pipe(sbd_name) + else: + sbd = self.spacy.create_pipe(sbd_name) + self.spacy.add_pipe(sbd) + + def split_sentences(self, text: str) -> List[str]: + if self._is_version_3: + return [sent.text.strip() for sent in self.spacy(text).sents] + else: + return [sent.string.strip() for sent in self.spacy(text).sents] + + def batch_split_sentences(self, texts: List[str]) -> List[List[str]]: + """ + This method lets you take advantage of spacy's batch processing. + """ + if self._is_version_3: + return [ + [sentence.text.strip() for sentence in doc.sents] for doc in self.spacy.pipe(texts) + ] + return [ + [sentence.string.strip() for sentence in doc.sents] for doc in self.spacy.pipe(texts) + ] + + def _to_params(self) -> Dict[str, Any]: + return {"type": "spacy", "language": self._language, "rule_based": self._rule_based} \ No newline at end of file