Add SentenceSplitter

26afe8e7 · Maja Jablonska · 497db05f · 26afe8e7 · 26afe8e7
Commit 26afe8e7 authored Apr 19, 2023 by Maja Jablonska
--- a/combo/data/tokenizers/__init__.py
+++ b/combo/data/tokenizers/__init__.py
@@ -2,4 +2,5 @@ from .tokenizer import Tokenizer, TokenizerToken
 from .character_tokenizer import CharacterTokenizer
 from .pretrained_transformer_tokenizer import PretrainedTransformerTokenizer
 from .spacy_tokenizer import SpacyTokenizer
+from .sentence_splitter import SentenceSplitter, SpacySentenceSplitter
 from .whitespace_tokenizer import WhitespaceTokenizer
--- a/combo/data/tokenizers/sentence_splitter.py
+++ b/combo/data/tokenizers/sentence_splitter.py
+"""
+Adapted from AllenNLP
+https://github.com/allenai/allennlp/blob/80fb6061e568cb9d6ab5d45b661e86eb61b92c82/allennlp/data/tokenizers/sentence_splitter.py
+"""
+from typing import List, Dict, Any
+import spacy
+from combo.utils.spacy import get_spacy_model
+class SentenceSplitter:
+    """
+    A `SentenceSplitter` splits strings into sentences.
+    """
+    default_implementation = "spacy"
+    def split_sentences(self, text: str) -> List[str]:
+        """
+        Splits a `text` :class:`str` paragraph into a list of :class:`str`, where each is a sentence.
+        """
+        raise NotImplementedError
+    def batch_split_sentences(self, texts: List[str]) -> List[List[str]]:
+        """
+        Default implementation is to just iterate over the texts and call `split_sentences`.
+        """
+        return [self.split_sentences(text) for text in texts]
+@SentenceSplitter.register("spacy")
+class SpacySentenceSplitter(SentenceSplitter):
+    """
+    A `SentenceSplitter` that uses spaCy's built-in sentence boundary detection.
+    Spacy's default sentence splitter uses a dependency parse to detect sentence boundaries, so
+    it is slow, but accurate.
+    Another option is to use rule-based sentence boundary detection. It's fast and has a small memory footprint,
+    since it uses punctuation to detect sentence boundaries. This can be activated with the `rule_based` flag.
+    By default, `SpacySentenceSplitter` calls the default spacy boundary detector.
+    Registered as a `SentenceSplitter` with name "spacy".
+    """
+    def __init__(self, language: str = "en_core_web_sm", rule_based: bool = False) -> None:
+        self._language = language
+        self._rule_based = rule_based
+        # we need spacy's dependency parser if we're not using rule-based sentence boundary detection.
+        self.spacy = get_spacy_model(self._language, parse=not self._rule_based, ner=False)
+        self._is_version_3 = spacy.__version__ >= "3.0"
+        if rule_based:
+            # we use `sentencizer`, a built-in spacy module for rule-based sentence boundary detection.
+            # depending on the spacy version, it could be called 'sentencizer' or 'sbd'
+            sbd_name = "sbd" if spacy.__version__ < "2.1" else "sentencizer"
+            if not self.spacy.has_pipe(sbd_name):
+                if self._is_version_3:
+                    self.spacy.add_pipe(sbd_name)
+                else:
+                    sbd = self.spacy.create_pipe(sbd_name)
+                    self.spacy.add_pipe(sbd)
+    def split_sentences(self, text: str) -> List[str]:
+        if self._is_version_3:
+            return [sent.text.strip() for sent in self.spacy(text).sents]
+        else:
+            return [sent.string.strip() for sent in self.spacy(text).sents]
+    def batch_split_sentences(self, texts: List[str]) -> List[List[str]]:
+        """
+        This method lets you take advantage of spacy's batch processing.
+        """
+        if self._is_version_3:
+            return [
+                [sentence.text.strip() for sentence in doc.sents] for doc in self.spacy.pipe(texts)
+            ]
+        return [
+            [sentence.string.strip() for sentence in doc.sents] for doc in self.spacy.pipe(texts)
+        ]
+    def _to_params(self) -> Dict[str, Any]:
+        return {"type": "spacy", "language": self._language, "rule_based": self._rule_based}
\ No newline at end of file