From c7c67b8e18ede85b069ed64ad383b0c4e9da306e Mon Sep 17 00:00:00 2001
From: Maja Jablonska <majajjablonska@gmail.com>
Date: Thu, 15 Feb 2024 22:21:04 +1100
Subject: [PATCH] Remove unnecessary spacy dependence

---
 combo/data/fields/text_field.py               |  2 -
 combo/data/tokenizers/__init__.py             |  1 -
 combo/data/tokenizers/lambo_tokenizer.py      | 34 ++++----
 combo/data/tokenizers/sentence_splitter.py    | 79 -------------------
 combo/main.py                                 |  8 +-
 requirements.txt                              |  3 -
 setup.py                                      |  1 -
 tests/data/tokenizers/test_lambo_tokenizer.py |  6 +-
 8 files changed, 24 insertions(+), 110 deletions(-)
 delete mode 100644 combo/data/tokenizers/sentence_splitter.py

diff --git a/combo/data/fields/text_field.py b/combo/data/fields/text_field.py
index da42252..ddd1c01 100644
--- a/combo/data/fields/text_field.py
+++ b/combo/data/fields/text_field.py
@@ -10,8 +10,6 @@ from copy import deepcopy
 from typing import Dict, List, Optional, Iterator
 import textwrap
 
-
-from spacy.tokens import Token as SpacyToken
 import torch
 
 # There are two levels of dictionaries here: the top level is for the *key*, which aligns
diff --git a/combo/data/tokenizers/__init__.py b/combo/data/tokenizers/__init__.py
index 4deeda9..5486da8 100644
--- a/combo/data/tokenizers/__init__.py
+++ b/combo/data/tokenizers/__init__.py
@@ -1,6 +1,5 @@
 from .tokenizer import Tokenizer, Token
 from .character_tokenizer import CharacterTokenizer
 from .pretrained_transformer_tokenizer import PretrainedTransformerTokenizer
-from .sentence_splitter import SentenceSplitter, SpacySentenceSplitter
 from .whitespace_tokenizer import WhitespaceTokenizer
 from .lambo_tokenizer import LamboTokenizer
diff --git a/combo/data/tokenizers/lambo_tokenizer.py b/combo/data/tokenizers/lambo_tokenizer.py
index abb4e33..c187233 100644
--- a/combo/data/tokenizers/lambo_tokenizer.py
+++ b/combo/data/tokenizers/lambo_tokenizer.py
@@ -25,8 +25,8 @@ def _reset_idx():
 
 
 def _sentence_tokens(token: Token,
-                     split_subwords: Optional[bool] = None) -> List[Token]:
-    if split_subwords and len(token.subwords) > 0:
+                     split_multiwords: Optional[bool] = None) -> List[Token]:
+    if split_multiwords and len(token.subwords) > 0:
         subword_idxs = [next(_token_idx()) for _ in range(len(token.subwords))]
         multiword = (token.text, (subword_idxs[0], subword_idxs[-1]))
         tokens = [Token(idx=s_idx, text=subword, multiword=multiword) for (s_idx, subword)
@@ -43,24 +43,24 @@ class LamboTokenizer(Tokenizer):
             self,
             language: str = "English",
             default_split_level: str = DEFAULT_SPLIT_LEVEL,
-            default_split_subwords: bool = True
+            default_split_multiwords: bool = True
     ):
         self._language = language
         self.__tokenizer = Lambo.get(language)
         self.__default_split_level = default_split_level.upper()
 
-        self.__default_split_subwords = default_split_subwords
+        self.__default_split_multiwords = default_split_multiwords
 
     def tokenize(self,
                  text: str,
                  split_level: Optional[str] = None,
-                 split_subwords: Optional[bool] = None,
+                 split_multiwords: Optional[bool] = None,
                  multiwords: Optional[bool] = None) -> List[List[Token]]:
         """
         Simple tokenization - ignoring the sentence splits
         :param text:
         :param split_level: split on turns, sentences, or no splitting (return one list of tokens)
-        :param split_subwords: split subwords into separate tokens (e.g. can't into ca, n't)
+        :param split_multiwords: split subwords into separate tokens (e.g. can't into ca, n't)
         :return:
         """
         _reset_idx()
@@ -68,7 +68,7 @@ class LamboTokenizer(Tokenizer):
         tokens = []
 
         split_level = split_level if split_level is not None else self.__default_split_level
-        split_subwords = split_subwords if split_subwords is not None else self.__default_split_subwords
+        split_multiwords = split_multiwords if split_multiwords is not None else self.__default_split_multiwords
 
         if split_level.upper() == "TURN":
             for turn in document.turns:
@@ -76,7 +76,7 @@ class LamboTokenizer(Tokenizer):
                 for sentence in turn.sentences:
                     _reset_idx()
                     for token in sentence.tokens:
-                        sentence_tokens.extend(_sentence_tokens(token, split_subwords))
+                        sentence_tokens.extend(_sentence_tokens(token, split_multiwords))
                 tokens.append(sentence_tokens)
         elif split_level.upper() == "SENTENCE":
             for turn in document.turns:
@@ -84,7 +84,7 @@ class LamboTokenizer(Tokenizer):
                     _reset_idx()
                     sentence_tokens = []
                     for token in sentence.tokens:
-                        if len(token.subwords) > 0 and split_subwords:
+                        if len(token.subwords) > 0 and split_multiwords:
                             # @TODO this is a very dirty fix for Lambo model's shortcomings
                             # I noticed that for longer words with multiwords it tends to remove the last letter in the last multiword
                             # so this is a quick workaround to fix it
@@ -93,14 +93,14 @@ class LamboTokenizer(Tokenizer):
                             if "".join(token.subwords) != token.text:
                                 fixed_subwords = fix_subwords(token)
                                 token.subwords = fixed_subwords
-                        sentence_tokens.extend(_sentence_tokens(token, split_subwords))
+                        sentence_tokens.extend(_sentence_tokens(token, split_multiwords))
                     tokens.append(sentence_tokens)
         else:
             for turn in document.turns:
                 for sentence in turn.sentences:
                     _reset_idx()
                     for token in sentence.tokens:
-                        tokens.extend(_sentence_tokens(token, split_subwords))
+                        tokens.extend(_sentence_tokens(token, split_multiwords))
             tokens = [tokens]
 
         return tokens
@@ -108,17 +108,17 @@ class LamboTokenizer(Tokenizer):
     def segment(self,
                 text: str,
                 turns: Optional[bool] = None,
-                split_subwords: Optional[bool] = None) -> List[List[str]]:
+                split_multiwords: Optional[bool] = None) -> List[List[str]]:
         """
         Full segmentation - segment into sentences and return a list of strings.
         :param text:
         :param turns: segment into sentences by splitting on sentences or on turns. Default: sentences.
-        :param split_subwords: split subwords into separate tokens (e.g. can't into ca, n't)
+        :param split_multiwords: split subwords into separate tokens (e.g. can't into ca, n't)
         :return:
         """
 
         turns = turns if turns is not None else self.__default_split_level.upper() == "TURNS"
-        split_subwords = split_subwords if split_subwords is not None else self.__default_split_subwords
+        split_multiwords = split_multiwords if split_multiwords is not None else self.__default_split_multiwords
 
         document = self.__tokenizer.segment(text)
         sentences = []
@@ -132,7 +132,7 @@ class LamboTokenizer(Tokenizer):
                 if not turns:
                     sentence_tokens = []
                 for token in sentence.tokens:
-                    if len(token.subwords) > 0 and split_subwords:
+                    if len(token.subwords) > 0 and split_multiwords:
                         # @TODO this is a very dirty fix for Lambo model's shortcomings
                         # I noticed that for longer words with multiwords it tends to remove the last letter in the last multiword
                         # so this is a quick workaround to fix it
@@ -141,9 +141,9 @@ class LamboTokenizer(Tokenizer):
                         if "".join(token.subwords) != token.text:
                             fixed_subwords = fix_subwords(token)
                             token.subwords = fixed_subwords
-                        # sentence_tokens.extend(_sentence_tokens(token, split_subwords))
+                        # sentence_tokens.extend(_sentence_tokens(token, split_multiwords))
                     # else:
-                    sentence_tokens.extend(_sentence_tokens(token, split_subwords))
+                    sentence_tokens.extend(_sentence_tokens(token, split_multiwords))
                 if not turns:
                     sentences.append(sentence_tokens)
             if turns:
diff --git a/combo/data/tokenizers/sentence_splitter.py b/combo/data/tokenizers/sentence_splitter.py
deleted file mode 100644
index 250b01e..0000000
--- a/combo/data/tokenizers/sentence_splitter.py
+++ /dev/null
@@ -1,79 +0,0 @@
-"""
-Adapted from AllenNLP
-https://github.com/allenai/allennlp/blob/80fb6061e568cb9d6ab5d45b661e86eb61b92c82/allennlp/data/tokenizers/sentence_splitter.py
-"""
-from typing import List
-
-import spacy
-
-from combo.config import Registry
-from combo.config.from_parameters import register_arguments, FromParameters
-from combo.utils.spacy import get_spacy_model
-
-
-class SentenceSplitter(FromParameters):
-    """
-    A `SentenceSplitter` splits strings into sentences.
-    """
-
-    def split_sentences(self, text: str) -> List[str]:
-        """
-        Splits a `text` :class:`str` paragraph into a list of :class:`str`, where each is a sentence.
-        """
-        raise NotImplementedError
-
-    def batch_split_sentences(self, texts: List[str]) -> List[List[str]]:
-        """
-        Default implementation is to just iterate over the texts and call `split_sentences`.
-        """
-        return [self.split_sentences(text) for text in texts]
-
-
-@Registry.register('spacy_sentence_splitter')
-class SpacySentenceSplitter(SentenceSplitter):
-    """
-    A `SentenceSplitter` that uses spaCy's built-in sentence boundary detection.
-    Spacy's default sentence splitter uses a dependency parse to detect sentence boundaries, so
-    it is slow, but accurate.
-    Another option is to use rule-based sentence boundary detection. It's fast and has a small memory footprint,
-    since it uses punctuation to detect sentence boundaries. This can be activated with the `rule_based` flag.
-    By default, `SpacySentenceSplitter` calls the default spacy boundary detector.
-    Registered as a `SentenceSplitter` with name "spacy".
-    """
-
-    @register_arguments
-    def __init__(self, language: str = "en_core_web_sm", rule_based: bool = False) -> None:
-        self._language = language
-        self._rule_based = rule_based
-
-        # we need spacy's dependency parser if we're not using rule-based sentence boundary detection.
-        self.spacy = get_spacy_model(self._language, parse=not self._rule_based, ner=False)
-        self._is_version_3 = spacy.__version__ >= "3.0"
-        if rule_based:
-            # we use `sentencizer`, a built-in spacy module for rule-based sentence boundary detection.
-            # depending on the spacy version, it could be called 'sentencizer' or 'sbd'
-            sbd_name = "sbd" if spacy.__version__ < "2.1" else "sentencizer"
-            if not self.spacy.has_pipe(sbd_name):
-                if self._is_version_3:
-                    self.spacy.add_pipe(sbd_name)
-                else:
-                    sbd = self.spacy.create_pipe(sbd_name)
-                    self.spacy.add_pipe(sbd)
-
-    def split_sentences(self, text: str) -> List[str]:
-        if self._is_version_3:
-            return [sent.text.strip() for sent in self.spacy(text).sents]
-        else:
-            return [sent.string.strip() for sent in self.spacy(text).sents]
-
-    def batch_split_sentences(self, texts: List[str]) -> List[List[str]]:
-        """
-        This method lets you take advantage of spacy's batch processing.
-        """
-        if self._is_version_3:
-            return [
-                [sentence.text.strip() for sentence in doc.sents] for doc in self.spacy.pipe(texts)
-            ]
-        return [
-            [sentence.string.strip() for sentence in doc.sents] for doc in self.spacy.pipe(texts)
-        ]
diff --git a/combo/main.py b/combo/main.py
index 79e9820..e44feda 100755
--- a/combo/main.py
+++ b/combo/main.py
@@ -84,7 +84,7 @@ flags.DEFINE_list(name="datasets_for_vocabulary", default=["train"],
                   help="")
 flags.DEFINE_boolean(name="turns", default=False,
                      help="Segment into sentences on sentence break or on turn break.")
-flags.DEFINE_boolean(name="split_subwords", default=False,
+flags.DEFINE_boolean(name="split_multiwords", default=False,
                      help="Split subwords (e.g. don\'t = do, n\'t) into separate tokens.")
 flags.DEFINE_boolean(name="transformer_encoder", default=False, help="Use transformer encoder.")
 
@@ -160,7 +160,7 @@ def get_defaults(dataset_reader: Optional[DatasetReader],
         dataset_reader = default_ud_dataset_reader(FLAGS.pretrained_transformer_name,
                                                    tokenizer=LamboTokenizer(FLAGS.tokenizer_language,
                                                     default_split_level="TURNS" if FLAGS.turns else "SENTENCES",
-                                                    default_split_subwords=FLAGS.split_subwords)
+                                                    default_split_multiwords=FLAGS.split_multiwords)
                                                    )
 
     if not training_data_loader:
@@ -412,7 +412,7 @@ def run(_):
             dataset_reader = default_ud_dataset_reader(FLAGS.pretrained_transformer_name,
                                                         tokenizer=LamboTokenizer(tokenizer_language,
                                                         default_split_level="TURNS" if FLAGS.turns else "SENTENCES",
-                                                        default_split_subwords=FLAGS.split_subwords)
+                                                        default_split_multiwords=FLAGS.split_multiwords)
                                                        )
 
         predictor = COMBO(model, dataset_reader)
@@ -454,7 +454,7 @@ def run(_):
                 with open(FLAGS.input_file, "r", encoding='utf-8') as file:
                     input_sentences = tokenizer.segment(file.read(),
                                                         turns=FLAGS.turns,
-                                                        split_subwords=FLAGS.split_subwords)
+                                                        split_multiwords=FLAGS.split_multiwords)
                 predictions = predictor.predict(input_sentences)
                 with open(FLAGS.output_file, "w") as file:
                     for prediction in tqdm(predictions):
diff --git a/requirements.txt b/requirements.txt
index ba612dd..1dd7adb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,8 +6,6 @@ conllutils~=1.1.4
 dill~=0.3.6
 importlib-resources~=5.12.0
 h5py~=3.9.0
--i https://pypi.clarin-pl.eu/
-lambo==2.1.0
 overrides~=7.3.1
 torch~=2.0.0
 torchtext~=0.15.1
@@ -20,5 +18,4 @@ pandas~=2.1.3
 pytest~=7.2.2
 transformers~=4.27.3
 sacremoses~=0.0.53
-spacy~=3.3.1
 urllib3==1.26.6
\ No newline at end of file
diff --git a/setup.py b/setup.py
index bd002c4..28a9570 100644
--- a/setup.py
+++ b/setup.py
@@ -24,7 +24,6 @@ REQUIREMENTS = [
     "pytest~=7.2.2",
     "transformers~=4.27.3",
     "sacremoses~=0.0.53",
-    "spacy~=3.3.1",
     "urllib3==1.26.6"
 ]
 
diff --git a/tests/data/tokenizers/test_lambo_tokenizer.py b/tests/data/tokenizers/test_lambo_tokenizer.py
index 4590638..5726c2c 100644
--- a/tests/data/tokenizers/test_lambo_tokenizer.py
+++ b/tests/data/tokenizers/test_lambo_tokenizer.py
@@ -24,19 +24,19 @@ class LamboTokenizerTest(unittest.TestCase):
                              [['Hello', 'cats', '.', 'I', 'love', 'you', '.'], ['Hi', '.']])
 
     def test_segment_text_with_multiwords(self):
-        tokens = self.lambo_tokenizer.segment('I don\'t want a pizza.', split_subwords=True)
+        tokens = self.lambo_tokenizer.segment('I don\'t want a pizza.', split_multiwords=True)
         self.assertListEqual(tokens,
                              [['I', 'do', 'n\'t', 'want', 'a', 'pizza', '.']])
 
     def test_segment_text_with_multiwords_without_splitting(self):
-        tokens = self.lambo_tokenizer.tokenize('I don\'t want a pizza.', split_subwords=False)
+        tokens = self.lambo_tokenizer.tokenize('I don\'t want a pizza.', split_multiwords=False)
         self.assertListEqual([t.text for t in tokens[0]],
                              ['I', 'don\'t', 'want', 'a', 'pizza', '.'])
         self.assertListEqual([t.subwords for t in tokens[0]],
                              [[], ['do', 'n\'t'], [], [], [], []])
 
     def test_segment_text_with_multiwords_with_splitting(self):
-        tokens = self.lambo_tokenizer.tokenize('I don\'t want a pizza.', split_subwords=True)
+        tokens = self.lambo_tokenizer.tokenize('I don\'t want a pizza.', split_multiwords=True)
         self.assertListEqual([t.text for t in tokens[0]],
                              ['I', 'do', 'n\'t', 'want', 'a', 'pizza', '.'])
         self.assertListEqual([t.multiword for t in tokens[0]],
-- 
GitLab