From c7c67b8e18ede85b069ed64ad383b0c4e9da306e Mon Sep 17 00:00:00 2001 From: Maja Jablonska <majajjablonska@gmail.com> Date: Thu, 15 Feb 2024 22:21:04 +1100 Subject: [PATCH] Remove unnecessary spacy dependence --- combo/data/fields/text_field.py | 2 - combo/data/tokenizers/__init__.py | 1 - combo/data/tokenizers/lambo_tokenizer.py | 34 ++++---- combo/data/tokenizers/sentence_splitter.py | 79 ------------------- combo/main.py | 8 +- requirements.txt | 3 - setup.py | 1 - tests/data/tokenizers/test_lambo_tokenizer.py | 6 +- 8 files changed, 24 insertions(+), 110 deletions(-) delete mode 100644 combo/data/tokenizers/sentence_splitter.py diff --git a/combo/data/fields/text_field.py b/combo/data/fields/text_field.py index da42252..ddd1c01 100644 --- a/combo/data/fields/text_field.py +++ b/combo/data/fields/text_field.py @@ -10,8 +10,6 @@ from copy import deepcopy from typing import Dict, List, Optional, Iterator import textwrap - -from spacy.tokens import Token as SpacyToken import torch # There are two levels of dictionaries here: the top level is for the *key*, which aligns diff --git a/combo/data/tokenizers/__init__.py b/combo/data/tokenizers/__init__.py index 4deeda9..5486da8 100644 --- a/combo/data/tokenizers/__init__.py +++ b/combo/data/tokenizers/__init__.py @@ -1,6 +1,5 @@ from .tokenizer import Tokenizer, Token from .character_tokenizer import CharacterTokenizer from .pretrained_transformer_tokenizer import PretrainedTransformerTokenizer -from .sentence_splitter import SentenceSplitter, SpacySentenceSplitter from .whitespace_tokenizer import WhitespaceTokenizer from .lambo_tokenizer import LamboTokenizer diff --git a/combo/data/tokenizers/lambo_tokenizer.py b/combo/data/tokenizers/lambo_tokenizer.py index abb4e33..c187233 100644 --- a/combo/data/tokenizers/lambo_tokenizer.py +++ b/combo/data/tokenizers/lambo_tokenizer.py @@ -25,8 +25,8 @@ def _reset_idx(): def _sentence_tokens(token: Token, - split_subwords: Optional[bool] = None) -> List[Token]: - if split_subwords and len(token.subwords) > 0: + split_multiwords: Optional[bool] = None) -> List[Token]: + if split_multiwords and len(token.subwords) > 0: subword_idxs = [next(_token_idx()) for _ in range(len(token.subwords))] multiword = (token.text, (subword_idxs[0], subword_idxs[-1])) tokens = [Token(idx=s_idx, text=subword, multiword=multiword) for (s_idx, subword) @@ -43,24 +43,24 @@ class LamboTokenizer(Tokenizer): self, language: str = "English", default_split_level: str = DEFAULT_SPLIT_LEVEL, - default_split_subwords: bool = True + default_split_multiwords: bool = True ): self._language = language self.__tokenizer = Lambo.get(language) self.__default_split_level = default_split_level.upper() - self.__default_split_subwords = default_split_subwords + self.__default_split_multiwords = default_split_multiwords def tokenize(self, text: str, split_level: Optional[str] = None, - split_subwords: Optional[bool] = None, + split_multiwords: Optional[bool] = None, multiwords: Optional[bool] = None) -> List[List[Token]]: """ Simple tokenization - ignoring the sentence splits :param text: :param split_level: split on turns, sentences, or no splitting (return one list of tokens) - :param split_subwords: split subwords into separate tokens (e.g. can't into ca, n't) + :param split_multiwords: split subwords into separate tokens (e.g. can't into ca, n't) :return: """ _reset_idx() @@ -68,7 +68,7 @@ class LamboTokenizer(Tokenizer): tokens = [] split_level = split_level if split_level is not None else self.__default_split_level - split_subwords = split_subwords if split_subwords is not None else self.__default_split_subwords + split_multiwords = split_multiwords if split_multiwords is not None else self.__default_split_multiwords if split_level.upper() == "TURN": for turn in document.turns: @@ -76,7 +76,7 @@ class LamboTokenizer(Tokenizer): for sentence in turn.sentences: _reset_idx() for token in sentence.tokens: - sentence_tokens.extend(_sentence_tokens(token, split_subwords)) + sentence_tokens.extend(_sentence_tokens(token, split_multiwords)) tokens.append(sentence_tokens) elif split_level.upper() == "SENTENCE": for turn in document.turns: @@ -84,7 +84,7 @@ class LamboTokenizer(Tokenizer): _reset_idx() sentence_tokens = [] for token in sentence.tokens: - if len(token.subwords) > 0 and split_subwords: + if len(token.subwords) > 0 and split_multiwords: # @TODO this is a very dirty fix for Lambo model's shortcomings # I noticed that for longer words with multiwords it tends to remove the last letter in the last multiword # so this is a quick workaround to fix it @@ -93,14 +93,14 @@ class LamboTokenizer(Tokenizer): if "".join(token.subwords) != token.text: fixed_subwords = fix_subwords(token) token.subwords = fixed_subwords - sentence_tokens.extend(_sentence_tokens(token, split_subwords)) + sentence_tokens.extend(_sentence_tokens(token, split_multiwords)) tokens.append(sentence_tokens) else: for turn in document.turns: for sentence in turn.sentences: _reset_idx() for token in sentence.tokens: - tokens.extend(_sentence_tokens(token, split_subwords)) + tokens.extend(_sentence_tokens(token, split_multiwords)) tokens = [tokens] return tokens @@ -108,17 +108,17 @@ class LamboTokenizer(Tokenizer): def segment(self, text: str, turns: Optional[bool] = None, - split_subwords: Optional[bool] = None) -> List[List[str]]: + split_multiwords: Optional[bool] = None) -> List[List[str]]: """ Full segmentation - segment into sentences and return a list of strings. :param text: :param turns: segment into sentences by splitting on sentences or on turns. Default: sentences. - :param split_subwords: split subwords into separate tokens (e.g. can't into ca, n't) + :param split_multiwords: split subwords into separate tokens (e.g. can't into ca, n't) :return: """ turns = turns if turns is not None else self.__default_split_level.upper() == "TURNS" - split_subwords = split_subwords if split_subwords is not None else self.__default_split_subwords + split_multiwords = split_multiwords if split_multiwords is not None else self.__default_split_multiwords document = self.__tokenizer.segment(text) sentences = [] @@ -132,7 +132,7 @@ class LamboTokenizer(Tokenizer): if not turns: sentence_tokens = [] for token in sentence.tokens: - if len(token.subwords) > 0 and split_subwords: + if len(token.subwords) > 0 and split_multiwords: # @TODO this is a very dirty fix for Lambo model's shortcomings # I noticed that for longer words with multiwords it tends to remove the last letter in the last multiword # so this is a quick workaround to fix it @@ -141,9 +141,9 @@ class LamboTokenizer(Tokenizer): if "".join(token.subwords) != token.text: fixed_subwords = fix_subwords(token) token.subwords = fixed_subwords - # sentence_tokens.extend(_sentence_tokens(token, split_subwords)) + # sentence_tokens.extend(_sentence_tokens(token, split_multiwords)) # else: - sentence_tokens.extend(_sentence_tokens(token, split_subwords)) + sentence_tokens.extend(_sentence_tokens(token, split_multiwords)) if not turns: sentences.append(sentence_tokens) if turns: diff --git a/combo/data/tokenizers/sentence_splitter.py b/combo/data/tokenizers/sentence_splitter.py deleted file mode 100644 index 250b01e..0000000 --- a/combo/data/tokenizers/sentence_splitter.py +++ /dev/null @@ -1,79 +0,0 @@ -""" -Adapted from AllenNLP -https://github.com/allenai/allennlp/blob/80fb6061e568cb9d6ab5d45b661e86eb61b92c82/allennlp/data/tokenizers/sentence_splitter.py -""" -from typing import List - -import spacy - -from combo.config import Registry -from combo.config.from_parameters import register_arguments, FromParameters -from combo.utils.spacy import get_spacy_model - - -class SentenceSplitter(FromParameters): - """ - A `SentenceSplitter` splits strings into sentences. - """ - - def split_sentences(self, text: str) -> List[str]: - """ - Splits a `text` :class:`str` paragraph into a list of :class:`str`, where each is a sentence. - """ - raise NotImplementedError - - def batch_split_sentences(self, texts: List[str]) -> List[List[str]]: - """ - Default implementation is to just iterate over the texts and call `split_sentences`. - """ - return [self.split_sentences(text) for text in texts] - - -@Registry.register('spacy_sentence_splitter') -class SpacySentenceSplitter(SentenceSplitter): - """ - A `SentenceSplitter` that uses spaCy's built-in sentence boundary detection. - Spacy's default sentence splitter uses a dependency parse to detect sentence boundaries, so - it is slow, but accurate. - Another option is to use rule-based sentence boundary detection. It's fast and has a small memory footprint, - since it uses punctuation to detect sentence boundaries. This can be activated with the `rule_based` flag. - By default, `SpacySentenceSplitter` calls the default spacy boundary detector. - Registered as a `SentenceSplitter` with name "spacy". - """ - - @register_arguments - def __init__(self, language: str = "en_core_web_sm", rule_based: bool = False) -> None: - self._language = language - self._rule_based = rule_based - - # we need spacy's dependency parser if we're not using rule-based sentence boundary detection. - self.spacy = get_spacy_model(self._language, parse=not self._rule_based, ner=False) - self._is_version_3 = spacy.__version__ >= "3.0" - if rule_based: - # we use `sentencizer`, a built-in spacy module for rule-based sentence boundary detection. - # depending on the spacy version, it could be called 'sentencizer' or 'sbd' - sbd_name = "sbd" if spacy.__version__ < "2.1" else "sentencizer" - if not self.spacy.has_pipe(sbd_name): - if self._is_version_3: - self.spacy.add_pipe(sbd_name) - else: - sbd = self.spacy.create_pipe(sbd_name) - self.spacy.add_pipe(sbd) - - def split_sentences(self, text: str) -> List[str]: - if self._is_version_3: - return [sent.text.strip() for sent in self.spacy(text).sents] - else: - return [sent.string.strip() for sent in self.spacy(text).sents] - - def batch_split_sentences(self, texts: List[str]) -> List[List[str]]: - """ - This method lets you take advantage of spacy's batch processing. - """ - if self._is_version_3: - return [ - [sentence.text.strip() for sentence in doc.sents] for doc in self.spacy.pipe(texts) - ] - return [ - [sentence.string.strip() for sentence in doc.sents] for doc in self.spacy.pipe(texts) - ] diff --git a/combo/main.py b/combo/main.py index 79e9820..e44feda 100755 --- a/combo/main.py +++ b/combo/main.py @@ -84,7 +84,7 @@ flags.DEFINE_list(name="datasets_for_vocabulary", default=["train"], help="") flags.DEFINE_boolean(name="turns", default=False, help="Segment into sentences on sentence break or on turn break.") -flags.DEFINE_boolean(name="split_subwords", default=False, +flags.DEFINE_boolean(name="split_multiwords", default=False, help="Split subwords (e.g. don\'t = do, n\'t) into separate tokens.") flags.DEFINE_boolean(name="transformer_encoder", default=False, help="Use transformer encoder.") @@ -160,7 +160,7 @@ def get_defaults(dataset_reader: Optional[DatasetReader], dataset_reader = default_ud_dataset_reader(FLAGS.pretrained_transformer_name, tokenizer=LamboTokenizer(FLAGS.tokenizer_language, default_split_level="TURNS" if FLAGS.turns else "SENTENCES", - default_split_subwords=FLAGS.split_subwords) + default_split_multiwords=FLAGS.split_multiwords) ) if not training_data_loader: @@ -412,7 +412,7 @@ def run(_): dataset_reader = default_ud_dataset_reader(FLAGS.pretrained_transformer_name, tokenizer=LamboTokenizer(tokenizer_language, default_split_level="TURNS" if FLAGS.turns else "SENTENCES", - default_split_subwords=FLAGS.split_subwords) + default_split_multiwords=FLAGS.split_multiwords) ) predictor = COMBO(model, dataset_reader) @@ -454,7 +454,7 @@ def run(_): with open(FLAGS.input_file, "r", encoding='utf-8') as file: input_sentences = tokenizer.segment(file.read(), turns=FLAGS.turns, - split_subwords=FLAGS.split_subwords) + split_multiwords=FLAGS.split_multiwords) predictions = predictor.predict(input_sentences) with open(FLAGS.output_file, "w") as file: for prediction in tqdm(predictions): diff --git a/requirements.txt b/requirements.txt index ba612dd..1dd7adb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,8 +6,6 @@ conllutils~=1.1.4 dill~=0.3.6 importlib-resources~=5.12.0 h5py~=3.9.0 --i https://pypi.clarin-pl.eu/ -lambo==2.1.0 overrides~=7.3.1 torch~=2.0.0 torchtext~=0.15.1 @@ -20,5 +18,4 @@ pandas~=2.1.3 pytest~=7.2.2 transformers~=4.27.3 sacremoses~=0.0.53 -spacy~=3.3.1 urllib3==1.26.6 \ No newline at end of file diff --git a/setup.py b/setup.py index bd002c4..28a9570 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,6 @@ REQUIREMENTS = [ "pytest~=7.2.2", "transformers~=4.27.3", "sacremoses~=0.0.53", - "spacy~=3.3.1", "urllib3==1.26.6" ] diff --git a/tests/data/tokenizers/test_lambo_tokenizer.py b/tests/data/tokenizers/test_lambo_tokenizer.py index 4590638..5726c2c 100644 --- a/tests/data/tokenizers/test_lambo_tokenizer.py +++ b/tests/data/tokenizers/test_lambo_tokenizer.py @@ -24,19 +24,19 @@ class LamboTokenizerTest(unittest.TestCase): [['Hello', 'cats', '.', 'I', 'love', 'you', '.'], ['Hi', '.']]) def test_segment_text_with_multiwords(self): - tokens = self.lambo_tokenizer.segment('I don\'t want a pizza.', split_subwords=True) + tokens = self.lambo_tokenizer.segment('I don\'t want a pizza.', split_multiwords=True) self.assertListEqual(tokens, [['I', 'do', 'n\'t', 'want', 'a', 'pizza', '.']]) def test_segment_text_with_multiwords_without_splitting(self): - tokens = self.lambo_tokenizer.tokenize('I don\'t want a pizza.', split_subwords=False) + tokens = self.lambo_tokenizer.tokenize('I don\'t want a pizza.', split_multiwords=False) self.assertListEqual([t.text for t in tokens[0]], ['I', 'don\'t', 'want', 'a', 'pizza', '.']) self.assertListEqual([t.subwords for t in tokens[0]], [[], ['do', 'n\'t'], [], [], [], []]) def test_segment_text_with_multiwords_with_splitting(self): - tokens = self.lambo_tokenizer.tokenize('I don\'t want a pizza.', split_subwords=True) + tokens = self.lambo_tokenizer.tokenize('I don\'t want a pizza.', split_multiwords=True) self.assertListEqual([t.text for t in tokens[0]], ['I', 'do', 'n\'t', 'want', 'a', 'pizza', '.']) self.assertListEqual([t.multiword for t in tokens[0]], -- GitLab