From dbf99b170da220152c2f3d507b78da70559c46e9 Mon Sep 17 00:00:00 2001 From: Maja Jablonska <majajjablonska@gmail.com> Date: Tue, 16 Jan 2024 10:37:00 +0100 Subject: [PATCH] Add optional subwords splitting --- combo/combo_model.py | 1 - combo/data/tokenizers/lambo_tokenizer.py | 15 +++++++++++---- combo/default_model.py | 9 +++++++-- combo/main.py | 15 +++++++++++---- combo/models/encoder.py | 2 +- combo/nn/utils.py | 7 ++++++- docs/Training.md | 6 ++++-- tests/data/tokenizers/test_lambo_tokenizer.py | 16 ++++++++++++++-- 8 files changed, 54 insertions(+), 17 deletions(-) diff --git a/combo/combo_model.py b/combo/combo_model.py index 3ab90c8..b87eb1c 100644 --- a/combo/combo_model.py +++ b/combo/combo_model.py @@ -173,7 +173,6 @@ class ComboModel(Model, FromParameters): mapped_gold_labels = [] for _, cat_indices in self.morphological_feat.slices.items(): try: - print("Feats dimensions: ", feats.shape, "cat_indices shape", len(cat_indices), "min and max", min(cat_indices), max(cat_indices)) mapped_gold_labels.append(feats[:, :, cat_indices].argmax(dim=-1)) except TypeError: raise ConfigurationError('Feats is None - if no feats are provided, the morphological_feat property should be set to None.') diff --git a/combo/data/tokenizers/lambo_tokenizer.py b/combo/data/tokenizers/lambo_tokenizer.py index c11b605..6842958 100644 --- a/combo/data/tokenizers/lambo_tokenizer.py +++ b/combo/data/tokenizers/lambo_tokenizer.py @@ -1,4 +1,4 @@ -from typing import List +from typing import List, Optional from lambo.segmenter.lambo import Lambo @@ -13,10 +13,14 @@ class LamboTokenizer(Tokenizer): @register_arguments def __init__( self, - language: str = "English" + language: str = "English", + default_turns: bool = False, + default_split_subwords: bool = False ): self._language = language self.__tokenizer = Lambo.get(language) + self.__default_turns = default_turns + self.__default_split_subwords = default_split_subwords def tokenize(self, text: str) -> List[Token]: """ @@ -34,7 +38,7 @@ class LamboTokenizer(Tokenizer): return tokens - def segment(self, text: str, turns: bool = False) -> List[List[str]]: + def segment(self, text: str, turns: Optional[bool] = None, split_subwords: Optional[bool] = None) -> List[List[str]]: """ Full segmentation - segment into sentences. :param text: @@ -42,6 +46,9 @@ class LamboTokenizer(Tokenizer): :return: """ + turns = turns or self.__default_turns + split_subwords = split_subwords or self.__default_split_subwords + document = self.__tokenizer.segment(text) sentences = [] sentence_tokens = [] @@ -53,7 +60,7 @@ class LamboTokenizer(Tokenizer): if not turns: sentence_tokens = [] for token in sentence.tokens: - if len(token.subwords) > 0: + if len(token.subwords) > 0 and split_subwords: sentence_tokens.extend([s for s in token.subwords]) else: sentence_tokens.append(token.text) diff --git a/combo/default_model.py b/combo/default_model.py index b3fe5db..d074e62 100644 --- a/combo/default_model.py +++ b/combo/default_model.py @@ -1,6 +1,7 @@ """ A default model and classes, mainly for testing the package """ +from typing import Optional from combo.data import DatasetReader from combo.data.batch import Batch @@ -23,6 +24,7 @@ from combo.modules import FeedForwardPredictor from combo.nn.base import Linear from combo.nn.regularizers.regularizers import L2Regularizer from combo.nn import RegularizerApplicator +from combo.data import Tokenizer, LamboTokenizer def default_character_indexer(namespace=None, @@ -42,7 +44,9 @@ def default_character_indexer(namespace=None, ) -def default_ud_dataset_reader(pretrained_transformer_name: str) -> UniversalDependenciesDatasetReader: +def default_ud_dataset_reader(pretrained_transformer_name: str, + tokenizer: Optional[Tokenizer] = None) -> UniversalDependenciesDatasetReader: + tokenizer = tokenizer or LamboTokenizer() return UniversalDependenciesDatasetReader( features=["token", "char"], lemma_indexers={ @@ -63,7 +67,8 @@ def default_ud_dataset_reader(pretrained_transformer_name: str) -> UniversalDepe namespace="xpostag" ) }, - use_sem=False + use_sem=False, + tokenizer=tokenizer ) diff --git a/combo/main.py b/combo/main.py index 5fe926f..4f0a846 100755 --- a/combo/main.py +++ b/combo/main.py @@ -63,7 +63,7 @@ flags.DEFINE_integer(name="batch_size", default=256, help="Batch size") flags.DEFINE_integer(name="batches_per_epoch", default=16, help="Number of batches per epoch") -flags.DEFINE_string(name="pretrained_transformer_name", default="", +flags.DEFINE_string(name="pretrained_transformer_name", default="bert-base-cased", help="Pretrained transformer model name (see transformers from HuggingFace library for list of " "available models) for transformers based embeddings.") flags.DEFINE_list(name="features", default=["token", "char"], @@ -82,6 +82,8 @@ flags.DEFINE_list(name="datasets_for_vocabulary", default=["train"], help="") flags.DEFINE_boolean(name="turns", default=False, help="Segment into sentences on sentence break or on turn break.") +flags.DEFINE_boolean(name="split_subwords", default=False, + help="Split subwords (e.g. don\'t = do, n\'t) into separate tokens.") # Finetune after training flags flags.DEFINE_string(name="finetuning_training_data_path", default="", @@ -396,8 +398,11 @@ def run(_): if not dataset_reader: logger.info("No dataset reader in the configuration or archive file - using a default UD dataset reader", prefix=prefix) - dataset_reader = default_ud_dataset_reader(FLAGS.pretrained_transformer_name) - dataset_reader.tokenizer = LamboTokenizer(tokenizer_language) + dataset_reader = default_ud_dataset_reader(FLAGS.pretrained_transformer_name, + LamboTokenizer(tokenizer_language, + default_turns=FLAGS.turns, + default_split_subwords=FLAGS.split_subwords) + ) predictor = COMBO(model, dataset_reader) @@ -436,7 +441,9 @@ def run(_): else: tokenizer = LamboTokenizer(tokenizer_language) with open(FLAGS.input_file, "r", encoding='utf-8') as file: - input_sentences = tokenizer.segment(file.read(), turns=FLAGS.turns) + input_sentences = tokenizer.segment(file.read(), + turns=FLAGS.turns, + split_subwords=FLAGS.split_subwords) predictions = predictor.predict(input_sentences) with open(FLAGS.output_file, "w") as file: for prediction in tqdm(predictions): diff --git a/combo/models/encoder.py b/combo/models/encoder.py index 3f22f15..8ed5aff 100644 --- a/combo/models/encoder.py +++ b/combo/models/encoder.py @@ -225,7 +225,7 @@ class ComboStackedBidirectionalLSTM(StackedBidirectionalLstm, FromParameters): return output_sequence, (state_fwd, state_bwd) -@Registry.register( 'combo_encoder') +@Registry.register('combo_encoder') class ComboEncoder(Seq2SeqEncoder, FromParameters): """COMBO encoder (https://www.aclweb.org/anthology/K18-2004.pdf). diff --git a/combo/nn/utils.py b/combo/nn/utils.py index 4333be0..fbb17df 100644 --- a/combo/nn/utils.py +++ b/combo/nn/utils.py @@ -14,7 +14,12 @@ StateDictType = Union[Dict[str, torch.Tensor], "OrderedDict[str, torch.Tensor]"] def masked_cross_entropy(pred: torch.Tensor, true: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor: pred = pred + (mask.float().unsqueeze(-1) + 1e-45).log() - return F.cross_entropy(pred, true, reduction="none") * mask + try: + return F.cross_entropy(pred, true, reduction="none") * mask + except Exception as e: + print("pred shape", pred.shape, "true shape", true.shape, "mask shape", mask.shape) + print(F.cross_entropy(pred, true, reduction="none").shape) + raise e def tiny_value_of_dtype(dtype: torch.dtype): """ diff --git a/docs/Training.md b/docs/Training.md index d4f724b..65bf1df 100644 --- a/docs/Training.md +++ b/docs/Training.md @@ -5,7 +5,8 @@ Basic command: ```bash python combo/main.py --mode train \ --training_data_path your_training_path \ - --validation_data_path your_validation_path + --validation_data_path your_validation_path \ + --serialization_dir directory_to_store_model_output_in ``` Options: @@ -16,7 +17,8 @@ python combo/main.py --helpfull ## Examples -For clarity, the training and validation data paths are omitted. +For clarity, the training and validation data paths, as well as serialization directory path, +are omitted. Train on multiple accelerators (default: train on all available ones) ```bash diff --git a/tests/data/tokenizers/test_lambo_tokenizer.py b/tests/data/tokenizers/test_lambo_tokenizer.py index 2945a82..08e18d4 100644 --- a/tests/data/tokenizers/test_lambo_tokenizer.py +++ b/tests/data/tokenizers/test_lambo_tokenizer.py @@ -15,11 +15,23 @@ class LamboTokenizerTest(unittest.TestCase): def test_segment_text(self): tokens = self.lambo_tokenizer.segment('Hello cats. I love you.\n\nHi.') - self.assertListEqual(tokens, [['Hello', 'cats', '.'], ['I', 'love', 'you', '.'], ['Hi', '.']]) + self.assertListEqual(tokens, + [['Hello', 'cats', '.'], ['I', 'love', 'you', '.'], ['Hi', '.']]) def test_segment_text_with_turns(self): tokens = self.lambo_tokenizer.segment('Hello cats. I love you.\n\nHi.', turns=True) - self.assertListEqual(tokens, [['Hello', 'cats', '.', 'I', 'love', 'you', '.'], ['Hi', '.']]) + self.assertListEqual(tokens, + [['Hello', 'cats', '.', 'I', 'love', 'you', '.'], ['Hi', '.']]) + + def test_segment_text_with_multiwords(self): + tokens = self.lambo_tokenizer.segment('I don\'t want a pizza.', split_subwords=True) + self.assertListEqual(tokens, + [['I', 'do', 'n\'t', 'want', 'a', 'pizza', '.']]) + + def test_segment_text_with_multiwords_without_splitting(self): + tokens = self.lambo_tokenizer.segment('I don\'t want a pizza.', split_subwords=False) + self.assertListEqual(tokens, + [['I', 'don\'t', 'want', 'a', 'pizza', '.']]) def test_tokenize_sentence_with_multiword(self): tokens = self.lambo_tokenizer.tokenize('I don\'t like apples.') -- GitLab