Skip to content
Snippets Groups Projects
Commit 18627a23 authored by Maja Jablonska's avatar Maja Jablonska
Browse files

Add LAMBO as a tokenizer

parent 82d70415
Branches
Tags
1 merge request!46Merge COMBO 3.0 into master
......@@ -4,6 +4,6 @@ from .samplers import TokenCountBatchSampler
from .instance import Instance
from .token_indexers import (SingleIdTokenIndexer, TokenIndexer, TokenFeatsIndexer)
from .tokenizers import (Tokenizer, Token, CharacterTokenizer, PretrainedTransformerTokenizer,
SpacyTokenizer, WhitespaceTokenizer)
SpacyTokenizer, WhitespaceTokenizer, LamboTokenizer)
from .dataset_readers import (ConllDatasetReader, DatasetReader,
TextClassificationJSONReader, UniversalDependenciesDatasetReader)
"""
Adapted from AllenNLP
https://github.com/allenai/allennlp/blob/main/allennlp/data/data_loaders/data_loader.py
"""
from typing import Dict, Union, Iterator
import torch
from combo.data import Vocabulary, Instance
TensorDict = Dict[str, Union[torch.Tensor, Dict[str, torch.Tensor]]]
"""
`TensorDict` is the type we use for batches.
"""
class DataLoader:
"""
A `DataLoader` is responsible for generating batches of instances from a
[`DatasetReader`](/api/data/dataset_readers/dataset_reader/#datasetreader),
or another source of data.
This is purely an abstract base class. All concrete subclasses must provide
implementations of the following methods:
- [`__iter__()`](#__iter__) that creates an iterable of `TensorDict`s,
- [`iter_instances()`](#iter_instances) that creates an iterable of `Instance`s,
- [`index_with()`](#index_with) that should index the data with a vocabulary, and
- [`set_target_device()`](#set_target_device), which updates the device that batch
tensors should be put it when they are generated in `__iter__()`.
Additionally, this class should also implement `__len__()` when possible.
The default implementation is
[`MultiProcessDataLoader`](../multiprocess_data_loader/#multiprocessdataloader).
"""
default_implementation = "multiprocess"
def __len__(self) -> int:
raise TypeError
def __iter__(self) -> Iterator[TensorDict]:
raise NotImplementedError
def iter_instances(self) -> Iterator[Instance]:
raise NotImplementedError
def index_with(self, vocab: Vocabulary) -> None:
raise NotImplementedError
def set_target_device(self, device: torch.device) -> None:
raise NotImplementedError
"""
Adapted from AllenNLP
https://github.com/allenai/allennlp/blob/main/allennlp/data/data_loaders/simple_data_loader.py
"""
import math
import random
from typing import Optional, List, Iterator
import torch
from allennlp.common.util import lazy_groups_of
from allennlp.common.tqdm import Tqdm
from allennlp.data.data_loaders.data_collator import DefaultDataCollator
from allennlp.data.dataset_readers import DatasetReader
from allennlp.data.instance import Instance
from allennlp.data.vocabulary import Vocabulary
import allennlp.nn.util as nn_util
from combo.data.dataset_loaders.dataset_loader import DataLoader
class SimpleDataLoader(DataLoader):
"""
A very simple `DataLoader` that is mostly used for testing.
"""
def __init__(
self,
instances: List[Instance],
batch_size: int,
*,
shuffle: bool = False,
batches_per_epoch: Optional[int] = None,
vocab: Optional[Vocabulary] = None,
) -> None:
self.instances = instances
self.batch_size = batch_size
self.shuffle = shuffle
self.batches_per_epoch = batches_per_epoch
self.vocab = vocab
self.cuda_device: Optional[torch.device] = None
self._batch_generator: Optional[Iterator[TensorDict]] = None
self.collate_fn = DefaultDataCollator()
def __len__(self) -> int:
if self.batches_per_epoch is not None:
return self.batches_per_epoch
return math.ceil(len(self.instances) / self.batch_size)
def __iter__(self) -> Iterator[TensorDict]:
if self.batches_per_epoch is None:
yield from self._iter_batches()
else:
if self._batch_generator is None:
self._batch_generator = self._iter_batches()
for i in range(self.batches_per_epoch):
try:
yield next(self._batch_generator)
except StopIteration: # data_generator is exhausted
self._batch_generator = self._iter_batches() # so refresh it
yield next(self._batch_generator)
def _iter_batches(self) -> Iterator[TensorDict]:
if self.shuffle:
random.shuffle(self.instances)
for batch in lazy_groups_of(self.iter_instances(), self.batch_size):
tensor_dict = self.collate_fn(batch)
if self.cuda_device is not None:
tensor_dict = nn_util.move_to_device(tensor_dict, self.cuda_device)
yield tensor_dict
def iter_instances(self) -> Iterator[Instance]:
for instance in self.instances:
if self.vocab is not None:
instance.index_fields(self.vocab)
yield instance
def index_with(self, vocab: Vocabulary) -> None:
self.vocab = vocab
for instance in self.instances:
instance.index_fields(self.vocab)
def set_target_device(self, device: torch.device) -> None:
self.cuda_device = device
@classmethod
def from_dataset_reader(
cls,
reader: DatasetReader,
data_path: str,
batch_size: int,
shuffle: bool = False,
batches_per_epoch: Optional[int] = None,
quiet: bool = False,
) -> "SimpleDataLoader":
instance_iter = reader.read(data_path)
if not quiet:
instance_iter = Tqdm.tqdm(instance_iter, desc="loading instances")
instances = list(instance_iter)
return cls(instances, batch_size, shuffle=shuffle, batches_per_epoch=batches_per_epoch)
......@@ -4,3 +4,4 @@ from .pretrained_transformer_tokenizer import PretrainedTransformerTokenizer
from .spacy_tokenizer import SpacyTokenizer
from .sentence_splitter import SentenceSplitter, SpacySentenceSplitter
from .whitespace_tokenizer import WhitespaceTokenizer
from .lambo_tokenizer import LamboTokenizer
from typing import List, Dict, Any
from combo.data.tokenizers.token import Token
from combo.data.tokenizers.tokenizer import Tokenizer
from lambo.segmenter.lambo import Lambo
class LamboTokenizer(Tokenizer):
def __init__(
self,
language: str = "English"
):
self._language = language
self.__tokenizer = Lambo.get(language)
def tokenize(self, text: str) -> List[Token]:
"""
Simple tokenization - ignoring the sentence splits
:param text:
:return:
"""
document = self.__tokenizer.segment(text)
tokens = []
for turn in document.turns:
for sentence in turn.sentences:
for token in sentence.tokens:
tokens.append(Token(token.text, subwords=token.subwords))
return tokens
def segment(self, text: str) -> List[List[str]]:
"""
Full segmentation - segment into sentences
:param text:
:return:
"""
document = self.__tokenizer.segment(text)
sentences = []
for turn in document.turns:
for sentence in turn.sentences:
sentences.append([t.text for t in sentence.tokens])
return sentences
def _to_params(self) -> Dict[str, Any]:
return {
"type": "lambo",
"language": self._language
}
......@@ -28,6 +28,7 @@ class Token:
"deprel",
"deps",
"misc",
"subwords",
"semrel",
"embeddings",
"text_id"
......@@ -44,6 +45,7 @@ class Token:
deprel: Optional[str] # dep_ ?
deps: Optional[str]
misc: Optional[str]
subwords: Optional[List[str]]
semrel: Optional[str]
embeddings: Dict[str, List[float]]
text_id: Optional[int]
......@@ -60,6 +62,7 @@ class Token:
deprel: str = None,
deps: str = None,
misc: str = None,
subwords: List[str] = None,
semrel: str = None,
embeddings: Dict[str, List[float]] = None,
text_id: int = None) -> None:
......@@ -76,6 +79,7 @@ class Token:
self.deprel = deprel
self.deps = deps
self.misc = misc
self.subwords = subwords if subwords else []
self.semrel = semrel
if embeddings is None:
......@@ -105,6 +109,7 @@ class Token:
f"(deprel: {self.deprel}) "
f"(deps: {self.deps}) "
f"(misc: {self.misc}) "
f"(subwords: {','.join(self.subwords)})"
f"(semrel: {self.semrel}) "
f"(embeddings: {self.embeddings}) "
f"(text_id: {self.text_id})"
......
......@@ -9,6 +9,7 @@ importlib-resources~=5.12.0
overrides~=7.3.1
torch~=2.0.0
torchtext~=0.15.1
lambo~=2.0.0
numpy~=1.24.1
pytorch-lightning~=2.0.01
requests~=2.28.2
......
import unittest
from combo.data import ConllDatasetReader
from torch.utils.data import DataLoader
class ConllDatasetReaderTest(unittest.TestCase):
......@@ -9,6 +10,11 @@ class ConllDatasetReaderTest(unittest.TestCase):
tokens = [token for token in reader('conll_test_file.txt')]
self.assertEqual(len(tokens), 6)
def test_read_all_tokens_data_loader(self):
reader = ConllDatasetReader(coding_scheme='IOB2')
loader = DataLoader(reader('conll_test_file.txt'), batch_size=16)
print(next(iter(loader)))
def test_tokenize_correct_tokens(self):
reader = ConllDatasetReader(coding_scheme='IOB2')
token = next(iter(reader('conll_test_file.txt')))
......
import unittest
from combo.data import LamboTokenizer
class LamboTokenizerTest(unittest.TestCase):
def setUp(self) -> None:
self.lambo_tokenizer = LamboTokenizer()
def test_tokenize_sentence(self):
tokens = self.lambo_tokenizer.tokenize('Hello cats. I love you')
self.assertListEqual([t.text for t in tokens],
['Hello', 'cats', '.', 'I', 'love', 'you'])
def test_tokenize_sentence_with_multiword(self):
tokens = self.lambo_tokenizer.tokenize('I don\'t like apples.')
self.assertListEqual([t.text for t in tokens],
['I', 'don\'t', 'like', 'apples', '.'])
self.assertListEqual([t.subwords for t in tokens],
[[], ['do', 'n\'t'], [], [], []])
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment