diff --git a/combo/data/samplers/samplers.py b/combo/data/samplers/samplers.py index dcb83ee4725fe177e21b39e8a803783086fce5a5..a1754985f115d95995f48a2f99d5a4485d5faa40 100644 --- a/combo/data/samplers/samplers.py +++ b/combo/data/samplers/samplers.py @@ -35,17 +35,4 @@ class TokenCountBatchSampler(allen_data.BatchSampler): return batches def get_num_batches(self, instances: Sequence[data.Instance]) -> int: - dataset = list(instances) - batches = [] - batch = [] - words_count = 0 - lengths = [len(instance.fields["sentence"].tokens) for instance in dataset] - argsorted_lengths = np.argsort(lengths) - for idx in argsorted_lengths: - words_count += lengths[idx] - batch.append(idx) - if words_count > self._word_batch_size: - batches.append(batch) - words_count = 0 - batch = [] - return len(batches) + return sum(1 for _ in self.get_batch_indices(instances))