diff --git a/new_datasets/iterate_hf_dataset.py b/new_datasets/iterate_hf_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..fea5aefdbb72ddad2486e3ac29497a6b032879c2 --- /dev/null +++ b/new_datasets/iterate_hf_dataset.py @@ -0,0 +1,26 @@ +import argparse +from typing import Optional + +import datasets + + +def iterate_dataset(dataset_path: str, dataset_name: Optional[str], cache_dir: str, split_name: str, items: int): + dataset_dict = datasets.load_dataset(dataset_path, dataset_name if len(dataset_name) > 0 else None, cache_dir) + dataset = dataset_dict[split_name] + for it in dataset[:items]: + print(it) + counter = 0 + for it in dataset: + counter += 1 + print(f'all_items {counter}') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("--dataset_path") + parser.add_argument("--dataset_name") + parser.add_argument("--cache_dir") + parser.add_argument("--split_name") + parser.add_argument("--items") + args = parser.parse_args() + iterate_dataset(args.dataset_path, args.dataset_name, args.cache_dir, int(args.items))