From db1eced8ed546e5ac4fcb6cf5b39148a2520c560 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20W=C4=85troba?= <markowanga@gmail.com> Date: Sun, 15 Jan 2023 09:14:00 +0100 Subject: [PATCH] download_dataset command --- new_datasets/iterate_hf_dataset.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 new_datasets/iterate_hf_dataset.py diff --git a/new_datasets/iterate_hf_dataset.py b/new_datasets/iterate_hf_dataset.py new file mode 100644 index 0000000..fea5aef --- /dev/null +++ b/new_datasets/iterate_hf_dataset.py @@ -0,0 +1,26 @@ +import argparse +from typing import Optional + +import datasets + + +def iterate_dataset(dataset_path: str, dataset_name: Optional[str], cache_dir: str, split_name: str, items: int): + dataset_dict = datasets.load_dataset(dataset_path, dataset_name if len(dataset_name) > 0 else None, cache_dir) + dataset = dataset_dict[split_name] + for it in dataset[:items]: + print(it) + counter = 0 + for it in dataset: + counter += 1 + print(f'all_items {counter}') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("--dataset_path") + parser.add_argument("--dataset_name") + parser.add_argument("--cache_dir") + parser.add_argument("--split_name") + parser.add_argument("--items") + args = parser.parse_args() + iterate_dataset(args.dataset_path, args.dataset_name, args.cache_dir, int(args.items)) -- GitLab