diff --git a/new_experiment/pipeline/import_datasets.py b/new_experiment/pipeline/import_datasets.py index 1633088b377f28039d65dd65635580fac6ca23e2..d1497ce4c8864501e352a6eb69bad092421ef53f 100644 --- a/new_experiment/pipeline/import_datasets.py +++ b/new_experiment/pipeline/import_datasets.py @@ -16,7 +16,7 @@ def import_fleurs_dataset(dataset_lang: str, experiment_dataset_name: str): get_experiment_repository(experiment_dataset_name), get_minio_audio_record_repository(), experiment_dataset_name, - ), load_dataset('google/fleurs', dataset_lang)) + ), load_dataset('google/fleurs', dataset_lang)['test']) def import_minds14_dataset(dataset_lang: str, experiment_dataset_name: str): diff --git a/new_experiment/utils/hf_dataset_importer.py b/new_experiment/utils/hf_dataset_importer.py index 94db715eb3901c722c83583b7fbf77856247d998..c47df17d4ba862623b56a012555ae0cec4964315 100644 --- a/new_experiment/utils/hf_dataset_importer.py +++ b/new_experiment/utils/hf_dataset_importer.py @@ -1,3 +1,4 @@ +import datetime from abc import ABC, abstractmethod from hashlib import sha1 from pathlib import Path @@ -41,8 +42,11 @@ class HfDatasetImporter(ABC): pass def process_dataset(self, dataset: Dataset): + counter = 1 for it in dataset: + print(datetime.datetime.now().isoformat(), f'process_dataset item {counter} {it}') self.process_record(it) + counter += 1 def process_record(self, record: Dict[str, Any]): record_id = self.get_record_id(record)