From 3329cb6dddfd26aab4681a65e159641cbc3d4795 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20W=C4=85troba?= <markowanga@gmail.com> Date: Wed, 11 Jan 2023 23:44:11 +0100 Subject: [PATCH] Add new experiment processing --- new_experiment/pipeline/import_datasets.py | 2 +- new_experiment/utils/hf_dataset_importer.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/new_experiment/pipeline/import_datasets.py b/new_experiment/pipeline/import_datasets.py index 1633088..d1497ce 100644 --- a/new_experiment/pipeline/import_datasets.py +++ b/new_experiment/pipeline/import_datasets.py @@ -16,7 +16,7 @@ def import_fleurs_dataset(dataset_lang: str, experiment_dataset_name: str): get_experiment_repository(experiment_dataset_name), get_minio_audio_record_repository(), experiment_dataset_name, - ), load_dataset('google/fleurs', dataset_lang)) + ), load_dataset('google/fleurs', dataset_lang)['test']) def import_minds14_dataset(dataset_lang: str, experiment_dataset_name: str): diff --git a/new_experiment/utils/hf_dataset_importer.py b/new_experiment/utils/hf_dataset_importer.py index 94db715..c47df17 100644 --- a/new_experiment/utils/hf_dataset_importer.py +++ b/new_experiment/utils/hf_dataset_importer.py @@ -1,3 +1,4 @@ +import datetime from abc import ABC, abstractmethod from hashlib import sha1 from pathlib import Path @@ -41,8 +42,11 @@ class HfDatasetImporter(ABC): pass def process_dataset(self, dataset: Dataset): + counter = 1 for it in dataset: + print(datetime.datetime.now().isoformat(), f'process_dataset item {counter} {it}') self.process_record(it) + counter += 1 def process_record(self, record: Dict[str, Any]): record_id = self.get_record_id(record) -- GitLab