From 3329cb6dddfd26aab4681a65e159641cbc3d4795 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marcin=20W=C4=85troba?= <markowanga@gmail.com>
Date: Wed, 11 Jan 2023 23:44:11 +0100
Subject: [PATCH] Add new experiment processing

---
 new_experiment/pipeline/import_datasets.py  | 2 +-
 new_experiment/utils/hf_dataset_importer.py | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/new_experiment/pipeline/import_datasets.py b/new_experiment/pipeline/import_datasets.py
index 1633088..d1497ce 100644
--- a/new_experiment/pipeline/import_datasets.py
+++ b/new_experiment/pipeline/import_datasets.py
@@ -16,7 +16,7 @@ def import_fleurs_dataset(dataset_lang: str, experiment_dataset_name: str):
         get_experiment_repository(experiment_dataset_name),
         get_minio_audio_record_repository(),
         experiment_dataset_name,
-    ), load_dataset('google/fleurs', dataset_lang))
+    ), load_dataset('google/fleurs', dataset_lang)['test'])
 
 
 def import_minds14_dataset(dataset_lang: str, experiment_dataset_name: str):
diff --git a/new_experiment/utils/hf_dataset_importer.py b/new_experiment/utils/hf_dataset_importer.py
index 94db715..c47df17 100644
--- a/new_experiment/utils/hf_dataset_importer.py
+++ b/new_experiment/utils/hf_dataset_importer.py
@@ -1,3 +1,4 @@
+import datetime
 from abc import ABC, abstractmethod
 from hashlib import sha1
 from pathlib import Path
@@ -41,8 +42,11 @@ class HfDatasetImporter(ABC):
         pass
 
     def process_dataset(self, dataset: Dataset):
+        counter = 1
         for it in dataset:
+            print(datetime.datetime.now().isoformat(), f'process_dataset item {counter} {it}')
             self.process_record(it)
+            counter += 1
 
     def process_record(self, record: Dict[str, Any]):
         record_id = self.get_record_id(record)
-- 
GitLab