Skip to content
Snippets Groups Projects
Commit f9aed5c1 authored by Marcin Wątroba's avatar Marcin Wątroba
Browse files

download_dataset command

parent 89b3540c
No related merge requests found
from pathlib import Path
from typing import Dict, Any, List
from minio import Minio
from nltk import RegexpTokenizer
from new_datasets.import_datasets.upload_audio import process_numpy_array_to_md5_hash
from new_experiment.utils.hf_dataset_importer import HfDatasetImporter
from new_experiment.utils.minio_audio_record_repository import MinioAudioRecordRepository
from sziszapangma.integration.repository.experiment_repository import ExperimentRepository
class CommonVoiceDatasetImporter(HfDatasetImporter):
def __init__(self, experiment_repository: ExperimentRepository,
minio_audio_record_repository: MinioAudioRecordRepository, dataset_name: str):
super().__init__(experiment_repository, minio_audio_record_repository, dataset_name)
def get_words(self, record: Dict[str, Any]) -> List[str]:
tokenizer = RegexpTokenizer(r'\w+')
return tokenizer.tokenize(record['sentence'])
def get_raw_transcription(self, record: Dict[str, Any]) -> str:
return record['sentence']
def get_audio_file(self, record: Dict[str, Any]) -> Path:
return record['path']
def get_record_id(self, record: Dict[str, Any]) -> str:
return process_numpy_array_to_md5_hash(record['audio']['array'])
import json
from pathlib import Path
from typing import Any, List
from nltk import RegexpTokenizer
from new_experiment.new_dependency_provider import get_experiment_repository
from new_experiment.pipeline.dataset_importer.import_datasets import import_common_voice_dataset
from new_experiment.utils.property_helper import PropertyHelper
from sziszapangma.model.model_creators import create_new_word
if __name__ == '__main__':
import_common_voice_dataset('nl', 'nl_common_voice')
import_common_voice_dataset('fr', 'fr_common_voice')
import_common_voice_dataset('de', 'de_common_voice')
import_common_voice_dataset('it', 'it_common_voice')
import_common_voice_dataset('pl', 'pl_common_voice')
import_common_voice_dataset('es', 'es_common_voice')
import_common_voice_dataset('en', 'en_common_voice')
...@@ -2,6 +2,7 @@ from datasets import Dataset, load_dataset ...@@ -2,6 +2,7 @@ from datasets import Dataset, load_dataset
from new_experiment.new_dependency_provider import get_experiment_repository, get_minio_client, \ from new_experiment.new_dependency_provider import get_experiment_repository, get_minio_client, \
get_minio_audio_record_repository get_minio_audio_record_repository
from new_experiment.pipeline.dataset_importer.common_voice_dataset_importer import CommonVoiceDatasetImporter
from new_experiment.pipeline.dataset_importer.fleurs_dataset_importer import FleursDatasetImporter from new_experiment.pipeline.dataset_importer.fleurs_dataset_importer import FleursDatasetImporter
from new_experiment.pipeline.dataset_importer.minds14_dataset_importer import Minds14DatasetImporter from new_experiment.pipeline.dataset_importer.minds14_dataset_importer import Minds14DatasetImporter
from new_experiment.utils.hf_dataset_importer import HfDatasetImporter from new_experiment.utils.hf_dataset_importer import HfDatasetImporter
...@@ -33,3 +34,11 @@ def import_voxpopuli_dataset(dataset_lang: str, experiment_dataset_name: str): ...@@ -33,3 +34,11 @@ def import_voxpopuli_dataset(dataset_lang: str, experiment_dataset_name: str):
get_minio_audio_record_repository(), get_minio_audio_record_repository(),
experiment_dataset_name, experiment_dataset_name,
), load_dataset('facebook/voxpopuli', dataset_lang)['test']) ), load_dataset('facebook/voxpopuli', dataset_lang)['test'])
def import_common_voice_dataset(dataset_lang: str, experiment_dataset_name: str):
import_single_dataset(CommonVoiceDatasetImporter(
get_experiment_repository(experiment_dataset_name),
get_minio_audio_record_repository(),
experiment_dataset_name,
), load_dataset("mozilla-foundation/common_voice_11_0", dataset_lang, cache_dir='/mnt/disk2/huggingface')['test'])
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment