diff --git a/new_experiment/pipeline/dataset_importer/common_voice_dataset_importer.py b/new_experiment/pipeline/dataset_importer/common_voice_dataset_importer.py new file mode 100644 index 0000000000000000000000000000000000000000..17ded78f35496b367f6228549605862f5d0f79ff --- /dev/null +++ b/new_experiment/pipeline/dataset_importer/common_voice_dataset_importer.py @@ -0,0 +1,30 @@ +from pathlib import Path +from typing import Dict, Any, List + +from minio import Minio +from nltk import RegexpTokenizer + +from new_datasets.import_datasets.upload_audio import process_numpy_array_to_md5_hash +from new_experiment.utils.hf_dataset_importer import HfDatasetImporter +from new_experiment.utils.minio_audio_record_repository import MinioAudioRecordRepository +from sziszapangma.integration.repository.experiment_repository import ExperimentRepository + + +class CommonVoiceDatasetImporter(HfDatasetImporter): + + def __init__(self, experiment_repository: ExperimentRepository, + minio_audio_record_repository: MinioAudioRecordRepository, dataset_name: str): + super().__init__(experiment_repository, minio_audio_record_repository, dataset_name) + + def get_words(self, record: Dict[str, Any]) -> List[str]: + tokenizer = RegexpTokenizer(r'\w+') + return tokenizer.tokenize(record['sentence']) + + def get_raw_transcription(self, record: Dict[str, Any]) -> str: + return record['sentence'] + + def get_audio_file(self, record: Dict[str, Any]) -> Path: + return record['path'] + + def get_record_id(self, record: Dict[str, Any]) -> str: + return process_numpy_array_to_md5_hash(record['audio']['array']) diff --git a/new_experiment/pipeline/dataset_importer/import_common_voice.py b/new_experiment/pipeline/dataset_importer/import_common_voice.py new file mode 100644 index 0000000000000000000000000000000000000000..611529e6fdb8cb3383dc995ced7050dc5a36eda8 --- /dev/null +++ b/new_experiment/pipeline/dataset_importer/import_common_voice.py @@ -0,0 +1,19 @@ +import json +from pathlib import Path +from typing import Any, List + +from nltk import RegexpTokenizer + +from new_experiment.new_dependency_provider import get_experiment_repository +from new_experiment.pipeline.dataset_importer.import_datasets import import_common_voice_dataset +from new_experiment.utils.property_helper import PropertyHelper +from sziszapangma.model.model_creators import create_new_word + +if __name__ == '__main__': + import_common_voice_dataset('nl', 'nl_common_voice') + import_common_voice_dataset('fr', 'fr_common_voice') + import_common_voice_dataset('de', 'de_common_voice') + import_common_voice_dataset('it', 'it_common_voice') + import_common_voice_dataset('pl', 'pl_common_voice') + import_common_voice_dataset('es', 'es_common_voice') + import_common_voice_dataset('en', 'en_common_voice') diff --git a/new_experiment/pipeline/dataset_importer/import_datasets.py b/new_experiment/pipeline/dataset_importer/import_datasets.py index a16bf91022687feafdbd397e6de604203a3730ea..307d73f779041114a7a280b9907e13261764ed7c 100644 --- a/new_experiment/pipeline/dataset_importer/import_datasets.py +++ b/new_experiment/pipeline/dataset_importer/import_datasets.py @@ -2,6 +2,7 @@ from datasets import Dataset, load_dataset from new_experiment.new_dependency_provider import get_experiment_repository, get_minio_client, \ get_minio_audio_record_repository +from new_experiment.pipeline.dataset_importer.common_voice_dataset_importer import CommonVoiceDatasetImporter from new_experiment.pipeline.dataset_importer.fleurs_dataset_importer import FleursDatasetImporter from new_experiment.pipeline.dataset_importer.minds14_dataset_importer import Minds14DatasetImporter from new_experiment.utils.hf_dataset_importer import HfDatasetImporter @@ -33,3 +34,11 @@ def import_voxpopuli_dataset(dataset_lang: str, experiment_dataset_name: str): get_minio_audio_record_repository(), experiment_dataset_name, ), load_dataset('facebook/voxpopuli', dataset_lang)['test']) + + +def import_common_voice_dataset(dataset_lang: str, experiment_dataset_name: str): + import_single_dataset(CommonVoiceDatasetImporter( + get_experiment_repository(experiment_dataset_name), + get_minio_audio_record_repository(), + experiment_dataset_name, + ), load_dataset("mozilla-foundation/common_voice_11_0", dataset_lang, cache_dir='/mnt/disk2/huggingface')['test'])