Skip to content
Snippets Groups Projects
Commit f9aed5c1 authored by Marcin Wątroba's avatar Marcin Wątroba
Browse files

download_dataset command

parent 89b3540c
No related branches found
No related tags found
No related merge requests found
from pathlib import Path
from typing import Dict, Any, List
from minio import Minio
from nltk import RegexpTokenizer
from new_datasets.import_datasets.upload_audio import process_numpy_array_to_md5_hash
from new_experiment.utils.hf_dataset_importer import HfDatasetImporter
from new_experiment.utils.minio_audio_record_repository import MinioAudioRecordRepository
from sziszapangma.integration.repository.experiment_repository import ExperimentRepository
class CommonVoiceDatasetImporter(HfDatasetImporter):
def __init__(self, experiment_repository: ExperimentRepository,
minio_audio_record_repository: MinioAudioRecordRepository, dataset_name: str):
super().__init__(experiment_repository, minio_audio_record_repository, dataset_name)
def get_words(self, record: Dict[str, Any]) -> List[str]:
tokenizer = RegexpTokenizer(r'\w+')
return tokenizer.tokenize(record['sentence'])
def get_raw_transcription(self, record: Dict[str, Any]) -> str:
return record['sentence']
def get_audio_file(self, record: Dict[str, Any]) -> Path:
return record['path']
def get_record_id(self, record: Dict[str, Any]) -> str:
return process_numpy_array_to_md5_hash(record['audio']['array'])
import json
from pathlib import Path
from typing import Any, List
from nltk import RegexpTokenizer
from new_experiment.new_dependency_provider import get_experiment_repository
from new_experiment.pipeline.dataset_importer.import_datasets import import_common_voice_dataset
from new_experiment.utils.property_helper import PropertyHelper
from sziszapangma.model.model_creators import create_new_word
if __name__ == '__main__':
import_common_voice_dataset('nl', 'nl_common_voice')
import_common_voice_dataset('fr', 'fr_common_voice')
import_common_voice_dataset('de', 'de_common_voice')
import_common_voice_dataset('it', 'it_common_voice')
import_common_voice_dataset('pl', 'pl_common_voice')
import_common_voice_dataset('es', 'es_common_voice')
import_common_voice_dataset('en', 'en_common_voice')
......@@ -2,6 +2,7 @@ from datasets import Dataset, load_dataset
from new_experiment.new_dependency_provider import get_experiment_repository, get_minio_client, \
get_minio_audio_record_repository
from new_experiment.pipeline.dataset_importer.common_voice_dataset_importer import CommonVoiceDatasetImporter
from new_experiment.pipeline.dataset_importer.fleurs_dataset_importer import FleursDatasetImporter
from new_experiment.pipeline.dataset_importer.minds14_dataset_importer import Minds14DatasetImporter
from new_experiment.utils.hf_dataset_importer import HfDatasetImporter
......@@ -33,3 +34,11 @@ def import_voxpopuli_dataset(dataset_lang: str, experiment_dataset_name: str):
get_minio_audio_record_repository(),
experiment_dataset_name,
), load_dataset('facebook/voxpopuli', dataset_lang)['test'])
def import_common_voice_dataset(dataset_lang: str, experiment_dataset_name: str):
import_single_dataset(CommonVoiceDatasetImporter(
get_experiment_repository(experiment_dataset_name),
get_minio_audio_record_repository(),
experiment_dataset_name,
), load_dataset("mozilla-foundation/common_voice_11_0", dataset_lang, cache_dir='/mnt/disk2/huggingface')['test'])
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment