From 584babb2a0ee71fea69894eb4e68fdbc8685376a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20W=C4=85troba?= <markowanga@gmail.com> Date: Sun, 15 Jan 2023 19:15:39 +0100 Subject: [PATCH] download_dataset command --- new_datasets/add_asr_to_queue.py | 10 ++++------ new_datasets/import_datasets/upload_audio.py | 12 ++++++------ 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/new_datasets/add_asr_to_queue.py b/new_datasets/add_asr_to_queue.py index ab67863..8b3da42 100644 --- a/new_datasets/add_asr_to_queue.py +++ b/new_datasets/add_asr_to_queue.py @@ -8,7 +8,9 @@ from pika.adapters.blocking_connection import BlockingChannel def get_all_datasets() -> List[str]: - return [it.object_name[:-1] for it in get_minio_client().list_objects('dataset-audio', '')] + # return [it.object_name[:-1] for it in get_minio_client().list_objects('dataset-audio', '')] + language_codes = ['nl', 'fr', 'de', 'it', 'pl', 'es', 'en'] + return [f'{it}_common_voice' for it in language_codes] def get_dataset_items_id(dataset_name: str) -> List[str]: @@ -32,11 +34,7 @@ def add_to_queue(dataset: str, asr_name: str, item_id: str, channel: BlockingCha def add_whisper(channel: BlockingChannel): - whisper_asr_variant_list = [ - # 'tiny', - # 'base', - 'small', 'medium', 'large-v2' - ] + whisper_asr_variant_list = ['tiny', 'base', 'small', 'medium', 'large-v2'] for whisper_variant in whisper_asr_variant_list: asr_name = f'whisper_{whisper_variant}' for dataset in get_all_datasets(): diff --git a/new_datasets/import_datasets/upload_audio.py b/new_datasets/import_datasets/upload_audio.py index 0a2e08e..83fad98 100644 --- a/new_datasets/import_datasets/upload_audio.py +++ b/new_datasets/import_datasets/upload_audio.py @@ -86,13 +86,13 @@ def upload_minds(): def upload_common_voice(): - upload_single_commonvoice('nl', 'nl_common_voice') - upload_single_commonvoice('fr', 'fr_common_voice') - upload_single_commonvoice('de', 'de_common_voice') - upload_single_commonvoice('it', 'it_common_voice') + # upload_single_commonvoice('nl', 'nl_common_voice') + # upload_single_commonvoice('fr', 'fr_common_voice') + # upload_single_commonvoice('de', 'de_common_voice') + # upload_single_commonvoice('it', 'it_common_voice') upload_single_commonvoice('pl', 'pl_common_voice') - upload_single_commonvoice('es', 'es_common_voice') - upload_single_commonvoice('en', 'en_common_voice') + # upload_single_commonvoice('es', 'es_common_voice') + # upload_single_commonvoice('en', 'en_common_voice') if __name__ == '__main__': -- GitLab