diff --git a/new_datasets/add_asr_to_queue.py b/new_datasets/add_asr_to_queue.py index ab678639fd1d963478fd4338c1e8942c2e23edd5..8b3da423038b7aa6b1ba658944b05875b78eca4d 100644 --- a/new_datasets/add_asr_to_queue.py +++ b/new_datasets/add_asr_to_queue.py @@ -8,7 +8,9 @@ from pika.adapters.blocking_connection import BlockingChannel def get_all_datasets() -> List[str]: - return [it.object_name[:-1] for it in get_minio_client().list_objects('dataset-audio', '')] + # return [it.object_name[:-1] for it in get_minio_client().list_objects('dataset-audio', '')] + language_codes = ['nl', 'fr', 'de', 'it', 'pl', 'es', 'en'] + return [f'{it}_common_voice' for it in language_codes] def get_dataset_items_id(dataset_name: str) -> List[str]: @@ -32,11 +34,7 @@ def add_to_queue(dataset: str, asr_name: str, item_id: str, channel: BlockingCha def add_whisper(channel: BlockingChannel): - whisper_asr_variant_list = [ - # 'tiny', - # 'base', - 'small', 'medium', 'large-v2' - ] + whisper_asr_variant_list = ['tiny', 'base', 'small', 'medium', 'large-v2'] for whisper_variant in whisper_asr_variant_list: asr_name = f'whisper_{whisper_variant}' for dataset in get_all_datasets(): diff --git a/new_datasets/import_datasets/upload_audio.py b/new_datasets/import_datasets/upload_audio.py index 0a2e08ebbdbaf81f6063e7c0ecb89721401110bd..83fad986c9455c1c4141723e395990be27218f98 100644 --- a/new_datasets/import_datasets/upload_audio.py +++ b/new_datasets/import_datasets/upload_audio.py @@ -86,13 +86,13 @@ def upload_minds(): def upload_common_voice(): - upload_single_commonvoice('nl', 'nl_common_voice') - upload_single_commonvoice('fr', 'fr_common_voice') - upload_single_commonvoice('de', 'de_common_voice') - upload_single_commonvoice('it', 'it_common_voice') + # upload_single_commonvoice('nl', 'nl_common_voice') + # upload_single_commonvoice('fr', 'fr_common_voice') + # upload_single_commonvoice('de', 'de_common_voice') + # upload_single_commonvoice('it', 'it_common_voice') upload_single_commonvoice('pl', 'pl_common_voice') - upload_single_commonvoice('es', 'es_common_voice') - upload_single_commonvoice('en', 'en_common_voice') + # upload_single_commonvoice('es', 'es_common_voice') + # upload_single_commonvoice('en', 'en_common_voice') if __name__ == '__main__':