From 01293fe612422cbc142b56ff513825c35f532ba0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20W=C4=85troba?= <markowanga@gmail.com> Date: Sat, 14 Jan 2023 00:29:09 +0100 Subject: [PATCH] Change facebook wav2vec2 model --- new_experiment/add_to_queue_pipeline.py | 31 ++++++++++--------- .../pipeline/pipeline_process_asr.py | 6 ++-- .../pipeline_process_spacy_dep_tag_wer.py | 6 +--- .../pipeline_process_spacy_ner_wer.py | 6 +--- .../pipeline_process_spacy_pos_wer.py | 3 +- .../pipeline_process_wikineural_ner_wer.py | 1 - .../pipeline_process_word_classic_wer.py | 2 +- .../pipeline_process_word_embedding_wer.py | 8 +---- .../{new_worker.py => worker_asr.py} | 0 .../{worker.py => worker_pipeline.py} | 0 10 files changed, 24 insertions(+), 39 deletions(-) rename new_experiment/{new_worker.py => worker_asr.py} (100%) rename new_experiment/{worker.py => worker_pipeline.py} (100%) diff --git a/new_experiment/add_to_queue_pipeline.py b/new_experiment/add_to_queue_pipeline.py index b929cf0..9b9728b 100644 --- a/new_experiment/add_to_queue_pipeline.py +++ b/new_experiment/add_to_queue_pipeline.py @@ -49,30 +49,31 @@ def add_whisper(channel: BlockingChannel): asr_name = f'whisper_{whisper_variant}' for dataset_name in get_all_datasets_with_language(): for command in COMMANDS: - add_to_queue(dataset_name, asr_name, command, channel) + add_to_queue(dataset_name, asr_name, command, channel, 'asr_benchmark_experiments') def get_hf_facebook_wav2vec2_model_by_language_code(language_code: str) -> str: return { - 'nl': 'facebook_wav2vec2_large_xlsr_53_dutch', - 'en': 'facebook_wav2vec2_xls_r_300m', - 'fr': 'facebook_wav2vec2_large_xlsr_53_french', - 'de': 'facebook_wav2vec2_large_xlsr_53_german', - 'it': 'facebook_wav2vec2_large_xlsr_53_italian', - 'pl': 'facebook_wav2vec2_large_xlsr_53_polish', - 'es': 'facebook_wav2vec2_large_xlsr_53_spanish' + # 'nl': 'facebook_wav2vec2_large_xlsr_53_dutch', + 'en': 'facebook_wav2vec2_large_960h_lv60_self', + # 'fr': 'facebook_wav2vec2_large_xlsr_53_french', + # 'de': 'facebook_wav2vec2_large_xlsr_53_german', + # 'it': 'facebook_wav2vec2_large_xlsr_53_italian', + # 'pl': 'facebook_wav2vec2_large_xlsr_53_polish', + # 'es': 'facebook_wav2vec2_large_xlsr_53_spanish' }[language_code] def add_facebook_hf_wav2vec2_asr(channel: BlockingChannel): for dataset_name in get_all_datasets_with_language(): - add_to_queue( - dataset_name, - get_hf_facebook_wav2vec2_model_by_language_code(dataset_name[:2]), - 'hf_facebook_wav2vec2_asr', - channel, - 'hf_facebook_wav2vec2_asr' - ) + if dataset_name.startswith('en'): + add_to_queue( + dataset_name, + get_hf_facebook_wav2vec2_model_by_language_code(dataset_name[:2]), + 'hf_facebook_wav2vec2_asr', + channel, + 'hf_facebook_wav2vec2_asr' + ) def main(): diff --git a/new_experiment/pipeline/pipeline_process_asr.py b/new_experiment/pipeline/pipeline_process_asr.py index c883a3d..1860df6 100644 --- a/new_experiment/pipeline/pipeline_process_asr.py +++ b/new_experiment/pipeline/pipeline_process_asr.py @@ -10,8 +10,8 @@ from sziszapangma.integration.task.asr_task import AsrTask def get_asr_processor(asr_name: str) -> AsrProcessor: if asr_name == 'facebook_wav2vec2_large_xlsr_53_dutch': return Wav2Vec2AsrProcessor('facebook/wav2vec2-large-xlsr-53-dutch') - if asr_name == 'facebook_wav2vec2_xls_r_300m': - return Wav2Vec2AsrProcessor('facebook/wav2vec2-xls-r-300m') + if asr_name == 'facebook_wav2vec2_large_960h_lv60_self': + return Wav2Vec2AsrProcessor('facebook/wav2vec2-large-960h-lv60-self') if asr_name == 'facebook_wav2vec2_large_xlsr_53_french': return Wav2Vec2AsrProcessor('facebook/wav2vec2-large-xlsr-53-french') if asr_name == 'facebook_wav2vec2_large_xlsr_53_german': @@ -34,7 +34,7 @@ def run_hf_facebook_wav2vec2_asr_task(dataset_name: str, asr_name: str): AsrTask( asr_property_name=PropertyHelper.asr_result(asr_name), task_name=f'AsrTask___{dataset_name}___{asr_name}', - require_update=True, + require_update=False, asr_processor=get_asr_processor(asr_name), record_path_provider=record_provider ) diff --git a/new_experiment/pipeline/pipeline_process_spacy_dep_tag_wer.py b/new_experiment/pipeline/pipeline_process_spacy_dep_tag_wer.py index 8e2d568..c66a8f3 100644 --- a/new_experiment/pipeline/pipeline_process_spacy_dep_tag_wer.py +++ b/new_experiment/pipeline/pipeline_process_spacy_dep_tag_wer.py @@ -1,7 +1,3 @@ -import argparse - -from experiment.const_pipeline_names import GOLD_TRANSCRIPT -from experiment.experiment_dependency_provider import get_record_provider, get_repository from experiment.sentence_wer_processor.spacy_pos_sentence_dep_tag_processor import SpacyDepTagSentenceWerProcessor from new_experiment.new_dependency_provider import get_experiment_repository, get_minio_audio_record_repository from new_experiment.utils.get_spacy_model_name import get_spacy_model_name @@ -25,7 +21,7 @@ def run_spacy_dep_tag_wer_pipeline(dataset_name: str, asr_name: str): alignment_property_name=PropertyHelper.dep_tag_alignment(asr_name, model_name), wer_property_name=PropertyHelper.dep_tag_metrics(asr_name, model_name), task_name=f'SpacyDepTagSentenceWerProcessor___{dataset_name}___{asr_name}', - require_update=True + require_update=False ) ], experiment_repository=repository, diff --git a/new_experiment/pipeline/pipeline_process_spacy_ner_wer.py b/new_experiment/pipeline/pipeline_process_spacy_ner_wer.py index c74e419..9cdf4b0 100644 --- a/new_experiment/pipeline/pipeline_process_spacy_ner_wer.py +++ b/new_experiment/pipeline/pipeline_process_spacy_ner_wer.py @@ -1,7 +1,3 @@ -import argparse - -from experiment.const_pipeline_names import GOLD_TRANSCRIPT -from experiment.experiment_dependency_provider import get_record_provider, get_repository from experiment.sentence_wer_processor.spacy_ner_sentence_wer_processor import SpacyNerSentenceWerProcessor from new_experiment.new_dependency_provider import get_experiment_repository, get_minio_audio_record_repository from new_experiment.utils.get_spacy_model_name import get_spacy_model_name @@ -25,7 +21,7 @@ def run_spacy_ner_wer_pipeline(dataset_name: str, asr_name: str): alignment_property_name=PropertyHelper.ner_alignment(asr_name, model_name), wer_property_name=PropertyHelper.ner_metrics(asr_name, model_name), task_name=f'SpacyNerSentenceWerProcessor___{dataset_name}___{asr_name}', - require_update=True + require_update=False ) ], experiment_repository=repository, diff --git a/new_experiment/pipeline/pipeline_process_spacy_pos_wer.py b/new_experiment/pipeline/pipeline_process_spacy_pos_wer.py index 5959dc9..1af0b65 100644 --- a/new_experiment/pipeline/pipeline_process_spacy_pos_wer.py +++ b/new_experiment/pipeline/pipeline_process_spacy_pos_wer.py @@ -1,4 +1,3 @@ -from experiment.experiment_dependency_provider import get_record_provider, get_repository from experiment.sentence_wer_processor.spacy_pos_sentence_wer_processor import SpacyPosSentenceWerProcessor from new_experiment.new_dependency_provider import get_experiment_repository, get_minio_audio_record_repository from new_experiment.utils.get_spacy_model_name import get_spacy_model_name @@ -22,7 +21,7 @@ def run_spacy_pos_wer_pipeline(dataset_name: str, asr_name: str): alignment_property_name=PropertyHelper.pos_alignment(asr_name, model_name), wer_property_name=PropertyHelper.pos_metrics(asr_name, model_name), task_name=f'SpacyPosSentenceWerProcessor___{dataset_name}___{asr_name}', - require_update=True + require_update=False ) ], experiment_repository=repository, diff --git a/new_experiment/pipeline/pipeline_process_wikineural_ner_wer.py b/new_experiment/pipeline/pipeline_process_wikineural_ner_wer.py index 761abb4..3675fc9 100644 --- a/new_experiment/pipeline/pipeline_process_wikineural_ner_wer.py +++ b/new_experiment/pipeline/pipeline_process_wikineural_ner_wer.py @@ -1,4 +1,3 @@ -from experiment.experiment_dependency_provider import get_record_provider, get_repository from experiment.sentence_wer_processor.wikineural_multilingual_ner_transformers_wer_processor_base import \ WikineuralMultilingualNerTransformersWerProcessorBase from new_experiment.new_dependency_provider import get_experiment_repository, get_minio_audio_record_repository diff --git a/new_experiment/pipeline/pipeline_process_word_classic_wer.py b/new_experiment/pipeline/pipeline_process_word_classic_wer.py index 231a696..212e3db 100644 --- a/new_experiment/pipeline/pipeline_process_word_classic_wer.py +++ b/new_experiment/pipeline/pipeline_process_word_classic_wer.py @@ -15,7 +15,7 @@ def run_word_wer_classic_pipeline(dataset_name: str, asr_name: str): asr_property_name=PropertyHelper.asr_result(asr_name), gold_transcript_property_name=PropertyHelper.get_gold_transcript_words(), metrics_property_name=PropertyHelper.word_wer_classic_metrics(asr_name), - require_update=True, + require_update=False, alignment_property_name=PropertyHelper.word_wer_classic_alignment(asr_name) ), ], diff --git a/new_experiment/pipeline/pipeline_process_word_embedding_wer.py b/new_experiment/pipeline/pipeline_process_word_embedding_wer.py index 8f942f0..9026787 100644 --- a/new_experiment/pipeline/pipeline_process_word_embedding_wer.py +++ b/new_experiment/pipeline/pipeline_process_word_embedding_wer.py @@ -1,14 +1,8 @@ -import argparse - -from experiment.const_pipeline_names import GOLD_TRANSCRIPT -from experiment.experiment_dependency_provider import get_record_provider, get_repository from new_experiment.new_dependency_provider import get_experiment_repository, get_minio_audio_record_repository from new_experiment.utils.loaded_remote_dataset_helper import LoadedRemoteDatasetHelper from new_experiment.utils.property_helper import PropertyHelper from sziszapangma.core.transformer.fasttext_embedding_transformer import FasttextEmbeddingTransformer -from sziszapangma.core.transformer.web_embedding_transformer import WebEmbeddingTransformer from sziszapangma.integration.experiment_manager import ExperimentManager -from sziszapangma.integration.task.classic_wer_metric_task import ClassicWerMetricTask from sziszapangma.integration.task.embedding_wer_metrics_task import EmbeddingWerMetricsTask @@ -22,7 +16,7 @@ def run_word_wer_embedding_pipeline(dataset_name: str, asr_name: str): asr_property_name=PropertyHelper.asr_result(asr_name), gold_transcript_property_name=PropertyHelper.get_gold_transcript_words(), metrics_property_name=PropertyHelper.word_wer_embeddings_metrics(asr_name), - require_update=True, + require_update=False, embedding_transformer=FasttextEmbeddingTransformer(dataset_name[:2]), alignment_property_name=PropertyHelper.word_wer_embeddings_alignment(asr_name) ) diff --git a/new_experiment/new_worker.py b/new_experiment/worker_asr.py similarity index 100% rename from new_experiment/new_worker.py rename to new_experiment/worker_asr.py diff --git a/new_experiment/worker.py b/new_experiment/worker_pipeline.py similarity index 100% rename from new_experiment/worker.py rename to new_experiment/worker_pipeline.py -- GitLab