diff --git a/new_experiment/add_to_queue_pipeline.py b/new_experiment/add_to_queue_pipeline.py index b929cf08e5121cdcb5a2ef6fb2c5459aa6fd5bd9..9b9728bfc29bde6482869d3efb1c91a82a9165d0 100644 --- a/new_experiment/add_to_queue_pipeline.py +++ b/new_experiment/add_to_queue_pipeline.py @@ -49,30 +49,31 @@ def add_whisper(channel: BlockingChannel): asr_name = f'whisper_{whisper_variant}' for dataset_name in get_all_datasets_with_language(): for command in COMMANDS: - add_to_queue(dataset_name, asr_name, command, channel) + add_to_queue(dataset_name, asr_name, command, channel, 'asr_benchmark_experiments') def get_hf_facebook_wav2vec2_model_by_language_code(language_code: str) -> str: return { - 'nl': 'facebook_wav2vec2_large_xlsr_53_dutch', - 'en': 'facebook_wav2vec2_xls_r_300m', - 'fr': 'facebook_wav2vec2_large_xlsr_53_french', - 'de': 'facebook_wav2vec2_large_xlsr_53_german', - 'it': 'facebook_wav2vec2_large_xlsr_53_italian', - 'pl': 'facebook_wav2vec2_large_xlsr_53_polish', - 'es': 'facebook_wav2vec2_large_xlsr_53_spanish' + # 'nl': 'facebook_wav2vec2_large_xlsr_53_dutch', + 'en': 'facebook_wav2vec2_large_960h_lv60_self', + # 'fr': 'facebook_wav2vec2_large_xlsr_53_french', + # 'de': 'facebook_wav2vec2_large_xlsr_53_german', + # 'it': 'facebook_wav2vec2_large_xlsr_53_italian', + # 'pl': 'facebook_wav2vec2_large_xlsr_53_polish', + # 'es': 'facebook_wav2vec2_large_xlsr_53_spanish' }[language_code] def add_facebook_hf_wav2vec2_asr(channel: BlockingChannel): for dataset_name in get_all_datasets_with_language(): - add_to_queue( - dataset_name, - get_hf_facebook_wav2vec2_model_by_language_code(dataset_name[:2]), - 'hf_facebook_wav2vec2_asr', - channel, - 'hf_facebook_wav2vec2_asr' - ) + if dataset_name.startswith('en'): + add_to_queue( + dataset_name, + get_hf_facebook_wav2vec2_model_by_language_code(dataset_name[:2]), + 'hf_facebook_wav2vec2_asr', + channel, + 'hf_facebook_wav2vec2_asr' + ) def main(): diff --git a/new_experiment/pipeline/pipeline_process_asr.py b/new_experiment/pipeline/pipeline_process_asr.py index c883a3d759cca85dd0003b3e9159ff723cefd412..1860df6435f6e22e427f879b32b491979e7be5f8 100644 --- a/new_experiment/pipeline/pipeline_process_asr.py +++ b/new_experiment/pipeline/pipeline_process_asr.py @@ -10,8 +10,8 @@ from sziszapangma.integration.task.asr_task import AsrTask def get_asr_processor(asr_name: str) -> AsrProcessor: if asr_name == 'facebook_wav2vec2_large_xlsr_53_dutch': return Wav2Vec2AsrProcessor('facebook/wav2vec2-large-xlsr-53-dutch') - if asr_name == 'facebook_wav2vec2_xls_r_300m': - return Wav2Vec2AsrProcessor('facebook/wav2vec2-xls-r-300m') + if asr_name == 'facebook_wav2vec2_large_960h_lv60_self': + return Wav2Vec2AsrProcessor('facebook/wav2vec2-large-960h-lv60-self') if asr_name == 'facebook_wav2vec2_large_xlsr_53_french': return Wav2Vec2AsrProcessor('facebook/wav2vec2-large-xlsr-53-french') if asr_name == 'facebook_wav2vec2_large_xlsr_53_german': @@ -34,7 +34,7 @@ def run_hf_facebook_wav2vec2_asr_task(dataset_name: str, asr_name: str): AsrTask( asr_property_name=PropertyHelper.asr_result(asr_name), task_name=f'AsrTask___{dataset_name}___{asr_name}', - require_update=True, + require_update=False, asr_processor=get_asr_processor(asr_name), record_path_provider=record_provider ) diff --git a/new_experiment/pipeline/pipeline_process_spacy_dep_tag_wer.py b/new_experiment/pipeline/pipeline_process_spacy_dep_tag_wer.py index 8e2d5684d35878ec89a33a868af6734af6c3e215..c66a8f393188b41b8292be92335436aa4c682564 100644 --- a/new_experiment/pipeline/pipeline_process_spacy_dep_tag_wer.py +++ b/new_experiment/pipeline/pipeline_process_spacy_dep_tag_wer.py @@ -1,7 +1,3 @@ -import argparse - -from experiment.const_pipeline_names import GOLD_TRANSCRIPT -from experiment.experiment_dependency_provider import get_record_provider, get_repository from experiment.sentence_wer_processor.spacy_pos_sentence_dep_tag_processor import SpacyDepTagSentenceWerProcessor from new_experiment.new_dependency_provider import get_experiment_repository, get_minio_audio_record_repository from new_experiment.utils.get_spacy_model_name import get_spacy_model_name @@ -25,7 +21,7 @@ def run_spacy_dep_tag_wer_pipeline(dataset_name: str, asr_name: str): alignment_property_name=PropertyHelper.dep_tag_alignment(asr_name, model_name), wer_property_name=PropertyHelper.dep_tag_metrics(asr_name, model_name), task_name=f'SpacyDepTagSentenceWerProcessor___{dataset_name}___{asr_name}', - require_update=True + require_update=False ) ], experiment_repository=repository, diff --git a/new_experiment/pipeline/pipeline_process_spacy_ner_wer.py b/new_experiment/pipeline/pipeline_process_spacy_ner_wer.py index c74e41960e8123e9d6fb2f374eab57e0e083a2c2..9cdf4b02410bed11155a71d84b18a0bcf746ba0a 100644 --- a/new_experiment/pipeline/pipeline_process_spacy_ner_wer.py +++ b/new_experiment/pipeline/pipeline_process_spacy_ner_wer.py @@ -1,7 +1,3 @@ -import argparse - -from experiment.const_pipeline_names import GOLD_TRANSCRIPT -from experiment.experiment_dependency_provider import get_record_provider, get_repository from experiment.sentence_wer_processor.spacy_ner_sentence_wer_processor import SpacyNerSentenceWerProcessor from new_experiment.new_dependency_provider import get_experiment_repository, get_minio_audio_record_repository from new_experiment.utils.get_spacy_model_name import get_spacy_model_name @@ -25,7 +21,7 @@ def run_spacy_ner_wer_pipeline(dataset_name: str, asr_name: str): alignment_property_name=PropertyHelper.ner_alignment(asr_name, model_name), wer_property_name=PropertyHelper.ner_metrics(asr_name, model_name), task_name=f'SpacyNerSentenceWerProcessor___{dataset_name}___{asr_name}', - require_update=True + require_update=False ) ], experiment_repository=repository, diff --git a/new_experiment/pipeline/pipeline_process_spacy_pos_wer.py b/new_experiment/pipeline/pipeline_process_spacy_pos_wer.py index 5959dc97a59cc9e3d345aaa86da7d67ba4d499ac..1af0b659d10866a46aadacd3f628839e3ba1ab4f 100644 --- a/new_experiment/pipeline/pipeline_process_spacy_pos_wer.py +++ b/new_experiment/pipeline/pipeline_process_spacy_pos_wer.py @@ -1,4 +1,3 @@ -from experiment.experiment_dependency_provider import get_record_provider, get_repository from experiment.sentence_wer_processor.spacy_pos_sentence_wer_processor import SpacyPosSentenceWerProcessor from new_experiment.new_dependency_provider import get_experiment_repository, get_minio_audio_record_repository from new_experiment.utils.get_spacy_model_name import get_spacy_model_name @@ -22,7 +21,7 @@ def run_spacy_pos_wer_pipeline(dataset_name: str, asr_name: str): alignment_property_name=PropertyHelper.pos_alignment(asr_name, model_name), wer_property_name=PropertyHelper.pos_metrics(asr_name, model_name), task_name=f'SpacyPosSentenceWerProcessor___{dataset_name}___{asr_name}', - require_update=True + require_update=False ) ], experiment_repository=repository, diff --git a/new_experiment/pipeline/pipeline_process_wikineural_ner_wer.py b/new_experiment/pipeline/pipeline_process_wikineural_ner_wer.py index 761abb4315150b7fdd9e47c4f531250b038c4507..3675fc95d3e93e6053ee2df352e79f7ffb1ca1c4 100644 --- a/new_experiment/pipeline/pipeline_process_wikineural_ner_wer.py +++ b/new_experiment/pipeline/pipeline_process_wikineural_ner_wer.py @@ -1,4 +1,3 @@ -from experiment.experiment_dependency_provider import get_record_provider, get_repository from experiment.sentence_wer_processor.wikineural_multilingual_ner_transformers_wer_processor_base import \ WikineuralMultilingualNerTransformersWerProcessorBase from new_experiment.new_dependency_provider import get_experiment_repository, get_minio_audio_record_repository diff --git a/new_experiment/pipeline/pipeline_process_word_classic_wer.py b/new_experiment/pipeline/pipeline_process_word_classic_wer.py index 231a696ae4190ab8c261e6718f56148e98f480ec..212e3db6eb95eb802a7d95134374e3c79dc0f1e5 100644 --- a/new_experiment/pipeline/pipeline_process_word_classic_wer.py +++ b/new_experiment/pipeline/pipeline_process_word_classic_wer.py @@ -15,7 +15,7 @@ def run_word_wer_classic_pipeline(dataset_name: str, asr_name: str): asr_property_name=PropertyHelper.asr_result(asr_name), gold_transcript_property_name=PropertyHelper.get_gold_transcript_words(), metrics_property_name=PropertyHelper.word_wer_classic_metrics(asr_name), - require_update=True, + require_update=False, alignment_property_name=PropertyHelper.word_wer_classic_alignment(asr_name) ), ], diff --git a/new_experiment/pipeline/pipeline_process_word_embedding_wer.py b/new_experiment/pipeline/pipeline_process_word_embedding_wer.py index 8f942f0c32c33e1386753100bfb83b0660f93772..90267877bc05a2d152f4f4041c62ccb160ef2ad8 100644 --- a/new_experiment/pipeline/pipeline_process_word_embedding_wer.py +++ b/new_experiment/pipeline/pipeline_process_word_embedding_wer.py @@ -1,14 +1,8 @@ -import argparse - -from experiment.const_pipeline_names import GOLD_TRANSCRIPT -from experiment.experiment_dependency_provider import get_record_provider, get_repository from new_experiment.new_dependency_provider import get_experiment_repository, get_minio_audio_record_repository from new_experiment.utils.loaded_remote_dataset_helper import LoadedRemoteDatasetHelper from new_experiment.utils.property_helper import PropertyHelper from sziszapangma.core.transformer.fasttext_embedding_transformer import FasttextEmbeddingTransformer -from sziszapangma.core.transformer.web_embedding_transformer import WebEmbeddingTransformer from sziszapangma.integration.experiment_manager import ExperimentManager -from sziszapangma.integration.task.classic_wer_metric_task import ClassicWerMetricTask from sziszapangma.integration.task.embedding_wer_metrics_task import EmbeddingWerMetricsTask @@ -22,7 +16,7 @@ def run_word_wer_embedding_pipeline(dataset_name: str, asr_name: str): asr_property_name=PropertyHelper.asr_result(asr_name), gold_transcript_property_name=PropertyHelper.get_gold_transcript_words(), metrics_property_name=PropertyHelper.word_wer_embeddings_metrics(asr_name), - require_update=True, + require_update=False, embedding_transformer=FasttextEmbeddingTransformer(dataset_name[:2]), alignment_property_name=PropertyHelper.word_wer_embeddings_alignment(asr_name) ) diff --git a/new_experiment/new_worker.py b/new_experiment/worker_asr.py similarity index 100% rename from new_experiment/new_worker.py rename to new_experiment/worker_asr.py diff --git a/new_experiment/worker.py b/new_experiment/worker_pipeline.py similarity index 100% rename from new_experiment/worker.py rename to new_experiment/worker_pipeline.py