from new_experiment.new_dependency_provider import get_experiment_repository, get_minio_audio_record_repository from new_experiment.utils.loaded_remote_dataset_helper import LoadedRemoteDatasetHelper from new_experiment.utils.property_helper import PropertyHelper from sziszapangma.core.transformer.fasttext_embedding_transformer import FasttextEmbeddingTransformer from sziszapangma.integration.experiment_manager import ExperimentManager from sziszapangma.integration.task.embedding_wer_metrics_task import EmbeddingWerMetricsTask def run_word_wer_embedding_pipeline(dataset_name: str, asr_name: str): repository = get_experiment_repository(dataset_name) experiment_processor = ExperimentManager( record_id_iterator=LoadedRemoteDatasetHelper(repository, get_minio_audio_record_repository(), dataset_name), processing_tasks=[ EmbeddingWerMetricsTask( task_name=f'EmbeddingWerMetricsTask___{dataset_name}___{asr_name}', asr_property_name=PropertyHelper.asr_result(asr_name), gold_transcript_property_name=PropertyHelper.get_gold_transcript_words(), require_update=True, embedding_transformer=FasttextEmbeddingTransformer(dataset_name[:2]), embeddings_alignment_property_name=PropertyHelper.word_wer_embeddings_alignment(asr_name), embeddings_metrics_property_name=PropertyHelper.word_wer_embeddings_metrics(asr_name), soft_alignment_property_name=PropertyHelper.word_wer_soft_alignment(asr_name), soft_metrics_property_name=PropertyHelper.word_wer_soft_metrics(asr_name) ) ], experiment_repository=repository ) experiment_processor.process() if __name__ == '__main__': run_word_wer_embedding_pipeline('de_minds14', 'whisper_tiny')