From 06fa75094654c397344e8fef6615e2d1390ee35f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20W=C4=85troba?= <markowanga@gmail.com> Date: Thu, 12 Jan 2023 11:24:26 +0100 Subject: [PATCH] Add stats --- call_experiment_stats.py | 54 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/call_experiment_stats.py b/call_experiment_stats.py index c894b69..60bc573 100644 --- a/call_experiment_stats.py +++ b/call_experiment_stats.py @@ -1,10 +1,11 @@ from new_experiment.new_dependency_provider import get_experiment_repository +from new_experiment.utils.get_spacy_model_name import get_spacy_model_name +from new_experiment.utils.property_helper import PropertyHelper def get_stats_for(dataset_name: str, property_name: str) -> float: repo = get_experiment_repository(dataset_name) vals = [repo.get_property_for_key(it, property_name) for it in repo.get_all_record_ids_for_property(property_name)] - print(vals) vals = [it for it in vals if isinstance(it, float)] ret = 0.0 if len(vals) == 0: @@ -12,6 +13,7 @@ def get_stats_for(dataset_name: str, property_name: str) -> float: else: ret = sum(vals) / len(vals) print(dataset_name, property_name, ret) + return ret def get_stats_for_classic_wer(dataset_name: str, property_name: str) -> float: @@ -19,6 +21,12 @@ def get_stats_for_classic_wer(dataset_name: str, property_name: str) -> float: vals = [repo.get_property_for_key(it, property_name) for it in repo.get_all_record_ids_for_property(property_name)] vals = [it['classic_wer'] for it in vals if 'classic_wer' in it] vals = [it for it in vals if isinstance(it, float)] + ret = 0.0 + if len(vals) == 0: + ret = -1 + else: + ret = sum(vals) / len(vals) + print(dataset_name, property_name, ret) return sum(vals) / len(vals) @@ -27,6 +35,12 @@ def get_stats_for_soft_wer(dataset_name: str, property_name: str) -> float: vals = [repo.get_property_for_key(it, property_name) for it in repo.get_all_record_ids_for_property(property_name)] vals = [it['soft_wer'] for it in vals if 'soft_wer' in it] vals = [it for it in vals if isinstance(it, float)] + ret = 0.0 + if len(vals) == 0: + ret = -1 + else: + ret = sum(vals) / len(vals) + print(dataset_name, property_name, ret) return sum(vals) / len(vals) @@ -36,3 +50,41 @@ def get_stats_for_embedding_wer(dataset_name: str, property_name: str) -> float: vals = [it['embedding_wer'] for it in vals if 'embedding_wer' in it] vals = [it for it in vals if isinstance(it, float)] return sum(vals) / len(vals) + + +if __name__ == '__main__': + COMMANDS = ['run_word_wer_classic_pipeline', 'run_word_wer_embedding_pipeline', 'run_spacy_dep_tag_wer_pipeline', + 'run_spacy_ner_wer_pipeline', 'run_spacy_pos_wer_pipeline'] + LANGUAGES = ['nl', 'fr', 'de', 'it', 'pl', 'es', 'en'] + WHISPER_ASR_MODEL = ['tiny', 'base', 'small', 'medium', 'large-v2'] + DATASETS = ['google_fleurs', 'minds14', 'voxpopuli'] + FULL_DATASET_NAMES = [] + for itt in LANGUAGES: + for it in DATASETS: + FULL_DATASET_NAMES.append(f'{itt}_{it}') + + FULL_LANGUAGE_MODELS = [f'whisper_{it}' for it in WHISPER_ASR_MODEL] + + for dataset in FULL_DATASET_NAMES: + for model in FULL_LANGUAGE_MODELS: + get_stats_for(dataset, PropertyHelper.ner_metrics(model, get_spacy_model_name(dataset[:2]))) + + for dataset in FULL_DATASET_NAMES: + for model in FULL_LANGUAGE_MODELS: + get_stats_for(dataset, PropertyHelper.pos_metrics(model, get_spacy_model_name(dataset[:2]))) + + for dataset in FULL_DATASET_NAMES: + for model in FULL_LANGUAGE_MODELS: + get_stats_for(dataset, PropertyHelper.dep_tag_metrics(model, get_spacy_model_name(dataset[:2]))) + + for dataset in FULL_DATASET_NAMES: + for model in FULL_LANGUAGE_MODELS: + get_stats_for_classic_wer(dataset, PropertyHelper.word_wer_classic_metrics(model)) + + for dataset in FULL_DATASET_NAMES: + for model in FULL_LANGUAGE_MODELS: + get_stats_for_soft_wer(dataset, PropertyHelper.word_wer_embeddings_metrics(model)) + + for dataset in FULL_DATASET_NAMES: + for model in FULL_LANGUAGE_MODELS: + get_stats_for_embedding_wer(dataset, PropertyHelper.word_wer_embeddings_metrics(model)) -- GitLab