Newer
Older
from new_experiment.new_dependency_provider import get_experiment_repository
from new_experiment.utils.get_spacy_model_name import get_spacy_model_name
from new_experiment.utils.property_helper import PropertyHelper
def get_stats_for(dataset_name: str, property_name: str) -> float:
repo = get_experiment_repository(dataset_name)
all_vals = repo.get_all_values_from_property(property_name)
vals = [all_vals[record_id] for record_id in all_vals.keys()]
vals = [ittt for ittt in vals if isinstance(ittt, float) and 10 > ittt > -2]
if len(vals) == 0:
ret = -1
else:
ret = sum(vals) / len(vals)
print(dataset_name, property_name, ret)
def get_stats_for_classic_wer(dataset_name: str, property_name: str) -> float:
repo = get_experiment_repository(dataset_name)
all_vals = repo.get_all_values_from_property(property_name)
vals = [all_vals[record_id] for record_id in all_vals.keys()]
vals = [ittt['classic_wer'] for ittt in vals if 'classic_wer' in ittt]
vals = [ittt for ittt in vals if isinstance(ittt, float) and 10 > ittt > -2]
if len(vals) == 0:
ret = -1
else:
ret = sum(vals) / len(vals)
print(dataset_name, property_name, ret)
def get_stats_for_soft_wer(dataset_name: str, property_name: str) -> float:
repo = get_experiment_repository(dataset_name)
all_vals = repo.get_all_values_from_property(property_name)
vals = [all_vals[record_id] for record_id in all_vals.keys()]
vals = [ittt['soft_wer'] for ittt in vals if 'soft_wer' in ittt]
vals = [ittt for ittt in vals if isinstance(ittt, float) and 10 > ittt > -2]
if len(vals) == 0:
ret = -1
else:
ret = sum(vals) / len(vals)
print(dataset_name, property_name + '_soft', ret)
return ret
def get_stats_for_embedding_wer(dataset_name: str, property_name: str) -> float:
repo = get_experiment_repository(dataset_name)
vals = [repo.get_property_for_key(it, property_name) for it in repo.get_all_record_ids_for_property(property_name)]
vals = [it['embedding_wer'] for it in vals if 'embedding_wer' in it]
vals = [ittt for ittt in vals if isinstance(ittt, float)]
if len(vals) == 0:
ret = -1
else:
ret = sum(vals) / len(vals)
print(dataset_name, property_name + '_emb', ret)
return ret
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
if __name__ == '__main__':
COMMANDS = ['run_word_wer_classic_pipeline', 'run_word_wer_embedding_pipeline', 'run_spacy_dep_tag_wer_pipeline',
'run_spacy_ner_wer_pipeline', 'run_spacy_pos_wer_pipeline']
LANGUAGES = ['nl', 'fr', 'de', 'it', 'pl', 'es', 'en']
WHISPER_ASR_MODEL = ['tiny', 'base', 'small', 'medium', 'large-v2']
DATASETS = ['google_fleurs', 'minds14', 'voxpopuli']
FULL_DATASET_NAMES = []
for itt in LANGUAGES:
for it in DATASETS:
FULL_DATASET_NAMES.append(f'{itt}_{it}')
FULL_LANGUAGE_MODELS = [f'whisper_{it}' for it in WHISPER_ASR_MODEL]
for dataset in FULL_DATASET_NAMES:
for model in FULL_LANGUAGE_MODELS:
get_stats_for(dataset, PropertyHelper.ner_metrics(model, get_spacy_model_name(dataset[:2])))
for dataset in FULL_DATASET_NAMES:
for model in FULL_LANGUAGE_MODELS:
get_stats_for(dataset, PropertyHelper.pos_metrics(model, get_spacy_model_name(dataset[:2])))
for dataset in FULL_DATASET_NAMES:
for model in FULL_LANGUAGE_MODELS:
get_stats_for(dataset, PropertyHelper.dep_tag_metrics(model, get_spacy_model_name(dataset[:2])))
for dataset in FULL_DATASET_NAMES:
for model in FULL_LANGUAGE_MODELS:
get_stats_for_classic_wer(dataset, PropertyHelper.word_wer_classic_metrics(model))
for dataset in FULL_DATASET_NAMES:
for model in FULL_LANGUAGE_MODELS:
get_stats_for_soft_wer(dataset, PropertyHelper.word_wer_embeddings_metrics(model))
for dataset in FULL_DATASET_NAMES:
for model in FULL_LANGUAGE_MODELS:
get_stats_for_embedding_wer(dataset, PropertyHelper.word_wer_embeddings_metrics(model))