From a87187dbaba8b841128972edd9a2cb7b175ce88f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20W=C4=85troba?= <markowanga@gmail.com> Date: Thu, 12 Jan 2023 14:46:58 +0100 Subject: [PATCH] Add results processing --- call_experiment_stats.py | 37 +- metrics.ipynb | 1050 ++++++++++++++++- .../repository/experiment_repository.py | 6 +- .../repository/mongo_experiment_repository.py | 5 +- 4 files changed, 1050 insertions(+), 48 deletions(-) diff --git a/call_experiment_stats.py b/call_experiment_stats.py index 60bc573..c93ad1c 100644 --- a/call_experiment_stats.py +++ b/call_experiment_stats.py @@ -5,9 +5,9 @@ from new_experiment.utils.property_helper import PropertyHelper def get_stats_for(dataset_name: str, property_name: str) -> float: repo = get_experiment_repository(dataset_name) - vals = [repo.get_property_for_key(it, property_name) for it in repo.get_all_record_ids_for_property(property_name)] - vals = [it for it in vals if isinstance(it, float)] - ret = 0.0 + all_vals = repo.get_all_values_from_property(property_name) + vals = [all_vals[record_id] for record_id in all_vals.keys()] + vals = [ittt for ittt in vals if isinstance(ittt, float) and 10 > ittt > -2] if len(vals) == 0: ret = -1 else: @@ -18,38 +18,43 @@ def get_stats_for(dataset_name: str, property_name: str) -> float: def get_stats_for_classic_wer(dataset_name: str, property_name: str) -> float: repo = get_experiment_repository(dataset_name) - vals = [repo.get_property_for_key(it, property_name) for it in repo.get_all_record_ids_for_property(property_name)] - vals = [it['classic_wer'] for it in vals if 'classic_wer' in it] - vals = [it for it in vals if isinstance(it, float)] - ret = 0.0 + all_vals = repo.get_all_values_from_property(property_name) + vals = [all_vals[record_id] for record_id in all_vals.keys()] + vals = [ittt['classic_wer'] for ittt in vals if 'classic_wer' in ittt] + vals = [ittt for ittt in vals if isinstance(ittt, float) and 10 > ittt > -2] if len(vals) == 0: ret = -1 else: ret = sum(vals) / len(vals) print(dataset_name, property_name, ret) - return sum(vals) / len(vals) + return ret def get_stats_for_soft_wer(dataset_name: str, property_name: str) -> float: repo = get_experiment_repository(dataset_name) - vals = [repo.get_property_for_key(it, property_name) for it in repo.get_all_record_ids_for_property(property_name)] - vals = [it['soft_wer'] for it in vals if 'soft_wer' in it] - vals = [it for it in vals if isinstance(it, float)] - ret = 0.0 + all_vals = repo.get_all_values_from_property(property_name) + vals = [all_vals[record_id] for record_id in all_vals.keys()] + vals = [ittt['soft_wer'] for ittt in vals if 'soft_wer' in ittt] + vals = [ittt for ittt in vals if isinstance(ittt, float) and 10 > ittt > -2] if len(vals) == 0: ret = -1 else: ret = sum(vals) / len(vals) - print(dataset_name, property_name, ret) - return sum(vals) / len(vals) + print(dataset_name, property_name + '_soft', ret) + return ret def get_stats_for_embedding_wer(dataset_name: str, property_name: str) -> float: repo = get_experiment_repository(dataset_name) vals = [repo.get_property_for_key(it, property_name) for it in repo.get_all_record_ids_for_property(property_name)] vals = [it['embedding_wer'] for it in vals if 'embedding_wer' in it] - vals = [it for it in vals if isinstance(it, float)] - return sum(vals) / len(vals) + vals = [ittt for ittt in vals if isinstance(ittt, float)] + if len(vals) == 0: + ret = -1 + else: + ret = sum(vals) / len(vals) + print(dataset_name, property_name + '_emb', ret) + return ret if __name__ == '__main__': diff --git a/metrics.ipynb b/metrics.ipynb index 7a79fe7..0341b3c 100644 --- a/metrics.ipynb +++ b/metrics.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 4, + "execution_count": 9, "id": "955a0385-29fb-47dc-b012-729e49570594", "metadata": {}, "outputs": [], @@ -12,12 +12,14 @@ "from call_experiment_stats import *\n", "\n", "from new_experiment.utils.property_helper import PropertyHelper\n", - "from new_experiment.utils.get_spacy_model_name import get_spacy_model_name" + "from new_experiment.utils.get_spacy_model_name import get_spacy_model_name\n", + "import pandas as pd\n", + "import numpy as np" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 10, "id": "9f5e44a6-f211-4b61-8cb4-5636c7672c6a", "metadata": {}, "outputs": [], @@ -31,50 +33,1038 @@ "for itt in LANGUAGES:\n", " for it in DATASETS:\n", " FULL_DATASET_NAMES.append(f'{itt}_{it}')\n", - " \n", + "\n", "FULL_LANGUAGE_MODELS = [f'whisper_{it}' for it in WHISPER_ASR_MODEL]" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 18, + "id": "d2465ceb-7439-4fa5-adf8-e95d7e6106b9", + "metadata": {}, + "outputs": [], + "source": [ + "0vals = dict()\n", + "with open('metrics.log', 'r') as reader:\n", + " lines = reader.read().splitlines(keepends=False)\n", + " for line in lines:\n", + " # print(line)\n", + " words = line.split()\n", + " key = f'{words[0]}_{words[1]}'\n", + " # print(key)\n", + " vals[key] = float(words[2])\n", + "# vals" + ] + }, + { + "cell_type": "code", + "execution_count": 19, "id": "22d84451-b7e3-4dba-9758-068dae23ace4", "metadata": {}, + "outputs": [], + "source": [ + "spacy_ner = [\n", + " [vals.get(f'{dataset}_{PropertyHelper.ner_metrics(model, get_spacy_model_name(dataset[:2]))}', -1.0) for model in FULL_LANGUAGE_MODELS]\n", + " for dataset in FULL_DATASET_NAMES\n", + "]\n", + "spacy_pos = [\n", + " [vals.get(f'{dataset}_{PropertyHelper.pos_metrics(model, get_spacy_model_name(dataset[:2]))}', -1.0) for model in FULL_LANGUAGE_MODELS]\n", + " for dataset in FULL_DATASET_NAMES\n", + "]\n", + "spacy_dep = [\n", + " [vals.get(f'{dataset}_{PropertyHelper.pos_metrics(model, get_spacy_model_name(dataset[:2]))}', -1.0) for model in FULL_LANGUAGE_MODELS]\n", + " for dataset in FULL_DATASET_NAMES\n", + "]\n", + "word_wer_classic_metrics = [\n", + " [vals.get(f'{dataset}_{PropertyHelper.word_wer_classic_metrics(model)}', -1.0) for model in FULL_LANGUAGE_MODELS]\n", + " for dataset in FULL_DATASET_NAMES\n", + "]\n", + "\n", + "# for dataset in FULL_DATASET_NAMES:\n", + "# for model in FULL_LANGUAGE_MODELS:\n", + "# get_stats_for_classic_wer(dataset, PropertyHelper.word_wer_classic_metrics(model))\n", + "\n", + "# for dataset in FULL_DATASET_NAMES:\n", + "# for model in FULL_LANGUAGE_MODELS:\n", + "# get_stats_for_soft_wer(dataset, PropertyHelper.word_wer_embeddings_metrics(model))\n", + "\n", + "# for dataset in FULL_DATASET_NAMES:\n", + "# for model in FULL_LANGUAGE_MODELS:\n", + "# get_stats_for_embedding_wer(dataset, PropertyHelper.word_wer_embeddings_metrics(model))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "45fd851c-644f-48e6-b711-5bd312404b8b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>tiny</th>\n", + " <th>base</th>\n", + " <th>small</th>\n", + " <th>medium</th>\n", + " <th>large-v2</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>nl_google_fleurs</th>\n", + " <td>0.316124</td>\n", + " <td>0.230845</td>\n", + " <td>0.186936</td>\n", + " <td>0.170150</td>\n", + " <td>0.165057</td>\n", + " </tr>\n", + " <tr>\n", + " <th>nl_minds14</th>\n", + " <td>0.463084</td>\n", + " <td>0.409993</td>\n", + " <td>0.360934</td>\n", + " <td>0.331613</td>\n", + " <td>0.324172</td>\n", + " </tr>\n", + " <tr>\n", + " <th>nl_voxpopuli</th>\n", + " <td>0.215158</td>\n", + " <td>0.178716</td>\n", + " <td>0.132960</td>\n", + " <td>0.118042</td>\n", + " <td>0.139958</td>\n", + " </tr>\n", + " <tr>\n", + " <th>fr_google_fleurs</th>\n", + " <td>0.264291</td>\n", + " <td>0.193436</td>\n", + " <td>0.177302</td>\n", + " <td>0.147464</td>\n", + " <td>0.141276</td>\n", + " </tr>\n", + " <tr>\n", + " <th>fr_minds14</th>\n", + " <td>0.466860</td>\n", + " <td>0.468822</td>\n", + " <td>0.471754</td>\n", + " <td>0.444854</td>\n", + " <td>0.485090</td>\n", + " </tr>\n", + " <tr>\n", + " <th>fr_voxpopuli</th>\n", + " <td>0.161386</td>\n", + " <td>0.131144</td>\n", + " <td>0.113097</td>\n", + " <td>0.099114</td>\n", + " <td>0.111776</td>\n", + " </tr>\n", + " <tr>\n", + " <th>de_google_fleurs</th>\n", + " <td>0.316175</td>\n", + " <td>0.257454</td>\n", + " <td>0.234163</td>\n", + " <td>0.239750</td>\n", + " <td>0.236715</td>\n", + " </tr>\n", + " <tr>\n", + " <th>de_minds14</th>\n", + " <td>0.435681</td>\n", + " <td>0.425712</td>\n", + " <td>0.412896</td>\n", + " <td>0.398617</td>\n", + " <td>0.398762</td>\n", + " </tr>\n", + " <tr>\n", + " <th>de_voxpopuli</th>\n", + " <td>0.200245</td>\n", + " <td>0.155502</td>\n", + " <td>0.133251</td>\n", + " <td>0.116949</td>\n", + " <td>0.156371</td>\n", + " </tr>\n", + " <tr>\n", + " <th>it_google_fleurs</th>\n", + " <td>0.206301</td>\n", + " <td>0.172527</td>\n", + " <td>0.161195</td>\n", + " <td>0.156655</td>\n", + " <td>0.160677</td>\n", + " </tr>\n", + " <tr>\n", + " <th>it_minds14</th>\n", + " <td>0.487493</td>\n", + " <td>0.448874</td>\n", + " <td>0.432679</td>\n", + " <td>0.416035</td>\n", + " <td>0.392705</td>\n", + " </tr>\n", + " <tr>\n", + " <th>it_voxpopuli</th>\n", + " <td>-1.000000</td>\n", + " <td>-1.000000</td>\n", + " <td>-1.000000</td>\n", + " <td>-1.000000</td>\n", + " <td>-1.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>pl_google_fleurs</th>\n", + " <td>0.334936</td>\n", + " <td>0.273025</td>\n", + " <td>0.227662</td>\n", + " <td>0.210962</td>\n", + " <td>0.209027</td>\n", + " </tr>\n", + " <tr>\n", + " <th>pl_minds14</th>\n", + " <td>0.657194</td>\n", + " <td>0.591588</td>\n", + " <td>0.487344</td>\n", + " <td>0.474013</td>\n", + " <td>0.487891</td>\n", + " </tr>\n", + " <tr>\n", + " <th>pl_voxpopuli</th>\n", + " <td>0.203548</td>\n", + " <td>0.158526</td>\n", + " <td>0.126280</td>\n", + " <td>0.110784</td>\n", + " <td>0.117780</td>\n", + " </tr>\n", + " <tr>\n", + " <th>es_google_fleurs</th>\n", + " <td>0.187607</td>\n", + " <td>0.159873</td>\n", + " <td>0.147104</td>\n", + " <td>0.155210</td>\n", + " <td>0.154657</td>\n", + " </tr>\n", + " <tr>\n", + " <th>es_minds14</th>\n", + " <td>0.721295</td>\n", + " <td>0.670363</td>\n", + " <td>0.666278</td>\n", + " <td>0.673058</td>\n", + " <td>0.680341</td>\n", + " </tr>\n", + " <tr>\n", + " <th>es_voxpopuli</th>\n", + " <td>0.133805</td>\n", + " <td>0.116222</td>\n", + " <td>0.119882</td>\n", + " <td>0.106610</td>\n", + " <td>0.122036</td>\n", + " </tr>\n", + " <tr>\n", + " <th>en_google_fleurs</th>\n", + " <td>0.217843</td>\n", + " <td>0.188810</td>\n", + " <td>0.186407</td>\n", + " <td>0.183656</td>\n", + " <td>0.184568</td>\n", + " </tr>\n", + " <tr>\n", + " <th>en_minds14</th>\n", + " <td>0.562068</td>\n", + " <td>0.566999</td>\n", + " <td>0.580369</td>\n", + " <td>0.583945</td>\n", + " <td>0.578079</td>\n", + " </tr>\n", + " <tr>\n", + " <th>en_voxpopuli</th>\n", + " <td>0.224980</td>\n", + " <td>0.203959</td>\n", + " <td>0.210278</td>\n", + " <td>0.322688</td>\n", + " <td>0.280877</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " tiny base small medium large-v2\n", + "nl_google_fleurs 0.316124 0.230845 0.186936 0.170150 0.165057\n", + "nl_minds14 0.463084 0.409993 0.360934 0.331613 0.324172\n", + "nl_voxpopuli 0.215158 0.178716 0.132960 0.118042 0.139958\n", + "fr_google_fleurs 0.264291 0.193436 0.177302 0.147464 0.141276\n", + "fr_minds14 0.466860 0.468822 0.471754 0.444854 0.485090\n", + "fr_voxpopuli 0.161386 0.131144 0.113097 0.099114 0.111776\n", + "de_google_fleurs 0.316175 0.257454 0.234163 0.239750 0.236715\n", + "de_minds14 0.435681 0.425712 0.412896 0.398617 0.398762\n", + "de_voxpopuli 0.200245 0.155502 0.133251 0.116949 0.156371\n", + "it_google_fleurs 0.206301 0.172527 0.161195 0.156655 0.160677\n", + "it_minds14 0.487493 0.448874 0.432679 0.416035 0.392705\n", + "it_voxpopuli -1.000000 -1.000000 -1.000000 -1.000000 -1.000000\n", + "pl_google_fleurs 0.334936 0.273025 0.227662 0.210962 0.209027\n", + "pl_minds14 0.657194 0.591588 0.487344 0.474013 0.487891\n", + "pl_voxpopuli 0.203548 0.158526 0.126280 0.110784 0.117780\n", + "es_google_fleurs 0.187607 0.159873 0.147104 0.155210 0.154657\n", + "es_minds14 0.721295 0.670363 0.666278 0.673058 0.680341\n", + "es_voxpopuli 0.133805 0.116222 0.119882 0.106610 0.122036\n", + "en_google_fleurs 0.217843 0.188810 0.186407 0.183656 0.184568\n", + "en_minds14 0.562068 0.566999 0.580369 0.583945 0.578079\n", + "en_voxpopuli 0.224980 0.203959 0.210278 0.322688 0.280877" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame(spacy_ner, columns=WHISPER_ASR_MODEL, index=FULL_DATASET_NAMES)\n", + "# NER" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "6466877e-e744-4cb1-8d4f-f818e1d3ee7d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>tiny</th>\n", + " <th>base</th>\n", + " <th>small</th>\n", + " <th>medium</th>\n", + " <th>large-v2</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>nl_google_fleurs</th>\n", + " <td>0.582916</td>\n", + " <td>0.427364</td>\n", + " <td>0.279190</td>\n", + " <td>0.229402</td>\n", + " <td>0.212373</td>\n", + " </tr>\n", + " <tr>\n", + " <th>nl_minds14</th>\n", + " <td>0.888989</td>\n", + " <td>0.702107</td>\n", + " <td>0.511865</td>\n", + " <td>0.440081</td>\n", + " <td>0.415821</td>\n", + " </tr>\n", + " <tr>\n", + " <th>nl_voxpopuli</th>\n", + " <td>0.451950</td>\n", + " <td>0.350228</td>\n", + " <td>0.233061</td>\n", + " <td>0.188461</td>\n", + " <td>0.208664</td>\n", + " </tr>\n", + " <tr>\n", + " <th>fr_google_fleurs</th>\n", + " <td>0.468415</td>\n", + " <td>0.338927</td>\n", + " <td>0.260157</td>\n", + " <td>0.207241</td>\n", + " <td>0.194587</td>\n", + " </tr>\n", + " <tr>\n", + " <th>fr_minds14</th>\n", + " <td>0.700735</td>\n", + " <td>0.619382</td>\n", + " <td>0.567487</td>\n", + " <td>0.513574</td>\n", + " <td>0.552826</td>\n", + " </tr>\n", + " <tr>\n", + " <th>fr_voxpopuli</th>\n", + " <td>0.310661</td>\n", + " <td>0.235596</td>\n", + " <td>0.180943</td>\n", + " <td>0.153288</td>\n", + " <td>0.159867</td>\n", + " </tr>\n", + " <tr>\n", + " <th>de_google_fleurs</th>\n", + " <td>0.449640</td>\n", + " <td>0.344001</td>\n", + " <td>0.282088</td>\n", + " <td>0.275634</td>\n", + " <td>0.264093</td>\n", + " </tr>\n", + " <tr>\n", + " <th>de_minds14</th>\n", + " <td>0.608813</td>\n", + " <td>0.529599</td>\n", + " <td>0.472205</td>\n", + " <td>0.443094</td>\n", + " <td>0.441656</td>\n", + " </tr>\n", + " <tr>\n", + " <th>de_voxpopuli</th>\n", + " <td>0.347653</td>\n", + " <td>0.248060</td>\n", + " <td>0.198001</td>\n", + " <td>0.168237</td>\n", + " <td>0.205059</td>\n", + " </tr>\n", + " <tr>\n", + " <th>it_google_fleurs</th>\n", + " <td>0.364700</td>\n", + " <td>0.269092</td>\n", + " <td>0.218361</td>\n", + " <td>0.189632</td>\n", + " <td>0.189108</td>\n", + " </tr>\n", + " <tr>\n", + " <th>it_minds14</th>\n", + " <td>0.735663</td>\n", + " <td>0.597724</td>\n", + " <td>0.500377</td>\n", + " <td>0.438344</td>\n", + " <td>0.417785</td>\n", + " </tr>\n", + " <tr>\n", + " <th>it_voxpopuli</th>\n", + " <td>-1.000000</td>\n", + " <td>-1.000000</td>\n", + " <td>-1.000000</td>\n", + " <td>-1.000000</td>\n", + " <td>-1.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>pl_google_fleurs</th>\n", + " <td>0.594285</td>\n", + " <td>0.452570</td>\n", + " <td>0.318702</td>\n", + " <td>0.276475</td>\n", + " <td>0.261194</td>\n", + " </tr>\n", + " <tr>\n", + " <th>pl_minds14</th>\n", + " <td>0.988993</td>\n", + " <td>0.853431</td>\n", + " <td>0.653693</td>\n", + " <td>0.585884</td>\n", + " <td>0.597468</td>\n", + " </tr>\n", + " <tr>\n", + " <th>pl_voxpopuli</th>\n", + " <td>0.374544</td>\n", + " <td>0.277290</td>\n", + " <td>0.198685</td>\n", + " <td>0.164524</td>\n", + " <td>0.161887</td>\n", + " </tr>\n", + " <tr>\n", + " <th>es_google_fleurs</th>\n", + " <td>0.284499</td>\n", + " <td>0.224748</td>\n", + " <td>0.187365</td>\n", + " <td>0.189561</td>\n", + " <td>0.184028</td>\n", + " </tr>\n", + " <tr>\n", + " <th>es_minds14</th>\n", + " <td>0.880992</td>\n", + " <td>0.747677</td>\n", + " <td>0.695294</td>\n", + " <td>0.690749</td>\n", + " <td>0.697884</td>\n", + " </tr>\n", + " <tr>\n", + " <th>es_voxpopuli</th>\n", + " <td>0.252463</td>\n", + " <td>0.206225</td>\n", + " <td>0.229706</td>\n", + " <td>0.195846</td>\n", + " <td>0.231587</td>\n", + " </tr>\n", + " <tr>\n", + " <th>en_google_fleurs</th>\n", + " <td>0.295853</td>\n", + " <td>0.250928</td>\n", + " <td>0.224483</td>\n", + " <td>0.218855</td>\n", + " <td>0.218479</td>\n", + " </tr>\n", + " <tr>\n", + " <th>en_minds14</th>\n", + " <td>0.634351</td>\n", + " <td>0.623962</td>\n", + " <td>0.626942</td>\n", + " <td>0.626588</td>\n", + " <td>0.620953</td>\n", + " </tr>\n", + " <tr>\n", + " <th>en_voxpopuli</th>\n", + " <td>0.345836</td>\n", + " <td>0.319493</td>\n", + " <td>0.319060</td>\n", + " <td>0.466410</td>\n", + " <td>0.408949</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " tiny base small medium large-v2\n", + "nl_google_fleurs 0.582916 0.427364 0.279190 0.229402 0.212373\n", + "nl_minds14 0.888989 0.702107 0.511865 0.440081 0.415821\n", + "nl_voxpopuli 0.451950 0.350228 0.233061 0.188461 0.208664\n", + "fr_google_fleurs 0.468415 0.338927 0.260157 0.207241 0.194587\n", + "fr_minds14 0.700735 0.619382 0.567487 0.513574 0.552826\n", + "fr_voxpopuli 0.310661 0.235596 0.180943 0.153288 0.159867\n", + "de_google_fleurs 0.449640 0.344001 0.282088 0.275634 0.264093\n", + "de_minds14 0.608813 0.529599 0.472205 0.443094 0.441656\n", + "de_voxpopuli 0.347653 0.248060 0.198001 0.168237 0.205059\n", + "it_google_fleurs 0.364700 0.269092 0.218361 0.189632 0.189108\n", + "it_minds14 0.735663 0.597724 0.500377 0.438344 0.417785\n", + "it_voxpopuli -1.000000 -1.000000 -1.000000 -1.000000 -1.000000\n", + "pl_google_fleurs 0.594285 0.452570 0.318702 0.276475 0.261194\n", + "pl_minds14 0.988993 0.853431 0.653693 0.585884 0.597468\n", + "pl_voxpopuli 0.374544 0.277290 0.198685 0.164524 0.161887\n", + "es_google_fleurs 0.284499 0.224748 0.187365 0.189561 0.184028\n", + "es_minds14 0.880992 0.747677 0.695294 0.690749 0.697884\n", + "es_voxpopuli 0.252463 0.206225 0.229706 0.195846 0.231587\n", + "en_google_fleurs 0.295853 0.250928 0.224483 0.218855 0.218479\n", + "en_minds14 0.634351 0.623962 0.626942 0.626588 0.620953\n", + "en_voxpopuli 0.345836 0.319493 0.319060 0.466410 0.408949" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame(spacy_pos, columns=WHISPER_ASR_MODEL, index=FULL_DATASET_NAMES)\n", + "# POS" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "77567361-b730-49f0-ab68-19ad335df1b1", + "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "[0.1875, 0.0, 0.3125, 0.2962962962962963, 0.2857142857142857, 0.2608695652173913, 0.29411764705882354, 0.43137254901960786, 0.45454545454545453, 0.2608695652173913, 0.25, 0.17647058823529413, 0.21666666666666667, 0.3076923076923077, 0.38461538461538464, 0.625, 0.1111111111111111, 0.3684210526315789, 0.15384615384615385, 0.16666666666666666, 0.2777777777777778, 0.17142857142857143, 0.12121212121212122, 0.14285714285714285, 0.35, 0.05, 0.125, 0.2857142857142857, 0.0, 0.22727272727272727, 0.47058823529411764, 0.7142857142857143, 0.3333333333333333, 4.461538461538462, 0.13043478260869565, 0.09090909090909091, 0.24, 0.21739130434782608, 0.3333333333333333, 0.20689655172413793, 0.25925925925925924, 0.35294117647058826, 0.27586206896551724, 0.4166666666666667, 0.29411764705882354, 0.3333333333333333, 0.17647058823529413, 0.25, 0.05263157894736842, 0.3333333333333333, 0.6, 0.1875, 0.36363636363636365, 0.10810810810810811, 0.17142857142857143, 0.1, 0.3793103448275862, 0.2413793103448276, 0.34782608695652173, 0.34782608695652173, 0.2608695652173913, 0.21875, 0.125, 0.18181818181818182, 0.1875, 0.15789473684210525, 0.19230769230769232, 0.35294117647058826, 0.15789473684210525, 0.4230769230769231, 0.058823529411764705, 0.12121212121212122, 0.23076923076923078, 0.375, 0.23076923076923078, 0.20689655172413793, 0.18181818181818182, 0.22580645161290322, 0.8, 0.2857142857142857, 0.5454545454545454, 0.35714285714285715, 0.09090909090909091, 0.2857142857142857, 0.15384615384615385, 0.2692307692307692, 0.46153846153846156, 0.2777777777777778, 0.5384615384615384, 0.4375, 0.4, 0.09090909090909091, 1.0, 0.5238095238095238, 0.23809523809523808, 0.2608695652173913, 0.15, 0.5555555555555556, 0.14285714285714285, 0.38095238095238093, 1.6666666666666667, 0.3333333333333333, 0.7083333333333334, 0.48, 0.1935483870967742, 0.2222222222222222, 0.4, 0.08333333333333333, 0.2857142857142857, 0.15, 0.35294117647058826, 0.14814814814814814, 0.4444444444444444, 0.1111111111111111, 0.2857142857142857, 0.14285714285714285, 0.47058823529411764, 0.38095238095238093, 0.38095238095238093, 0.13043478260869565, 0.17857142857142858, 0.17391304347826086, 0.3333333333333333, 0.4117647058823529, 0.7857142857142857, 0.2727272727272727, 0.37037037037037035, 0.15789473684210525, 0.1875, 0.2777777777777778, 0.3076923076923077, 0.2903225806451613, 0.16666666666666666, 0.38461538461538464, 0.45, 0.35, 0.25806451612903225, 0.21428571428571427, 0.11764705882352941, 0.6666666666666666, 0.1, 0.13636363636363635, 0.20833333333333334, 0.3888888888888889, 1.0555555555555556, 0.1875, 0.7083333333333334, 0.5555555555555556, 0.3023255813953488, 0.1111111111111111, 0.5555555555555556, 0.21428571428571427, 0.6, 0.3235294117647059, 0.5789473684210527, 0.3333333333333333, 0.18181818181818182, 0.32, 0.2777777777777778, 0.4444444444444444, 0.2631578947368421, 0.5238095238095238, 0.23529411764705882, 0.05263157894736842, 0.92, 0.47058823529411764, 0.23076923076923078, 0.2727272727272727, 0.5263157894736842, 0.22727272727272727, 0.34615384615384615, 0.4, 0.6666666666666666, 0.2, 0.09090909090909091, 0.2, 0.21739130434782608, 0.21212121212121213, 0.047619047619047616, 0.24, 0.29411764705882354, 0.34615384615384615, 0.17857142857142858, 0.0, 0.3076923076923077, 0.14285714285714285, 0.038461538461538464, 0.2857142857142857, 0.2857142857142857, 0.22727272727272727, 0.25, 0.13333333333333333, 0.4444444444444444, 0.21951219512195122, 0.17391304347826086, 0.6296296296296297, 0.3333333333333333, 0.14814814814814814, 0.20833333333333334, 0.2222222222222222, 0.32, 0.06451612903225806, 0.07692307692307693, 0.29310344827586204, 0.11764705882352941, 0.10526315789473684, 0.4375, 0.3125, 0.14814814814814814, 0.2727272727272727, 0.46153846153846156, 0.20833333333333334, 0.125, 0.14285714285714285, 0.4666666666666667, 1.3, 0.4583333333333333, 0.13043478260869565, 0.17391304347826086, 0.3157894736842105, 0.17857142857142858, 0.4, 0.3157894736842105, 0.6363636363636364, 0.12195121951219512, 0.05, 0.2916666666666667, 0.24324324324324326, 0.3333333333333333, 0.21739130434782608, 0.38461538461538464, 0.15789473684210525, 0.15, 0.09523809523809523, 0.2777777777777778, 0.21212121212121213, 0.07692307692307693, 0.1111111111111111, 0.0625, 0.3793103448275862, 0.29411764705882354, 0.4090909090909091, 0.4444444444444444, 0.7777777777777778, 0.3076923076923077, 0.2777777777777778, 0.1724137931034483, 0.26666666666666666, 0.13333333333333333, 0.45454545454545453, 0.4375]\n", - "nl_google_fleurs whisper_tiny__nl_core_news_lg__ner_metrics 0.3161237339690157\n", - "[0.25, 0.07692307692307693, 0.125, 0.2962962962962963, 0.23809523809523808, 0.13043478260869565, 0.23529411764705882, 0.2549019607843137, 0.2727272727272727, 0.4583333333333333, 0.17391304347826086, 0.3, 0.35294117647058826, 0.19230769230769232, 0.38461538461538464, 0.25, 0.2222222222222222, 0.3157894736842105, 0.15384615384615385, 0.08333333333333333, 0.2777777777777778, 0.08571428571428572, 0.15151515151515152, 0.14285714285714285, 0.2, 0.15, 0.041666666666666664, 0.21428571428571427, 0.125, 0.2727272727272727, 0.058823529411764705, 0.35714285714285715, 0.06666666666666667, 0.23076923076923078, 0.13043478260869565, 0.09090909090909091, 0.12, 0.21739130434782608, 0.5238095238095238, 0.13793103448275862, 0.07407407407407407, 0.29411764705882354, 0.27586206896551724, 0.4166666666666667, 0.16666666666666666, 0.11764705882352941, 0.11764705882352941, 0.16666666666666666, 0.05263157894736842, 0.3333333333333333, 0.6, 0.46875, 0.2727272727272727, 0.13513513513513514, 0.17142857142857143, 0.27586206896551724, 0.15, 0.20689655172413793, 0.34782608695652173, 0.15625, 0.21739130434782608, 0.17391304347826086, 0.125, 0.13636363636363635, 0.125, 0.29411764705882354, 0.3684210526315789, 0.15384615384615385, 0.10526315789473684, 0.058823529411764705, 0.34615384615384615, 0.030303030303030304, 0.1935483870967742, 0.2916666666666667, 0.3076923076923077, 0.13793103448275862, 0.06060606060606061, 0.20512820512820512, 0.4666666666666667, 0.14285714285714285, 0.18181818181818182, 0.42857142857142855, 0.15384615384615385, 0.17857142857142858, 0.18181818181818182, 0.23076923076923078, 0.38461538461538464, 0.3333333333333333, 0.23076923076923078, 1.625, 0.4, 0.09090909090909091, 0.23809523809523808, 0.3333333333333333, 0.3333333333333333, 0.2, 0.08695652173913043, 0.3333333333333333, 0.10714285714285714, 0.2857142857142857, 0.0, 0.13333333333333333, 0.4166666666666667, 0.24, 0.06451612903225806, 0.3888888888888889, 0.1, 0.16666666666666666, 0.1111111111111111, 0.2, 0.29411764705882354, 0.14285714285714285, 0.3333333333333333, 0.1111111111111111, 0.23809523809523808, 0.09523809523809523, 0.11764705882352941, 0.14285714285714285, 0.2857142857142857, 0.08695652173913043, 0.17857142857142858, 0.2857142857142857, 0.08695652173913043, 0.35294117647058826, 0.14285714285714285, 0.36363636363636365, 0.05263157894736842, 0.0625, 0.25, 0.25925925925925924, 0.23076923076923078, 0.2777777777777778, 0.12903225806451613, 0.23076923076923078, 0.1, 0.2, 0.0967741935483871, 2.5, 0.17647058823529413, 0.5833333333333334, 0.0, 0.0, 0.08333333333333333, 0.1111111111111111, 0.2222222222222222, 0.25, 0.625, 0.4444444444444444, 0.13953488372093023, 0.2222222222222222, 0.3333333333333333, 0.14285714285714285, 0.4666666666666667, 0.23529411764705882, 0.7368421052631579, 0.0, 0.2777777777777778, 0.13333333333333333, 0.1111111111111111, 0.12, 0.19047619047619047, 0.05263157894736842, 0.23529411764705882, 0.05263157894736842, 0.24, 0.38235294117647056, 0.19230769230769232, 0.22727272727272727, 0.42105263157894735, 0.36363636363636365, 0.07692307692307693, 0.2, 0.25, 0.1, 0.18181818181818182, 0.2, 0.043478260869565216, 0.23809523809523808, 0.06060606060606061, 0.32, 0.17647058823529413, 0.38461538461538464, 0.17857142857142858, 0.19230769230769232, 0.3076923076923077, 0.047619047619047616, 0.038461538461538464, 0.38095238095238093, 0.14285714285714285, 0.13636363636363635, 0.06666666666666667, 0.10714285714285714, 0.3333333333333333, 0.12195121951219512, 0.043478260869565216, 0.6296296296296297, 0.2777777777777778, 0.14814814814814814, 0.08333333333333333, 0.1111111111111111, 0.2, 0.16129032258064516, 0.07692307692307693, 0.3103448275862069, 0.14705882352941177, 0.15789473684210525, 0.3125, 0.25, 0.2222222222222222, 0.18181818181818182, 0.125, 0.10256410256410256, 0.3333333333333333, 0.16666666666666666, 0.4666666666666667, 0.1, 0.17391304347826086, 0.4583333333333333, 0.08695652173913043, 0.2631578947368421, 0.17857142857142858, 0.26666666666666666, 0.42105263157894735, 0.8181818181818182, 0.4, 0.07317073170731707, 0.2916666666666667, 0.16216216216216217, 0.20833333333333334, 0.043478260869565216, 0.23076923076923078, 0.10526315789473684, 0.2, 0.2222222222222222, 0.14285714285714285, 0.12121212121212122, 0.11538461538461539, 0.1388888888888889, 0.25, 0.5172413793103449, 0.29411764705882354, 0.13636363636363635, 0.2222222222222222, 0.15384615384615385, 0.4074074074074074, 0.2777777777777778, 0.13793103448275862, 0.13333333333333333, 0.13333333333333333, 0.45454545454545453, 0.125]\n", - "nl_google_fleurs whisper_base__nl_core_news_lg__ner_metrics 0.23084502550941563\n" - ] - }, + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>tiny</th>\n", + " <th>base</th>\n", + " <th>small</th>\n", + " <th>medium</th>\n", + " <th>large-v2</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>nl_google_fleurs</th>\n", + " <td>0.582916</td>\n", + " <td>0.427364</td>\n", + " <td>0.279190</td>\n", + " <td>0.229402</td>\n", + " <td>0.212373</td>\n", + " </tr>\n", + " <tr>\n", + " <th>nl_minds14</th>\n", + " <td>0.888989</td>\n", + " <td>0.702107</td>\n", + " <td>0.511865</td>\n", + " <td>0.440081</td>\n", + " <td>0.415821</td>\n", + " </tr>\n", + " <tr>\n", + " <th>nl_voxpopuli</th>\n", + " <td>0.451950</td>\n", + " <td>0.350228</td>\n", + " <td>0.233061</td>\n", + " <td>0.188461</td>\n", + " <td>0.208664</td>\n", + " </tr>\n", + " <tr>\n", + " <th>fr_google_fleurs</th>\n", + " <td>0.468415</td>\n", + " <td>0.338927</td>\n", + " <td>0.260157</td>\n", + " <td>0.207241</td>\n", + " <td>0.194587</td>\n", + " </tr>\n", + " <tr>\n", + " <th>fr_minds14</th>\n", + " <td>0.700735</td>\n", + " <td>0.619382</td>\n", + " <td>0.567487</td>\n", + " <td>0.513574</td>\n", + " <td>0.552826</td>\n", + " </tr>\n", + " <tr>\n", + " <th>fr_voxpopuli</th>\n", + " <td>0.310661</td>\n", + " <td>0.235596</td>\n", + " <td>0.180943</td>\n", + " <td>0.153288</td>\n", + " <td>0.159867</td>\n", + " </tr>\n", + " <tr>\n", + " <th>de_google_fleurs</th>\n", + " <td>0.449640</td>\n", + " <td>0.344001</td>\n", + " <td>0.282088</td>\n", + " <td>0.275634</td>\n", + " <td>0.264093</td>\n", + " </tr>\n", + " <tr>\n", + " <th>de_minds14</th>\n", + " <td>0.608813</td>\n", + " <td>0.529599</td>\n", + " <td>0.472205</td>\n", + " <td>0.443094</td>\n", + " <td>0.441656</td>\n", + " </tr>\n", + " <tr>\n", + " <th>de_voxpopuli</th>\n", + " <td>0.347653</td>\n", + " <td>0.248060</td>\n", + " <td>0.198001</td>\n", + " <td>0.168237</td>\n", + " <td>0.205059</td>\n", + " </tr>\n", + " <tr>\n", + " <th>it_google_fleurs</th>\n", + " <td>0.364700</td>\n", + " <td>0.269092</td>\n", + " <td>0.218361</td>\n", + " <td>0.189632</td>\n", + " <td>0.189108</td>\n", + " </tr>\n", + " <tr>\n", + " <th>it_minds14</th>\n", + " <td>0.735663</td>\n", + " <td>0.597724</td>\n", + " <td>0.500377</td>\n", + " <td>0.438344</td>\n", + " <td>0.417785</td>\n", + " </tr>\n", + " <tr>\n", + " <th>it_voxpopuli</th>\n", + " <td>-1.000000</td>\n", + " <td>-1.000000</td>\n", + " <td>-1.000000</td>\n", + " <td>-1.000000</td>\n", + " <td>-1.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>pl_google_fleurs</th>\n", + " <td>0.594285</td>\n", + " <td>0.452570</td>\n", + " <td>0.318702</td>\n", + " <td>0.276475</td>\n", + " <td>0.261194</td>\n", + " </tr>\n", + " <tr>\n", + " <th>pl_minds14</th>\n", + " <td>0.988993</td>\n", + " <td>0.853431</td>\n", + " <td>0.653693</td>\n", + " <td>0.585884</td>\n", + " <td>0.597468</td>\n", + " </tr>\n", + " <tr>\n", + " <th>pl_voxpopuli</th>\n", + " <td>0.374544</td>\n", + " <td>0.277290</td>\n", + " <td>0.198685</td>\n", + " <td>0.164524</td>\n", + " <td>0.161887</td>\n", + " </tr>\n", + " <tr>\n", + " <th>es_google_fleurs</th>\n", + " <td>0.284499</td>\n", + " <td>0.224748</td>\n", + " <td>0.187365</td>\n", + " <td>0.189561</td>\n", + " <td>0.184028</td>\n", + " </tr>\n", + " <tr>\n", + " <th>es_minds14</th>\n", + " <td>0.880992</td>\n", + " <td>0.747677</td>\n", + " <td>0.695294</td>\n", + " <td>0.690749</td>\n", + " <td>0.697884</td>\n", + " </tr>\n", + " <tr>\n", + " <th>es_voxpopuli</th>\n", + " <td>0.252463</td>\n", + " <td>0.206225</td>\n", + " <td>0.229706</td>\n", + " <td>0.195846</td>\n", + " <td>0.231587</td>\n", + " </tr>\n", + " <tr>\n", + " <th>en_google_fleurs</th>\n", + " <td>0.295853</td>\n", + " <td>0.250928</td>\n", + " <td>0.224483</td>\n", + " <td>0.218855</td>\n", + " <td>0.218479</td>\n", + " </tr>\n", + " <tr>\n", + " <th>en_minds14</th>\n", + " <td>0.634351</td>\n", + " <td>0.623962</td>\n", + " <td>0.626942</td>\n", + " <td>0.626588</td>\n", + " <td>0.620953</td>\n", + " </tr>\n", + " <tr>\n", + " <th>en_voxpopuli</th>\n", + " <td>0.345836</td>\n", + " <td>0.319493</td>\n", + " <td>0.319060</td>\n", + " <td>0.466410</td>\n", + " <td>0.408949</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " tiny base small medium large-v2\n", + "nl_google_fleurs 0.582916 0.427364 0.279190 0.229402 0.212373\n", + "nl_minds14 0.888989 0.702107 0.511865 0.440081 0.415821\n", + "nl_voxpopuli 0.451950 0.350228 0.233061 0.188461 0.208664\n", + "fr_google_fleurs 0.468415 0.338927 0.260157 0.207241 0.194587\n", + "fr_minds14 0.700735 0.619382 0.567487 0.513574 0.552826\n", + "fr_voxpopuli 0.310661 0.235596 0.180943 0.153288 0.159867\n", + "de_google_fleurs 0.449640 0.344001 0.282088 0.275634 0.264093\n", + "de_minds14 0.608813 0.529599 0.472205 0.443094 0.441656\n", + "de_voxpopuli 0.347653 0.248060 0.198001 0.168237 0.205059\n", + "it_google_fleurs 0.364700 0.269092 0.218361 0.189632 0.189108\n", + "it_minds14 0.735663 0.597724 0.500377 0.438344 0.417785\n", + "it_voxpopuli -1.000000 -1.000000 -1.000000 -1.000000 -1.000000\n", + "pl_google_fleurs 0.594285 0.452570 0.318702 0.276475 0.261194\n", + "pl_minds14 0.988993 0.853431 0.653693 0.585884 0.597468\n", + "pl_voxpopuli 0.374544 0.277290 0.198685 0.164524 0.161887\n", + "es_google_fleurs 0.284499 0.224748 0.187365 0.189561 0.184028\n", + "es_minds14 0.880992 0.747677 0.695294 0.690749 0.697884\n", + "es_voxpopuli 0.252463 0.206225 0.229706 0.195846 0.231587\n", + "en_google_fleurs 0.295853 0.250928 0.224483 0.218855 0.218479\n", + "en_minds14 0.634351 0.623962 0.626942 0.626588 0.620953\n", + "en_voxpopuli 0.345836 0.319493 0.319060 0.466410 0.408949" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame(spacy_dep, columns=WHISPER_ASR_MODEL, index=FULL_DATASET_NAMES)\n", + "# DEP" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "3dbfbb6e-c369-47fd-801c-6df211943dc1", + "metadata": {}, + "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "KeyboardInterrupt\n", - "\n" - ] + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>tiny</th>\n", + " <th>base</th>\n", + " <th>small</th>\n", + " <th>medium</th>\n", + " <th>large-v2</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>nl_google_fleurs</th>\n", + " <td>0.708020</td>\n", + " <td>0.535692</td>\n", + " <td>0.365346</td>\n", + " <td>0.296100</td>\n", + " <td>0.261951</td>\n", + " </tr>\n", + " <tr>\n", + " <th>nl_minds14</th>\n", + " <td>0.897447</td>\n", + " <td>0.714498</td>\n", + " <td>0.503436</td>\n", + " <td>0.419083</td>\n", + " <td>0.389125</td>\n", + " </tr>\n", + " <tr>\n", + " <th>nl_voxpopuli</th>\n", + " <td>0.645715</td>\n", + " <td>0.526939</td>\n", + " <td>0.396940</td>\n", + " <td>0.345034</td>\n", + " <td>0.358023</td>\n", + " </tr>\n", + " <tr>\n", + " <th>fr_google_fleurs</th>\n", + " <td>0.600185</td>\n", + " <td>0.470808</td>\n", + " <td>0.378478</td>\n", + " <td>0.324236</td>\n", + " <td>0.309570</td>\n", + " </tr>\n", + " <tr>\n", + " <th>fr_minds14</th>\n", + " <td>0.805977</td>\n", + " <td>0.700773</td>\n", + " <td>0.642619</td>\n", + " <td>0.583323</td>\n", + " <td>0.616411</td>\n", + " </tr>\n", + " <tr>\n", + " <th>fr_voxpopuli</th>\n", + " <td>0.510623</td>\n", + " <td>0.440340</td>\n", + " <td>0.382961</td>\n", + " <td>0.359633</td>\n", + " <td>0.365811</td>\n", + " </tr>\n", + " <tr>\n", + " <th>de_google_fleurs</th>\n", + " <td>0.651989</td>\n", + " <td>0.551766</td>\n", + " <td>0.506944</td>\n", + " <td>0.478476</td>\n", + " <td>0.469045</td>\n", + " </tr>\n", + " <tr>\n", + " <th>de_minds14</th>\n", + " <td>0.659890</td>\n", + " <td>0.554437</td>\n", + " <td>0.474513</td>\n", + " <td>0.429274</td>\n", + " <td>0.425134</td>\n", + " </tr>\n", + " <tr>\n", + " <th>de_voxpopuli</th>\n", + " <td>0.645898</td>\n", + " <td>0.558876</td>\n", + " <td>0.518976</td>\n", + " <td>0.488194</td>\n", + " <td>0.525581</td>\n", + " </tr>\n", + " <tr>\n", + " <th>it_google_fleurs</th>\n", + " <td>0.465298</td>\n", + " <td>0.355877</td>\n", + " <td>0.287491</td>\n", + " <td>0.254384</td>\n", + " <td>0.251697</td>\n", + " </tr>\n", + " <tr>\n", + " <th>it_minds14</th>\n", + " <td>0.779429</td>\n", + " <td>0.621546</td>\n", + " <td>0.502670</td>\n", + " <td>0.437805</td>\n", + " <td>0.422781</td>\n", + " </tr>\n", + " <tr>\n", + " <th>it_voxpopuli</th>\n", + " <td>-1.000000</td>\n", + " <td>-1.000000</td>\n", + " <td>-1.000000</td>\n", + " <td>-1.000000</td>\n", + " <td>-1.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>pl_google_fleurs</th>\n", + " <td>0.705909</td>\n", + " <td>0.553073</td>\n", + " <td>0.384142</td>\n", + " <td>0.318203</td>\n", + " <td>0.298247</td>\n", + " </tr>\n", + " <tr>\n", + " <th>pl_minds14</th>\n", + " <td>1.009390</td>\n", + " <td>0.860626</td>\n", + " <td>0.633766</td>\n", + " <td>0.572826</td>\n", + " <td>0.563293</td>\n", + " </tr>\n", + " <tr>\n", + " <th>pl_voxpopuli</th>\n", + " <td>0.588464</td>\n", + " <td>0.489265</td>\n", + " <td>0.380883</td>\n", + " <td>0.345623</td>\n", + " <td>0.349896</td>\n", + " </tr>\n", + " <tr>\n", + " <th>es_google_fleurs</th>\n", + " <td>0.333658</td>\n", + " <td>0.261352</td>\n", + " <td>0.213950</td>\n", + " <td>0.206351</td>\n", + " <td>0.202078</td>\n", + " </tr>\n", + " <tr>\n", + " <th>es_minds14</th>\n", + " <td>0.884689</td>\n", + " <td>0.740604</td>\n", + " <td>0.664831</td>\n", + " <td>0.656090</td>\n", + " <td>0.650328</td>\n", + " </tr>\n", + " <tr>\n", + " <th>es_voxpopuli</th>\n", + " <td>0.347112</td>\n", + " <td>0.294192</td>\n", + " <td>0.333500</td>\n", + " <td>0.295472</td>\n", + " <td>0.353273</td>\n", + " </tr>\n", + " <tr>\n", + " <th>en_google_fleurs</th>\n", + " <td>0.348152</td>\n", + " <td>0.307207</td>\n", + " <td>0.278857</td>\n", + " <td>0.268917</td>\n", + " <td>0.270208</td>\n", + " </tr>\n", + " <tr>\n", + " <th>en_minds14</th>\n", + " <td>0.588375</td>\n", + " <td>0.571845</td>\n", + " <td>0.566381</td>\n", + " <td>0.567538</td>\n", + " <td>0.562651</td>\n", + " </tr>\n", + " <tr>\n", + " <th>en_voxpopuli</th>\n", + " <td>0.475612</td>\n", + " <td>0.451586</td>\n", + " <td>0.453132</td>\n", + " <td>0.594546</td>\n", + " <td>0.549755</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " tiny base small medium large-v2\n", + "nl_google_fleurs 0.708020 0.535692 0.365346 0.296100 0.261951\n", + "nl_minds14 0.897447 0.714498 0.503436 0.419083 0.389125\n", + "nl_voxpopuli 0.645715 0.526939 0.396940 0.345034 0.358023\n", + "fr_google_fleurs 0.600185 0.470808 0.378478 0.324236 0.309570\n", + "fr_minds14 0.805977 0.700773 0.642619 0.583323 0.616411\n", + "fr_voxpopuli 0.510623 0.440340 0.382961 0.359633 0.365811\n", + "de_google_fleurs 0.651989 0.551766 0.506944 0.478476 0.469045\n", + "de_minds14 0.659890 0.554437 0.474513 0.429274 0.425134\n", + "de_voxpopuli 0.645898 0.558876 0.518976 0.488194 0.525581\n", + "it_google_fleurs 0.465298 0.355877 0.287491 0.254384 0.251697\n", + "it_minds14 0.779429 0.621546 0.502670 0.437805 0.422781\n", + "it_voxpopuli -1.000000 -1.000000 -1.000000 -1.000000 -1.000000\n", + "pl_google_fleurs 0.705909 0.553073 0.384142 0.318203 0.298247\n", + "pl_minds14 1.009390 0.860626 0.633766 0.572826 0.563293\n", + "pl_voxpopuli 0.588464 0.489265 0.380883 0.345623 0.349896\n", + "es_google_fleurs 0.333658 0.261352 0.213950 0.206351 0.202078\n", + "es_minds14 0.884689 0.740604 0.664831 0.656090 0.650328\n", + "es_voxpopuli 0.347112 0.294192 0.333500 0.295472 0.353273\n", + "en_google_fleurs 0.348152 0.307207 0.278857 0.268917 0.270208\n", + "en_minds14 0.588375 0.571845 0.566381 0.567538 0.562651\n", + "en_voxpopuli 0.475612 0.451586 0.453132 0.594546 0.549755" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "[\n", - " [\n", - " get_stats_for(dataset, PropertyHelper.ner_metrics(model, get_spacy_model_name(dataset[:2])))\n", - " for model in FULL_LANGUAGE_MODELS\n", - " ]\n", - " for dataset in FULL_DATASET_NAMES\n", - "]" + "pd.DataFrame(word_wer_classic_metrics, columns=WHISPER_ASR_MODEL, index=FULL_DATASET_NAMES)\n", + "# word_wer_classic_metrics" ] }, { "cell_type": "code", "execution_count": null, - "id": "45fd851c-644f-48e6-b711-5bd312404b8b", + "id": "77a6e273-1f5e-4a2b-9568-66e53ba99c7b", "metadata": {}, "outputs": [], "source": [] @@ -82,7 +1072,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6466877e-e744-4cb1-8d4f-f818e1d3ee7d", + "id": "629318e6-8c00-413c-99d4-2b7ff559ac3f", "metadata": {}, "outputs": [], "source": [] diff --git a/sziszapangma/integration/repository/experiment_repository.py b/sziszapangma/integration/repository/experiment_repository.py index e50666d..61ddbb3 100644 --- a/sziszapangma/integration/repository/experiment_repository.py +++ b/sziszapangma/integration/repository/experiment_repository.py @@ -1,6 +1,6 @@ """Repository to manage results of asr experiment processing.""" from abc import ABC, abstractmethod -from typing import Any, Optional, Set +from typing import Any, Optional, Set, Dict class ExperimentRepository(ABC): @@ -37,3 +37,7 @@ class ExperimentRepository(ABC): @abstractmethod def get_all_properties(self) -> Set[str]: """Methods returns all possible properties.""" + + @abstractmethod + def get_all_values_from_property(self, property_name: str) -> Dict[str, Any]: + pass diff --git a/sziszapangma/integration/repository/mongo_experiment_repository.py b/sziszapangma/integration/repository/mongo_experiment_repository.py index 6c87a1d..98c2ef3 100644 --- a/sziszapangma/integration/repository/mongo_experiment_repository.py +++ b/sziszapangma/integration/repository/mongo_experiment_repository.py @@ -1,4 +1,4 @@ -from typing import Any, Optional, Set +from typing import Any, Optional, Set, Dict from pymongo import MongoClient from pymongo.database import Database @@ -58,3 +58,6 @@ class MongoExperimentRepository(ExperimentRepository): def get_all_properties(self) -> Set[str]: return set(self._get_database().list_collection_names()) + + def get_all_values_from_property(self, property_name: str) -> Dict[str, Any]: + return {record[ID]: record[VALUE] for record in self._get_database()[property_name].find()} -- GitLab