{ "cells": [ { "cell_type": "code", "execution_count": 24, "id": "955a0385-29fb-47dc-b012-729e49570594", "metadata": {}, "outputs": [], "source": [ "from new_experiment.utils.get_spacy_model_name import *\n", "\n", "from call_experiment_stats import *\n", "\n", "from new_experiment.utils.property_helper import PropertyHelper\n", "from new_experiment.utils.get_spacy_model_name import get_spacy_model_name\n", "from new_experiment.new_dependency_provider import get_experiment_repository\n", "from new_experiment.add_to_queue_pipeline import get_hf_facebook_wav2vec2_model_by_language_code\n", "import pandas as pd\n", "import numpy as np\n", "from typing import List" ] }, { "cell_type": "code", "execution_count": 2, "id": "3f1221d3-5f70-4441-af07-58fa176e31e9", "metadata": {}, "outputs": [], "source": [ "METRICS_FILE = 'metrics.txt'" ] }, { "cell_type": "code", "execution_count": 3, "id": "eda46e65-8079-40b9-9c4e-37fe74caec45", "metadata": {}, "outputs": [ { "ename": "ServerSelectionTimeoutError", "evalue": "192.168.0.124:27017: timed out, Timeout: 30s, Topology Description: <TopologyDescription id: 63caac355a13a212d6a8209f, topology_type: Unknown, servers: [<ServerDescription ('192.168.0.124', 27017) server_type: Unknown, rtt: None, error=NetworkTimeout('192.168.0.124:27017: timed out')>]>", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mServerSelectionTimeoutError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[3], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m metric_repository \u001b[38;5;241m=\u001b[39m get_experiment_repository(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmetric_stats\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(METRICS_FILE, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mw\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m writer:\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m dataset_property \u001b[38;5;129;01min\u001b[39;00m \u001b[43mmetric_repository\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_all_properties\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 4\u001b[0m values_dict \u001b[38;5;241m=\u001b[39m metric_repository\u001b[38;5;241m.\u001b[39mget_all_values_from_property(dataset_property)\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m value_key \u001b[38;5;129;01min\u001b[39;00m values_dict\u001b[38;5;241m.\u001b[39mkeys():\n", "File \u001b[0;32m~/Desktop/WUST/asr-benchmarks/sziszapangma/integration/repository/mongo_experiment_repository.py:60\u001b[0m, in \u001b[0;36mMongoExperimentRepository.get_all_properties\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 59\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_all_properties\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Set[\u001b[38;5;28mstr\u001b[39m]:\n\u001b[0;32m---> 60\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mset\u001b[39m(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_database\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlist_collection_names\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m)\n", "File \u001b[0;32m~/opt/miniconda3/envs/asr_benchmarks_39/lib/python3.9/site-packages/pymongo/database.py:959\u001b[0m, in \u001b[0;36mDatabase.list_collection_names\u001b[0;34m(self, session, filter, comment, **kwargs)\u001b[0m\n\u001b[1;32m 956\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mfilter\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m (\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mfilter\u001b[39m) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mfilter\u001b[39m):\n\u001b[1;32m 957\u001b[0m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnameOnly\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 959\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m [result[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;28;01mfor\u001b[39;00m result \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlist_collections\u001b[49m\u001b[43m(\u001b[49m\u001b[43msession\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msession\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m]\n", "File \u001b[0;32m~/opt/miniconda3/envs/asr_benchmarks_39/lib/python3.9/site-packages/pymongo/database.py:911\u001b[0m, in \u001b[0;36mDatabase.list_collections\u001b[0;34m(self, session, filter, comment, **kwargs)\u001b[0m\n\u001b[1;32m 906\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_cmd\u001b[39m(session, server, sock_info, read_preference):\n\u001b[1;32m 907\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_list_collections(\n\u001b[1;32m 908\u001b[0m sock_info, session, read_preference\u001b[38;5;241m=\u001b[39mread_preference, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs\n\u001b[1;32m 909\u001b[0m )\n\u001b[0;32m--> 911\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__client\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_retryable_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_cmd\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mread_pref\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msession\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/opt/miniconda3/envs/asr_benchmarks_39/lib/python3.9/site-packages/pymongo/_csot.py:105\u001b[0m, in \u001b[0;36mapply.<locals>.csot_wrapper\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 103\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _TimeoutContext(timeout):\n\u001b[1;32m 104\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m--> 105\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/opt/miniconda3/envs/asr_benchmarks_39/lib/python3.9/site-packages/pymongo/mongo_client.py:1441\u001b[0m, in \u001b[0;36mMongoClient._retryable_read\u001b[0;34m(self, func, read_pref, session, address, retryable)\u001b[0m\n\u001b[1;32m 1439\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m last_error\n\u001b[1;32m 1440\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1441\u001b[0m server \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_select_server\u001b[49m\u001b[43m(\u001b[49m\u001b[43mread_pref\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msession\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maddress\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maddress\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1442\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_socket_from_server(read_pref, server, session) \u001b[38;5;28;01mas\u001b[39;00m (sock_info, read_pref):\n\u001b[1;32m 1443\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m retrying \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m retryable:\n\u001b[1;32m 1444\u001b[0m \u001b[38;5;66;03m# A retry is not possible because this server does\u001b[39;00m\n\u001b[1;32m 1445\u001b[0m \u001b[38;5;66;03m# not support retryable reads, raise the last error.\u001b[39;00m\n", "File \u001b[0;32m~/opt/miniconda3/envs/asr_benchmarks_39/lib/python3.9/site-packages/pymongo/mongo_client.py:1257\u001b[0m, in \u001b[0;36mMongoClient._select_server\u001b[0;34m(self, server_selector, session, address)\u001b[0m\n\u001b[1;32m 1255\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m AutoReconnect(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mserver \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m:\u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m no longer available\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m address)\n\u001b[1;32m 1256\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1257\u001b[0m server \u001b[38;5;241m=\u001b[39m \u001b[43mtopology\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mselect_server\u001b[49m\u001b[43m(\u001b[49m\u001b[43mserver_selector\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1258\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m server\n\u001b[1;32m 1259\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m PyMongoError \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 1260\u001b[0m \u001b[38;5;66;03m# Server selection errors in a transaction are transient.\u001b[39;00m\n", "File \u001b[0;32m~/opt/miniconda3/envs/asr_benchmarks_39/lib/python3.9/site-packages/pymongo/topology.py:272\u001b[0m, in \u001b[0;36mTopology.select_server\u001b[0;34m(self, selector, server_selection_timeout, address)\u001b[0m\n\u001b[1;32m 270\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mselect_server\u001b[39m(\u001b[38;5;28mself\u001b[39m, selector, server_selection_timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, address\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m 271\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Like select_servers, but choose a random server if several match.\"\"\"\u001b[39;00m\n\u001b[0;32m--> 272\u001b[0m server \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_select_server\u001b[49m\u001b[43m(\u001b[49m\u001b[43mselector\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mserver_selection_timeout\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maddress\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 273\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m _csot\u001b[38;5;241m.\u001b[39mget_timeout():\n\u001b[1;32m 274\u001b[0m _csot\u001b[38;5;241m.\u001b[39mset_rtt(server\u001b[38;5;241m.\u001b[39mdescription\u001b[38;5;241m.\u001b[39mround_trip_time)\n", "File \u001b[0;32m~/opt/miniconda3/envs/asr_benchmarks_39/lib/python3.9/site-packages/pymongo/topology.py:261\u001b[0m, in \u001b[0;36mTopology._select_server\u001b[0;34m(self, selector, server_selection_timeout, address)\u001b[0m\n\u001b[1;32m 260\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_select_server\u001b[39m(\u001b[38;5;28mself\u001b[39m, selector, server_selection_timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, address\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[0;32m--> 261\u001b[0m servers \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mselect_servers\u001b[49m\u001b[43m(\u001b[49m\u001b[43mselector\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mserver_selection_timeout\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maddress\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 262\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(servers) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 263\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m servers[\u001b[38;5;241m0\u001b[39m]\n", "File \u001b[0;32m~/opt/miniconda3/envs/asr_benchmarks_39/lib/python3.9/site-packages/pymongo/topology.py:223\u001b[0m, in \u001b[0;36mTopology.select_servers\u001b[0;34m(self, selector, server_selection_timeout, address)\u001b[0m\n\u001b[1;32m 220\u001b[0m server_timeout \u001b[38;5;241m=\u001b[39m server_selection_timeout\n\u001b[1;32m 222\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_lock:\n\u001b[0;32m--> 223\u001b[0m server_descriptions \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_select_servers_loop\u001b[49m\u001b[43m(\u001b[49m\u001b[43mselector\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mserver_timeout\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maddress\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 225\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_server_by_address(sd\u001b[38;5;241m.\u001b[39maddress) \u001b[38;5;28;01mfor\u001b[39;00m sd \u001b[38;5;129;01min\u001b[39;00m server_descriptions]\n", "File \u001b[0;32m~/opt/miniconda3/envs/asr_benchmarks_39/lib/python3.9/site-packages/pymongo/topology.py:238\u001b[0m, in \u001b[0;36mTopology._select_servers_loop\u001b[0;34m(self, selector, timeout, address)\u001b[0m\n\u001b[1;32m 235\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m server_descriptions:\n\u001b[1;32m 236\u001b[0m \u001b[38;5;66;03m# No suitable servers.\u001b[39;00m\n\u001b[1;32m 237\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m now \u001b[38;5;241m>\u001b[39m end_time:\n\u001b[0;32m--> 238\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m ServerSelectionTimeoutError(\n\u001b[1;32m 239\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m, Timeout: \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124ms, Topology Description: \u001b[39m\u001b[38;5;132;01m%r\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 240\u001b[0m \u001b[38;5;241m%\u001b[39m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_error_message(selector), timeout, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdescription)\n\u001b[1;32m 241\u001b[0m )\n\u001b[1;32m 243\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_ensure_opened()\n\u001b[1;32m 244\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_request_check_all()\n", "\u001b[0;31mServerSelectionTimeoutError\u001b[0m: 192.168.0.124:27017: timed out, Timeout: 30s, Topology Description: <TopologyDescription id: 63caac355a13a212d6a8209f, topology_type: Unknown, servers: [<ServerDescription ('192.168.0.124', 27017) server_type: Unknown, rtt: None, error=NetworkTimeout('192.168.0.124:27017: timed out')>]>" ] } ], "source": [ "metric_repository = get_experiment_repository('metric_stats')\n", "with open(METRICS_FILE, 'w') as writer:\n", " for dataset_property in metric_repository.get_all_properties():\n", " values_dict = metric_repository.get_all_values_from_property(dataset_property)\n", " for value_key in values_dict.keys():\n", " line = f'{dataset_property} {value_key} {values_dict[value_key]}'\n", " writer.write(f'{line}\\n')" ] }, { "cell_type": "code", "execution_count": 25, "id": "9f5e44a6-f211-4b61-8cb4-5636c7672c6a", "metadata": {}, "outputs": [], "source": [ "COMMANDS = ['run_word_wer_classic_pipeline', 'run_word_wer_embedding_pipeline', 'run_spacy_dep_tag_wer_pipeline',\n", " 'run_spacy_ner_wer_pipeline', 'run_spacy_pos_wer_pipeline']\n", "LANGUAGES = ['nl', 'fr', 'de', 'it', 'pl', 'es', 'en']\n", "WHISPER_ASR_MODEL = ['tiny', 'base', 'small', 'medium', 'large-v2']\n", "DATASETS = ['google_fleurs', 'minds14', 'voxpopuli']\n", "FULL_DATASET_NAMES = []\n", "for itt in LANGUAGES:\n", " for it in DATASETS:\n", " FULL_DATASET_NAMES.append(f'{itt}_{it}')\n", "\n", "FULL_LANGUAGE_MODELS = [f'whisper_{it}' for it in WHISPER_ASR_MODEL] + ['facebook_wav2vec2', 'nvidia_stt']" ] }, { "cell_type": "code", "execution_count": 26, "id": "d2465ceb-7439-4fa5-adf8-e95d7e6106b9", "metadata": {}, "outputs": [], "source": [ "vals = dict()\n", "with open(METRICS_FILE, 'r') as reader:\n", " lines = reader.read().splitlines(keepends=False)\n", " for line in lines:\n", " # print(line)\n", " words = line.split()\n", " key = f'{words[0]}_{words[1]}'\n", " vals[key] = float(words[2])" ] }, { "cell_type": "code", "execution_count": 27, "id": "e41b19d0-37cb-4810-896a-fa0f73dd86e0", "metadata": {}, "outputs": [], "source": [ "def get_model_for_dataset_name(dataset: str, model: str):\n", " language_code = dataset[:2]\n", " if model.startswith('whisper'):\n", " return model\n", " elif model.startswith('facebook_wav2vec2'):\n", " return get_hf_facebook_wav2vec2_model_by_language_code(language_code)\n", " elif model.startswith('nvidia_stt'):\n", " return f'nvidia_stt_{language_code}_conformer_transducer_large'\n", " else:\n", " raise Exception('asr name not found')" ] }, { "cell_type": "code", "execution_count": 28, "id": "22d84451-b7e3-4dba-9758-068dae23ace4", "metadata": {}, "outputs": [], "source": [ "spacy_ner = [\n", " [vals.get(f'{dataset}_{PropertyHelper.ner_metrics(get_model_for_dataset_name(dataset, model), get_spacy_model_name(dataset[:2]))}', -1.0) \n", " for model in FULL_LANGUAGE_MODELS]\n", " for dataset in FULL_DATASET_NAMES\n", "]\n", "spacy_pos = [\n", " [vals.get(f'{dataset}_{PropertyHelper.pos_metrics(get_model_for_dataset_name(dataset, model), get_spacy_model_name(dataset[:2]))}', -1.0) \n", " for model in FULL_LANGUAGE_MODELS]\n", " for dataset in FULL_DATASET_NAMES\n", "]\n", "spacy_dep = [\n", " [vals.get(f'{dataset}_{PropertyHelper.dep_tag_metrics(get_model_for_dataset_name(dataset, model), get_spacy_model_name(dataset[:2]))}', -1.0) \n", " for model in FULL_LANGUAGE_MODELS]\n", " for dataset in FULL_DATASET_NAMES\n", "]\n", "word_wer_classic_metrics = [\n", " [vals.get(f'{dataset}_{PropertyHelper.word_wer_classic_metrics(get_model_for_dataset_name(dataset, model))}', -1.0) for model in FULL_LANGUAGE_MODELS]\n", " for dataset in FULL_DATASET_NAMES\n", "]\n", "word_wer_soft_metrics = [\n", " [vals.get(f'{dataset}_{PropertyHelper.word_wer_soft_metrics(get_model_for_dataset_name(dataset, model))}', -1.0) for model in FULL_LANGUAGE_MODELS]\n", " for dataset in FULL_DATASET_NAMES\n", "]\n", "word_wer_embedding_metrics = [\n", " [vals.get(f'{dataset}_{PropertyHelper.word_wer_embeddings_metrics(get_model_for_dataset_name(dataset, model))}', -1.0) for model in FULL_LANGUAGE_MODELS]\n", " for dataset in FULL_DATASET_NAMES\n", "]\n", "flair_pos = [\n", " [vals.get(f'{dataset}_{PropertyHelper.pos_metrics(get_model_for_dataset_name(dataset, model), \"flair_upos_multi\")}', -1.0) for model in FULL_LANGUAGE_MODELS]\n", " for dataset in FULL_DATASET_NAMES\n", "]\n", "wikineural_ner = [\n", " [vals.get(f'{dataset}_{PropertyHelper.ner_metrics(get_model_for_dataset_name(dataset, model), \"wikineural\")}', -1.0) for model in FULL_LANGUAGE_MODELS]\n", " for dataset in FULL_DATASET_NAMES\n", "]" ] }, { "cell_type": "code", "execution_count": 32, "id": "0d9a4977-edce-4c8e-aebe-b76781901512", "metadata": {}, "outputs": [], "source": [ "def df_to_latex(df: pd.DataFrame, name: str) -> None:\n", " with pd.option_context(\"max_colwidth\", 1000):\n", " with open(name, 'w') as writer:\n", " writer.write(spacy_ner_df.to_latex())\n", "\n", "\n", "def summarize_df(arr: List[List[float]], name: str) -> pd.DataFrame:\n", " spacy_ner_df = pd.DataFrame(arr, columns=FULL_LANGUAGE_MODELS, index=FULL_DATASET_NAMES)\n", " spacy_ner_df.to_csv(f'results/{name}.csv')\n", " df_to_latex(spacy_ner, f'results/{name}.tex')\n", " return spacy_ner_df" ] }, { "cell_type": "code", "execution_count": 34, "id": "45fd851c-644f-48e6-b711-5bd312404b8b", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/t8/4j9s5lbj1cbbn0xj92r0g31c0000gn/T/ipykernel_59977/2461695209.py:4: FutureWarning: In future versions `DataFrame.to_latex` is expected to utilise the base implementation of `Styler.to_latex` for formatting and rendering. The arguments signature may therefore change. It is recommended instead to use `DataFrame.style.to_latex` which also contains additional functionality.\n", " writer.write(spacy_ner_df.to_latex())\n" ] }, { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>whisper_tiny</th>\n", " <th>whisper_base</th>\n", " <th>whisper_small</th>\n", " <th>whisper_medium</th>\n", " <th>whisper_large-v2</th>\n", " <th>facebook_wav2vec2</th>\n", " <th>nvidia_stt</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>nl_google_fleurs</th>\n", " <td>0.316124</td>\n", " <td>0.230845</td>\n", " <td>0.186936</td>\n", " <td>0.170150</td>\n", " <td>0.165057</td>\n", " <td>0.082781</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>nl_minds14</th>\n", " <td>0.463084</td>\n", " <td>0.409993</td>\n", " <td>0.360934</td>\n", " <td>0.331613</td>\n", " <td>0.324172</td>\n", " <td>0.142155</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>nl_voxpopuli</th>\n", " <td>0.215158</td>\n", " <td>0.178716</td>\n", " <td>0.132960</td>\n", " <td>0.118042</td>\n", " <td>0.139958</td>\n", " <td>0.200403</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>fr_google_fleurs</th>\n", " <td>0.264291</td>\n", " <td>0.193436</td>\n", " <td>0.177302</td>\n", " <td>0.147464</td>\n", " <td>0.141276</td>\n", " <td>0.083170</td>\n", " <td>0.053155</td>\n", " </tr>\n", " <tr>\n", " <th>fr_minds14</th>\n", " <td>0.466860</td>\n", " <td>0.468822</td>\n", " <td>0.471754</td>\n", " <td>0.444854</td>\n", " <td>0.485090</td>\n", " <td>0.220358</td>\n", " <td>0.189111</td>\n", " </tr>\n", " <tr>\n", " <th>fr_voxpopuli</th>\n", " <td>0.161386</td>\n", " <td>0.131144</td>\n", " <td>0.113097</td>\n", " <td>0.099114</td>\n", " <td>0.111776</td>\n", " <td>0.169564</td>\n", " <td>0.127958</td>\n", " </tr>\n", " <tr>\n", " <th>de_google_fleurs</th>\n", " <td>0.316175</td>\n", " <td>0.257454</td>\n", " <td>0.234163</td>\n", " <td>0.239750</td>\n", " <td>0.236715</td>\n", " <td>0.083423</td>\n", " <td>0.051673</td>\n", " </tr>\n", " <tr>\n", " <th>de_minds14</th>\n", " <td>0.435681</td>\n", " <td>0.425712</td>\n", " <td>0.412896</td>\n", " <td>0.398617</td>\n", " <td>0.398762</td>\n", " <td>0.183933</td>\n", " <td>0.146988</td>\n", " </tr>\n", " <tr>\n", " <th>de_voxpopuli</th>\n", " <td>0.200245</td>\n", " <td>0.155502</td>\n", " <td>0.133251</td>\n", " <td>0.116949</td>\n", " <td>0.156371</td>\n", " <td>0.242498</td>\n", " <td>0.168854</td>\n", " </tr>\n", " <tr>\n", " <th>it_google_fleurs</th>\n", " <td>0.206301</td>\n", " <td>0.172527</td>\n", " <td>0.161195</td>\n", " <td>0.156655</td>\n", " <td>0.160677</td>\n", " <td>0.067181</td>\n", " <td>0.039040</td>\n", " </tr>\n", " <tr>\n", " <th>it_minds14</th>\n", " <td>0.487493</td>\n", " <td>0.448874</td>\n", " <td>0.432679</td>\n", " <td>0.416035</td>\n", " <td>0.392705</td>\n", " <td>0.198809</td>\n", " <td>0.146235</td>\n", " </tr>\n", " <tr>\n", " <th>it_voxpopuli</th>\n", " <td>0.160365</td>\n", " <td>0.139461</td>\n", " <td>0.138966</td>\n", " <td>0.123130</td>\n", " <td>0.130691</td>\n", " <td>-1.000000</td>\n", " <td>0.153960</td>\n", " </tr>\n", " <tr>\n", " <th>pl_google_fleurs</th>\n", " <td>0.334936</td>\n", " <td>0.273025</td>\n", " <td>0.227662</td>\n", " <td>0.210962</td>\n", " <td>0.209027</td>\n", " <td>0.088157</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>pl_minds14</th>\n", " <td>0.657194</td>\n", " <td>0.591588</td>\n", " <td>0.487344</td>\n", " <td>0.474013</td>\n", " <td>0.487891</td>\n", " <td>0.237692</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>pl_voxpopuli</th>\n", " <td>0.203548</td>\n", " <td>0.158526</td>\n", " <td>0.126280</td>\n", " <td>0.110784</td>\n", " <td>0.117780</td>\n", " <td>0.184368</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>es_google_fleurs</th>\n", " <td>0.187607</td>\n", " <td>0.159873</td>\n", " <td>0.147104</td>\n", " <td>0.155210</td>\n", " <td>0.154657</td>\n", " <td>0.057830</td>\n", " <td>0.038903</td>\n", " </tr>\n", " <tr>\n", " <th>es_minds14</th>\n", " <td>0.721295</td>\n", " <td>0.670363</td>\n", " <td>0.666278</td>\n", " <td>0.673058</td>\n", " <td>0.680341</td>\n", " <td>0.411927</td>\n", " <td>0.342895</td>\n", " </tr>\n", " <tr>\n", " <th>es_voxpopuli</th>\n", " <td>0.133805</td>\n", " <td>0.116222</td>\n", " <td>0.119882</td>\n", " <td>0.106610</td>\n", " <td>0.122036</td>\n", " <td>0.148225</td>\n", " <td>0.128456</td>\n", " </tr>\n", " <tr>\n", " <th>en_google_fleurs</th>\n", " <td>0.217843</td>\n", " <td>0.188810</td>\n", " <td>0.186407</td>\n", " <td>0.183656</td>\n", " <td>0.184568</td>\n", " <td>0.180523</td>\n", " <td>0.071421</td>\n", " </tr>\n", " <tr>\n", " <th>en_minds14</th>\n", " <td>0.562068</td>\n", " <td>0.566999</td>\n", " <td>0.580369</td>\n", " <td>0.583945</td>\n", " <td>0.578079</td>\n", " <td>0.325304</td>\n", " <td>0.293083</td>\n", " </tr>\n", " <tr>\n", " <th>en_voxpopuli</th>\n", " <td>0.224980</td>\n", " <td>0.203959</td>\n", " <td>0.210278</td>\n", " <td>0.322688</td>\n", " <td>0.280877</td>\n", " <td>0.182708</td>\n", " <td>0.124416</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " whisper_tiny whisper_base whisper_small whisper_medium \\\n", "nl_google_fleurs 0.316124 0.230845 0.186936 0.170150 \n", "nl_minds14 0.463084 0.409993 0.360934 0.331613 \n", "nl_voxpopuli 0.215158 0.178716 0.132960 0.118042 \n", "fr_google_fleurs 0.264291 0.193436 0.177302 0.147464 \n", "fr_minds14 0.466860 0.468822 0.471754 0.444854 \n", "fr_voxpopuli 0.161386 0.131144 0.113097 0.099114 \n", "de_google_fleurs 0.316175 0.257454 0.234163 0.239750 \n", "de_minds14 0.435681 0.425712 0.412896 0.398617 \n", "de_voxpopuli 0.200245 0.155502 0.133251 0.116949 \n", "it_google_fleurs 0.206301 0.172527 0.161195 0.156655 \n", "it_minds14 0.487493 0.448874 0.432679 0.416035 \n", "it_voxpopuli 0.160365 0.139461 0.138966 0.123130 \n", "pl_google_fleurs 0.334936 0.273025 0.227662 0.210962 \n", "pl_minds14 0.657194 0.591588 0.487344 0.474013 \n", "pl_voxpopuli 0.203548 0.158526 0.126280 0.110784 \n", "es_google_fleurs 0.187607 0.159873 0.147104 0.155210 \n", "es_minds14 0.721295 0.670363 0.666278 0.673058 \n", "es_voxpopuli 0.133805 0.116222 0.119882 0.106610 \n", "en_google_fleurs 0.217843 0.188810 0.186407 0.183656 \n", "en_minds14 0.562068 0.566999 0.580369 0.583945 \n", "en_voxpopuli 0.224980 0.203959 0.210278 0.322688 \n", "\n", " whisper_large-v2 facebook_wav2vec2 nvidia_stt \n", "nl_google_fleurs 0.165057 0.082781 -1.000000 \n", "nl_minds14 0.324172 0.142155 -1.000000 \n", "nl_voxpopuli 0.139958 0.200403 -1.000000 \n", "fr_google_fleurs 0.141276 0.083170 0.053155 \n", "fr_minds14 0.485090 0.220358 0.189111 \n", "fr_voxpopuli 0.111776 0.169564 0.127958 \n", "de_google_fleurs 0.236715 0.083423 0.051673 \n", "de_minds14 0.398762 0.183933 0.146988 \n", "de_voxpopuli 0.156371 0.242498 0.168854 \n", "it_google_fleurs 0.160677 0.067181 0.039040 \n", "it_minds14 0.392705 0.198809 0.146235 \n", "it_voxpopuli 0.130691 -1.000000 0.153960 \n", "pl_google_fleurs 0.209027 0.088157 -1.000000 \n", "pl_minds14 0.487891 0.237692 -1.000000 \n", "pl_voxpopuli 0.117780 0.184368 -1.000000 \n", "es_google_fleurs 0.154657 0.057830 0.038903 \n", "es_minds14 0.680341 0.411927 0.342895 \n", "es_voxpopuli 0.122036 0.148225 0.128456 \n", "en_google_fleurs 0.184568 0.180523 0.071421 \n", "en_minds14 0.578079 0.325304 0.293083 \n", "en_voxpopuli 0.280877 0.182708 0.124416 " ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "summarize_df(spacy_ner, 'spacy_ner')" ] }, { "cell_type": "code", "execution_count": 20, "id": "6466877e-e744-4cb1-8d4f-f818e1d3ee7d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>whisper_tiny</th>\n", " <th>whisper_base</th>\n", " <th>whisper_small</th>\n", " <th>whisper_medium</th>\n", " <th>whisper_large-v2</th>\n", " <th>facebook_wav2vec2</th>\n", " <th>nvidia_stt</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>nl_google_fleurs</th>\n", " <td>0.582916</td>\n", " <td>0.427364</td>\n", " <td>0.279190</td>\n", " <td>0.229402</td>\n", " <td>0.212373</td>\n", " <td>0.160957</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>nl_minds14</th>\n", " <td>0.888989</td>\n", " <td>0.702107</td>\n", " <td>0.511865</td>\n", " <td>0.440081</td>\n", " <td>0.415821</td>\n", " <td>0.298583</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>nl_voxpopuli</th>\n", " <td>0.451950</td>\n", " <td>0.350228</td>\n", " <td>0.233061</td>\n", " <td>0.188461</td>\n", " <td>0.208664</td>\n", " <td>0.340656</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>fr_google_fleurs</th>\n", " <td>0.468415</td>\n", " <td>0.338927</td>\n", " <td>0.260157</td>\n", " <td>0.207241</td>\n", " <td>0.194587</td>\n", " <td>0.141560</td>\n", " <td>0.073667</td>\n", " </tr>\n", " <tr>\n", " <th>fr_minds14</th>\n", " <td>0.700735</td>\n", " <td>0.619382</td>\n", " <td>0.567487</td>\n", " <td>0.513574</td>\n", " <td>0.552826</td>\n", " <td>0.336656</td>\n", " <td>0.236770</td>\n", " </tr>\n", " <tr>\n", " <th>fr_voxpopuli</th>\n", " <td>0.310661</td>\n", " <td>0.235596</td>\n", " <td>0.180943</td>\n", " <td>0.153288</td>\n", " <td>0.159867</td>\n", " <td>0.245229</td>\n", " <td>0.164607</td>\n", " </tr>\n", " <tr>\n", " <th>de_google_fleurs</th>\n", " <td>0.449640</td>\n", " <td>0.344001</td>\n", " <td>0.282088</td>\n", " <td>0.275634</td>\n", " <td>0.264093</td>\n", " <td>0.094206</td>\n", " <td>0.053148</td>\n", " </tr>\n", " <tr>\n", " <th>de_minds14</th>\n", " <td>0.608813</td>\n", " <td>0.529599</td>\n", " <td>0.472205</td>\n", " <td>0.443094</td>\n", " <td>0.441656</td>\n", " <td>0.228980</td>\n", " <td>0.157855</td>\n", " </tr>\n", " <tr>\n", " <th>de_voxpopuli</th>\n", " <td>0.347653</td>\n", " <td>0.248060</td>\n", " <td>0.198001</td>\n", " <td>0.168237</td>\n", " <td>0.205059</td>\n", " <td>0.313704</td>\n", " <td>0.203633</td>\n", " </tr>\n", " <tr>\n", " <th>it_google_fleurs</th>\n", " <td>0.364700</td>\n", " <td>0.269092</td>\n", " <td>0.218361</td>\n", " <td>0.189632</td>\n", " <td>0.189108</td>\n", " <td>0.115212</td>\n", " <td>0.057875</td>\n", " </tr>\n", " <tr>\n", " <th>it_minds14</th>\n", " <td>0.735663</td>\n", " <td>0.597724</td>\n", " <td>0.500377</td>\n", " <td>0.438344</td>\n", " <td>0.417785</td>\n", " <td>0.285531</td>\n", " <td>0.153250</td>\n", " </tr>\n", " <tr>\n", " <th>it_voxpopuli</th>\n", " <td>0.401738</td>\n", " <td>0.332257</td>\n", " <td>0.278988</td>\n", " <td>0.245468</td>\n", " <td>0.247638</td>\n", " <td>-1.000000</td>\n", " <td>0.236106</td>\n", " </tr>\n", " <tr>\n", " <th>pl_google_fleurs</th>\n", " <td>0.594285</td>\n", " <td>0.452570</td>\n", " <td>0.318702</td>\n", " <td>0.276475</td>\n", " <td>0.261194</td>\n", " <td>0.184994</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>pl_minds14</th>\n", " <td>0.988993</td>\n", " <td>0.853431</td>\n", " <td>0.653693</td>\n", " <td>0.585884</td>\n", " <td>0.597468</td>\n", " <td>0.454939</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>pl_voxpopuli</th>\n", " <td>0.374544</td>\n", " <td>0.277290</td>\n", " <td>0.198685</td>\n", " <td>0.164524</td>\n", " <td>0.161887</td>\n", " <td>0.309752</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>es_google_fleurs</th>\n", " <td>0.284499</td>\n", " <td>0.224748</td>\n", " <td>0.187365</td>\n", " <td>0.189561</td>\n", " <td>0.184028</td>\n", " <td>0.096476</td>\n", " <td>0.051401</td>\n", " </tr>\n", " <tr>\n", " <th>es_minds14</th>\n", " <td>0.880992</td>\n", " <td>0.747677</td>\n", " <td>0.695294</td>\n", " <td>0.690749</td>\n", " <td>0.697884</td>\n", " <td>0.508818</td>\n", " <td>0.384215</td>\n", " </tr>\n", " <tr>\n", " <th>es_voxpopuli</th>\n", " <td>0.252463</td>\n", " <td>0.206225</td>\n", " <td>0.229706</td>\n", " <td>0.195846</td>\n", " <td>0.231587</td>\n", " <td>0.230351</td>\n", " <td>0.173987</td>\n", " </tr>\n", " <tr>\n", " <th>en_google_fleurs</th>\n", " <td>0.295853</td>\n", " <td>0.250928</td>\n", " <td>0.224483</td>\n", " <td>0.218855</td>\n", " <td>0.218479</td>\n", " <td>0.367414</td>\n", " <td>0.078904</td>\n", " </tr>\n", " <tr>\n", " <th>en_minds14</th>\n", " <td>0.634351</td>\n", " <td>0.623962</td>\n", " <td>0.626942</td>\n", " <td>0.626588</td>\n", " <td>0.620953</td>\n", " <td>0.584547</td>\n", " <td>0.329282</td>\n", " </tr>\n", " <tr>\n", " <th>en_voxpopuli</th>\n", " <td>0.345836</td>\n", " <td>0.319493</td>\n", " <td>0.319060</td>\n", " <td>0.466410</td>\n", " <td>0.408949</td>\n", " <td>0.377100</td>\n", " <td>0.160883</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " whisper_tiny whisper_base whisper_small whisper_medium \\\n", "nl_google_fleurs 0.582916 0.427364 0.279190 0.229402 \n", "nl_minds14 0.888989 0.702107 0.511865 0.440081 \n", "nl_voxpopuli 0.451950 0.350228 0.233061 0.188461 \n", "fr_google_fleurs 0.468415 0.338927 0.260157 0.207241 \n", "fr_minds14 0.700735 0.619382 0.567487 0.513574 \n", "fr_voxpopuli 0.310661 0.235596 0.180943 0.153288 \n", "de_google_fleurs 0.449640 0.344001 0.282088 0.275634 \n", "de_minds14 0.608813 0.529599 0.472205 0.443094 \n", "de_voxpopuli 0.347653 0.248060 0.198001 0.168237 \n", "it_google_fleurs 0.364700 0.269092 0.218361 0.189632 \n", "it_minds14 0.735663 0.597724 0.500377 0.438344 \n", "it_voxpopuli 0.401738 0.332257 0.278988 0.245468 \n", "pl_google_fleurs 0.594285 0.452570 0.318702 0.276475 \n", "pl_minds14 0.988993 0.853431 0.653693 0.585884 \n", "pl_voxpopuli 0.374544 0.277290 0.198685 0.164524 \n", "es_google_fleurs 0.284499 0.224748 0.187365 0.189561 \n", "es_minds14 0.880992 0.747677 0.695294 0.690749 \n", "es_voxpopuli 0.252463 0.206225 0.229706 0.195846 \n", "en_google_fleurs 0.295853 0.250928 0.224483 0.218855 \n", "en_minds14 0.634351 0.623962 0.626942 0.626588 \n", "en_voxpopuli 0.345836 0.319493 0.319060 0.466410 \n", "\n", " whisper_large-v2 facebook_wav2vec2 nvidia_stt \n", "nl_google_fleurs 0.212373 0.160957 -1.000000 \n", "nl_minds14 0.415821 0.298583 -1.000000 \n", "nl_voxpopuli 0.208664 0.340656 -1.000000 \n", "fr_google_fleurs 0.194587 0.141560 0.073667 \n", "fr_minds14 0.552826 0.336656 0.236770 \n", "fr_voxpopuli 0.159867 0.245229 0.164607 \n", "de_google_fleurs 0.264093 0.094206 0.053148 \n", "de_minds14 0.441656 0.228980 0.157855 \n", "de_voxpopuli 0.205059 0.313704 0.203633 \n", "it_google_fleurs 0.189108 0.115212 0.057875 \n", "it_minds14 0.417785 0.285531 0.153250 \n", "it_voxpopuli 0.247638 -1.000000 0.236106 \n", "pl_google_fleurs 0.261194 0.184994 -1.000000 \n", "pl_minds14 0.597468 0.454939 -1.000000 \n", "pl_voxpopuli 0.161887 0.309752 -1.000000 \n", "es_google_fleurs 0.184028 0.096476 0.051401 \n", "es_minds14 0.697884 0.508818 0.384215 \n", "es_voxpopuli 0.231587 0.230351 0.173987 \n", "en_google_fleurs 0.218479 0.367414 0.078904 \n", "en_minds14 0.620953 0.584547 0.329282 \n", "en_voxpopuli 0.408949 0.377100 0.160883 " ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "summarize_df(spacy_ner, 'spacy_ner')\n", "\n", "spacy_pos_df = pd.DataFrame(spacy_pos, columns=FULL_LANGUAGE_MODELS, index=FULL_DATASET_NAMES)\n", "spacy_pos_df.to_csv('results/spacy_pos.csv')\n", "spacy_pos_df" ] }, { "cell_type": "code", "execution_count": 21, "id": "77567361-b730-49f0-ab68-19ad335df1b1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>whisper_tiny</th>\n", " <th>whisper_base</th>\n", " <th>whisper_small</th>\n", " <th>whisper_medium</th>\n", " <th>whisper_large-v2</th>\n", " <th>facebook_wav2vec2</th>\n", " <th>nvidia_stt</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>nl_google_fleurs</th>\n", " <td>0.699699</td>\n", " <td>0.533595</td>\n", " <td>0.366764</td>\n", " <td>0.300730</td>\n", " <td>0.282070</td>\n", " <td>0.246416</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>nl_minds14</th>\n", " <td>0.941359</td>\n", " <td>0.778265</td>\n", " <td>0.584732</td>\n", " <td>0.511929</td>\n", " <td>0.490065</td>\n", " <td>0.376911</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>nl_voxpopuli</th>\n", " <td>0.553280</td>\n", " <td>0.435277</td>\n", " <td>0.304322</td>\n", " <td>0.252270</td>\n", " <td>0.268306</td>\n", " <td>0.430234</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>fr_google_fleurs</th>\n", " <td>0.580527</td>\n", " <td>0.429523</td>\n", " <td>0.337506</td>\n", " <td>0.275466</td>\n", " <td>0.259405</td>\n", " <td>0.205104</td>\n", " <td>0.114100</td>\n", " </tr>\n", " <tr>\n", " <th>fr_minds14</th>\n", " <td>0.800999</td>\n", " <td>0.714124</td>\n", " <td>0.647957</td>\n", " <td>0.592392</td>\n", " <td>0.613262</td>\n", " <td>0.421050</td>\n", " <td>0.284212</td>\n", " </tr>\n", " <tr>\n", " <th>fr_voxpopuli</th>\n", " <td>0.387866</td>\n", " <td>0.307476</td>\n", " <td>0.240038</td>\n", " <td>0.205174</td>\n", " <td>0.210248</td>\n", " <td>0.323655</td>\n", " <td>0.232059</td>\n", " </tr>\n", " <tr>\n", " <th>de_google_fleurs</th>\n", " <td>0.519535</td>\n", " <td>0.424735</td>\n", " <td>0.360695</td>\n", " <td>0.353459</td>\n", " <td>0.345089</td>\n", " <td>0.139605</td>\n", " <td>0.074235</td>\n", " </tr>\n", " <tr>\n", " <th>de_minds14</th>\n", " <td>0.693370</td>\n", " <td>0.628170</td>\n", " <td>0.570571</td>\n", " <td>0.543742</td>\n", " <td>0.546479</td>\n", " <td>0.288109</td>\n", " <td>0.216011</td>\n", " </tr>\n", " <tr>\n", " <th>de_voxpopuli</th>\n", " <td>0.396771</td>\n", " <td>0.298134</td>\n", " <td>0.236937</td>\n", " <td>0.204998</td>\n", " <td>0.241773</td>\n", " <td>0.385364</td>\n", " <td>0.271072</td>\n", " </tr>\n", " <tr>\n", " <th>it_google_fleurs</th>\n", " <td>0.453637</td>\n", " <td>0.334587</td>\n", " <td>0.269876</td>\n", " <td>0.234494</td>\n", " <td>0.232862</td>\n", " <td>0.168723</td>\n", " <td>0.089945</td>\n", " </tr>\n", " <tr>\n", " <th>it_minds14</th>\n", " <td>0.814580</td>\n", " <td>0.681371</td>\n", " <td>0.576940</td>\n", " <td>0.511340</td>\n", " <td>0.495661</td>\n", " <td>0.376479</td>\n", " <td>0.224318</td>\n", " </tr>\n", " <tr>\n", " <th>it_voxpopuli</th>\n", " <td>0.483728</td>\n", " <td>0.401518</td>\n", " <td>0.332556</td>\n", " <td>0.290310</td>\n", " <td>0.291917</td>\n", " <td>-1.000000</td>\n", " <td>0.288211</td>\n", " </tr>\n", " <tr>\n", " <th>pl_google_fleurs</th>\n", " <td>0.741445</td>\n", " <td>0.580439</td>\n", " <td>0.420468</td>\n", " <td>0.365168</td>\n", " <td>0.348206</td>\n", " <td>0.303350</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>pl_minds14</th>\n", " <td>1.138465</td>\n", " <td>0.999350</td>\n", " <td>0.817470</td>\n", " <td>0.738430</td>\n", " <td>0.754548</td>\n", " <td>0.587577</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>pl_voxpopuli</th>\n", " <td>0.479609</td>\n", " <td>0.366738</td>\n", " <td>0.257558</td>\n", " <td>0.210752</td>\n", " <td>0.201585</td>\n", " <td>0.422140</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>es_google_fleurs</th>\n", " <td>0.341816</td>\n", " <td>0.278543</td>\n", " <td>0.226821</td>\n", " <td>0.227239</td>\n", " <td>0.220248</td>\n", " <td>0.135718</td>\n", " <td>0.069997</td>\n", " </tr>\n", " <tr>\n", " <th>es_minds14</th>\n", " <td>0.992263</td>\n", " <td>0.828084</td>\n", " <td>0.799141</td>\n", " <td>0.791115</td>\n", " <td>0.799426</td>\n", " <td>0.591663</td>\n", " <td>0.435506</td>\n", " </tr>\n", " <tr>\n", " <th>es_voxpopuli</th>\n", " <td>0.304887</td>\n", " <td>0.249827</td>\n", " <td>0.277536</td>\n", " <td>0.240640</td>\n", " <td>0.280930</td>\n", " <td>0.276648</td>\n", " <td>0.210668</td>\n", " </tr>\n", " <tr>\n", " <th>en_google_fleurs</th>\n", " <td>0.341285</td>\n", " <td>0.285416</td>\n", " <td>0.262014</td>\n", " <td>0.249445</td>\n", " <td>0.251211</td>\n", " <td>0.398297</td>\n", " <td>0.099033</td>\n", " </tr>\n", " <tr>\n", " <th>en_minds14</th>\n", " <td>0.681148</td>\n", " <td>0.666131</td>\n", " <td>0.669723</td>\n", " <td>0.669332</td>\n", " <td>0.661842</td>\n", " <td>0.627539</td>\n", " <td>0.361619</td>\n", " </tr>\n", " <tr>\n", " <th>en_voxpopuli</th>\n", " <td>0.364437</td>\n", " <td>0.335141</td>\n", " <td>0.333144</td>\n", " <td>0.481083</td>\n", " <td>0.419667</td>\n", " <td>0.402100</td>\n", " <td>0.170951</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " whisper_tiny whisper_base whisper_small whisper_medium \\\n", "nl_google_fleurs 0.699699 0.533595 0.366764 0.300730 \n", "nl_minds14 0.941359 0.778265 0.584732 0.511929 \n", "nl_voxpopuli 0.553280 0.435277 0.304322 0.252270 \n", "fr_google_fleurs 0.580527 0.429523 0.337506 0.275466 \n", "fr_minds14 0.800999 0.714124 0.647957 0.592392 \n", "fr_voxpopuli 0.387866 0.307476 0.240038 0.205174 \n", "de_google_fleurs 0.519535 0.424735 0.360695 0.353459 \n", "de_minds14 0.693370 0.628170 0.570571 0.543742 \n", "de_voxpopuli 0.396771 0.298134 0.236937 0.204998 \n", "it_google_fleurs 0.453637 0.334587 0.269876 0.234494 \n", "it_minds14 0.814580 0.681371 0.576940 0.511340 \n", "it_voxpopuli 0.483728 0.401518 0.332556 0.290310 \n", "pl_google_fleurs 0.741445 0.580439 0.420468 0.365168 \n", "pl_minds14 1.138465 0.999350 0.817470 0.738430 \n", "pl_voxpopuli 0.479609 0.366738 0.257558 0.210752 \n", "es_google_fleurs 0.341816 0.278543 0.226821 0.227239 \n", "es_minds14 0.992263 0.828084 0.799141 0.791115 \n", "es_voxpopuli 0.304887 0.249827 0.277536 0.240640 \n", "en_google_fleurs 0.341285 0.285416 0.262014 0.249445 \n", "en_minds14 0.681148 0.666131 0.669723 0.669332 \n", "en_voxpopuli 0.364437 0.335141 0.333144 0.481083 \n", "\n", " whisper_large-v2 facebook_wav2vec2 nvidia_stt \n", "nl_google_fleurs 0.282070 0.246416 -1.000000 \n", "nl_minds14 0.490065 0.376911 -1.000000 \n", "nl_voxpopuli 0.268306 0.430234 -1.000000 \n", "fr_google_fleurs 0.259405 0.205104 0.114100 \n", "fr_minds14 0.613262 0.421050 0.284212 \n", "fr_voxpopuli 0.210248 0.323655 0.232059 \n", "de_google_fleurs 0.345089 0.139605 0.074235 \n", "de_minds14 0.546479 0.288109 0.216011 \n", "de_voxpopuli 0.241773 0.385364 0.271072 \n", "it_google_fleurs 0.232862 0.168723 0.089945 \n", "it_minds14 0.495661 0.376479 0.224318 \n", "it_voxpopuli 0.291917 -1.000000 0.288211 \n", "pl_google_fleurs 0.348206 0.303350 -1.000000 \n", "pl_minds14 0.754548 0.587577 -1.000000 \n", "pl_voxpopuli 0.201585 0.422140 -1.000000 \n", "es_google_fleurs 0.220248 0.135718 0.069997 \n", "es_minds14 0.799426 0.591663 0.435506 \n", "es_voxpopuli 0.280930 0.276648 0.210668 \n", "en_google_fleurs 0.251211 0.398297 0.099033 \n", "en_minds14 0.661842 0.627539 0.361619 \n", "en_voxpopuli 0.419667 0.402100 0.170951 " ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "spacy_dep_df = pd.DataFrame(spacy_dep, columns=FULL_LANGUAGE_MODELS, index=FULL_DATASET_NAMES)\n", "spacy_dep_df.to_csv('results/spacy_dep.csv')\n", "spacy_dep_df\n", "\n", "summarize_df(spacy_ner, 'spacy_ner')" ] }, { "cell_type": "code", "execution_count": 22, "id": "3dbfbb6e-c369-47fd-801c-6df211943dc1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>whisper_tiny</th>\n", " <th>whisper_base</th>\n", " <th>whisper_small</th>\n", " <th>whisper_medium</th>\n", " <th>whisper_large-v2</th>\n", " <th>facebook_wav2vec2</th>\n", " <th>nvidia_stt</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>nl_google_fleurs</th>\n", " <td>0.708020</td>\n", " <td>0.535692</td>\n", " <td>0.365346</td>\n", " <td>0.296100</td>\n", " <td>0.261951</td>\n", " <td>0.273752</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>nl_minds14</th>\n", " <td>0.897447</td>\n", " <td>0.714498</td>\n", " <td>0.503436</td>\n", " <td>0.419083</td>\n", " <td>0.389125</td>\n", " <td>0.465494</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>nl_voxpopuli</th>\n", " <td>0.645715</td>\n", " <td>0.526939</td>\n", " <td>0.396940</td>\n", " <td>0.345034</td>\n", " <td>0.358023</td>\n", " <td>0.380835</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>fr_google_fleurs</th>\n", " <td>0.600185</td>\n", " <td>0.470808</td>\n", " <td>0.378478</td>\n", " <td>0.324236</td>\n", " <td>0.309570</td>\n", " <td>0.305183</td>\n", " <td>0.206433</td>\n", " </tr>\n", " <tr>\n", " <th>fr_minds14</th>\n", " <td>0.805977</td>\n", " <td>0.700773</td>\n", " <td>0.642619</td>\n", " <td>0.583323</td>\n", " <td>0.616411</td>\n", " <td>0.564885</td>\n", " <td>0.441154</td>\n", " </tr>\n", " <tr>\n", " <th>fr_voxpopuli</th>\n", " <td>0.510623</td>\n", " <td>0.440340</td>\n", " <td>0.382961</td>\n", " <td>0.359633</td>\n", " <td>0.365811</td>\n", " <td>0.323351</td>\n", " <td>0.187074</td>\n", " </tr>\n", " <tr>\n", " <th>de_google_fleurs</th>\n", " <td>0.651989</td>\n", " <td>0.551766</td>\n", " <td>0.506944</td>\n", " <td>0.478476</td>\n", " <td>0.469045</td>\n", " <td>0.182395</td>\n", " <td>0.072162</td>\n", " </tr>\n", " <tr>\n", " <th>de_minds14</th>\n", " <td>0.659890</td>\n", " <td>0.554437</td>\n", " <td>0.474513</td>\n", " <td>0.429274</td>\n", " <td>0.425134</td>\n", " <td>0.437369</td>\n", " <td>0.357848</td>\n", " </tr>\n", " <tr>\n", " <th>de_voxpopuli</th>\n", " <td>0.645898</td>\n", " <td>0.558876</td>\n", " <td>0.518976</td>\n", " <td>0.488194</td>\n", " <td>0.525581</td>\n", " <td>0.292203</td>\n", " <td>0.088256</td>\n", " </tr>\n", " <tr>\n", " <th>it_google_fleurs</th>\n", " <td>0.465298</td>\n", " <td>0.355877</td>\n", " <td>0.287491</td>\n", " <td>0.254384</td>\n", " <td>0.251697</td>\n", " <td>0.218689</td>\n", " <td>0.140564</td>\n", " </tr>\n", " <tr>\n", " <th>it_minds14</th>\n", " <td>0.779429</td>\n", " <td>0.621546</td>\n", " <td>0.502670</td>\n", " <td>0.437805</td>\n", " <td>0.422781</td>\n", " <td>0.429940</td>\n", " <td>0.276002</td>\n", " </tr>\n", " <tr>\n", " <th>it_voxpopuli</th>\n", " <td>0.562729</td>\n", " <td>0.477854</td>\n", " <td>0.420387</td>\n", " <td>0.388904</td>\n", " <td>0.393964</td>\n", " <td>-1.000000</td>\n", " <td>0.233076</td>\n", " </tr>\n", " <tr>\n", " <th>pl_google_fleurs</th>\n", " <td>0.700853</td>\n", " <td>0.553073</td>\n", " <td>0.384142</td>\n", " <td>0.318203</td>\n", " <td>0.298247</td>\n", " <td>0.335870</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>pl_minds14</th>\n", " <td>1.023324</td>\n", " <td>0.860626</td>\n", " <td>0.633766</td>\n", " <td>0.572826</td>\n", " <td>0.563293</td>\n", " <td>0.697584</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>pl_voxpopuli</th>\n", " <td>0.588464</td>\n", " <td>0.489265</td>\n", " <td>0.380883</td>\n", " <td>0.345623</td>\n", " <td>0.349896</td>\n", " <td>0.324229</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>es_google_fleurs</th>\n", " <td>0.333658</td>\n", " <td>0.261352</td>\n", " <td>0.213950</td>\n", " <td>0.206351</td>\n", " <td>0.202078</td>\n", " <td>0.145522</td>\n", " <td>0.067686</td>\n", " </tr>\n", " <tr>\n", " <th>es_minds14</th>\n", " <td>0.884689</td>\n", " <td>0.740604</td>\n", " <td>0.664831</td>\n", " <td>0.656090</td>\n", " <td>0.650328</td>\n", " <td>0.602494</td>\n", " <td>0.436570</td>\n", " </tr>\n", " <tr>\n", " <th>es_voxpopuli</th>\n", " <td>0.347112</td>\n", " <td>0.294192</td>\n", " <td>0.333500</td>\n", " <td>0.295472</td>\n", " <td>0.353273</td>\n", " <td>0.191242</td>\n", " <td>0.067363</td>\n", " </tr>\n", " <tr>\n", " <th>en_google_fleurs</th>\n", " <td>0.348152</td>\n", " <td>0.307207</td>\n", " <td>0.278857</td>\n", " <td>0.268917</td>\n", " <td>0.270208</td>\n", " <td>1.031485</td>\n", " <td>0.114966</td>\n", " </tr>\n", " <tr>\n", " <th>en_minds14</th>\n", " <td>0.588375</td>\n", " <td>0.571845</td>\n", " <td>0.566381</td>\n", " <td>0.567538</td>\n", " <td>0.562651</td>\n", " <td>1.203252</td>\n", " <td>0.467297</td>\n", " </tr>\n", " <tr>\n", " <th>en_voxpopuli</th>\n", " <td>0.475612</td>\n", " <td>0.451586</td>\n", " <td>0.453132</td>\n", " <td>0.594546</td>\n", " <td>0.549755</td>\n", " <td>1.020514</td>\n", " <td>0.067919</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " whisper_tiny whisper_base whisper_small whisper_medium \\\n", "nl_google_fleurs 0.708020 0.535692 0.365346 0.296100 \n", "nl_minds14 0.897447 0.714498 0.503436 0.419083 \n", "nl_voxpopuli 0.645715 0.526939 0.396940 0.345034 \n", "fr_google_fleurs 0.600185 0.470808 0.378478 0.324236 \n", "fr_minds14 0.805977 0.700773 0.642619 0.583323 \n", "fr_voxpopuli 0.510623 0.440340 0.382961 0.359633 \n", "de_google_fleurs 0.651989 0.551766 0.506944 0.478476 \n", "de_minds14 0.659890 0.554437 0.474513 0.429274 \n", "de_voxpopuli 0.645898 0.558876 0.518976 0.488194 \n", "it_google_fleurs 0.465298 0.355877 0.287491 0.254384 \n", "it_minds14 0.779429 0.621546 0.502670 0.437805 \n", "it_voxpopuli 0.562729 0.477854 0.420387 0.388904 \n", "pl_google_fleurs 0.700853 0.553073 0.384142 0.318203 \n", "pl_minds14 1.023324 0.860626 0.633766 0.572826 \n", "pl_voxpopuli 0.588464 0.489265 0.380883 0.345623 \n", "es_google_fleurs 0.333658 0.261352 0.213950 0.206351 \n", "es_minds14 0.884689 0.740604 0.664831 0.656090 \n", "es_voxpopuli 0.347112 0.294192 0.333500 0.295472 \n", "en_google_fleurs 0.348152 0.307207 0.278857 0.268917 \n", "en_minds14 0.588375 0.571845 0.566381 0.567538 \n", "en_voxpopuli 0.475612 0.451586 0.453132 0.594546 \n", "\n", " whisper_large-v2 facebook_wav2vec2 nvidia_stt \n", "nl_google_fleurs 0.261951 0.273752 -1.000000 \n", "nl_minds14 0.389125 0.465494 -1.000000 \n", "nl_voxpopuli 0.358023 0.380835 -1.000000 \n", "fr_google_fleurs 0.309570 0.305183 0.206433 \n", "fr_minds14 0.616411 0.564885 0.441154 \n", "fr_voxpopuli 0.365811 0.323351 0.187074 \n", "de_google_fleurs 0.469045 0.182395 0.072162 \n", "de_minds14 0.425134 0.437369 0.357848 \n", "de_voxpopuli 0.525581 0.292203 0.088256 \n", "it_google_fleurs 0.251697 0.218689 0.140564 \n", "it_minds14 0.422781 0.429940 0.276002 \n", "it_voxpopuli 0.393964 -1.000000 0.233076 \n", "pl_google_fleurs 0.298247 0.335870 -1.000000 \n", "pl_minds14 0.563293 0.697584 -1.000000 \n", "pl_voxpopuli 0.349896 0.324229 -1.000000 \n", "es_google_fleurs 0.202078 0.145522 0.067686 \n", "es_minds14 0.650328 0.602494 0.436570 \n", "es_voxpopuli 0.353273 0.191242 0.067363 \n", "en_google_fleurs 0.270208 1.031485 0.114966 \n", "en_minds14 0.562651 1.203252 0.467297 \n", "en_voxpopuli 0.549755 1.020514 0.067919 " ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "word_wer_classic_metrics_df = pd.DataFrame(word_wer_classic_metrics, columns=FULL_LANGUAGE_MODELS, index=FULL_DATASET_NAMES)\n", "word_wer_classic_metrics_df.to_csv('results/word_wer_classic_metrics.csv')\n", "word_wer_classic_metrics_df\n", "\n", "summarize_df(spacy_ner, 'spacy_ner')" ] }, { "cell_type": "code", "execution_count": 23, "id": "77a6e273-1f5e-4a2b-9568-66e53ba99c7b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>whisper_tiny</th>\n", " <th>whisper_base</th>\n", " <th>whisper_small</th>\n", " <th>whisper_medium</th>\n", " <th>whisper_large-v2</th>\n", " <th>facebook_wav2vec2</th>\n", " <th>nvidia_stt</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>nl_google_fleurs</th>\n", " <td>0.487020</td>\n", " <td>0.332826</td>\n", " <td>0.173815</td>\n", " <td>0.118312</td>\n", " <td>0.092164</td>\n", " <td>0.186138</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>nl_minds14</th>\n", " <td>0.696387</td>\n", " <td>0.528807</td>\n", " <td>0.323153</td>\n", " <td>0.251855</td>\n", " <td>0.234766</td>\n", " <td>0.306648</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>nl_voxpopuli</th>\n", " <td>0.440765</td>\n", " <td>0.349226</td>\n", " <td>0.233398</td>\n", " <td>0.187694</td>\n", " <td>0.203840</td>\n", " <td>0.295450</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>fr_google_fleurs</th>\n", " <td>0.422005</td>\n", " <td>0.308031</td>\n", " <td>0.230959</td>\n", " <td>0.181520</td>\n", " <td>0.167575</td>\n", " <td>0.225745</td>\n", " <td>0.154588</td>\n", " </tr>\n", " <tr>\n", " <th>fr_minds14</th>\n", " <td>0.598664</td>\n", " <td>0.499632</td>\n", " <td>0.447757</td>\n", " <td>0.395654</td>\n", " <td>0.429327</td>\n", " <td>0.441224</td>\n", " <td>0.342637</td>\n", " </tr>\n", " <tr>\n", " <th>fr_voxpopuli</th>\n", " <td>0.349906</td>\n", " <td>0.291653</td>\n", " <td>0.242314</td>\n", " <td>0.218193</td>\n", " <td>0.226681</td>\n", " <td>0.251004</td>\n", " <td>0.147786</td>\n", " </tr>\n", " <tr>\n", " <th>de_google_fleurs</th>\n", " <td>0.328928</td>\n", " <td>0.213515</td>\n", " <td>0.151060</td>\n", " <td>0.116871</td>\n", " <td>0.104827</td>\n", " <td>0.118999</td>\n", " <td>0.048663</td>\n", " </tr>\n", " <tr>\n", " <th>de_minds14</th>\n", " <td>0.425754</td>\n", " <td>0.331317</td>\n", " <td>0.255620</td>\n", " <td>0.222602</td>\n", " <td>0.220104</td>\n", " <td>0.232533</td>\n", " <td>0.143306</td>\n", " </tr>\n", " <tr>\n", " <th>de_voxpopuli</th>\n", " <td>0.349224</td>\n", " <td>0.259910</td>\n", " <td>0.208328</td>\n", " <td>0.176478</td>\n", " <td>0.215692</td>\n", " <td>0.228572</td>\n", " <td>0.065661</td>\n", " </tr>\n", " <tr>\n", " <th>it_google_fleurs</th>\n", " <td>0.297877</td>\n", " <td>0.201276</td>\n", " <td>0.139435</td>\n", " <td>0.114579</td>\n", " <td>0.103925</td>\n", " <td>0.161414</td>\n", " <td>0.101285</td>\n", " </tr>\n", " <tr>\n", " <th>it_minds14</th>\n", " <td>0.603743</td>\n", " <td>0.455306</td>\n", " <td>0.323527</td>\n", " <td>0.264797</td>\n", " <td>0.255383</td>\n", " <td>0.299216</td>\n", " <td>0.162753</td>\n", " </tr>\n", " <tr>\n", " <th>it_voxpopuli</th>\n", " <td>0.418096</td>\n", " <td>0.345687</td>\n", " <td>0.298079</td>\n", " <td>0.266888</td>\n", " <td>0.270669</td>\n", " <td>-1.000000</td>\n", " <td>0.193692</td>\n", " </tr>\n", " <tr>\n", " <th>pl_google_fleurs</th>\n", " <td>0.493295</td>\n", " <td>0.336319</td>\n", " <td>0.183046</td>\n", " <td>0.119453</td>\n", " <td>0.096625</td>\n", " <td>0.232851</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>pl_minds14</th>\n", " <td>0.822964</td>\n", " <td>0.633399</td>\n", " <td>0.420067</td>\n", " <td>0.353710</td>\n", " <td>0.342892</td>\n", " <td>0.519684</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>pl_voxpopuli</th>\n", " <td>0.385923</td>\n", " <td>0.288336</td>\n", " <td>0.188413</td>\n", " <td>0.152321</td>\n", " <td>0.147463</td>\n", " <td>0.232410</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>es_google_fleurs</th>\n", " <td>0.196055</td>\n", " <td>0.130109</td>\n", " <td>0.084114</td>\n", " <td>0.077302</td>\n", " <td>0.067295</td>\n", " <td>0.102324</td>\n", " <td>0.048997</td>\n", " </tr>\n", " <tr>\n", " <th>es_minds14</th>\n", " <td>0.723086</td>\n", " <td>0.581624</td>\n", " <td>0.497037</td>\n", " <td>0.493568</td>\n", " <td>0.488170</td>\n", " <td>0.522209</td>\n", " <td>0.397315</td>\n", " </tr>\n", " <tr>\n", " <th>es_voxpopuli</th>\n", " <td>0.222505</td>\n", " <td>0.172764</td>\n", " <td>0.195746</td>\n", " <td>0.162495</td>\n", " <td>0.201468</td>\n", " <td>0.143578</td>\n", " <td>0.053721</td>\n", " </tr>\n", " <tr>\n", " <th>en_google_fleurs</th>\n", " <td>0.191329</td>\n", " <td>0.151693</td>\n", " <td>0.121134</td>\n", " <td>0.107578</td>\n", " <td>0.108609</td>\n", " <td>0.111466</td>\n", " <td>0.088609</td>\n", " </tr>\n", " <tr>\n", " <th>en_minds14</th>\n", " <td>0.449783</td>\n", " <td>0.433839</td>\n", " <td>0.427788</td>\n", " <td>0.431043</td>\n", " <td>0.424969</td>\n", " <td>0.424984</td>\n", " <td>0.363642</td>\n", " </tr>\n", " <tr>\n", " <th>en_voxpopuli</th>\n", " <td>0.314581</td>\n", " <td>0.286802</td>\n", " <td>0.297819</td>\n", " <td>0.439680</td>\n", " <td>0.402555</td>\n", " <td>0.118296</td>\n", " <td>0.054176</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " whisper_tiny whisper_base whisper_small whisper_medium \\\n", "nl_google_fleurs 0.487020 0.332826 0.173815 0.118312 \n", "nl_minds14 0.696387 0.528807 0.323153 0.251855 \n", "nl_voxpopuli 0.440765 0.349226 0.233398 0.187694 \n", "fr_google_fleurs 0.422005 0.308031 0.230959 0.181520 \n", "fr_minds14 0.598664 0.499632 0.447757 0.395654 \n", "fr_voxpopuli 0.349906 0.291653 0.242314 0.218193 \n", "de_google_fleurs 0.328928 0.213515 0.151060 0.116871 \n", "de_minds14 0.425754 0.331317 0.255620 0.222602 \n", "de_voxpopuli 0.349224 0.259910 0.208328 0.176478 \n", "it_google_fleurs 0.297877 0.201276 0.139435 0.114579 \n", "it_minds14 0.603743 0.455306 0.323527 0.264797 \n", "it_voxpopuli 0.418096 0.345687 0.298079 0.266888 \n", "pl_google_fleurs 0.493295 0.336319 0.183046 0.119453 \n", "pl_minds14 0.822964 0.633399 0.420067 0.353710 \n", "pl_voxpopuli 0.385923 0.288336 0.188413 0.152321 \n", "es_google_fleurs 0.196055 0.130109 0.084114 0.077302 \n", "es_minds14 0.723086 0.581624 0.497037 0.493568 \n", "es_voxpopuli 0.222505 0.172764 0.195746 0.162495 \n", "en_google_fleurs 0.191329 0.151693 0.121134 0.107578 \n", "en_minds14 0.449783 0.433839 0.427788 0.431043 \n", "en_voxpopuli 0.314581 0.286802 0.297819 0.439680 \n", "\n", " whisper_large-v2 facebook_wav2vec2 nvidia_stt \n", "nl_google_fleurs 0.092164 0.186138 -1.000000 \n", "nl_minds14 0.234766 0.306648 -1.000000 \n", "nl_voxpopuli 0.203840 0.295450 -1.000000 \n", "fr_google_fleurs 0.167575 0.225745 0.154588 \n", "fr_minds14 0.429327 0.441224 0.342637 \n", "fr_voxpopuli 0.226681 0.251004 0.147786 \n", "de_google_fleurs 0.104827 0.118999 0.048663 \n", "de_minds14 0.220104 0.232533 0.143306 \n", "de_voxpopuli 0.215692 0.228572 0.065661 \n", "it_google_fleurs 0.103925 0.161414 0.101285 \n", "it_minds14 0.255383 0.299216 0.162753 \n", "it_voxpopuli 0.270669 -1.000000 0.193692 \n", "pl_google_fleurs 0.096625 0.232851 -1.000000 \n", "pl_minds14 0.342892 0.519684 -1.000000 \n", "pl_voxpopuli 0.147463 0.232410 -1.000000 \n", "es_google_fleurs 0.067295 0.102324 0.048997 \n", "es_minds14 0.488170 0.522209 0.397315 \n", "es_voxpopuli 0.201468 0.143578 0.053721 \n", "en_google_fleurs 0.108609 0.111466 0.088609 \n", "en_minds14 0.424969 0.424984 0.363642 \n", "en_voxpopuli 0.402555 0.118296 0.054176 " ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "word_wer_soft_metrics_df = pd.DataFrame(word_wer_soft_metrics, columns=FULL_LANGUAGE_MODELS, index=FULL_DATASET_NAMES)\n", "word_wer_soft_metrics_df.to_csv('results/word_wer_soft_metrics.csv')\n", "word_wer_soft_metrics_df\n", "\n", "summarize_df(spacy_ner, 'spacy_ner')" ] }, { "cell_type": "code", "execution_count": 24, "id": "629318e6-8c00-413c-99d4-2b7ff559ac3f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>whisper_tiny</th>\n", " <th>whisper_base</th>\n", " <th>whisper_small</th>\n", " <th>whisper_medium</th>\n", " <th>whisper_large-v2</th>\n", " <th>facebook_wav2vec2</th>\n", " <th>nvidia_stt</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>nl_google_fleurs</th>\n", " <td>0.512857</td>\n", " <td>0.351476</td>\n", " <td>0.183268</td>\n", " <td>0.123803</td>\n", " <td>0.095700</td>\n", " <td>0.192525</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>nl_minds14</th>\n", " <td>0.732501</td>\n", " <td>0.554846</td>\n", " <td>0.346042</td>\n", " <td>0.267858</td>\n", " <td>0.244768</td>\n", " <td>0.319302</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>nl_voxpopuli</th>\n", " <td>0.472829</td>\n", " <td>0.364308</td>\n", " <td>0.241434</td>\n", " <td>0.193047</td>\n", " <td>0.210556</td>\n", " <td>0.304289</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>fr_google_fleurs</th>\n", " <td>0.442361</td>\n", " <td>0.321953</td>\n", " <td>0.240016</td>\n", " <td>0.188132</td>\n", " <td>0.174075</td>\n", " <td>0.233362</td>\n", " <td>0.159139</td>\n", " </tr>\n", " <tr>\n", " <th>fr_minds14</th>\n", " <td>0.628774</td>\n", " <td>0.527781</td>\n", " <td>0.472124</td>\n", " <td>0.417764</td>\n", " <td>0.451830</td>\n", " <td>0.456835</td>\n", " <td>0.353934</td>\n", " </tr>\n", " <tr>\n", " <th>fr_voxpopuli</th>\n", " <td>0.365471</td>\n", " <td>0.304097</td>\n", " <td>0.251867</td>\n", " <td>0.226099</td>\n", " <td>0.235006</td>\n", " <td>0.259228</td>\n", " <td>0.150950</td>\n", " </tr>\n", " <tr>\n", " <th>de_google_fleurs</th>\n", " <td>0.346586</td>\n", " <td>0.227203</td>\n", " <td>0.158453</td>\n", " <td>0.121399</td>\n", " <td>0.107550</td>\n", " <td>0.123204</td>\n", " <td>0.050265</td>\n", " </tr>\n", " <tr>\n", " <th>de_minds14</th>\n", " <td>0.446445</td>\n", " <td>0.346742</td>\n", " <td>0.265021</td>\n", " <td>0.229449</td>\n", " <td>0.226477</td>\n", " <td>0.238560</td>\n", " <td>0.147524</td>\n", " </tr>\n", " <tr>\n", " <th>de_voxpopuli</th>\n", " <td>0.366639</td>\n", " <td>0.270086</td>\n", " <td>0.215487</td>\n", " <td>0.181204</td>\n", " <td>0.221848</td>\n", " <td>0.234268</td>\n", " <td>0.067181</td>\n", " </tr>\n", " <tr>\n", " <th>it_google_fleurs</th>\n", " <td>0.313010</td>\n", " <td>0.210131</td>\n", " <td>0.144045</td>\n", " <td>0.117567</td>\n", " <td>0.106640</td>\n", " <td>0.165954</td>\n", " <td>0.104103</td>\n", " </tr>\n", " <tr>\n", " <th>it_minds14</th>\n", " <td>0.633334</td>\n", " <td>0.476970</td>\n", " <td>0.337584</td>\n", " <td>0.275103</td>\n", " <td>0.265102</td>\n", " <td>0.310508</td>\n", " <td>0.168097</td>\n", " </tr>\n", " <tr>\n", " <th>it_voxpopuli</th>\n", " <td>0.439105</td>\n", " <td>0.363577</td>\n", " <td>0.310733</td>\n", " <td>0.278968</td>\n", " <td>0.283103</td>\n", " <td>-1.000000</td>\n", " <td>0.198565</td>\n", " </tr>\n", " <tr>\n", " <th>pl_google_fleurs</th>\n", " <td>0.520524</td>\n", " <td>0.358929</td>\n", " <td>0.190407</td>\n", " <td>0.123706</td>\n", " <td>0.098981</td>\n", " <td>0.242890</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>pl_minds14</th>\n", " <td>0.861366</td>\n", " <td>0.666738</td>\n", " <td>0.439214</td>\n", " <td>0.370198</td>\n", " <td>0.361172</td>\n", " <td>0.542831</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>pl_voxpopuli</th>\n", " <td>0.404981</td>\n", " <td>0.301113</td>\n", " <td>0.194702</td>\n", " <td>0.156644</td>\n", " <td>0.151601</td>\n", " <td>0.240070</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>es_google_fleurs</th>\n", " <td>0.204884</td>\n", " <td>0.135018</td>\n", " <td>0.086281</td>\n", " <td>0.078608</td>\n", " <td>0.067940</td>\n", " <td>0.105327</td>\n", " <td>0.050019</td>\n", " </tr>\n", " <tr>\n", " <th>es_minds14</th>\n", " <td>0.752425</td>\n", " <td>0.601240</td>\n", " <td>0.511320</td>\n", " <td>0.505483</td>\n", " <td>0.497249</td>\n", " <td>0.535758</td>\n", " <td>0.401730</td>\n", " </tr>\n", " <tr>\n", " <th>es_voxpopuli</th>\n", " <td>0.233013</td>\n", " <td>0.179737</td>\n", " <td>0.202485</td>\n", " <td>0.167919</td>\n", " <td>0.208381</td>\n", " <td>0.148001</td>\n", " <td>0.054963</td>\n", " </tr>\n", " <tr>\n", " <th>en_google_fleurs</th>\n", " <td>0.198209</td>\n", " <td>0.157780</td>\n", " <td>0.125360</td>\n", " <td>0.111138</td>\n", " <td>0.112012</td>\n", " <td>0.116211</td>\n", " <td>0.092322</td>\n", " </tr>\n", " <tr>\n", " <th>en_minds14</th>\n", " <td>0.463499</td>\n", " <td>0.446222</td>\n", " <td>0.442346</td>\n", " <td>0.444175</td>\n", " <td>0.438048</td>\n", " <td>0.434445</td>\n", " <td>0.371188</td>\n", " </tr>\n", " <tr>\n", " <th>en_voxpopuli</th>\n", " <td>0.325976</td>\n", " <td>0.294154</td>\n", " <td>0.306453</td>\n", " <td>0.451091</td>\n", " <td>0.414535</td>\n", " <td>0.120754</td>\n", " <td>0.055428</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " whisper_tiny whisper_base whisper_small whisper_medium \\\n", "nl_google_fleurs 0.512857 0.351476 0.183268 0.123803 \n", "nl_minds14 0.732501 0.554846 0.346042 0.267858 \n", "nl_voxpopuli 0.472829 0.364308 0.241434 0.193047 \n", "fr_google_fleurs 0.442361 0.321953 0.240016 0.188132 \n", "fr_minds14 0.628774 0.527781 0.472124 0.417764 \n", "fr_voxpopuli 0.365471 0.304097 0.251867 0.226099 \n", "de_google_fleurs 0.346586 0.227203 0.158453 0.121399 \n", "de_minds14 0.446445 0.346742 0.265021 0.229449 \n", "de_voxpopuli 0.366639 0.270086 0.215487 0.181204 \n", "it_google_fleurs 0.313010 0.210131 0.144045 0.117567 \n", "it_minds14 0.633334 0.476970 0.337584 0.275103 \n", "it_voxpopuli 0.439105 0.363577 0.310733 0.278968 \n", "pl_google_fleurs 0.520524 0.358929 0.190407 0.123706 \n", "pl_minds14 0.861366 0.666738 0.439214 0.370198 \n", "pl_voxpopuli 0.404981 0.301113 0.194702 0.156644 \n", "es_google_fleurs 0.204884 0.135018 0.086281 0.078608 \n", "es_minds14 0.752425 0.601240 0.511320 0.505483 \n", "es_voxpopuli 0.233013 0.179737 0.202485 0.167919 \n", "en_google_fleurs 0.198209 0.157780 0.125360 0.111138 \n", "en_minds14 0.463499 0.446222 0.442346 0.444175 \n", "en_voxpopuli 0.325976 0.294154 0.306453 0.451091 \n", "\n", " whisper_large-v2 facebook_wav2vec2 nvidia_stt \n", "nl_google_fleurs 0.095700 0.192525 -1.000000 \n", "nl_minds14 0.244768 0.319302 -1.000000 \n", "nl_voxpopuli 0.210556 0.304289 -1.000000 \n", "fr_google_fleurs 0.174075 0.233362 0.159139 \n", "fr_minds14 0.451830 0.456835 0.353934 \n", "fr_voxpopuli 0.235006 0.259228 0.150950 \n", "de_google_fleurs 0.107550 0.123204 0.050265 \n", "de_minds14 0.226477 0.238560 0.147524 \n", "de_voxpopuli 0.221848 0.234268 0.067181 \n", "it_google_fleurs 0.106640 0.165954 0.104103 \n", "it_minds14 0.265102 0.310508 0.168097 \n", "it_voxpopuli 0.283103 -1.000000 0.198565 \n", "pl_google_fleurs 0.098981 0.242890 -1.000000 \n", "pl_minds14 0.361172 0.542831 -1.000000 \n", "pl_voxpopuli 0.151601 0.240070 -1.000000 \n", "es_google_fleurs 0.067940 0.105327 0.050019 \n", "es_minds14 0.497249 0.535758 0.401730 \n", "es_voxpopuli 0.208381 0.148001 0.054963 \n", "en_google_fleurs 0.112012 0.116211 0.092322 \n", "en_minds14 0.438048 0.434445 0.371188 \n", "en_voxpopuli 0.414535 0.120754 0.055428 " ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "word_wer_embedding_metrics_df = pd.DataFrame(word_wer_embedding_metrics, columns=FULL_LANGUAGE_MODELS, index=FULL_DATASET_NAMES)\n", "word_wer_embedding_metrics_df.to_csv('results/word_wer_embedding_metrics.csv')\n", "word_wer_embedding_metrics_df\n", "\n", "summarize_df(spacy_ner, 'spacy_ner')" ] }, { "cell_type": "code", "execution_count": 25, "id": "99bfad3e-3c9f-42d6-9a36-ce1914b16bb5", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>whisper_tiny</th>\n", " <th>whisper_base</th>\n", " <th>whisper_small</th>\n", " <th>whisper_medium</th>\n", " <th>whisper_large-v2</th>\n", " <th>facebook_wav2vec2</th>\n", " <th>nvidia_stt</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>nl_google_fleurs</th>\n", " <td>0.510993</td>\n", " <td>0.364093</td>\n", " <td>0.233944</td>\n", " <td>0.194375</td>\n", " <td>0.176388</td>\n", " <td>0.127387</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>nl_minds14</th>\n", " <td>0.749436</td>\n", " <td>0.563341</td>\n", " <td>0.400222</td>\n", " <td>0.337951</td>\n", " <td>0.321183</td>\n", " <td>0.253165</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>nl_voxpopuli</th>\n", " <td>0.393880</td>\n", " <td>0.294984</td>\n", " <td>0.187720</td>\n", " <td>0.148644</td>\n", " <td>0.167895</td>\n", " <td>0.314945</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>fr_google_fleurs</th>\n", " <td>0.433044</td>\n", " <td>0.304306</td>\n", " <td>0.222197</td>\n", " <td>0.178437</td>\n", " <td>0.165940</td>\n", " <td>0.114709</td>\n", " <td>0.062883</td>\n", " </tr>\n", " <tr>\n", " <th>fr_minds14</th>\n", " <td>0.664310</td>\n", " <td>0.565113</td>\n", " <td>0.509531</td>\n", " <td>0.449146</td>\n", " <td>0.490874</td>\n", " <td>0.329511</td>\n", " <td>0.231802</td>\n", " </tr>\n", " <tr>\n", " <th>fr_voxpopuli</th>\n", " <td>0.271206</td>\n", " <td>0.200462</td>\n", " <td>0.143015</td>\n", " <td>0.116287</td>\n", " <td>0.121793</td>\n", " <td>0.250052</td>\n", " <td>0.183570</td>\n", " </tr>\n", " <tr>\n", " <th>de_google_fleurs</th>\n", " <td>0.398803</td>\n", " <td>0.298202</td>\n", " <td>0.238771</td>\n", " <td>0.233886</td>\n", " <td>0.225833</td>\n", " <td>0.090344</td>\n", " <td>0.045677</td>\n", " </tr>\n", " <tr>\n", " <th>de_minds14</th>\n", " <td>0.541808</td>\n", " <td>0.458428</td>\n", " <td>0.394453</td>\n", " <td>0.366073</td>\n", " <td>0.366372</td>\n", " <td>0.216899</td>\n", " <td>0.167290</td>\n", " </tr>\n", " <tr>\n", " <th>de_voxpopuli</th>\n", " <td>0.310177</td>\n", " <td>0.212666</td>\n", " <td>0.156219</td>\n", " <td>0.127821</td>\n", " <td>0.165711</td>\n", " <td>0.318096</td>\n", " <td>0.215976</td>\n", " </tr>\n", " <tr>\n", " <th>it_google_fleurs</th>\n", " <td>0.334621</td>\n", " <td>0.248942</td>\n", " <td>0.206167</td>\n", " <td>0.171781</td>\n", " <td>0.175235</td>\n", " <td>0.110213</td>\n", " <td>0.066707</td>\n", " </tr>\n", " <tr>\n", " <th>it_minds14</th>\n", " <td>0.669161</td>\n", " <td>0.531287</td>\n", " <td>0.431609</td>\n", " <td>0.369645</td>\n", " <td>0.358544</td>\n", " <td>0.267590</td>\n", " <td>0.162753</td>\n", " </tr>\n", " <tr>\n", " <th>it_voxpopuli</th>\n", " <td>0.387963</td>\n", " <td>0.318896</td>\n", " <td>0.260937</td>\n", " <td>0.225710</td>\n", " <td>0.228727</td>\n", " <td>-1.000000</td>\n", " <td>0.253023</td>\n", " </tr>\n", " <tr>\n", " <th>pl_google_fleurs</th>\n", " <td>0.500755</td>\n", " <td>0.379842</td>\n", " <td>0.260604</td>\n", " <td>0.225961</td>\n", " <td>0.216013</td>\n", " <td>0.133794</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>pl_minds14</th>\n", " <td>0.914857</td>\n", " <td>0.762013</td>\n", " <td>0.564110</td>\n", " <td>0.499567</td>\n", " <td>0.505554</td>\n", " <td>0.405276</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>pl_voxpopuli</th>\n", " <td>0.303221</td>\n", " <td>0.213051</td>\n", " <td>0.135739</td>\n", " <td>0.102749</td>\n", " <td>0.098353</td>\n", " <td>0.296488</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>es_google_fleurs</th>\n", " <td>0.254765</td>\n", " <td>0.196087</td>\n", " <td>0.162556</td>\n", " <td>0.172763</td>\n", " <td>0.169661</td>\n", " <td>0.082617</td>\n", " <td>0.051719</td>\n", " </tr>\n", " <tr>\n", " <th>es_minds14</th>\n", " <td>0.830240</td>\n", " <td>0.721083</td>\n", " <td>0.667662</td>\n", " <td>0.661177</td>\n", " <td>0.669886</td>\n", " <td>0.480701</td>\n", " <td>0.396179</td>\n", " </tr>\n", " <tr>\n", " <th>es_voxpopuli</th>\n", " <td>0.210093</td>\n", " <td>0.164819</td>\n", " <td>0.174829</td>\n", " <td>0.142208</td>\n", " <td>0.168499</td>\n", " <td>0.232005</td>\n", " <td>0.188607</td>\n", " </tr>\n", " <tr>\n", " <th>en_google_fleurs</th>\n", " <td>0.263058</td>\n", " <td>0.214739</td>\n", " <td>0.192109</td>\n", " <td>0.188423</td>\n", " <td>0.189492</td>\n", " <td>0.156390</td>\n", " <td>0.075239</td>\n", " </tr>\n", " <tr>\n", " <th>en_minds14</th>\n", " <td>0.532602</td>\n", " <td>0.518411</td>\n", " <td>0.523873</td>\n", " <td>0.524760</td>\n", " <td>0.517753</td>\n", " <td>0.391206</td>\n", " <td>0.332978</td>\n", " </tr>\n", " <tr>\n", " <th>en_voxpopuli</th>\n", " <td>0.280769</td>\n", " <td>0.252505</td>\n", " <td>0.246012</td>\n", " <td>0.364994</td>\n", " <td>0.296381</td>\n", " <td>0.210788</td>\n", " <td>0.167197</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " whisper_tiny whisper_base whisper_small whisper_medium \\\n", "nl_google_fleurs 0.510993 0.364093 0.233944 0.194375 \n", "nl_minds14 0.749436 0.563341 0.400222 0.337951 \n", "nl_voxpopuli 0.393880 0.294984 0.187720 0.148644 \n", "fr_google_fleurs 0.433044 0.304306 0.222197 0.178437 \n", "fr_minds14 0.664310 0.565113 0.509531 0.449146 \n", "fr_voxpopuli 0.271206 0.200462 0.143015 0.116287 \n", "de_google_fleurs 0.398803 0.298202 0.238771 0.233886 \n", "de_minds14 0.541808 0.458428 0.394453 0.366073 \n", "de_voxpopuli 0.310177 0.212666 0.156219 0.127821 \n", "it_google_fleurs 0.334621 0.248942 0.206167 0.171781 \n", "it_minds14 0.669161 0.531287 0.431609 0.369645 \n", "it_voxpopuli 0.387963 0.318896 0.260937 0.225710 \n", "pl_google_fleurs 0.500755 0.379842 0.260604 0.225961 \n", "pl_minds14 0.914857 0.762013 0.564110 0.499567 \n", "pl_voxpopuli 0.303221 0.213051 0.135739 0.102749 \n", "es_google_fleurs 0.254765 0.196087 0.162556 0.172763 \n", "es_minds14 0.830240 0.721083 0.667662 0.661177 \n", "es_voxpopuli 0.210093 0.164819 0.174829 0.142208 \n", "en_google_fleurs 0.263058 0.214739 0.192109 0.188423 \n", "en_minds14 0.532602 0.518411 0.523873 0.524760 \n", "en_voxpopuli 0.280769 0.252505 0.246012 0.364994 \n", "\n", " whisper_large-v2 facebook_wav2vec2 nvidia_stt \n", "nl_google_fleurs 0.176388 0.127387 -1.000000 \n", "nl_minds14 0.321183 0.253165 -1.000000 \n", "nl_voxpopuli 0.167895 0.314945 -1.000000 \n", "fr_google_fleurs 0.165940 0.114709 0.062883 \n", "fr_minds14 0.490874 0.329511 0.231802 \n", "fr_voxpopuli 0.121793 0.250052 0.183570 \n", "de_google_fleurs 0.225833 0.090344 0.045677 \n", "de_minds14 0.366372 0.216899 0.167290 \n", "de_voxpopuli 0.165711 0.318096 0.215976 \n", "it_google_fleurs 0.175235 0.110213 0.066707 \n", "it_minds14 0.358544 0.267590 0.162753 \n", "it_voxpopuli 0.228727 -1.000000 0.253023 \n", "pl_google_fleurs 0.216013 0.133794 -1.000000 \n", "pl_minds14 0.505554 0.405276 -1.000000 \n", "pl_voxpopuli 0.098353 0.296488 -1.000000 \n", "es_google_fleurs 0.169661 0.082617 0.051719 \n", "es_minds14 0.669886 0.480701 0.396179 \n", "es_voxpopuli 0.168499 0.232005 0.188607 \n", "en_google_fleurs 0.189492 0.156390 0.075239 \n", "en_minds14 0.517753 0.391206 0.332978 \n", "en_voxpopuli 0.296381 0.210788 0.167197 " ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "flair_pos_df = pd.DataFrame(flair_pos, columns=FULL_LANGUAGE_MODELS, index=FULL_DATASET_NAMES)\n", "flair_pos_df.to_csv('results/flair_pos.csv')\n", "flair_pos_df\n", "\n", "summarize_df(spacy_ner, 'spacy_ner')" ] }, { "cell_type": "code", "execution_count": 26, "id": "7275b2b0-957b-4618-9f66-7b88302f896a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>whisper_tiny</th>\n", " <th>whisper_base</th>\n", " <th>whisper_small</th>\n", " <th>whisper_medium</th>\n", " <th>whisper_large-v2</th>\n", " <th>facebook_wav2vec2</th>\n", " <th>nvidia_stt</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>nl_google_fleurs</th>\n", " <td>0.510993</td>\n", " <td>0.364093</td>\n", " <td>0.233944</td>\n", " <td>0.194375</td>\n", " <td>0.176388</td>\n", " <td>0.127387</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>nl_minds14</th>\n", " <td>0.749436</td>\n", " <td>0.563341</td>\n", " <td>0.400222</td>\n", " <td>0.337951</td>\n", " <td>0.321183</td>\n", " <td>0.253165</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>nl_voxpopuli</th>\n", " <td>0.393880</td>\n", " <td>0.294984</td>\n", " <td>0.187720</td>\n", " <td>0.148644</td>\n", " <td>0.167895</td>\n", " <td>0.314945</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>fr_google_fleurs</th>\n", " <td>0.433044</td>\n", " <td>0.304306</td>\n", " <td>0.222197</td>\n", " <td>0.178437</td>\n", " <td>0.165940</td>\n", " <td>0.114709</td>\n", " <td>0.062883</td>\n", " </tr>\n", " <tr>\n", " <th>fr_minds14</th>\n", " <td>0.664310</td>\n", " <td>0.565113</td>\n", " <td>0.509531</td>\n", " <td>0.449146</td>\n", " <td>0.490874</td>\n", " <td>0.329511</td>\n", " <td>0.231802</td>\n", " </tr>\n", " <tr>\n", " <th>fr_voxpopuli</th>\n", " <td>0.271206</td>\n", " <td>0.200462</td>\n", " <td>0.143015</td>\n", " <td>0.116287</td>\n", " <td>0.121793</td>\n", " <td>0.250052</td>\n", " <td>0.183570</td>\n", " </tr>\n", " <tr>\n", " <th>de_google_fleurs</th>\n", " <td>0.398803</td>\n", " <td>0.298202</td>\n", " <td>0.238771</td>\n", " <td>0.233886</td>\n", " <td>0.225833</td>\n", " <td>0.090344</td>\n", " <td>0.045677</td>\n", " </tr>\n", " <tr>\n", " <th>de_minds14</th>\n", " <td>0.541808</td>\n", " <td>0.458428</td>\n", " <td>0.394453</td>\n", " <td>0.366073</td>\n", " <td>0.366372</td>\n", " <td>0.216899</td>\n", " <td>0.167290</td>\n", " </tr>\n", " <tr>\n", " <th>de_voxpopuli</th>\n", " <td>0.310177</td>\n", " <td>0.212666</td>\n", " <td>0.156219</td>\n", " <td>0.127821</td>\n", " <td>0.165711</td>\n", " <td>0.318096</td>\n", " <td>0.215976</td>\n", " </tr>\n", " <tr>\n", " <th>it_google_fleurs</th>\n", " <td>0.334621</td>\n", " <td>0.248942</td>\n", " <td>0.206167</td>\n", " <td>0.171781</td>\n", " <td>0.175235</td>\n", " <td>0.110213</td>\n", " <td>0.066707</td>\n", " </tr>\n", " <tr>\n", " <th>it_minds14</th>\n", " <td>0.669161</td>\n", " <td>0.531287</td>\n", " <td>0.431609</td>\n", " <td>0.369645</td>\n", " <td>0.358544</td>\n", " <td>0.267590</td>\n", " <td>0.162753</td>\n", " </tr>\n", " <tr>\n", " <th>it_voxpopuli</th>\n", " <td>0.387963</td>\n", " <td>0.318896</td>\n", " <td>0.260937</td>\n", " <td>0.225710</td>\n", " <td>0.228727</td>\n", " <td>-1.000000</td>\n", " <td>0.253023</td>\n", " </tr>\n", " <tr>\n", " <th>pl_google_fleurs</th>\n", " <td>0.500755</td>\n", " <td>0.379842</td>\n", " <td>0.260604</td>\n", " <td>0.225961</td>\n", " <td>0.216013</td>\n", " <td>0.133794</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>pl_minds14</th>\n", " <td>0.914857</td>\n", " <td>0.762013</td>\n", " <td>0.564110</td>\n", " <td>0.499567</td>\n", " <td>0.505554</td>\n", " <td>0.405276</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>pl_voxpopuli</th>\n", " <td>0.303221</td>\n", " <td>0.213051</td>\n", " <td>0.135739</td>\n", " <td>0.102749</td>\n", " <td>0.098353</td>\n", " <td>0.296488</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>es_google_fleurs</th>\n", " <td>0.254765</td>\n", " <td>0.196087</td>\n", " <td>0.162556</td>\n", " <td>0.172763</td>\n", " <td>0.169661</td>\n", " <td>0.082617</td>\n", " <td>0.051719</td>\n", " </tr>\n", " <tr>\n", " <th>es_minds14</th>\n", " <td>0.830240</td>\n", " <td>0.721083</td>\n", " <td>0.667662</td>\n", " <td>0.661177</td>\n", " <td>0.669886</td>\n", " <td>0.480701</td>\n", " <td>0.396179</td>\n", " </tr>\n", " <tr>\n", " <th>es_voxpopuli</th>\n", " <td>0.210093</td>\n", " <td>0.164819</td>\n", " <td>0.174829</td>\n", " <td>0.142208</td>\n", " <td>0.168499</td>\n", " <td>0.232005</td>\n", " <td>0.188607</td>\n", " </tr>\n", " <tr>\n", " <th>en_google_fleurs</th>\n", " <td>0.263058</td>\n", " <td>0.214739</td>\n", " <td>0.192109</td>\n", " <td>0.188423</td>\n", " <td>0.189492</td>\n", " <td>0.156390</td>\n", " <td>0.075239</td>\n", " </tr>\n", " <tr>\n", " <th>en_minds14</th>\n", " <td>0.532602</td>\n", " <td>0.518411</td>\n", " <td>0.523873</td>\n", " <td>0.524760</td>\n", " <td>0.517753</td>\n", " <td>0.391206</td>\n", " <td>0.332978</td>\n", " </tr>\n", " <tr>\n", " <th>en_voxpopuli</th>\n", " <td>0.280769</td>\n", " <td>0.252505</td>\n", " <td>0.246012</td>\n", " <td>0.364994</td>\n", " <td>0.296381</td>\n", " <td>0.210788</td>\n", " <td>0.167197</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " whisper_tiny whisper_base whisper_small whisper_medium \\\n", "nl_google_fleurs 0.510993 0.364093 0.233944 0.194375 \n", "nl_minds14 0.749436 0.563341 0.400222 0.337951 \n", "nl_voxpopuli 0.393880 0.294984 0.187720 0.148644 \n", "fr_google_fleurs 0.433044 0.304306 0.222197 0.178437 \n", "fr_minds14 0.664310 0.565113 0.509531 0.449146 \n", "fr_voxpopuli 0.271206 0.200462 0.143015 0.116287 \n", "de_google_fleurs 0.398803 0.298202 0.238771 0.233886 \n", "de_minds14 0.541808 0.458428 0.394453 0.366073 \n", "de_voxpopuli 0.310177 0.212666 0.156219 0.127821 \n", "it_google_fleurs 0.334621 0.248942 0.206167 0.171781 \n", "it_minds14 0.669161 0.531287 0.431609 0.369645 \n", "it_voxpopuli 0.387963 0.318896 0.260937 0.225710 \n", "pl_google_fleurs 0.500755 0.379842 0.260604 0.225961 \n", "pl_minds14 0.914857 0.762013 0.564110 0.499567 \n", "pl_voxpopuli 0.303221 0.213051 0.135739 0.102749 \n", "es_google_fleurs 0.254765 0.196087 0.162556 0.172763 \n", "es_minds14 0.830240 0.721083 0.667662 0.661177 \n", "es_voxpopuli 0.210093 0.164819 0.174829 0.142208 \n", "en_google_fleurs 0.263058 0.214739 0.192109 0.188423 \n", "en_minds14 0.532602 0.518411 0.523873 0.524760 \n", "en_voxpopuli 0.280769 0.252505 0.246012 0.364994 \n", "\n", " whisper_large-v2 facebook_wav2vec2 nvidia_stt \n", "nl_google_fleurs 0.176388 0.127387 -1.000000 \n", "nl_minds14 0.321183 0.253165 -1.000000 \n", "nl_voxpopuli 0.167895 0.314945 -1.000000 \n", "fr_google_fleurs 0.165940 0.114709 0.062883 \n", "fr_minds14 0.490874 0.329511 0.231802 \n", "fr_voxpopuli 0.121793 0.250052 0.183570 \n", "de_google_fleurs 0.225833 0.090344 0.045677 \n", "de_minds14 0.366372 0.216899 0.167290 \n", "de_voxpopuli 0.165711 0.318096 0.215976 \n", "it_google_fleurs 0.175235 0.110213 0.066707 \n", "it_minds14 0.358544 0.267590 0.162753 \n", "it_voxpopuli 0.228727 -1.000000 0.253023 \n", "pl_google_fleurs 0.216013 0.133794 -1.000000 \n", "pl_minds14 0.505554 0.405276 -1.000000 \n", "pl_voxpopuli 0.098353 0.296488 -1.000000 \n", "es_google_fleurs 0.169661 0.082617 0.051719 \n", "es_minds14 0.669886 0.480701 0.396179 \n", "es_voxpopuli 0.168499 0.232005 0.188607 \n", "en_google_fleurs 0.189492 0.156390 0.075239 \n", "en_minds14 0.517753 0.391206 0.332978 \n", "en_voxpopuli 0.296381 0.210788 0.167197 " ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "flair_pos_df = pd.DataFrame(flair_pos, columns=FULL_LANGUAGE_MODELS, index=FULL_DATASET_NAMES)\n", "flair_pos_df.to_csv('results/flair_pos.csv')\n", "flair_pos_df\n", "\n", "summarize_df(spacy_ner, 'spacy_ner')" ] }, { "cell_type": "code", "execution_count": 27, "id": "5a4f9e8e-9c0e-44e5-9426-655c400ea054", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>whisper_tiny</th>\n", " <th>whisper_base</th>\n", " <th>whisper_small</th>\n", " <th>whisper_medium</th>\n", " <th>whisper_large-v2</th>\n", " <th>facebook_wav2vec2</th>\n", " <th>nvidia_stt</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>nl_google_fleurs</th>\n", " <td>0.215391</td>\n", " <td>0.174029</td>\n", " <td>0.125444</td>\n", " <td>0.115182</td>\n", " <td>0.113159</td>\n", " <td>0.089213</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>nl_minds14</th>\n", " <td>0.343079</td>\n", " <td>0.280673</td>\n", " <td>0.216319</td>\n", " <td>0.201182</td>\n", " <td>0.198545</td>\n", " <td>0.151310</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>nl_voxpopuli</th>\n", " <td>0.216156</td>\n", " <td>0.189828</td>\n", " <td>0.141318</td>\n", " <td>0.133931</td>\n", " <td>0.147112</td>\n", " <td>0.176515</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>fr_google_fleurs</th>\n", " <td>0.208012</td>\n", " <td>0.146742</td>\n", " <td>0.128173</td>\n", " <td>0.106214</td>\n", " <td>0.097691</td>\n", " <td>0.068703</td>\n", " <td>0.045601</td>\n", " </tr>\n", " <tr>\n", " <th>fr_minds14</th>\n", " <td>0.383273</td>\n", " <td>0.356633</td>\n", " <td>0.346255</td>\n", " <td>0.330446</td>\n", " <td>0.365426</td>\n", " <td>0.248440</td>\n", " <td>0.193615</td>\n", " </tr>\n", " <tr>\n", " <th>fr_voxpopuli</th>\n", " <td>0.157518</td>\n", " <td>0.126534</td>\n", " <td>0.104213</td>\n", " <td>0.089124</td>\n", " <td>0.095847</td>\n", " <td>0.147897</td>\n", " <td>0.118277</td>\n", " </tr>\n", " <tr>\n", " <th>de_google_fleurs</th>\n", " <td>0.178717</td>\n", " <td>0.140455</td>\n", " <td>0.153612</td>\n", " <td>0.130936</td>\n", " <td>0.135413</td>\n", " <td>0.069640</td>\n", " <td>0.049105</td>\n", " </tr>\n", " <tr>\n", " <th>de_minds14</th>\n", " <td>0.295491</td>\n", " <td>0.264049</td>\n", " <td>0.246428</td>\n", " <td>0.232066</td>\n", " <td>0.234698</td>\n", " <td>0.172801</td>\n", " <td>0.140307</td>\n", " </tr>\n", " <tr>\n", " <th>de_voxpopuli</th>\n", " <td>0.178856</td>\n", " <td>0.137537</td>\n", " <td>0.105534</td>\n", " <td>0.087482</td>\n", " <td>0.124275</td>\n", " <td>0.164667</td>\n", " <td>0.126233</td>\n", " </tr>\n", " <tr>\n", " <th>it_google_fleurs</th>\n", " <td>0.154465</td>\n", " <td>0.123694</td>\n", " <td>0.123264</td>\n", " <td>0.107109</td>\n", " <td>0.110015</td>\n", " <td>0.060042</td>\n", " <td>0.037594</td>\n", " </tr>\n", " <tr>\n", " <th>it_minds14</th>\n", " <td>0.343251</td>\n", " <td>0.273268</td>\n", " <td>0.240797</td>\n", " <td>0.213506</td>\n", " <td>0.211958</td>\n", " <td>0.147876</td>\n", " <td>0.110599</td>\n", " </tr>\n", " <tr>\n", " <th>it_voxpopuli</th>\n", " <td>0.152579</td>\n", " <td>0.146635</td>\n", " <td>0.142331</td>\n", " <td>0.125299</td>\n", " <td>0.126924</td>\n", " <td>-1.000000</td>\n", " <td>0.145502</td>\n", " </tr>\n", " <tr>\n", " <th>pl_google_fleurs</th>\n", " <td>0.200009</td>\n", " <td>0.163202</td>\n", " <td>0.127170</td>\n", " <td>0.116060</td>\n", " <td>0.112860</td>\n", " <td>0.091837</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>pl_minds14</th>\n", " <td>0.454800</td>\n", " <td>0.415696</td>\n", " <td>0.311585</td>\n", " <td>0.310715</td>\n", " <td>0.316154</td>\n", " <td>0.279046</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>pl_voxpopuli</th>\n", " <td>0.175804</td>\n", " <td>0.137694</td>\n", " <td>0.101624</td>\n", " <td>0.084531</td>\n", " <td>0.081097</td>\n", " <td>0.152815</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>es_google_fleurs</th>\n", " <td>0.139403</td>\n", " <td>0.105495</td>\n", " <td>0.095208</td>\n", " <td>0.106332</td>\n", " <td>0.104021</td>\n", " <td>0.063813</td>\n", " <td>0.038414</td>\n", " </tr>\n", " <tr>\n", " <th>es_minds14</th>\n", " <td>0.533874</td>\n", " <td>0.480372</td>\n", " <td>0.472338</td>\n", " <td>0.480882</td>\n", " <td>0.483780</td>\n", " <td>0.359815</td>\n", " <td>0.290631</td>\n", " </tr>\n", " <tr>\n", " <th>es_voxpopuli</th>\n", " <td>0.110894</td>\n", " <td>0.098927</td>\n", " <td>0.100773</td>\n", " <td>0.087911</td>\n", " <td>0.096432</td>\n", " <td>0.122212</td>\n", " <td>0.116315</td>\n", " </tr>\n", " <tr>\n", " <th>en_google_fleurs</th>\n", " <td>0.151080</td>\n", " <td>0.134344</td>\n", " <td>0.130206</td>\n", " <td>0.131738</td>\n", " <td>0.132967</td>\n", " <td>1.255453</td>\n", " <td>0.049170</td>\n", " </tr>\n", " <tr>\n", " <th>en_minds14</th>\n", " <td>0.381197</td>\n", " <td>0.386708</td>\n", " <td>0.395517</td>\n", " <td>0.399133</td>\n", " <td>0.393609</td>\n", " <td>1.444793</td>\n", " <td>0.284332</td>\n", " </tr>\n", " <tr>\n", " <th>en_voxpopuli</th>\n", " <td>0.258779</td>\n", " <td>0.212418</td>\n", " <td>0.217320</td>\n", " <td>0.337455</td>\n", " <td>0.292532</td>\n", " <td>1.211453</td>\n", " <td>0.120684</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " whisper_tiny whisper_base whisper_small whisper_medium \\\n", "nl_google_fleurs 0.215391 0.174029 0.125444 0.115182 \n", "nl_minds14 0.343079 0.280673 0.216319 0.201182 \n", "nl_voxpopuli 0.216156 0.189828 0.141318 0.133931 \n", "fr_google_fleurs 0.208012 0.146742 0.128173 0.106214 \n", "fr_minds14 0.383273 0.356633 0.346255 0.330446 \n", "fr_voxpopuli 0.157518 0.126534 0.104213 0.089124 \n", "de_google_fleurs 0.178717 0.140455 0.153612 0.130936 \n", "de_minds14 0.295491 0.264049 0.246428 0.232066 \n", "de_voxpopuli 0.178856 0.137537 0.105534 0.087482 \n", "it_google_fleurs 0.154465 0.123694 0.123264 0.107109 \n", "it_minds14 0.343251 0.273268 0.240797 0.213506 \n", "it_voxpopuli 0.152579 0.146635 0.142331 0.125299 \n", "pl_google_fleurs 0.200009 0.163202 0.127170 0.116060 \n", "pl_minds14 0.454800 0.415696 0.311585 0.310715 \n", "pl_voxpopuli 0.175804 0.137694 0.101624 0.084531 \n", "es_google_fleurs 0.139403 0.105495 0.095208 0.106332 \n", "es_minds14 0.533874 0.480372 0.472338 0.480882 \n", "es_voxpopuli 0.110894 0.098927 0.100773 0.087911 \n", "en_google_fleurs 0.151080 0.134344 0.130206 0.131738 \n", "en_minds14 0.381197 0.386708 0.395517 0.399133 \n", "en_voxpopuli 0.258779 0.212418 0.217320 0.337455 \n", "\n", " whisper_large-v2 facebook_wav2vec2 nvidia_stt \n", "nl_google_fleurs 0.113159 0.089213 -1.000000 \n", "nl_minds14 0.198545 0.151310 -1.000000 \n", "nl_voxpopuli 0.147112 0.176515 -1.000000 \n", "fr_google_fleurs 0.097691 0.068703 0.045601 \n", "fr_minds14 0.365426 0.248440 0.193615 \n", "fr_voxpopuli 0.095847 0.147897 0.118277 \n", "de_google_fleurs 0.135413 0.069640 0.049105 \n", "de_minds14 0.234698 0.172801 0.140307 \n", "de_voxpopuli 0.124275 0.164667 0.126233 \n", "it_google_fleurs 0.110015 0.060042 0.037594 \n", "it_minds14 0.211958 0.147876 0.110599 \n", "it_voxpopuli 0.126924 -1.000000 0.145502 \n", "pl_google_fleurs 0.112860 0.091837 -1.000000 \n", "pl_minds14 0.316154 0.279046 -1.000000 \n", "pl_voxpopuli 0.081097 0.152815 -1.000000 \n", "es_google_fleurs 0.104021 0.063813 0.038414 \n", "es_minds14 0.483780 0.359815 0.290631 \n", "es_voxpopuli 0.096432 0.122212 0.116315 \n", "en_google_fleurs 0.132967 1.255453 0.049170 \n", "en_minds14 0.393609 1.444793 0.284332 \n", "en_voxpopuli 0.292532 1.211453 0.120684 " ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wikineural_ner_df = pd.DataFrame(wikineural_ner, columns=FULL_LANGUAGE_MODELS, index=FULL_DATASET_NAMES)\n", "wikineural_ner_df.to_csv('results/wikineural_ner.csv')\n", "wikineural_ner_df\n", "\n", "summarize_df(spacy_ner, 'spacy_ner')" ] }, { "cell_type": "code", "execution_count": null, "id": "8d4a212b-7437-4fa2-9e4e-06db21da1855", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.15" } }, "nbformat": 4, "nbformat_minor": 5 }