Newer
Older
"id": "955a0385-29fb-47dc-b012-729e49570594",
"metadata": {},
"source": [
"from new_experiment.utils.get_spacy_model_name import *\n",
"\n",
"from call_experiment_stats import *\n",
"\n",
"from new_experiment.utils.property_helper import PropertyHelper\n",
"from new_experiment.utils.get_spacy_model_name import get_spacy_model_name\n",
"from new_experiment.new_dependency_provider import get_experiment_repository\n",
"from new_experiment.add_to_queue_pipeline import get_hf_facebook_wav2vec2_model_by_language_code\n",
"id": "3f1221d3-5f70-4441-af07-58fa176e31e9",
"metadata": {},
"outputs": [],
"source": [
"METRICS_FILE = 'metrics.txt'"
]
},
{
"cell_type": "code",
"id": "eda46e65-8079-40b9-9c4e-37fe74caec45",
"metadata": {},
"outputs": [
{
"ename": "ServerSelectionTimeoutError",
"evalue": "192.168.0.124:27017: timed out, Timeout: 30s, Topology Description: <TopologyDescription id: 63caac355a13a212d6a8209f, topology_type: Unknown, servers: [<ServerDescription ('192.168.0.124', 27017) server_type: Unknown, rtt: None, error=NetworkTimeout('192.168.0.124:27017: timed out')>]>",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mServerSelectionTimeoutError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[3], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m metric_repository \u001b[38;5;241m=\u001b[39m get_experiment_repository(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmetric_stats\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(METRICS_FILE, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mw\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m writer:\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m dataset_property \u001b[38;5;129;01min\u001b[39;00m \u001b[43mmetric_repository\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_all_properties\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 4\u001b[0m values_dict \u001b[38;5;241m=\u001b[39m metric_repository\u001b[38;5;241m.\u001b[39mget_all_values_from_property(dataset_property)\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m value_key \u001b[38;5;129;01min\u001b[39;00m values_dict\u001b[38;5;241m.\u001b[39mkeys():\n",
"File \u001b[0;32m~/Desktop/WUST/asr-benchmarks/sziszapangma/integration/repository/mongo_experiment_repository.py:60\u001b[0m, in \u001b[0;36mMongoExperimentRepository.get_all_properties\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 59\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_all_properties\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Set[\u001b[38;5;28mstr\u001b[39m]:\n\u001b[0;32m---> 60\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mset\u001b[39m(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_database\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlist_collection_names\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m)\n",
"File \u001b[0;32m~/opt/miniconda3/envs/asr_benchmarks_39/lib/python3.9/site-packages/pymongo/database.py:959\u001b[0m, in \u001b[0;36mDatabase.list_collection_names\u001b[0;34m(self, session, filter, comment, **kwargs)\u001b[0m\n\u001b[1;32m 956\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mfilter\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m (\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mfilter\u001b[39m) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mfilter\u001b[39m):\n\u001b[1;32m 957\u001b[0m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnameOnly\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 959\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m [result[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;28;01mfor\u001b[39;00m result \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlist_collections\u001b[49m\u001b[43m(\u001b[49m\u001b[43msession\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msession\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m]\n",
"File \u001b[0;32m~/opt/miniconda3/envs/asr_benchmarks_39/lib/python3.9/site-packages/pymongo/database.py:911\u001b[0m, in \u001b[0;36mDatabase.list_collections\u001b[0;34m(self, session, filter, comment, **kwargs)\u001b[0m\n\u001b[1;32m 906\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_cmd\u001b[39m(session, server, sock_info, read_preference):\n\u001b[1;32m 907\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_list_collections(\n\u001b[1;32m 908\u001b[0m sock_info, session, read_preference\u001b[38;5;241m=\u001b[39mread_preference, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs\n\u001b[1;32m 909\u001b[0m )\n\u001b[0;32m--> 911\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__client\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_retryable_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_cmd\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mread_pref\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msession\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/opt/miniconda3/envs/asr_benchmarks_39/lib/python3.9/site-packages/pymongo/_csot.py:105\u001b[0m, in \u001b[0;36mapply.<locals>.csot_wrapper\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 103\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _TimeoutContext(timeout):\n\u001b[1;32m 104\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m--> 105\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/opt/miniconda3/envs/asr_benchmarks_39/lib/python3.9/site-packages/pymongo/mongo_client.py:1441\u001b[0m, in \u001b[0;36mMongoClient._retryable_read\u001b[0;34m(self, func, read_pref, session, address, retryable)\u001b[0m\n\u001b[1;32m 1439\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m last_error\n\u001b[1;32m 1440\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1441\u001b[0m server \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_select_server\u001b[49m\u001b[43m(\u001b[49m\u001b[43mread_pref\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msession\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maddress\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maddress\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1442\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_socket_from_server(read_pref, server, session) \u001b[38;5;28;01mas\u001b[39;00m (sock_info, read_pref):\n\u001b[1;32m 1443\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m retrying \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m retryable:\n\u001b[1;32m 1444\u001b[0m \u001b[38;5;66;03m# A retry is not possible because this server does\u001b[39;00m\n\u001b[1;32m 1445\u001b[0m \u001b[38;5;66;03m# not support retryable reads, raise the last error.\u001b[39;00m\n",
"File \u001b[0;32m~/opt/miniconda3/envs/asr_benchmarks_39/lib/python3.9/site-packages/pymongo/mongo_client.py:1257\u001b[0m, in \u001b[0;36mMongoClient._select_server\u001b[0;34m(self, server_selector, session, address)\u001b[0m\n\u001b[1;32m 1255\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m AutoReconnect(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mserver \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m:\u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m no longer available\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m address)\n\u001b[1;32m 1256\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1257\u001b[0m server \u001b[38;5;241m=\u001b[39m \u001b[43mtopology\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mselect_server\u001b[49m\u001b[43m(\u001b[49m\u001b[43mserver_selector\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1258\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m server\n\u001b[1;32m 1259\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m PyMongoError \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 1260\u001b[0m \u001b[38;5;66;03m# Server selection errors in a transaction are transient.\u001b[39;00m\n",
"File \u001b[0;32m~/opt/miniconda3/envs/asr_benchmarks_39/lib/python3.9/site-packages/pymongo/topology.py:272\u001b[0m, in \u001b[0;36mTopology.select_server\u001b[0;34m(self, selector, server_selection_timeout, address)\u001b[0m\n\u001b[1;32m 270\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mselect_server\u001b[39m(\u001b[38;5;28mself\u001b[39m, selector, server_selection_timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, address\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m 271\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Like select_servers, but choose a random server if several match.\"\"\"\u001b[39;00m\n\u001b[0;32m--> 272\u001b[0m server \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_select_server\u001b[49m\u001b[43m(\u001b[49m\u001b[43mselector\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mserver_selection_timeout\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maddress\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 273\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m _csot\u001b[38;5;241m.\u001b[39mget_timeout():\n\u001b[1;32m 274\u001b[0m _csot\u001b[38;5;241m.\u001b[39mset_rtt(server\u001b[38;5;241m.\u001b[39mdescription\u001b[38;5;241m.\u001b[39mround_trip_time)\n",
"File \u001b[0;32m~/opt/miniconda3/envs/asr_benchmarks_39/lib/python3.9/site-packages/pymongo/topology.py:261\u001b[0m, in \u001b[0;36mTopology._select_server\u001b[0;34m(self, selector, server_selection_timeout, address)\u001b[0m\n\u001b[1;32m 260\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_select_server\u001b[39m(\u001b[38;5;28mself\u001b[39m, selector, server_selection_timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, address\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[0;32m--> 261\u001b[0m servers \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mselect_servers\u001b[49m\u001b[43m(\u001b[49m\u001b[43mselector\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mserver_selection_timeout\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maddress\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 262\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(servers) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 263\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m servers[\u001b[38;5;241m0\u001b[39m]\n",
"File \u001b[0;32m~/opt/miniconda3/envs/asr_benchmarks_39/lib/python3.9/site-packages/pymongo/topology.py:223\u001b[0m, in \u001b[0;36mTopology.select_servers\u001b[0;34m(self, selector, server_selection_timeout, address)\u001b[0m\n\u001b[1;32m 220\u001b[0m server_timeout \u001b[38;5;241m=\u001b[39m server_selection_timeout\n\u001b[1;32m 222\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_lock:\n\u001b[0;32m--> 223\u001b[0m server_descriptions \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_select_servers_loop\u001b[49m\u001b[43m(\u001b[49m\u001b[43mselector\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mserver_timeout\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maddress\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 225\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_server_by_address(sd\u001b[38;5;241m.\u001b[39maddress) \u001b[38;5;28;01mfor\u001b[39;00m sd \u001b[38;5;129;01min\u001b[39;00m server_descriptions]\n",
"File \u001b[0;32m~/opt/miniconda3/envs/asr_benchmarks_39/lib/python3.9/site-packages/pymongo/topology.py:238\u001b[0m, in \u001b[0;36mTopology._select_servers_loop\u001b[0;34m(self, selector, timeout, address)\u001b[0m\n\u001b[1;32m 235\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m server_descriptions:\n\u001b[1;32m 236\u001b[0m \u001b[38;5;66;03m# No suitable servers.\u001b[39;00m\n\u001b[1;32m 237\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m now \u001b[38;5;241m>\u001b[39m end_time:\n\u001b[0;32m--> 238\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m ServerSelectionTimeoutError(\n\u001b[1;32m 239\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m, Timeout: \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124ms, Topology Description: \u001b[39m\u001b[38;5;132;01m%r\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 240\u001b[0m \u001b[38;5;241m%\u001b[39m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_error_message(selector), timeout, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdescription)\n\u001b[1;32m 241\u001b[0m )\n\u001b[1;32m 243\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_ensure_opened()\n\u001b[1;32m 244\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_request_check_all()\n",
"\u001b[0;31mServerSelectionTimeoutError\u001b[0m: 192.168.0.124:27017: timed out, Timeout: 30s, Topology Description: <TopologyDescription id: 63caac355a13a212d6a8209f, topology_type: Unknown, servers: [<ServerDescription ('192.168.0.124', 27017) server_type: Unknown, rtt: None, error=NetworkTimeout('192.168.0.124:27017: timed out')>]>"
]
}
],
"source": [
"metric_repository = get_experiment_repository('metric_stats')\n",
"with open(METRICS_FILE, 'w') as writer:\n",
" for dataset_property in metric_repository.get_all_properties():\n",
" values_dict = metric_repository.get_all_values_from_property(dataset_property)\n",
" for value_key in values_dict.keys():\n",
" line = f'{dataset_property} {value_key} {values_dict[value_key]}'\n",
" writer.write(f'{line}\\n')"
]
},
{
"cell_type": "code",
"id": "9f5e44a6-f211-4b61-8cb4-5636c7672c6a",
"metadata": {},
"outputs": [],
"source": [
"COMMANDS = ['run_word_wer_classic_pipeline', 'run_word_wer_embedding_pipeline', 'run_spacy_dep_tag_wer_pipeline',\n",
" 'run_spacy_ner_wer_pipeline', 'run_spacy_pos_wer_pipeline']\n",
"LANGUAGES = ['nl', 'fr', 'de', 'it', 'pl', 'es', 'en']\n",
"WHISPER_ASR_MODEL = ['tiny', 'base', 'small', 'medium', 'large-v2']\n",
"DATASETS = ['google_fleurs', 'minds14', 'voxpopuli']\n",
"FULL_DATASET_NAMES = []\n",
"for itt in LANGUAGES:\n",
" for it in DATASETS:\n",
" FULL_DATASET_NAMES.append(f'{itt}_{it}')\n",
"FULL_LANGUAGE_MODELS = [f'whisper_{it}' for it in WHISPER_ASR_MODEL] + ['facebook_wav2vec2', 'nvidia_stt']"
"id": "d2465ceb-7439-4fa5-adf8-e95d7e6106b9",
"metadata": {},
"outputs": [],
"source": [
"vals = dict()\n",
"with open(METRICS_FILE, 'r') as reader:\n",
" lines = reader.read().splitlines(keepends=False)\n",
" for line in lines:\n",
" # print(line)\n",
" words = line.split()\n",
" key = f'{words[0]}_{words[1]}'\n",
"id": "e41b19d0-37cb-4810-896a-fa0f73dd86e0",
"metadata": {},
"outputs": [],
"source": [
"def get_model_for_dataset_name(dataset: str, model: str):\n",
" language_code = dataset[:2]\n",
" if model.startswith('whisper'):\n",
" return model\n",
" elif model.startswith('facebook_wav2vec2'):\n",
" return get_hf_facebook_wav2vec2_model_by_language_code(language_code)\n",
" elif model.startswith('nvidia_stt'):\n",
" return f'nvidia_stt_{language_code}_conformer_transducer_large'\n",
" else:\n",
" raise Exception('asr name not found')"
]
},
{
"cell_type": "code",
"id": "22d84451-b7e3-4dba-9758-068dae23ace4",
"metadata": {},
"outputs": [],
"source": [
"spacy_ner = [\n",
" [vals.get(f'{dataset}_{PropertyHelper.ner_metrics(get_model_for_dataset_name(dataset, model), get_spacy_model_name(dataset[:2]))}', -1.0) \n",
" for model in FULL_LANGUAGE_MODELS]\n",
" for dataset in FULL_DATASET_NAMES\n",
"]\n",
"spacy_pos = [\n",
" [vals.get(f'{dataset}_{PropertyHelper.pos_metrics(get_model_for_dataset_name(dataset, model), get_spacy_model_name(dataset[:2]))}', -1.0) \n",
" for model in FULL_LANGUAGE_MODELS]\n",
" for dataset in FULL_DATASET_NAMES\n",
"]\n",
"spacy_dep = [\n",
" [vals.get(f'{dataset}_{PropertyHelper.dep_tag_metrics(get_model_for_dataset_name(dataset, model), get_spacy_model_name(dataset[:2]))}', -1.0) \n",
" for dataset in FULL_DATASET_NAMES\n",
"]\n",
"word_wer_classic_metrics = [\n",
" [vals.get(f'{dataset}_{PropertyHelper.word_wer_classic_metrics(get_model_for_dataset_name(dataset, model))}', -1.0) for model in FULL_LANGUAGE_MODELS]\n",
" for dataset in FULL_DATASET_NAMES\n",
"]\n",
"word_wer_soft_metrics = [\n",
" [vals.get(f'{dataset}_{PropertyHelper.word_wer_soft_metrics(get_model_for_dataset_name(dataset, model))}', -1.0) for model in FULL_LANGUAGE_MODELS]\n",
" for dataset in FULL_DATASET_NAMES\n",
"]\n",
"word_wer_embedding_metrics = [\n",
" [vals.get(f'{dataset}_{PropertyHelper.word_wer_embeddings_metrics(get_model_for_dataset_name(dataset, model))}', -1.0) for model in FULL_LANGUAGE_MODELS]\n",
" for dataset in FULL_DATASET_NAMES\n",
"]\n",
"flair_pos = [\n",
" [vals.get(f'{dataset}_{PropertyHelper.pos_metrics(get_model_for_dataset_name(dataset, model), \"flair_upos_multi\")}', -1.0) for model in FULL_LANGUAGE_MODELS]\n",
" for dataset in FULL_DATASET_NAMES\n",
"]\n",
"wikineural_ner = [\n",
" [vals.get(f'{dataset}_{PropertyHelper.ner_metrics(get_model_for_dataset_name(dataset, model), \"wikineural\")}', -1.0) for model in FULL_LANGUAGE_MODELS]\n",
"execution_count": 32,
"id": "0d9a4977-edce-4c8e-aebe-b76781901512",
"metadata": {},
"outputs": [],
"source": [
"def df_to_latex(df: pd.DataFrame, name: str) -> None:\n",
" with pd.option_context(\"max_colwidth\", 1000):\n",
" with open(name, 'w') as writer:\n",
" writer.write(spacy_ner_df.to_latex())\n",
"\n",
"\n",
"def summarize_df(arr: List[List[float]], name: str) -> pd.DataFrame:\n",
" spacy_ner_df = pd.DataFrame(arr, columns=FULL_LANGUAGE_MODELS, index=FULL_DATASET_NAMES)\n",
" spacy_ner_df.to_csv(f'results/{name}.csv')\n",
" df_to_latex(spacy_ner, f'results/{name}.tex')\n",
" return spacy_ner_df"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "45fd851c-644f-48e6-b711-5bd312404b8b",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/t8/4j9s5lbj1cbbn0xj92r0g31c0000gn/T/ipykernel_59977/2461695209.py:4: FutureWarning: In future versions `DataFrame.to_latex` is expected to utilise the base implementation of `Styler.to_latex` for formatting and rendering. The arguments signature may therefore change. It is recommended instead to use `DataFrame.style.to_latex` which also contains additional functionality.\n",
" writer.write(spacy_ner_df.to_latex())\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>whisper_tiny</th>\n",
" <th>whisper_base</th>\n",
" <th>whisper_small</th>\n",
" <th>whisper_medium</th>\n",
" <th>whisper_large-v2</th>\n",
" <th>facebook_wav2vec2</th>\n",
" <th>nvidia_stt</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>nl_google_fleurs</th>\n",
" <td>0.316124</td>\n",
" <td>0.230845</td>\n",
" <td>0.186936</td>\n",
" <td>0.170150</td>\n",
" <td>0.165057</td>\n",
" <td>0.082781</td>\n",
" <td>-1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>nl_minds14</th>\n",
" <td>0.463084</td>\n",
" <td>0.409993</td>\n",
" <td>0.360934</td>\n",
" <td>0.331613</td>\n",
" <td>0.324172</td>\n",
" <td>0.142155</td>\n",
" <td>-1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>nl_voxpopuli</th>\n",
" <td>0.215158</td>\n",
" <td>0.178716</td>\n",
" <td>0.132960</td>\n",
" <td>0.118042</td>\n",
" <td>0.139958</td>\n",
" <td>0.200403</td>\n",
" <td>-1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>fr_google_fleurs</th>\n",
" <td>0.264291</td>\n",
" <td>0.193436</td>\n",
" <td>0.177302</td>\n",
" <td>0.147464</td>\n",
" <td>0.141276</td>\n",
" <td>0.083170</td>\n",
" <td>0.053155</td>\n",
" </tr>\n",
" <tr>\n",
" <th>fr_minds14</th>\n",
" <td>0.466860</td>\n",
" <td>0.468822</td>\n",
" <td>0.471754</td>\n",
" <td>0.444854</td>\n",
" <td>0.485090</td>\n",
" <td>0.220358</td>\n",
" <td>0.189111</td>\n",
" </tr>\n",
" <tr>\n",
" <th>fr_voxpopuli</th>\n",
" <td>0.161386</td>\n",
" <td>0.131144</td>\n",
" <td>0.113097</td>\n",
" <td>0.099114</td>\n",
" <td>0.111776</td>\n",
" <td>0.169564</td>\n",
" <td>0.127958</td>\n",
" </tr>\n",
" <tr>\n",
" <th>de_google_fleurs</th>\n",
" <td>0.316175</td>\n",
" <td>0.257454</td>\n",
" <td>0.234163</td>\n",
" <td>0.239750</td>\n",
" <td>0.236715</td>\n",
" <td>0.083423</td>\n",
" <td>0.051673</td>\n",
" </tr>\n",
" <tr>\n",
" <th>de_minds14</th>\n",
" <td>0.435681</td>\n",
" <td>0.425712</td>\n",
" <td>0.412896</td>\n",
" <td>0.398617</td>\n",
" <td>0.398762</td>\n",
" <td>0.183933</td>\n",
" <td>0.146988</td>\n",
" </tr>\n",
" <tr>\n",
" <th>de_voxpopuli</th>\n",
" <td>0.200245</td>\n",
" <td>0.155502</td>\n",
" <td>0.133251</td>\n",
" <td>0.116949</td>\n",
" <td>0.156371</td>\n",
" <td>0.242498</td>\n",
" <td>0.168854</td>\n",
" </tr>\n",
" <tr>\n",
" <th>it_google_fleurs</th>\n",
" <td>0.206301</td>\n",
" <td>0.172527</td>\n",
" <td>0.161195</td>\n",
" <td>0.156655</td>\n",
" <td>0.160677</td>\n",
" <td>0.067181</td>\n",
" <td>0.039040</td>\n",
" </tr>\n",
" <tr>\n",
" <th>it_minds14</th>\n",
" <td>0.487493</td>\n",
" <td>0.448874</td>\n",
" <td>0.432679</td>\n",
" <td>0.416035</td>\n",
" <td>0.392705</td>\n",
" <td>0.198809</td>\n",
" <td>0.146235</td>\n",
" </tr>\n",
" <tr>\n",
" <th>it_voxpopuli</th>\n",
" <td>0.160365</td>\n",
" <td>0.139461</td>\n",
" <td>0.138966</td>\n",
" <td>0.123130</td>\n",
" <td>0.130691</td>\n",
" </tr>\n",
" <tr>\n",
" <th>pl_google_fleurs</th>\n",
" <td>0.334936</td>\n",
" <td>0.273025</td>\n",
" <td>0.227662</td>\n",
" <td>0.210962</td>\n",
" <td>0.209027</td>\n",
" <td>0.088157</td>\n",
" <td>-1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>pl_minds14</th>\n",
" <td>0.657194</td>\n",
" <td>0.591588</td>\n",
" <td>0.487344</td>\n",
" <td>0.474013</td>\n",
" <td>0.487891</td>\n",
" <td>0.237692</td>\n",
" <td>-1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>pl_voxpopuli</th>\n",
" <td>0.203548</td>\n",
" <td>0.158526</td>\n",
" <td>0.126280</td>\n",
" <td>0.110784</td>\n",
" <td>0.117780</td>\n",
" <td>0.184368</td>\n",
" <td>-1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>es_google_fleurs</th>\n",
" <td>0.187607</td>\n",
" <td>0.159873</td>\n",
" <td>0.147104</td>\n",
" <td>0.155210</td>\n",
" <td>0.154657</td>\n",
" <td>0.057830</td>\n",
" <td>0.038903</td>\n",
" </tr>\n",
" <tr>\n",
" <th>es_minds14</th>\n",
" <td>0.721295</td>\n",
" <td>0.670363</td>\n",
" <td>0.666278</td>\n",
" <td>0.673058</td>\n",
" <td>0.680341</td>\n",
" <td>0.411927</td>\n",
" <td>0.342895</td>\n",
" </tr>\n",
" <tr>\n",
" <th>es_voxpopuli</th>\n",
" <td>0.133805</td>\n",
" <td>0.116222</td>\n",
" <td>0.119882</td>\n",
" <td>0.106610</td>\n",
" <td>0.122036</td>\n",
" <td>0.148225</td>\n",
" <td>0.128456</td>\n",
" </tr>\n",
" <tr>\n",
" <th>en_google_fleurs</th>\n",
" <td>0.217843</td>\n",
" <td>0.188810</td>\n",
" <td>0.186407</td>\n",
" <td>0.183656</td>\n",
" <td>0.184568</td>\n",
" <td>0.180523</td>\n",
" <td>0.071421</td>\n",
" </tr>\n",
" <tr>\n",
" <th>en_minds14</th>\n",
" <td>0.562068</td>\n",
" <td>0.566999</td>\n",
" <td>0.580369</td>\n",
" <td>0.583945</td>\n",
" <td>0.578079</td>\n",
" <td>0.325304</td>\n",
" <td>0.293083</td>\n",
" </tr>\n",
" <tr>\n",
" <th>en_voxpopuli</th>\n",
" <td>0.224980</td>\n",
" <td>0.203959</td>\n",
" <td>0.210278</td>\n",
" <td>0.322688</td>\n",
" <td>0.280877</td>\n",
" <td>0.182708</td>\n",
" <td>0.124416</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
" whisper_tiny whisper_base whisper_small whisper_medium \\\n",
"nl_google_fleurs 0.316124 0.230845 0.186936 0.170150 \n",
"nl_minds14 0.463084 0.409993 0.360934 0.331613 \n",
"nl_voxpopuli 0.215158 0.178716 0.132960 0.118042 \n",
"fr_google_fleurs 0.264291 0.193436 0.177302 0.147464 \n",
"fr_minds14 0.466860 0.468822 0.471754 0.444854 \n",
"fr_voxpopuli 0.161386 0.131144 0.113097 0.099114 \n",
"de_google_fleurs 0.316175 0.257454 0.234163 0.239750 \n",
"de_minds14 0.435681 0.425712 0.412896 0.398617 \n",
"de_voxpopuli 0.200245 0.155502 0.133251 0.116949 \n",
"it_google_fleurs 0.206301 0.172527 0.161195 0.156655 \n",
"it_minds14 0.487493 0.448874 0.432679 0.416035 \n",
"it_voxpopuli 0.160365 0.139461 0.138966 0.123130 \n",
"pl_google_fleurs 0.334936 0.273025 0.227662 0.210962 \n",
"pl_minds14 0.657194 0.591588 0.487344 0.474013 \n",
"pl_voxpopuli 0.203548 0.158526 0.126280 0.110784 \n",
"es_google_fleurs 0.187607 0.159873 0.147104 0.155210 \n",
"es_minds14 0.721295 0.670363 0.666278 0.673058 \n",
"es_voxpopuli 0.133805 0.116222 0.119882 0.106610 \n",
"en_google_fleurs 0.217843 0.188810 0.186407 0.183656 \n",
"en_minds14 0.562068 0.566999 0.580369 0.583945 \n",
"en_voxpopuli 0.224980 0.203959 0.210278 0.322688 \n",
"\n",
" whisper_large-v2 facebook_wav2vec2 nvidia_stt \n",
"nl_google_fleurs 0.165057 0.082781 -1.000000 \n",
"nl_minds14 0.324172 0.142155 -1.000000 \n",
"nl_voxpopuli 0.139958 0.200403 -1.000000 \n",
"fr_google_fleurs 0.141276 0.083170 0.053155 \n",
"fr_minds14 0.485090 0.220358 0.189111 \n",
"fr_voxpopuli 0.111776 0.169564 0.127958 \n",
"de_google_fleurs 0.236715 0.083423 0.051673 \n",
"de_minds14 0.398762 0.183933 0.146988 \n",
"de_voxpopuli 0.156371 0.242498 0.168854 \n",
"it_google_fleurs 0.160677 0.067181 0.039040 \n",
"it_minds14 0.392705 0.198809 0.146235 \n",
"it_voxpopuli 0.130691 -1.000000 0.153960 \n",
"pl_google_fleurs 0.209027 0.088157 -1.000000 \n",
"pl_minds14 0.487891 0.237692 -1.000000 \n",
"pl_voxpopuli 0.117780 0.184368 -1.000000 \n",
"es_google_fleurs 0.154657 0.057830 0.038903 \n",
"es_minds14 0.680341 0.411927 0.342895 \n",
"es_voxpopuli 0.122036 0.148225 0.128456 \n",
"en_google_fleurs 0.184568 0.180523 0.071421 \n",
"en_minds14 0.578079 0.325304 0.293083 \n",
"en_voxpopuli 0.280877 0.182708 0.124416 "
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"id": "6466877e-e744-4cb1-8d4f-f818e1d3ee7d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>whisper_tiny</th>\n",
" <th>whisper_base</th>\n",
" <th>whisper_small</th>\n",
" <th>whisper_medium</th>\n",
" <th>whisper_large-v2</th>\n",
" <th>facebook_wav2vec2</th>\n",
" <th>nvidia_stt</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>nl_google_fleurs</th>\n",
" <td>0.582916</td>\n",
" <td>0.427364</td>\n",
" <td>0.279190</td>\n",
" <td>0.229402</td>\n",
" <td>0.212373</td>\n",
" <td>0.160957</td>\n",
" <td>-1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>nl_minds14</th>\n",
" <td>0.888989</td>\n",
" <td>0.702107</td>\n",
" <td>0.511865</td>\n",
" <td>0.440081</td>\n",
" <td>0.415821</td>\n",
" <td>0.298583</td>\n",
" <td>-1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>nl_voxpopuli</th>\n",
" <td>0.451950</td>\n",
" <td>0.350228</td>\n",
" <td>0.233061</td>\n",
" <td>0.188461</td>\n",
" <td>0.208664</td>\n",
" <td>0.340656</td>\n",
" <td>-1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>fr_google_fleurs</th>\n",
" <td>0.468415</td>\n",
" <td>0.338927</td>\n",
" <td>0.260157</td>\n",
" <td>0.207241</td>\n",
" <td>0.194587</td>\n",
" <td>0.141560</td>\n",
" <td>0.073667</td>\n",
" </tr>\n",
" <tr>\n",
" <th>fr_minds14</th>\n",
" <td>0.700735</td>\n",
" <td>0.619382</td>\n",
" <td>0.567487</td>\n",
" <td>0.513574</td>\n",
" <td>0.552826</td>\n",
" <td>0.336656</td>\n",
" <td>0.236770</td>\n",
" </tr>\n",
" <tr>\n",
" <th>fr_voxpopuli</th>\n",
" <td>0.310661</td>\n",
" <td>0.235596</td>\n",
" <td>0.180943</td>\n",
" <td>0.153288</td>\n",
" <td>0.159867</td>\n",
" <td>0.245229</td>\n",
" <td>0.164607</td>\n",
" </tr>\n",
" <tr>\n",
" <th>de_google_fleurs</th>\n",
" <td>0.449640</td>\n",
" <td>0.344001</td>\n",
" <td>0.282088</td>\n",
" <td>0.275634</td>\n",
" <td>0.264093</td>\n",
" <td>0.094206</td>\n",
" <td>0.053148</td>\n",
" </tr>\n",
" <tr>\n",
" <th>de_minds14</th>\n",
" <td>0.608813</td>\n",
" <td>0.529599</td>\n",
" <td>0.472205</td>\n",
" <td>0.443094</td>\n",
" <td>0.441656</td>\n",
" <td>0.228980</td>\n",
" <td>0.157855</td>\n",
" </tr>\n",
" <tr>\n",
" <th>de_voxpopuli</th>\n",
" <td>0.347653</td>\n",
" <td>0.248060</td>\n",
" <td>0.198001</td>\n",
" <td>0.168237</td>\n",
" <td>0.205059</td>\n",
" <td>0.313704</td>\n",
" <td>0.203633</td>\n",
" </tr>\n",
" <tr>\n",
" <th>it_google_fleurs</th>\n",
" <td>0.364700</td>\n",
" <td>0.269092</td>\n",
" <td>0.218361</td>\n",
" <td>0.189632</td>\n",
" <td>0.189108</td>\n",
" <td>0.115212</td>\n",
" <td>0.057875</td>\n",
" </tr>\n",
" <tr>\n",
" <th>it_minds14</th>\n",
" <td>0.735663</td>\n",
" <td>0.597724</td>\n",
" <td>0.500377</td>\n",
" <td>0.438344</td>\n",
" <td>0.417785</td>\n",
" <td>0.285531</td>\n",
" <td>0.153250</td>\n",
" </tr>\n",
" <tr>\n",
" <th>it_voxpopuli</th>\n",
" <td>0.401738</td>\n",
" <td>0.332257</td>\n",
" <td>0.278988</td>\n",
" <td>0.245468</td>\n",
" <td>0.247638</td>\n",
" </tr>\n",
" <tr>\n",
" <th>pl_google_fleurs</th>\n",
" <td>0.594285</td>\n",
" <td>0.452570</td>\n",
" <td>0.318702</td>\n",
" <td>0.276475</td>\n",
" <td>0.261194</td>\n",
" <td>0.184994</td>\n",
" <td>-1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>pl_minds14</th>\n",
" <td>0.988993</td>\n",
" <td>0.853431</td>\n",
" <td>0.653693</td>\n",
" <td>0.585884</td>\n",
" <td>0.597468</td>\n",
" <td>0.454939</td>\n",
" <td>-1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>pl_voxpopuli</th>\n",
" <td>0.374544</td>\n",
" <td>0.277290</td>\n",
" <td>0.198685</td>\n",
" <td>0.164524</td>\n",
" <td>0.161887</td>\n",
" <td>0.309752</td>\n",
" <td>-1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>es_google_fleurs</th>\n",
" <td>0.284499</td>\n",
" <td>0.224748</td>\n",
" <td>0.187365</td>\n",
" <td>0.189561</td>\n",
" <td>0.184028</td>\n",
" <td>0.096476</td>\n",
" <td>0.051401</td>\n",
" </tr>\n",
" <tr>\n",
" <th>es_minds14</th>\n",
" <td>0.880992</td>\n",
" <td>0.747677</td>\n",
" <td>0.695294</td>\n",
" <td>0.690749</td>\n",
" <td>0.697884</td>\n",
" <td>0.508818</td>\n",
" <td>0.384215</td>\n",
" </tr>\n",
" <tr>\n",
" <th>es_voxpopuli</th>\n",
" <td>0.252463</td>\n",
" <td>0.206225</td>\n",
" <td>0.229706</td>\n",
" <td>0.195846</td>\n",
" <td>0.231587</td>\n",
" <td>0.230351</td>\n",
" <td>0.173987</td>\n",
" </tr>\n",
" <tr>\n",
" <th>en_google_fleurs</th>\n",
" <td>0.295853</td>\n",
" <td>0.250928</td>\n",
" <td>0.224483</td>\n",
" <td>0.218855</td>\n",
" <td>0.218479</td>\n",
" <td>0.367414</td>\n",
" <td>0.078904</td>\n",
" </tr>\n",
" <tr>\n",
" <th>en_minds14</th>\n",
" <td>0.634351</td>\n",
" <td>0.623962</td>\n",
" <td>0.626942</td>\n",
" <td>0.626588</td>\n",
" <td>0.620953</td>\n",
" <td>0.584547</td>\n",
" <td>0.329282</td>\n",
" </tr>\n",
" <tr>\n",
" <th>en_voxpopuli</th>\n",
" <td>0.345836</td>\n",
" <td>0.319493</td>\n",
" <td>0.319060</td>\n",
" <td>0.466410</td>\n",
" <td>0.408949</td>\n",
" <td>0.377100</td>\n",
" <td>0.160883</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
" whisper_tiny whisper_base whisper_small whisper_medium \\\n",
"nl_google_fleurs 0.582916 0.427364 0.279190 0.229402 \n",
"nl_minds14 0.888989 0.702107 0.511865 0.440081 \n",
"nl_voxpopuli 0.451950 0.350228 0.233061 0.188461 \n",
"fr_google_fleurs 0.468415 0.338927 0.260157 0.207241 \n",
"fr_minds14 0.700735 0.619382 0.567487 0.513574 \n",
"fr_voxpopuli 0.310661 0.235596 0.180943 0.153288 \n",
"de_google_fleurs 0.449640 0.344001 0.282088 0.275634 \n",
"de_minds14 0.608813 0.529599 0.472205 0.443094 \n",
"de_voxpopuli 0.347653 0.248060 0.198001 0.168237 \n",
"it_google_fleurs 0.364700 0.269092 0.218361 0.189632 \n",
"it_minds14 0.735663 0.597724 0.500377 0.438344 \n",
"it_voxpopuli 0.401738 0.332257 0.278988 0.245468 \n",
"pl_google_fleurs 0.594285 0.452570 0.318702 0.276475 \n",
"pl_minds14 0.988993 0.853431 0.653693 0.585884 \n",
"pl_voxpopuli 0.374544 0.277290 0.198685 0.164524 \n",
"es_google_fleurs 0.284499 0.224748 0.187365 0.189561 \n",
"es_minds14 0.880992 0.747677 0.695294 0.690749 \n",
"es_voxpopuli 0.252463 0.206225 0.229706 0.195846 \n",
"en_google_fleurs 0.295853 0.250928 0.224483 0.218855 \n",
"en_minds14 0.634351 0.623962 0.626942 0.626588 \n",
"en_voxpopuli 0.345836 0.319493 0.319060 0.466410 \n",
"\n",
" whisper_large-v2 facebook_wav2vec2 nvidia_stt \n",
"nl_google_fleurs 0.212373 0.160957 -1.000000 \n",
"nl_minds14 0.415821 0.298583 -1.000000 \n",
"nl_voxpopuli 0.208664 0.340656 -1.000000 \n",
"fr_google_fleurs 0.194587 0.141560 0.073667 \n",
"fr_minds14 0.552826 0.336656 0.236770 \n",
"fr_voxpopuli 0.159867 0.245229 0.164607 \n",
"de_google_fleurs 0.264093 0.094206 0.053148 \n",
"de_minds14 0.441656 0.228980 0.157855 \n",
"de_voxpopuli 0.205059 0.313704 0.203633 \n",
"it_google_fleurs 0.189108 0.115212 0.057875 \n",
"it_minds14 0.417785 0.285531 0.153250 \n",
"it_voxpopuli 0.247638 -1.000000 0.236106 \n",
"pl_google_fleurs 0.261194 0.184994 -1.000000 \n",
"pl_minds14 0.597468 0.454939 -1.000000 \n",
"pl_voxpopuli 0.161887 0.309752 -1.000000 \n",
"es_google_fleurs 0.184028 0.096476 0.051401 \n",
"es_minds14 0.697884 0.508818 0.384215 \n",
"es_voxpopuli 0.231587 0.230351 0.173987 \n",
"en_google_fleurs 0.218479 0.367414 0.078904 \n",
"en_minds14 0.620953 0.584547 0.329282 \n",
"en_voxpopuli 0.408949 0.377100 0.160883 "
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"spacy_pos_df = pd.DataFrame(spacy_pos, columns=FULL_LANGUAGE_MODELS, index=FULL_DATASET_NAMES)\n",
"spacy_pos_df.to_csv('results/spacy_pos.csv')\n",
"spacy_pos_df"
"id": "77567361-b730-49f0-ab68-19ad335df1b1",
"metadata": {},
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>whisper_tiny</th>\n",
" <th>whisper_base</th>\n",
" <th>whisper_small</th>\n",
" <th>whisper_medium</th>\n",
" <th>whisper_large-v2</th>\n",
" <th>facebook_wav2vec2</th>\n",
" <th>nvidia_stt</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>nl_google_fleurs</th>\n",
" <td>0.699699</td>\n",
" <td>0.533595</td>\n",
" <td>0.366764</td>\n",
" <td>0.300730</td>\n",
" <td>0.282070</td>\n",
" <td>0.246416</td>\n",
" </tr>\n",
" <tr>\n",
" <th>nl_minds14</th>\n",
" <td>0.941359</td>\n",
" <td>0.778265</td>\n",
" <td>0.584732</td>\n",
" <td>0.511929</td>\n",
" <td>0.490065</td>\n",
" <td>0.376911</td>\n",
" </tr>\n",
" <tr>\n",
" <th>nl_voxpopuli</th>\n",
" <td>0.553280</td>\n",
" <td>0.435277</td>\n",
" <td>0.304322</td>\n",
" <td>0.252270</td>\n",
" <td>0.268306</td>\n",
" <td>0.430234</td>\n",
" </tr>\n",
" <tr>\n",
" <th>fr_google_fleurs</th>\n",
" <td>0.580527</td>\n",
" <td>0.429523</td>\n",
" <td>0.337506</td>\n",
" <td>0.275466</td>\n",
" <td>0.259405</td>\n",
" <td>0.205104</td>\n",
" <td>0.114100</td>\n",
" </tr>\n",
" <tr>\n",
" <th>fr_minds14</th>\n",
" <td>0.800999</td>\n",
" <td>0.714124</td>\n",
" <td>0.647957</td>\n",
" <td>0.592392</td>\n",
" <td>0.613262</td>\n",
" <td>0.421050</td>\n",
" <td>0.284212</td>\n",
" </tr>\n",
" <tr>\n",
" <th>fr_voxpopuli</th>\n",
" <td>0.387866</td>\n",
" <td>0.307476</td>\n",
" <td>0.240038</td>\n",
" <td>0.205174</td>\n",
" <td>0.210248</td>\n",
" <td>0.323655</td>\n",
" <td>0.232059</td>\n",
" </tr>\n",
" <tr>\n",
" <th>de_google_fleurs</th>\n",
" <td>0.519535</td>\n",
" <td>0.424735</td>\n",
" <td>0.360695</td>\n",
" <td>0.353459</td>\n",
" <td>0.345089</td>\n",
" <td>0.139605</td>\n",
" <td>0.074235</td>\n",
" </tr>\n",
" <tr>\n",
" <th>de_minds14</th>\n",
" <td>0.693370</td>\n",
" <td>0.628170</td>\n",
" <td>0.570571</td>\n",
" <td>0.543742</td>\n",
" <td>0.546479</td>\n",
" <td>0.288109</td>\n",
" <td>0.216011</td>\n",
" </tr>\n",
" <tr>\n",
" <th>de_voxpopuli</th>\n",
" <td>0.396771</td>\n",
" <td>0.298134</td>\n",
" <td>0.236937</td>\n",
" <td>0.204998</td>\n",
" <td>0.241773</td>\n",
" <td>0.385364</td>\n",
" <td>0.271072</td>\n",
" </tr>\n",
" <tr>\n",
" <th>it_google_fleurs</th>\n",
" <td>0.453637</td>\n",
" <td>0.334587</td>\n",
" <td>0.269876</td>\n",
" <td>0.234494</td>\n",
" <td>0.232862</td>\n",
" <td>0.168723</td>\n",
" <td>0.089945</td>\n",
" </tr>\n",
" <tr>\n",
" <th>it_minds14</th>\n",
" <td>0.814580</td>\n",
" <td>0.681371</td>\n",
" <td>0.576940</td>\n",
" <td>0.511340</td>\n",
" <td>0.495661</td>\n",
" <td>0.376479</td>\n",
" <td>0.224318</td>\n",
" </tr>\n",
" <tr>\n",
" <th>it_voxpopuli</th>\n",
" <td>0.483728</td>\n",
" <td>0.401518</td>\n",
" <td>0.332556</td>\n",
" <td>0.290310</td>\n",
" <td>0.291917</td>\n",
" </tr>\n",
" <tr>\n",
" <th>pl_google_fleurs</th>\n",
" <td>0.741445</td>\n",
" <td>0.580439</td>\n",
" <td>0.420468</td>\n",
" <td>0.365168</td>\n",
" <td>0.348206</td>\n",
" <td>0.303350</td>\n",
" </tr>\n",
" <tr>\n",
" <th>pl_minds14</th>\n",
" <td>1.138465</td>\n",
" <td>0.999350</td>\n",
" <td>0.817470</td>\n",
" <td>0.738430</td>\n",
" <td>0.754548</td>\n",
" <td>0.587577</td>\n",
" </tr>\n",
" <tr>\n",
" <th>pl_voxpopuli</th>\n",
" <td>0.479609</td>\n",
" <td>0.366738</td>\n",
" <td>0.257558</td>\n",