{ "cells": [ { "cell_type": "markdown", "id": "9febd313-2446-49bb-8508-997c0b2bc0ca", "metadata": {}, "source": [ "# Imports and configs" ] }, { "cell_type": "code", "execution_count": 1, "id": "1929f9bb-5060-4530-811b-823d69a5b00f", "metadata": {}, "outputs": [], "source": [ "from experiment.luna.luna_record_provider import LunaRecordProvider\n", "from sziszapangma.integration.experiment_manager import ExperimentManager\n", "from sziszapangma.integration.repository.multi_files_experiment_repository import MultiFilesExperimentRepository\n", "from sziszapangma.integration.path_filter import ExtensionPathFilter\n", "from pymongo import MongoClient\n", "from spacy.tokens.doc import Doc\n", "import pandas as pd\n", "from experiment.voicelab.voicelab_telco_record_provider import VoicelabTelcoRecordProvider\n", "from sziszapangma.integration.repository.experiment_repository import ExperimentRepository\n", "from dataclasses import dataclass\n", "import itertools\n", "from typing import Optional, Any, List\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "id": "321a93d9-0c5d-4d42-ba8f-4b704a05d78c", "metadata": {}, "outputs": [], "source": [ "pd.set_option('display.max_rows', None)" ] }, { "cell_type": "markdown", "id": "1b0a963f-ef3b-4f0f-bebc-68a0ea3e4f6c", "metadata": {}, "source": [ "# Load datasets and other pipeline objects" ] }, { "cell_type": "code", "execution_count": 3, "id": "bbe2a7bf-bb6d-42ee-b5ce-48e6ec7fcd94", "metadata": {}, "outputs": [], "source": [ "VOICELAB_DATASET_DIRECTORY = 'experiment_data/dataset/voicelab_cbiz_testset_20220322'\n", "LUNA_DATASET_DIRECTORY = 'experiment_data/dataset/LUNA.PL'" ] }, { "cell_type": "code", "execution_count": 4, "id": "4bb1a1c0-8784-4e0d-9426-13495718e087", "metadata": {}, "outputs": [], "source": [ "## repository collections\n", "GOLD_TRANSCRIPT = 'gold_transcript'\n", "GOLD_TRANSCRIPT_SPACY = 'gold_transcript_spacy'\n", "\n", "TECHMO_POLISH_ASR = 'techmo_polish_asr'\n", "WORD_TECHMO_MERTICS_WER = 'word_techmo_metrics_wer'\n", "WORD_TECHMO_ALIGNMENT_WER = 'word_techmo_alignment_wer'\n", "TECHMO_SPACY = 'techmo_spacy'\n", "POS_TECHMO_ALIGNMENT_WER = 'pos_techmo_alignment_wer'\n", "POS_TECHMO_METRICS_WER = 'pos_techmo_metrics_wer'\n", "\n", "AJN_POLISH_ASR = 'ajn_polish_asr'\n", "WORD_AJN_MERTICS_WER = 'word_ajn_metrics_wer'\n", "WORD_AJN_ALIGNMENT_WER = 'word_ajn_alignment_wer'\n", "AJN_SPACY = 'ajn_spacy'\n", "POS_AJN_ALIGNMENT_WER = 'pos_ajn_metrics_wer'\n", "POS_AJN_METRICS_WER = 'pos_ajn_alignment_wer'" ] }, { "cell_type": "code", "execution_count": 5, "id": "d4265253-755a-4160-97f7-72604fdf41d1", "metadata": {}, "outputs": [], "source": [ "@dataclass\n", "class CollectionsConfig:\n", " config_name: str\n", " gold_transcript: str\n", " gold_transcript_spacy: str\n", " asr: str\n", " word_asr_metric_wer: str\n", " word_asr_alignment_wer: str\n", " asr_spacy: str\n", " pos_asr_metric_wer: str\n", " pos_asr_alignment_wer: str" ] }, { "cell_type": "code", "execution_count": 6, "id": "950b0bb8-e5ae-46e0-97a2-a832b7c8a70f", "metadata": {}, "outputs": [], "source": [ "techmo_connections_config = CollectionsConfig(\n", " config_name='TECHMO ASR',\n", " gold_transcript=GOLD_TRANSCRIPT,\n", " gold_transcript_spacy=GOLD_TRANSCRIPT_SPACY,\n", " asr=TECHMO_POLISH_ASR,\n", " word_asr_metric_wer=WORD_TECHMO_MERTICS_WER,\n", " word_asr_alignment_wer=WORD_TECHMO_ALIGNMENT_WER,\n", " asr_spacy=TECHMO_SPACY,\n", " pos_asr_metric_wer=POS_TECHMO_METRICS_WER,\n", " pos_asr_alignment_wer=POS_TECHMO_ALIGNMENT_WER\n", ")\n", "ajn_connections_config = CollectionsConfig(\n", " config_name='AJN ASR',\n", " gold_transcript=GOLD_TRANSCRIPT,\n", " gold_transcript_spacy=GOLD_TRANSCRIPT_SPACY,\n", " asr=AJN_POLISH_ASR,\n", " word_asr_metric_wer=WORD_AJN_MERTICS_WER,\n", " word_asr_alignment_wer=WORD_AJN_ALIGNMENT_WER,\n", " asr_spacy=AJN_SPACY,\n", " pos_asr_metric_wer=POS_AJN_ALIGNMENT_WER,\n", " pos_asr_alignment_wer=POS_AJN_METRICS_WER\n", ")" ] }, { "cell_type": "code", "execution_count": 7, "id": "4dec626b-02e4-4c78-a238-04ef2f090ea5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "voicelab examples count 800\n", "luna examples count 500\n" ] } ], "source": [ "voicelab_experiment_repository = MultiFilesExperimentRepository(\n", " 'experiment_data/pipeline',\n", " 'asr_benchmark_voicelab_cbiz_testset_20220322'\n", ")\n", "luna_experiment_repository = MultiFilesExperimentRepository(\n", " 'experiment_data/pipeline',\n", " 'asr_benchmark_luna'\n", ")\n", "print(f'voicelab examples count {len(voicelab_experiment_repository.get_all_record_ids())}')\n", "print(f'luna examples count {len(luna_experiment_repository.get_all_record_ids())}')" ] }, { "cell_type": "code", "execution_count": 8, "id": "98c6ff1d-4fbd-4b68-9e23-ecea33852b12", "metadata": {}, "outputs": [], "source": [ "voicelab_record_provider = VoicelabTelcoRecordProvider(ExtensionPathFilter(\n", " root_directory=VOICELAB_DATASET_DIRECTORY,\n", " extension='wav',\n", "), relation_manager_root_path='experiment_data/dataset_relation_manager_data/voicelab_cbiz_testset_20220322')\n", "\n", "luna_record_provider = LunaRecordProvider(ExtensionPathFilter(\n", " root_directory=f'{LUNA_DATASET_DIRECTORY}/LUNA.PL',\n", " extension='wav',\n", "), relation_manager_root_path='experiment_data/dataset_relation_manager_data/luna')" ] }, { "cell_type": "code", "execution_count": 9, "id": "74bf17ba-8572-4758-8116-794708b6ea08", "metadata": {}, "outputs": [], "source": [ "def get_gold_transcript_for(record_id: str, experiment_repository: ExperimentRepository, \n", " collections_config: CollectionsConfig) -> str:\n", " gold_trnascript_property = experiment_repository.get_property_for_key(\n", " record_id, collections_config.gold_transcript)\n", " return ' '.join([it['word'] for it in gold_trnascript_property])\n", "\n", "def get_asr_transcript_for(record_id: str, experiment_repository: ExperimentRepository,\n", " collections_config: CollectionsConfig) -> str:\n", " gold_trnascript_property = experiment_repository.get_property_for_key(record_id, collections_config.asr)\n", " return gold_trnascript_property['full_text']\n", "\n", "def get_word_alignment_df(record_id: str, experiment_repository: ExperimentRepository, \n", " collections_config: CollectionsConfig) -> pd.DataFrame:\n", " word_alignment_wer = experiment_repository.get_property_for_key(record_id, \n", " collections_config.word_asr_alignment_wer)\n", " arr = [\n", " {\n", " 'step_type': it['step_type'],\n", " 'reference_word_text': it['step_words']['reference_word']['text'] \n", " if 'reference_word' in it['step_words'] else '',\n", " 'hypothesis_word_text': it['step_words']['hypothesis_word']['text']\n", " if 'hypothesis_word' in it['step_words'] else '',\n", " }\n", " for it in word_alignment_wer\n", " ]\n", " return pd.DataFrame(arr)\n", "\n", "\n", "def get_pos_alignment_df(record_id: str, experiment_repository: ExperimentRepository, \n", " collections_config: CollectionsConfig) -> pd.DataFrame:\n", " pos_alignment_wer = experiment_repository.get_property_for_key(record_id, \n", " collections_config.pos_asr_alignment_wer)\n", " gold_transcript_spacy = experiment_repository.get_property_for_key(record_id,\n", " collections_config.gold_transcript_spacy)\n", " gold_trnascript_spacy_word_dict = {it['id']: it['word'] for it in gold_transcript_spacy}\n", " asr_spacy = experiment_repository.get_property_for_key(record_id, collections_config.asr_spacy)\n", " asr_spacy_word_dict = {it['id']: it['word'] for it in asr_spacy}\n", " arr = [\n", " {\n", " 'step_type': it['step_type'],\n", " 'reference_word_pos': it['step_words']['reference_word']['text'] \n", " if 'reference_word' in it['step_words'] else '',\n", " 'reference_word_text': gold_trnascript_spacy_word_dict[it['step_words']['reference_word']['id']] \n", " if 'reference_word' in it['step_words'] else '',\n", " 'hypothesis_word_pos': it['step_words']['hypothesis_word']['text']\n", " if 'hypothesis_word' in it['step_words'] else '',\n", " 'hypothesis_word_text': asr_spacy_word_dict[it['step_words']['hypothesis_word']['id']] \n", " if 'hypothesis_word' in it['step_words'] else ''\n", " }\n", " for it in pos_alignment_wer\n", " ]\n", " return pd.DataFrame(arr)\n", " \n", " \n", "def show_report_for(record_id: str, experiment_repository: ExperimentRepository,\n", " collections_config: CollectionsConfig):\n", " print('record_id:', record_id)\n", " print('properties_confiog:', collections_config.config_name)\n", " print()\n", " print(f'gold transcript: {get_gold_transcript_for(record_id, experiment_repository, collections_config)}') \n", " print()\n", " print(f'asr transcript: {get_asr_transcript_for(record_id, experiment_repository, collections_config)}')\n", " print()\n", " print(f\"word wer {experiment_repository.get_property_for_key(record_id, collections_config.word_asr_metric_wer)}\")\n", " print()\n", " print(f\"pos wer {experiment_repository.get_property_for_key(record_id, collections_config.pos_asr_metric_wer)}\")\n", " print()\n", " display(get_word_alignment_df(record_id, experiment_repository, collections_config))\n", " print()\n", " display(get_pos_alignment_df(record_id, experiment_repository, collections_config))\n", " print('--------------------------------------------------------------')\n", " print('--------------------------------------------------------------')\n", " print('--------------------------------------------------------------')" ] }, { "cell_type": "markdown", "id": "327bbc4b-7bcb-4a28-8d4a-22660ecaf6c2", "metadata": {}, "source": [ "# Report for LUNA and TECHMO ASR" ] }, { "cell_type": "code", "execution_count": null, "id": "3395e091-3488-4b49-aede-af15a0055a8b", "metadata": {}, "outputs": [], "source": [ "for it in list(luna_record_provider.get_all_records())[:1]:\n", " show_report_for(it, luna_experiment_repository, techmo_connections_config)" ] }, { "cell_type": "markdown", "id": "a4b4f1bc-f0b3-4d23-bf1d-97f75715f332", "metadata": {}, "source": [ "# Report for LUNA and AJN ASR" ] }, { "cell_type": "code", "execution_count": null, "id": "27797bad-c92a-45a6-87f2-071645c18150", "metadata": {}, "outputs": [], "source": [ "for it in list(luna_record_provider.get_all_records())[:1]:\n", " show_report_for(it, luna_experiment_repository, ajn_connections_config)" ] }, { "cell_type": "markdown", "id": "a698d611-d520-47bf-83ed-2f116a2327d7", "metadata": {}, "source": [ "# Report for VOICELAB and TECHMO ASR" ] }, { "cell_type": "code", "execution_count": null, "id": "b275d21a-7405-46d9-9764-8fc744fa0ea0", "metadata": {}, "outputs": [], "source": [ "for it in list(voicelab_record_provider.get_all_records())[:1]:\n", " show_report_for(it, voicelab_experiment_repository, techmo_connections_config)" ] }, { "cell_type": "code", "execution_count": null, "id": "413a547c-10fc-4873-9db3-8655678f6f9b", "metadata": {}, "outputs": [], "source": [ "for it in list(voicelab_record_provider.get_all_records())[:1]:\n", " show_report_for(it, voicelab_experiment_repository, ajn_connections_config)" ] }, { "cell_type": "markdown", "id": "32e0753f-debf-4946-9e14-1876e9e26e0c", "metadata": {}, "source": [ "# Statistics of POS incorrect data" ] }, { "cell_type": "code", "execution_count": 22, "id": "aec7538f-60e7-4eee-9c3a-9875135bed1f", "metadata": {}, "outputs": [], "source": [ "@dataclass\n", "class AlignDetails:\n", " value_reference: Optional[str]\n", " value_hypothesis: Optional[str]\n", " \n", " @staticmethod\n", " def _get_string_or_default(value: Optional[str]) -> str:\n", " return value if value is not None else '___'\n", " \n", " def to_string(self) -> str:\n", " return f'{self._get_string_or_default(self.value_reference)} -> {self._get_string_or_default(self.value_hypothesis)}'\n", " \n", " def is_correct(self) -> bool:\n", " return self.value_reference == self.value_hypothesis\n", " \n", " \n", "def get_align_details(repository_record: List[Any]) -> List[AlignDetails]:\n", " return [\n", " AlignDetails(\n", " value_reference=it['step_words']['reference_word']['text'] if 'reference_word' in it['step_words'] else None,\n", " value_hypothesis=it['step_words']['hypothesis_word']['text'] if 'hypothesis_word' in it['step_words'] else None\n", " )\n", " for it in repository_record\n", " ]\n", "\n", "\n", "def show_stats(experiment_repository: ExperimentRepository, collections_config: CollectionsConfig):\n", " aggregated_aligned = [\n", " get_align_details(experiment_repository.get_property_for_key(record_id, collections_config.pos_asr_alignment_wer))\n", " for record_id in list(experiment_repository.get_all_record_ids())[:]\n", " ]\n", " list_of_aligns = list(itertools.chain(*aggregated_aligned))\n", " list_of_changes = [it.to_string() for it in list_of_aligns if not it.is_correct()]\n", " unique = np.unique(np.array(list_of_changes), return_counts=True)\n", " return pd.DataFrame({'values': unique[0], 'counts': unique[1]}).sort_values(by=['counts'], ascending=False).head(16)" ] }, { "cell_type": "markdown", "id": "264c3fd3-b890-4e24-ac8d-2756d171ccb8", "metadata": {}, "source": [ "# Luna Techmo" ] }, { "cell_type": "code", "execution_count": 24, "id": "1d456e71-301c-416f-b694-03ed7ccfdee0", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>values</th>\n", " <th>counts</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>131</th>\n", " <td>NUM -> ___</td>\n", " <td>2290</td>\n", " </tr>\n", " <tr>\n", " <th>192</th>\n", " <td>PUNCT -> ___</td>\n", " <td>1479</td>\n", " </tr>\n", " <tr>\n", " <th>114</th>\n", " <td>NOUN -> ___</td>\n", " <td>1204</td>\n", " </tr>\n", " <tr>\n", " <th>169</th>\n", " <td>PROPN -> NOUN</td>\n", " <td>1177</td>\n", " </tr>\n", " <tr>\n", " <th>45</th>\n", " <td>ADV -> ___</td>\n", " <td>1073</td>\n", " </tr>\n", " <tr>\n", " <th>30</th>\n", " <td>ADP -> ___</td>\n", " <td>988</td>\n", " </tr>\n", " <tr>\n", " <th>246</th>\n", " <td>___ -> NOUN</td>\n", " <td>918</td>\n", " </tr>\n", " <tr>\n", " <th>130</th>\n", " <td>NUM -> X</td>\n", " <td>900</td>\n", " </tr>\n", " <tr>\n", " <th>225</th>\n", " <td>VERB -> ___</td>\n", " <td>883</td>\n", " </tr>\n", " <tr>\n", " <th>176</th>\n", " <td>PROPN -> ___</td>\n", " <td>823</td>\n", " </tr>\n", " <tr>\n", " <th>162</th>\n", " <td>PROPN -> ADJ</td>\n", " <td>662</td>\n", " </tr>\n", " <tr>\n", " <th>252</th>\n", " <td>___ -> VERB</td>\n", " <td>632</td>\n", " </tr>\n", " <tr>\n", " <th>161</th>\n", " <td>PRON -> ___</td>\n", " <td>613</td>\n", " </tr>\n", " <tr>\n", " <th>146</th>\n", " <td>PART -> ___</td>\n", " <td>577</td>\n", " </tr>\n", " <tr>\n", " <th>15</th>\n", " <td>ADJ -> ___</td>\n", " <td>560</td>\n", " </tr>\n", " <tr>\n", " <th>73</th>\n", " <td>CCONJ -> ___</td>\n", " <td>495</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " values counts\n", "131 NUM -> ___ 2290\n", "192 PUNCT -> ___ 1479\n", "114 NOUN -> ___ 1204\n", "169 PROPN -> NOUN 1177\n", "45 ADV -> ___ 1073\n", "30 ADP -> ___ 988\n", "246 ___ -> NOUN 918\n", "130 NUM -> X 900\n", "225 VERB -> ___ 883\n", "176 PROPN -> ___ 823\n", "162 PROPN -> ADJ 662\n", "252 ___ -> VERB 632\n", "161 PRON -> ___ 613\n", "146 PART -> ___ 577\n", "15 ADJ -> ___ 560\n", "73 CCONJ -> ___ 495" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "show_stats(luna_experiment_repository, techmo_connections_config)" ] }, { "cell_type": "markdown", "id": "7f018f54-75dc-47a5-8b04-839a008a3edc", "metadata": {}, "source": [ "## Luna AJN" ] }, { "cell_type": "code", "execution_count": 25, "id": "f7dd73cf-1fff-416b-9fed-d368d2fc65d0", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>values</th>\n", " <th>counts</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>284</th>\n", " <td>___ -> PUNCT</td>\n", " <td>3344</td>\n", " </tr>\n", " <tr>\n", " <th>256</th>\n", " <td>VERB -> ___</td>\n", " <td>1645</td>\n", " </tr>\n", " <tr>\n", " <th>251</th>\n", " <td>VERB -> PUNCT</td>\n", " <td>1582</td>\n", " </tr>\n", " <tr>\n", " <th>279</th>\n", " <td>___ -> NOUN</td>\n", " <td>1548</td>\n", " </tr>\n", " <tr>\n", " <th>28</th>\n", " <td>ADP -> PUNCT</td>\n", " <td>1522</td>\n", " </tr>\n", " <tr>\n", " <th>52</th>\n", " <td>ADV -> ___</td>\n", " <td>1409</td>\n", " </tr>\n", " <tr>\n", " <th>34</th>\n", " <td>ADP -> ___</td>\n", " <td>1316</td>\n", " </tr>\n", " <tr>\n", " <th>124</th>\n", " <td>NOUN -> PUNCT</td>\n", " <td>1300</td>\n", " </tr>\n", " <tr>\n", " <th>46</th>\n", " <td>ADV -> PUNCT</td>\n", " <td>1197</td>\n", " </tr>\n", " <tr>\n", " <th>246</th>\n", " <td>VERB -> NOUN</td>\n", " <td>1186</td>\n", " </tr>\n", " <tr>\n", " <th>41</th>\n", " <td>ADV -> NOUN</td>\n", " <td>1167</td>\n", " </tr>\n", " <tr>\n", " <th>129</th>\n", " <td>NOUN -> ___</td>\n", " <td>1137</td>\n", " </tr>\n", " <tr>\n", " <th>16</th>\n", " <td>ADJ -> ___</td>\n", " <td>879</td>\n", " </tr>\n", " <tr>\n", " <th>162</th>\n", " <td>PART -> ___</td>\n", " <td>876</td>\n", " </tr>\n", " <tr>\n", " <th>178</th>\n", " <td>PRON -> ___</td>\n", " <td>843</td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", " <td>ADJ -> PUNCT</td>\n", " <td>830</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " values counts\n", "284 ___ -> PUNCT 3344\n", "256 VERB -> ___ 1645\n", "251 VERB -> PUNCT 1582\n", "279 ___ -> NOUN 1548\n", "28 ADP -> PUNCT 1522\n", "52 ADV -> ___ 1409\n", "34 ADP -> ___ 1316\n", "124 NOUN -> PUNCT 1300\n", "46 ADV -> PUNCT 1197\n", "246 VERB -> NOUN 1186\n", "41 ADV -> NOUN 1167\n", "129 NOUN -> ___ 1137\n", "16 ADJ -> ___ 879\n", "162 PART -> ___ 876\n", "178 PRON -> ___ 843\n", "11 ADJ -> PUNCT 830" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "show_stats(luna_experiment_repository, ajn_connections_config)" ] }, { "cell_type": "markdown", "id": "e15befae-02ad-4517-9e97-58b3edf8c607", "metadata": {}, "source": [ "## VoiceLab Techmo" ] }, { "cell_type": "code", "execution_count": 26, "id": "9acb8d8e-7359-4e89-9902-ec5067d9a65d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>values</th>\n", " <th>counts</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>30</th>\n", " <td>ADP -> ___</td>\n", " <td>4527</td>\n", " </tr>\n", " <tr>\n", " <th>121</th>\n", " <td>NOUN -> ___</td>\n", " <td>4129</td>\n", " </tr>\n", " <tr>\n", " <th>151</th>\n", " <td>PART -> ___</td>\n", " <td>3585</td>\n", " </tr>\n", " <tr>\n", " <th>167</th>\n", " <td>PRON -> ___</td>\n", " <td>3424</td>\n", " </tr>\n", " <tr>\n", " <th>221</th>\n", " <td>VERB -> ___</td>\n", " <td>2935</td>\n", " </tr>\n", " <tr>\n", " <th>46</th>\n", " <td>ADV -> ___</td>\n", " <td>2727</td>\n", " </tr>\n", " <tr>\n", " <th>77</th>\n", " <td>CCONJ -> ___</td>\n", " <td>2360</td>\n", " </tr>\n", " <tr>\n", " <th>135</th>\n", " <td>NUM -> X</td>\n", " <td>1842</td>\n", " </tr>\n", " <tr>\n", " <th>136</th>\n", " <td>NUM -> ___</td>\n", " <td>1726</td>\n", " </tr>\n", " <tr>\n", " <th>92</th>\n", " <td>DET -> ___</td>\n", " <td>1715</td>\n", " </tr>\n", " <tr>\n", " <th>61</th>\n", " <td>AUX -> ___</td>\n", " <td>1634</td>\n", " </tr>\n", " <tr>\n", " <th>204</th>\n", " <td>SCONJ -> ___</td>\n", " <td>1587</td>\n", " </tr>\n", " <tr>\n", " <th>15</th>\n", " <td>ADJ -> ___</td>\n", " <td>1461</td>\n", " </tr>\n", " <tr>\n", " <th>244</th>\n", " <td>___ -> NOUN</td>\n", " <td>1251</td>\n", " </tr>\n", " <tr>\n", " <th>250</th>\n", " <td>___ -> VERB</td>\n", " <td>1178</td>\n", " </tr>\n", " <tr>\n", " <th>119</th>\n", " <td>NOUN -> VERB</td>\n", " <td>677</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " values counts\n", "30 ADP -> ___ 4527\n", "121 NOUN -> ___ 4129\n", "151 PART -> ___ 3585\n", "167 PRON -> ___ 3424\n", "221 VERB -> ___ 2935\n", "46 ADV -> ___ 2727\n", "77 CCONJ -> ___ 2360\n", "135 NUM -> X 1842\n", "136 NUM -> ___ 1726\n", "92 DET -> ___ 1715\n", "61 AUX -> ___ 1634\n", "204 SCONJ -> ___ 1587\n", "15 ADJ -> ___ 1461\n", "244 ___ -> NOUN 1251\n", "250 ___ -> VERB 1178\n", "119 NOUN -> VERB 677" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "show_stats(voicelab_experiment_repository, techmo_connections_config)" ] }, { "cell_type": "markdown", "id": "41e5bd80-87e0-4791-a87a-4c247ddb27cb", "metadata": {}, "source": [ "## VoiceLab AJN" ] }, { "cell_type": "code", "execution_count": 27, "id": "96b5b6d1-d7cd-4afb-adca-f6ed4bc1fedf", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>values</th>\n", " <th>counts</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>259</th>\n", " <td>VERB -> ___</td>\n", " <td>11609</td>\n", " </tr>\n", " <tr>\n", " <th>140</th>\n", " <td>NOUN -> ___</td>\n", " <td>10416</td>\n", " </tr>\n", " <tr>\n", " <th>53</th>\n", " <td>ADV -> ___</td>\n", " <td>10127</td>\n", " </tr>\n", " <tr>\n", " <th>175</th>\n", " <td>PART -> ___</td>\n", " <td>9282</td>\n", " </tr>\n", " <tr>\n", " <th>35</th>\n", " <td>ADP -> ___</td>\n", " <td>8663</td>\n", " </tr>\n", " <tr>\n", " <th>192</th>\n", " <td>PRON -> ___</td>\n", " <td>8066</td>\n", " </tr>\n", " <tr>\n", " <th>287</th>\n", " <td>___ -> PUNCT</td>\n", " <td>6354</td>\n", " </tr>\n", " <tr>\n", " <th>105</th>\n", " <td>DET -> ___</td>\n", " <td>6147</td>\n", " </tr>\n", " <tr>\n", " <th>17</th>\n", " <td>ADJ -> ___</td>\n", " <td>5935</td>\n", " </tr>\n", " <tr>\n", " <th>231</th>\n", " <td>SCONJ -> ___</td>\n", " <td>5385</td>\n", " </tr>\n", " <tr>\n", " <th>254</th>\n", " <td>VERB -> PUNCT</td>\n", " <td>4842</td>\n", " </tr>\n", " <tr>\n", " <th>134</th>\n", " <td>NOUN -> PUNCT</td>\n", " <td>4632</td>\n", " </tr>\n", " <tr>\n", " <th>70</th>\n", " <td>AUX -> ___</td>\n", " <td>4016</td>\n", " </tr>\n", " <tr>\n", " <th>249</th>\n", " <td>VERB -> NOUN</td>\n", " <td>3772</td>\n", " </tr>\n", " <tr>\n", " <th>47</th>\n", " <td>ADV -> PUNCT</td>\n", " <td>3453</td>\n", " </tr>\n", " <tr>\n", " <th>88</th>\n", " <td>CCONJ -> ___</td>\n", " <td>3438</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " values counts\n", "259 VERB -> ___ 11609\n", "140 NOUN -> ___ 10416\n", "53 ADV -> ___ 10127\n", "175 PART -> ___ 9282\n", "35 ADP -> ___ 8663\n", "192 PRON -> ___ 8066\n", "287 ___ -> PUNCT 6354\n", "105 DET -> ___ 6147\n", "17 ADJ -> ___ 5935\n", "231 SCONJ -> ___ 5385\n", "254 VERB -> PUNCT 4842\n", "134 NOUN -> PUNCT 4632\n", "70 AUX -> ___ 4016\n", "249 VERB -> NOUN 3772\n", "47 ADV -> PUNCT 3453\n", "88 CCONJ -> ___ 3438" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "show_stats(voicelab_experiment_repository, ajn_connections_config)" ] }, { "cell_type": "code", "execution_count": null, "id": "17823c33-7065-43e6-9d2f-49a59fba26c1", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.13" } }, "nbformat": 4, "nbformat_minor": 5 }