Newer
Older
{
"cells": [
{
"cell_type": "markdown",
"id": "9febd313-2446-49bb-8508-997c0b2bc0ca",
"metadata": {},
"source": [
"# Imports and configs"
]
},
{
"cell_type": "code",
"id": "1929f9bb-5060-4530-811b-823d69a5b00f",
"metadata": {},
"outputs": [],
"source": [
"from experiment.luna.luna_record_provider import LunaRecordProvider\n",
"from sziszapangma.integration.experiment_manager import ExperimentManager\n",
"from sziszapangma.integration.repository.multi_files_experiment_repository import MultiFilesExperimentRepository\n",
"from sziszapangma.integration.path_filter import ExtensionPathFilter\n",
"from pymongo import MongoClient\n",
"from spacy.tokens.doc import Doc\n",
"import pandas as pd\n",
"from experiment.voicelab.voicelab_telco_record_provider import VoicelabTelcoRecordProvider\n",
"from sziszapangma.integration.repository.experiment_repository import ExperimentRepository\n",
"from dataclasses import dataclass\n",
"import itertools\n",
"from typing import Optional, Any, List\n",
"import numpy as np"
"id": "321a93d9-0c5d-4d42-ba8f-4b704a05d78c",
"metadata": {},
"outputs": [],
"source": [
"pd.set_option('display.max_rows', None)"
]
},
{
"cell_type": "markdown",
"id": "1b0a963f-ef3b-4f0f-bebc-68a0ea3e4f6c",
"metadata": {},
"source": [
"# Load datasets and other pipeline objects"
]
},
{
"cell_type": "code",
"id": "bbe2a7bf-bb6d-42ee-b5ce-48e6ec7fcd94",
"metadata": {},
"outputs": [],
"source": [
"VOICELAB_DATASET_DIRECTORY = 'experiment_data/dataset/voicelab_cbiz_testset_20220322'\n",
"LUNA_DATASET_DIRECTORY = 'experiment_data/dataset/LUNA.PL'"
]
},
{
"cell_type": "code",
"id": "4bb1a1c0-8784-4e0d-9426-13495718e087",
"metadata": {},
"outputs": [],
"source": [
"## repository collections\n",
"GOLD_TRANSCRIPT = 'gold_transcript'\n",
"GOLD_TRANSCRIPT_SPACY = 'gold_transcript_spacy'\n",
"\n",
"TECHMO_POLISH_ASR = 'techmo_polish_asr'\n",
"WORD_TECHMO_MERTICS_WER = 'word_techmo_metrics_wer'\n",
"WORD_TECHMO_ALIGNMENT_WER = 'word_techmo_alignment_wer'\n",
"TECHMO_SPACY = 'techmo_spacy'\n",
"POS_TECHMO_ALIGNMENT_WER = 'pos_techmo_alignment_wer'\n",
"POS_TECHMO_METRICS_WER = 'pos_techmo_metrics_wer'\n",
"TAG_SPACY_TECHMO_METRICS_WER_EMBEDDINGS = 'tag_spacy_techmo_metrics_wer_embeddings'\n",
"TAG_SPACY_TECHMO_ALIGNMENT_WER_EMBEDDINGS = 'tag_spacy_techmo_alignment_wer_embeddings'\n",
"\n",
"AJN_POLISH_ASR = 'ajn_polish_asr'\n",
"WORD_AJN_MERTICS_WER = 'word_ajn_metrics_wer'\n",
"WORD_AJN_ALIGNMENT_WER = 'word_ajn_alignment_wer'\n",
"AJN_SPACY = 'ajn_spacy'\n",
"POS_AJN_ALIGNMENT_WER = 'pos_ajn_metrics_wer'\n",
"POS_AJN_METRICS_WER = 'pos_ajn_alignment_wer'\n",
"TAG_SPACY_AJN_METRICS_WER_EMBEDDINGS = 'tag_spacy_ajn_metrics_wer_embeddings'\n",
"TAG_SPACY_AJN_ALIGNMENT_WER_EMBEDDINGS = 'tag_spacy_ajn_alignment_wer_embeddings'"
"id": "d4265253-755a-4160-97f7-72604fdf41d1",
"metadata": {},
"outputs": [],
"source": [
"@dataclass\n",
"class CollectionsConfig:\n",
" config_name: str\n",
" gold_transcript: str\n",
" gold_transcript_spacy: str\n",
" asr: str\n",
" word_asr_metric_wer: str\n",
" word_asr_alignment_wer: str\n",
" asr_spacy: str\n",
" pos_asr_metric_wer: str\n",
" pos_asr_alignment_wer: str\n",
" tag_metric_wer: str\n",
" tag_alignment_wer: str"
"id": "950b0bb8-e5ae-46e0-97a2-a832b7c8a70f",
"metadata": {},
"outputs": [],
"source": [
"techmo_connections_config = CollectionsConfig(\n",
" config_name='TECHMO ASR',\n",
" gold_transcript=GOLD_TRANSCRIPT,\n",
" gold_transcript_spacy=GOLD_TRANSCRIPT_SPACY,\n",
" asr=TECHMO_POLISH_ASR,\n",
" word_asr_metric_wer=WORD_TECHMO_MERTICS_WER,\n",
" word_asr_alignment_wer=WORD_TECHMO_ALIGNMENT_WER,\n",
" asr_spacy=TECHMO_SPACY,\n",
" pos_asr_metric_wer=POS_TECHMO_METRICS_WER,\n",
" pos_asr_alignment_wer=POS_TECHMO_ALIGNMENT_WER,\n",
" tag_metric_wer=TAG_SPACY_TECHMO_METRICS_WER_EMBEDDINGS,\n",
" tag_alignment_wer=TAG_SPACY_TECHMO_ALIGNMENT_WER_EMBEDDINGS\n",
")\n",
"ajn_connections_config = CollectionsConfig(\n",
" config_name='AJN ASR',\n",
" gold_transcript=GOLD_TRANSCRIPT,\n",
" gold_transcript_spacy=GOLD_TRANSCRIPT_SPACY,\n",
" asr=AJN_POLISH_ASR,\n",
" word_asr_metric_wer=WORD_AJN_MERTICS_WER,\n",
" word_asr_alignment_wer=WORD_AJN_ALIGNMENT_WER,\n",
" asr_spacy=AJN_SPACY,\n",
" pos_asr_metric_wer=POS_AJN_ALIGNMENT_WER,\n",
" pos_asr_alignment_wer=POS_AJN_METRICS_WER,\n",
" tag_metric_wer=TAG_SPACY_AJN_METRICS_WER_EMBEDDINGS,\n",
" tag_alignment_wer=TAG_SPACY_AJN_ALIGNMENT_WER_EMBEDDINGS\n",
"id": "4dec626b-02e4-4c78-a238-04ef2f090ea5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"voicelab examples count 800\n",
"luna examples count 500\n"
]
}
],
"source": [
"voicelab_experiment_repository = MultiFilesExperimentRepository(\n",
" 'experiment_data/pipeline',\n",
" 'asr_benchmark_voicelab_cbiz_testset_20220322'\n",
")\n",
"luna_experiment_repository = MultiFilesExperimentRepository(\n",
" 'experiment_data/pipeline',\n",
" 'asr_benchmark_luna'\n",
")\n",
"print(f'voicelab examples count {len(voicelab_experiment_repository.get_all_record_ids())}')\n",
"print(f'luna examples count {len(luna_experiment_repository.get_all_record_ids())}')"
]
},
{
"cell_type": "code",
"id": "98c6ff1d-4fbd-4b68-9e23-ecea33852b12",
"metadata": {},
"outputs": [],
"source": [
"voicelab_record_provider = VoicelabTelcoRecordProvider(ExtensionPathFilter(\n",
" root_directory=VOICELAB_DATASET_DIRECTORY,\n",
" extension='wav',\n",
"), relation_manager_root_path='experiment_data/dataset_relation_manager_data/voicelab_cbiz_testset_20220322')\n",
"\n",
"luna_record_provider = LunaRecordProvider(ExtensionPathFilter(\n",
" root_directory=f'{LUNA_DATASET_DIRECTORY}/LUNA.PL',\n",
" extension='wav',\n",
"), relation_manager_root_path='experiment_data/dataset_relation_manager_data/luna')"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "5bd3dcd6-6b32-480c-8937-07c770354ed1",
"metadata": {},
"outputs": [],
Loading full blame...