Skip to content
Snippets Groups Projects
pos.ipynb 1.68 MiB
Newer Older
Marcin Wątroba's avatar
Marcin Wątroba committed
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "9febd313-2446-49bb-8508-997c0b2bc0ca",
   "metadata": {},
   "source": [
    "# Imports and configs"
   ]
  },
  {
   "cell_type": "code",
Marcin Wątroba's avatar
Marcin Wątroba committed
   "execution_count": 26,
Marcin Wątroba's avatar
Marcin Wątroba committed
   "id": "1929f9bb-5060-4530-811b-823d69a5b00f",
   "metadata": {},
   "outputs": [],
   "source": [
    "from experiment.luna.luna_record_provider import LunaRecordProvider\n",
    "from sziszapangma.integration.experiment_manager import ExperimentManager\n",
    "from sziszapangma.integration.repository.multi_files_experiment_repository import MultiFilesExperimentRepository\n",
    "from sziszapangma.integration.path_filter import ExtensionPathFilter\n",
    "from pymongo import MongoClient\n",
    "from spacy.tokens.doc import Doc\n",
    "import pandas as pd\n",
    "from experiment.voicelab.voicelab_telco_record_provider import VoicelabTelcoRecordProvider\n",
    "from sziszapangma.integration.repository.experiment_repository import ExperimentRepository\n",
    "from dataclasses import dataclass\n",
    "import itertools\n",
    "from typing import Optional, Any, List\n",
    "import numpy as np"
Marcin Wątroba's avatar
Marcin Wątroba committed
   ]
  },
  {
   "cell_type": "code",
Marcin Wątroba's avatar
Marcin Wątroba committed
   "execution_count": 27,
Marcin Wątroba's avatar
Marcin Wątroba committed
   "id": "321a93d9-0c5d-4d42-ba8f-4b704a05d78c",
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.set_option('display.max_rows', None)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1b0a963f-ef3b-4f0f-bebc-68a0ea3e4f6c",
   "metadata": {},
   "source": [
    "# Load datasets and other pipeline objects"
   ]
  },
  {
   "cell_type": "code",
Marcin Wątroba's avatar
Marcin Wątroba committed
   "execution_count": 28,
Marcin Wątroba's avatar
Marcin Wątroba committed
   "id": "bbe2a7bf-bb6d-42ee-b5ce-48e6ec7fcd94",
   "metadata": {},
   "outputs": [],
   "source": [
    "VOICELAB_DATASET_DIRECTORY = 'experiment_data/dataset/voicelab_cbiz_testset_20220322'\n",
    "LUNA_DATASET_DIRECTORY = 'experiment_data/dataset/LUNA.PL'"
   ]
  },
  {
   "cell_type": "code",
Marcin Wątroba's avatar
Marcin Wątroba committed
   "execution_count": 29,
Marcin Wątroba's avatar
Marcin Wątroba committed
   "id": "4bb1a1c0-8784-4e0d-9426-13495718e087",
   "metadata": {},
   "outputs": [],
   "source": [
    "## repository collections\n",
    "GOLD_TRANSCRIPT = 'gold_transcript'\n",
    "GOLD_TRANSCRIPT_SPACY = 'gold_transcript_spacy'\n",
    "\n",
    "TECHMO_POLISH_ASR = 'techmo_polish_asr'\n",
    "WORD_TECHMO_MERTICS_WER = 'word_techmo_metrics_wer'\n",
    "WORD_TECHMO_ALIGNMENT_WER = 'word_techmo_alignment_wer'\n",
    "TECHMO_SPACY = 'techmo_spacy'\n",
    "POS_TECHMO_ALIGNMENT_WER = 'pos_techmo_alignment_wer'\n",
    "POS_TECHMO_METRICS_WER = 'pos_techmo_metrics_wer'\n",
Marcin Wątroba's avatar
Marcin Wątroba committed
    "TAG_SPACY_TECHMO_METRICS_WER_EMBEDDINGS = 'tag_spacy_techmo_metrics_wer_embeddings'\n",
    "TAG_SPACY_TECHMO_ALIGNMENT_WER_EMBEDDINGS = 'tag_spacy_techmo_alignment_wer_embeddings'\n",
Marcin Wątroba's avatar
Marcin Wątroba committed
    "\n",
    "AJN_POLISH_ASR = 'ajn_polish_asr'\n",
    "WORD_AJN_MERTICS_WER = 'word_ajn_metrics_wer'\n",
    "WORD_AJN_ALIGNMENT_WER = 'word_ajn_alignment_wer'\n",
    "AJN_SPACY = 'ajn_spacy'\n",
    "POS_AJN_ALIGNMENT_WER = 'pos_ajn_metrics_wer'\n",
Marcin Wątroba's avatar
Marcin Wątroba committed
    "POS_AJN_METRICS_WER = 'pos_ajn_alignment_wer'\n",
    "TAG_SPACY_AJN_METRICS_WER_EMBEDDINGS = 'tag_spacy_ajn_metrics_wer_embeddings'\n",
    "TAG_SPACY_AJN_ALIGNMENT_WER_EMBEDDINGS = 'tag_spacy_ajn_alignment_wer_embeddings'"
Marcin Wątroba's avatar
Marcin Wątroba committed
   ]
  },
  {
   "cell_type": "code",
Marcin Wątroba's avatar
Marcin Wątroba committed
   "execution_count": 30,
Marcin Wątroba's avatar
Marcin Wątroba committed
   "id": "d4265253-755a-4160-97f7-72604fdf41d1",
   "metadata": {},
   "outputs": [],
   "source": [
    "@dataclass\n",
    "class CollectionsConfig:\n",
    "    config_name: str\n",
    "    gold_transcript: str\n",
    "    gold_transcript_spacy: str\n",
    "    asr: str\n",
    "    word_asr_metric_wer: str\n",
    "    word_asr_alignment_wer: str\n",
    "    asr_spacy: str\n",
    "    pos_asr_metric_wer: str\n",
Marcin Wątroba's avatar
Marcin Wątroba committed
    "    pos_asr_alignment_wer: str\n",
    "    tag_metric_wer: str\n",
    "    tag_alignment_wer: str"
Marcin Wątroba's avatar
Marcin Wątroba committed
   ]
  },
  {
   "cell_type": "code",
Marcin Wątroba's avatar
Marcin Wątroba committed
   "execution_count": 31,
Marcin Wątroba's avatar
Marcin Wątroba committed
   "id": "950b0bb8-e5ae-46e0-97a2-a832b7c8a70f",
   "metadata": {},
   "outputs": [],
   "source": [
    "techmo_connections_config = CollectionsConfig(\n",
    "    config_name='TECHMO ASR',\n",
    "    gold_transcript=GOLD_TRANSCRIPT,\n",
    "    gold_transcript_spacy=GOLD_TRANSCRIPT_SPACY,\n",
    "    asr=TECHMO_POLISH_ASR,\n",
    "    word_asr_metric_wer=WORD_TECHMO_MERTICS_WER,\n",
    "    word_asr_alignment_wer=WORD_TECHMO_ALIGNMENT_WER,\n",
    "    asr_spacy=TECHMO_SPACY,\n",
    "    pos_asr_metric_wer=POS_TECHMO_METRICS_WER,\n",
Marcin Wątroba's avatar
Marcin Wątroba committed
    "    pos_asr_alignment_wer=POS_TECHMO_ALIGNMENT_WER,\n",
    "    tag_metric_wer=TAG_SPACY_TECHMO_METRICS_WER_EMBEDDINGS,\n",
    "    tag_alignment_wer=TAG_SPACY_TECHMO_ALIGNMENT_WER_EMBEDDINGS\n",
Marcin Wątroba's avatar
Marcin Wątroba committed
    ")\n",
    "ajn_connections_config = CollectionsConfig(\n",
    "    config_name='AJN ASR',\n",
    "    gold_transcript=GOLD_TRANSCRIPT,\n",
    "    gold_transcript_spacy=GOLD_TRANSCRIPT_SPACY,\n",
    "    asr=AJN_POLISH_ASR,\n",
    "    word_asr_metric_wer=WORD_AJN_MERTICS_WER,\n",
    "    word_asr_alignment_wer=WORD_AJN_ALIGNMENT_WER,\n",
    "    asr_spacy=AJN_SPACY,\n",
    "    pos_asr_metric_wer=POS_AJN_ALIGNMENT_WER,\n",
Marcin Wątroba's avatar
Marcin Wątroba committed
    "    pos_asr_alignment_wer=POS_AJN_METRICS_WER,\n",
    "    tag_metric_wer=TAG_SPACY_AJN_METRICS_WER_EMBEDDINGS,\n",
    "    tag_alignment_wer=TAG_SPACY_AJN_ALIGNMENT_WER_EMBEDDINGS\n",
Marcin Wątroba's avatar
Marcin Wątroba committed
    ")"
   ]
  },
  {
   "cell_type": "code",
Marcin Wątroba's avatar
Marcin Wątroba committed
   "execution_count": 32,
Marcin Wątroba's avatar
Marcin Wątroba committed
   "id": "4dec626b-02e4-4c78-a238-04ef2f090ea5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "voicelab examples count 800\n",
      "luna examples count 500\n"
Marcin Wątroba's avatar
Marcin Wątroba committed
     ]
    }
   ],
   "source": [
    "voicelab_experiment_repository = MultiFilesExperimentRepository(\n",
    "    'experiment_data/pipeline',\n",
    "    'asr_benchmark_voicelab_cbiz_testset_20220322'\n",
    ")\n",
    "luna_experiment_repository = MultiFilesExperimentRepository(\n",
    "    'experiment_data/pipeline',\n",
    "    'asr_benchmark_luna'\n",
    ")\n",
    "print(f'voicelab examples count {len(voicelab_experiment_repository.get_all_record_ids())}')\n",
    "print(f'luna examples count {len(luna_experiment_repository.get_all_record_ids())}')"
   ]
  },
  {
   "cell_type": "code",
Marcin Wątroba's avatar
Marcin Wątroba committed
   "execution_count": 33,
Marcin Wątroba's avatar
Marcin Wątroba committed
   "id": "98c6ff1d-4fbd-4b68-9e23-ecea33852b12",
   "metadata": {},
   "outputs": [],
   "source": [
    "voicelab_record_provider = VoicelabTelcoRecordProvider(ExtensionPathFilter(\n",
    "    root_directory=VOICELAB_DATASET_DIRECTORY,\n",
    "    extension='wav',\n",
    "), relation_manager_root_path='experiment_data/dataset_relation_manager_data/voicelab_cbiz_testset_20220322')\n",
    "\n",
    "luna_record_provider = LunaRecordProvider(ExtensionPathFilter(\n",
    "    root_directory=f'{LUNA_DATASET_DIRECTORY}/LUNA.PL',\n",
    "    extension='wav',\n",
    "), relation_manager_root_path='experiment_data/dataset_relation_manager_data/luna')"
   ]
  },
  {
   "cell_type": "code",
Marcin Wątroba's avatar
Marcin Wątroba committed
   "execution_count": 34,
   "id": "5bd3dcd6-6b32-480c-8937-07c770354ed1",
   "metadata": {},
   "outputs": [],
Loading full blame...