# Imports and configs

In [12]:
from experiment.luna.luna_record_provider import LunaRecordProvider
from sziszapangma.integration.experiment_manager import ExperimentManager
from sziszapangma.integration.repository.multi_files_experiment_repository import MultiFilesExperimentRepository
from sziszapangma.integration.path_filter import ExtensionPathFilter
from pymongo import MongoClient
from spacy.tokens.doc import Doc
import pandas as pd
from experiment.voicelab.voicelab_telco_record_provider import VoicelabTelcoRecordProvider
from sziszapangma.integration.repository.experiment_repository import ExperimentRepository
from dataclasses import dataclass
import itertools
from typing import Optional, Any, List
import numpy as np
from pprint import pprint

In [None]:
pd.set_option('display.max_rows', None)

# Load datasets and other pipeline objects

In [39]:
VOICELAB_DATASET_DIRECTORY = 'experiment_data/dataset/voicelab_cbiz_testset_20220322'
LUNA_DATASET_DIRECTORY = 'experiment_data/dataset/LUNA.PL'

In [41]:
voicelab_experiment_repository = MultiFilesExperimentRepository(
    'experiment_data/pipeline',
    'asr_benchmark_voicelab_cbiz_testset_20220322'
)
luna_experiment_repository = MultiFilesExperimentRepository(
    'experiment_data/pipeline',
    'asr_benchmark_luna'
)
print(f'voicelab examples count {len(voicelab_experiment_repository.get_all_record_ids())}')
print(f'luna examples count {len(luna_experiment_repository.get_all_record_ids())}')

voicelab examples count 800
luna examples count 500


In [42]:
voicelab_record_provider = VoicelabTelcoRecordProvider(ExtensionPathFilter(
    root_directory=VOICELAB_DATASET_DIRECTORY,
    extension='wav',
), relation_manager_root_path='experiment_data/dataset_relation_manager_data/voicelab_cbiz_testset_20220322')
luna_record_provider = LunaRecordProvider(ExtensionPathFilter(
    root_directory=f'{LUNA_DATASET_DIRECTORY}/LUNA.PL',
    extension='wav',
), relation_manager_root_path='experiment_data/dataset_relation_manager_data/luna')

# Preview data

In [53]:
def show_ducklig_report(repository: ExperimentRepository, record_id: str):
    gold_transcript_text = ' '.join([it['word'] for it in repository.get_property_for_key(record_id, 'gold_transcript')])
    pprint({
        'gold_transcript_text': gold_transcript_text,
        'gold_transcript_duckling': repository.get_property_for_key(record_id, 'gold_transcript_duckling'),
        'techmo_asr': repository.get_property_for_key(record_id, 'techmo_polish_asr')['full_text'],
        'techmo_duckling': repository.get_property_for_key(record_id, 'techmo_duckling'),
        'ajn_asr': repository.get_property_for_key(record_id, 'ajn_polish_asr')['full_text'],
        'ajn_duckling': repository.get_property_for_key(record_id, 'ajn_duckling')
    })

In [55]:
# show_ducklig_report(voicelab_experiment_repository, list(voicelab_record_provider.get_all_records())[0])
show_ducklig_report(luna_experiment_repository, list(luna_record_provider.get_all_records())[0])

{'ajn_asr': 'czy dobre <unk> z parą petra <unk> <unk> <unk> <unk> chodziło o '
            'kopanie szczepień rtÉ to repeat od uczestników ochota tak jak '
            'przedtem - cenię i szanuję - <unk> <unk> <unk> <unk> hojnie trud '
            'wśród <unk> okazało się to echem nad nie zatrzymywał się nie '
            'zatrzymują się zatrzymywać w remont tunelu średnicowego i hołd '
            'koźla śródmieście z otrzymuje &quot; wykonano kociak otrzymują '
            'też fachowców furtokiem proszę bardzo  ',
 'ajn_duckling': [],
 'gold_transcript_duckling': [{'body': 'dzień',
                               'dim': 'duration',
                               'end': 9,
                               'latent': False,
                               'start': 4,
                               'value': {'day': 1,
                                         'normalized': {'unit': 'second',
                                                        'value': 86400},
                            