# Imports and configs

In [1]:
from experiment.luna.luna_record_provider import LunaRecordProvider
from sziszapangma.integration.experiment_manager import ExperimentManager
from sziszapangma.integration.repository.multi_files_experiment_repository import MultiFilesExperimentRepository
from sziszapangma.integration.path_filter import ExtensionPathFilter
from pymongo import MongoClient
from spacy.tokens.doc import Doc
import pandas as pd
from experiment.voicelab.voicelab_telco_record_provider import VoicelabTelcoRecordProvider
from sziszapangma.integration.repository.experiment_repository import ExperimentRepository
from dataclasses import dataclass
import itertools
from typing import Optional, Any, List
import numpy as np

In [2]:
pd.set_option('display.max_rows', None)

# Load datasets and other pipeline objects

In [3]:
VOICELAB_DATASET_DIRECTORY = 'experiment_data/dataset/voicelab_cbiz_testset_20220322'
LUNA_DATASET_DIRECTORY = 'experiment_data/dataset/LUNA.PL'

In [4]:
## repository collections
GOLD_TRANSCRIPT = 'gold_transcript'
GOLD_TRANSCRIPT_SPACY = 'gold_transcript_spacy'

TECHMO_POLISH_ASR = 'techmo_polish_asr'
WORD_TECHMO_MERTICS_WER = 'word_techmo_metrics_wer'
WORD_TECHMO_ALIGNMENT_WER = 'word_techmo_alignment_wer'
TECHMO_SPACY = 'techmo_spacy'
POS_TECHMO_ALIGNMENT_WER = 'pos_techmo_alignment_wer'
POS_TECHMO_METRICS_WER = 'pos_techmo_metrics_wer'

AJN_POLISH_ASR = 'ajn_polish_asr'
WORD_AJN_MERTICS_WER = 'word_ajn_metrics_wer'
WORD_AJN_ALIGNMENT_WER = 'word_ajn_alignment_wer'
AJN_SPACY = 'ajn_spacy'
POS_AJN_ALIGNMENT_WER = 'pos_ajn_metrics_wer'
POS_AJN_METRICS_WER = 'pos_ajn_alignment_wer'

In [5]:
@dataclass
class CollectionsConfig:
    config_name: str
    gold_transcript: str
    gold_transcript_spacy: str
    asr: str
    word_asr_metric_wer: str
    word_asr_alignment_wer: str
    asr_spacy: str
    pos_asr_metric_wer: str
    pos_asr_alignment_wer: str

In [6]:
techmo_connections_config = CollectionsConfig(
    config_name='TECHMO ASR',
    gold_transcript=GOLD_TRANSCRIPT,
    gold_transcript_spacy=GOLD_TRANSCRIPT_SPACY,
    asr=TECHMO_POLISH_ASR,
    word_asr_metric_wer=WORD_TECHMO_MERTICS_WER,
    word_asr_alignment_wer=WORD_TECHMO_ALIGNMENT_WER,
    asr_spacy=TECHMO_SPACY,
    pos_asr_metric_wer=POS_TECHMO_METRICS_WER,
    pos_asr_alignment_wer=POS_TECHMO_ALIGNMENT_WER
)
ajn_connections_config = CollectionsConfig(
    config_name='AJN ASR',
    gold_transcript=GOLD_TRANSCRIPT,
    gold_transcript_spacy=GOLD_TRANSCRIPT_SPACY,
    asr=AJN_POLISH_ASR,
    word_asr_metric_wer=WORD_AJN_MERTICS_WER,
    word_asr_alignment_wer=WORD_AJN_ALIGNMENT_WER,
    asr_spacy=AJN_SPACY,
    pos_asr_metric_wer=POS_AJN_ALIGNMENT_WER,
    pos_asr_alignment_wer=POS_AJN_METRICS_WER
)

In [7]:
voicelab_experiment_repository = MultiFilesExperimentRepository(
    'experiment_data/pipeline',
    'asr_benchmark_voicelab_cbiz_testset_20220322'
)
luna_experiment_repository = MultiFilesExperimentRepository(
    'experiment_data/pipeline',
    'asr_benchmark_luna'
)
print(f'voicelab examples count {len(voicelab_experiment_repository.get_all_record_ids())}')
print(f'luna examples count {len(luna_experiment_repository.get_all_record_ids())}')

voicelab examples count 800
luna examples count 500


In [8]:
voicelab_record_provider = VoicelabTelcoRecordProvider(ExtensionPathFilter(
    root_directory=VOICELAB_DATASET_DIRECTORY,
    extension='wav',
), relation_manager_root_path='experiment_data/dataset_relation_manager_data/voicelab_cbiz_testset_20220322')

luna_record_provider = LunaRecordProvider(ExtensionPathFilter(
    root_directory=f'{LUNA_DATASET_DIRECTORY}/LUNA.PL',
    extension='wav',
), relation_manager_root_path='experiment_data/dataset_relation_manager_data/luna')

In [9]:
def get_gold_transcript_for(record_id: str, experiment_repository: ExperimentRepository, 
                            collections_config: CollectionsConfig) -> str:
    gold_trnascript_property = experiment_repository.get_property_for_key(
        record_id, collections_config.gold_transcript)
    return ' '.join([it['word'] for it in gold_trnascript_property])

def get_asr_transcript_for(record_id: str, experiment_repository: ExperimentRepository,
                           collections_config: CollectionsConfig) -> str:
    gold_trnascript_property = experiment_repository.get_property_for_key(record_id, collections_config.asr)
    return gold_trnascript_property['full_text']

def get_word_alignment_df(record_id: str, experiment_repository: ExperimentRepository, 
                        collections_config: CollectionsConfig) -> pd.DataFrame:
    word_alignment_wer = experiment_repository.get_property_for_key(record_id, 
                                                                    collections_config.word_asr_alignment_wer)
    arr = [
        {
            'step_type': it['step_type'],
            'reference_word_text': it['step_words']['reference_word']['text'] 
            if 'reference_word' in it['step_words'] else '',
            'hypothesis_word_text': it['step_words']['hypothesis_word']['text']
            if 'hypothesis_word' in it['step_words'] else '',
        }
        for it in word_alignment_wer
    ]
    return pd.DataFrame(arr)


def get_pos_alignment_df(record_id: str, experiment_repository: ExperimentRepository, 
                        collections_config: CollectionsConfig) -> pd.DataFrame:
    pos_alignment_wer = experiment_repository.get_property_for_key(record_id, 
                                                                   collections_config.pos_asr_alignment_wer)
    gold_transcript_spacy = experiment_repository.get_property_for_key(record_id,
                                                                       collections_config.gold_transcript_spacy)
    gold_trnascript_spacy_word_dict = {it['id']: it['word'] for it in gold_transcript_spacy}
    asr_spacy = experiment_repository.get_property_for_key(record_id, collections_config.asr_spacy)
    asr_spacy_word_dict = {it['id']: it['word'] for it in asr_spacy}
    arr = [
        {
            'step_type': it['step_type'],
            'reference_word_pos': it['step_words']['reference_word']['text'] 
            if 'reference_word' in it['step_words'] else '',
            'reference_word_text': gold_trnascript_spacy_word_dict[it['step_words']['reference_word']['id']] 
            if 'reference_word' in it['step_words'] else '',
            'hypothesis_word_pos': it['step_words']['hypothesis_word']['text']
            if 'hypothesis_word' in it['step_words'] else '',
            'hypothesis_word_text': asr_spacy_word_dict[it['step_words']['hypothesis_word']['id']] 
            if 'hypothesis_word' in it['step_words'] else ''
        }
        for it in pos_alignment_wer
    ]
    return pd.DataFrame(arr)
    
    
def show_report_for(record_id: str, experiment_repository: ExperimentRepository,
                    collections_config: CollectionsConfig):
    print('record_id:', record_id)
    print('properties_confiog:', collections_config.config_name)
    print()
    print(f'gold transcript: {get_gold_transcript_for(record_id, experiment_repository, collections_config)}')    
    print()
    print(f'asr transcript: {get_asr_transcript_for(record_id, experiment_repository, collections_config)}')
    print()
    print(f"word wer {experiment_repository.get_property_for_key(record_id, collections_config.word_asr_metric_wer)}")
    print()
    print(f"pos wer {experiment_repository.get_property_for_key(record_id, collections_config.pos_asr_metric_wer)}")
    print()
    display(get_word_alignment_df(record_id, experiment_repository, collections_config))
    print()
    display(get_pos_alignment_df(record_id, experiment_repository, collections_config))
    print('--------------------------------------------------------------')
    print('--------------------------------------------------------------')
    print('--------------------------------------------------------------')

# Report for LUNA and TECHMO ASR

In [None]:
for it in list(luna_record_provider.get_all_records())[:1]:
    show_report_for(it, luna_experiment_repository, techmo_connections_config)

# Report for LUNA and AJN ASR

In [None]:
for it in list(luna_record_provider.get_all_records())[:1]:
    show_report_for(it, luna_experiment_repository, ajn_connections_config)

# Report for VOICELAB and TECHMO ASR

In [None]:
for it in list(voicelab_record_provider.get_all_records())[:1]:
    show_report_for(it, voicelab_experiment_repository, techmo_connections_config)

In [None]:
for it in list(voicelab_record_provider.get_all_records())[:1]:
    show_report_for(it, voicelab_experiment_repository, ajn_connections_config)

# Statistics of POS incorrect data

In [22]:
@dataclass
class AlignDetails:
    value_reference: Optional[str]
    value_hypothesis: Optional[str]
    
    @staticmethod
    def _get_string_or_default(value: Optional[str]) -> str:
        return value if value is not None else '___'
        
    def to_string(self) -> str:
        return f'{self._get_string_or_default(self.value_reference)} -> {self._get_string_or_default(self.value_hypothesis)}'
    
    def is_correct(self) -> bool:
        return self.value_reference == self.value_hypothesis
    
    
def get_align_details(repository_record: List[Any]) -> List[AlignDetails]:
    return [
        AlignDetails(
            value_reference=it['step_words']['reference_word']['text'] if 'reference_word' in it['step_words'] else None,
            value_hypothesis=it['step_words']['hypothesis_word']['text'] if 'hypothesis_word' in it['step_words'] else None
        )
        for it in repository_record
    ]


def show_stats(experiment_repository: ExperimentRepository, collections_config: CollectionsConfig):
    aggregated_aligned = [
        get_align_details(experiment_repository.get_property_for_key(record_id, collections_config.pos_asr_alignment_wer))
        for record_id in list(experiment_repository.get_all_record_ids())[:]
    ]
    list_of_aligns = list(itertools.chain(*aggregated_aligned))
    list_of_changes = [it.to_string() for it in list_of_aligns if not it.is_correct()]
    unique = np.unique(np.array(list_of_changes), return_counts=True)
    return pd.DataFrame({'values': unique[0], 'counts': unique[1]}).sort_values(by=['counts'], ascending=False).head(16)

# Luna Techmo

In [24]:
show_stats(luna_experiment_repository, techmo_connections_config)

Unnamed: 0,values,counts
131,NUM -> ___,2290
192,PUNCT -> ___,1479
114,NOUN -> ___,1204
169,PROPN -> NOUN,1177
45,ADV -> ___,1073
30,ADP -> ___,988
246,___ -> NOUN,918
130,NUM -> X,900
225,VERB -> ___,883
176,PROPN -> ___,823


## Luna AJN

In [25]:
show_stats(luna_experiment_repository, ajn_connections_config)

Unnamed: 0,values,counts
284,___ -> PUNCT,3344
256,VERB -> ___,1645
251,VERB -> PUNCT,1582
279,___ -> NOUN,1548
28,ADP -> PUNCT,1522
52,ADV -> ___,1409
34,ADP -> ___,1316
124,NOUN -> PUNCT,1300
46,ADV -> PUNCT,1197
246,VERB -> NOUN,1186


## VoiceLab Techmo

In [26]:
show_stats(voicelab_experiment_repository, techmo_connections_config)

Unnamed: 0,values,counts
30,ADP -> ___,4527
121,NOUN -> ___,4129
151,PART -> ___,3585
167,PRON -> ___,3424
221,VERB -> ___,2935
46,ADV -> ___,2727
77,CCONJ -> ___,2360
135,NUM -> X,1842
136,NUM -> ___,1726
92,DET -> ___,1715


## VoiceLab AJN

In [27]:
show_stats(voicelab_experiment_repository, ajn_connections_config)

Unnamed: 0,values,counts
259,VERB -> ___,11609
140,NOUN -> ___,10416
53,ADV -> ___,10127
175,PART -> ___,9282
35,ADP -> ___,8663
192,PRON -> ___,8066
287,___ -> PUNCT,6354
105,DET -> ___,6147
17,ADJ -> ___,5935
231,SCONJ -> ___,5385
