import json
from pathlib import Path
from typing import Any, List

from nltk import RegexpTokenizer

from new_experiment.new_dependency_provider import get_experiment_repository
from new_experiment.utils.property_helper import PropertyHelper
from sziszapangma.model.model_creators import create_new_word


# de_voxpopuli

def get_words(raw: str) -> List[str]:
    tokenizer = RegexpTokenizer(r'\w+')
    return tokenizer.tokenize(raw)


def import_from_file(lang: str):
    path = Path(f'/Users/marcinwatroba/Desktop/MY_PROJECTS/playground/librispeech/cache_items_{lang}_voxpopuli.jsonl')
    with open(path, 'r') as reader:
        dataset_name = f'{lang}_voxpopuli'
        repo = get_experiment_repository(dataset_name)
        for line in reader.read().splitlines(keepends=False):
            it_dict = json.loads(line)
            print(it_dict)
            record_id = str(it_dict['audio_unique_id'])
            raw_text = it_dict['raw_text']
            normalized_text_words = [create_new_word(it) for it in get_words(it_dict['normalized_text'])]
            repo.update_property_for_key(
                record_id=record_id,
                property_name=PropertyHelper.get_gold_transcript_words(),
                property_value=normalized_text_words
            )
            repo.update_property_for_key(
                record_id=record_id,
                property_name=PropertyHelper.get_gold_transcript_raw(),
                property_value={'gold_transcript_raw': raw_text}
            )


if __name__ == '__main__':
    # import_voxpopuli_dataset('nl', 'nl_voxpopuli')
    # import_voxpopuli_dataset('fr', 'fr_voxpopuli')
    # import_voxpopuli_dataset('de', 'de_voxpopuli')
    # import_voxpopuli_dataset('it', 'it_voxpopuli')
    # import_voxpopuli_dataset('pl', 'pl_voxpopuli')
    # import_voxpopuli_dataset('es', 'es_voxpopuli')
    # import_voxpopuli_dataset('en', 'en_voxpopuli')

    import_from_file('nl')
    import_from_file('fr')
    import_from_file('de')
    # import_from_file('it')
    import_from_file('pl')
    import_from_file('es')
    import_from_file('en')
