from pathlib import Path
from typing import Dict, Any, List

from minio import Minio
from nltk import RegexpTokenizer

from new_datasets.import_datasets.upload_audio import process_numpy_array_to_md5_hash
from new_experiment.utils.hf_dataset_importer import HfDatasetImporter
from new_experiment.utils.minio_audio_record_repository import MinioAudioRecordRepository
from sziszapangma.integration.repository.experiment_repository import ExperimentRepository


class FleursDatasetImporter(HfDatasetImporter):

    def __init__(self, experiment_repository: ExperimentRepository,
                 minio_audio_record_repository: MinioAudioRecordRepository, dataset_name: str):
        super().__init__(experiment_repository, minio_audio_record_repository, dataset_name)

    def get_words(self, record: Dict[str, Any]) -> List[str]:
        tokenizer = RegexpTokenizer(r'\w+')
        return tokenizer.tokenize(record['transcription'])

    def get_raw_transcription(self, record: Dict[str, Any]) -> str:
        return record['transcription']

    def get_audio_file(self, record: Dict[str, Any]) -> Path:
        return record['path']

    def get_record_id(self, record: Dict[str, Any]) -> str:
        return str(record["id"])
