main.py

from pydub import AudioSegment

from sziszapangma.integration.service_core.asr.asr_base_processor import AsrBaseProcessor
from sziszapangma.integration.service_core.asr.asr_result import AsrResult, WordTimeAlignment
from huggingsound import SpeechRecognitionModel


class SpeechbrainAsrProcessor(AsrBaseProcessor):

    def process_asr(self, audio_file_path: str) -> AsrResult:
        asr_result = model.transcribe([audio_file_path])[0]
        return AsrResult(
            words=[it for it in asr_result['transcription'].split(' ')],
            full_text=asr_result['transcription'],
            words_time_alignment=[
                WordTimeAlignment(asr_result['start_timestamps'][it], asr_result['end_timestamps'][it])
                for it in range(len(asr_result['start_timestamps']))
            ]
        )


if __name__ == '__main__':
    model = SpeechRecognitionModel("jonatasgrosman/wav2vec2-xls-r-1b-polish")
    SpeechbrainAsrProcessor().start_processor()