diff --git a/Dockerfile b/Dockerfile index 0bcb59c55e9396d006b6efb8976f81efed7c163f..84d3df412a1e53c63316fddd3b6326465fca4019 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,6 +6,15 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y tzdata RUN add-apt-repository ppa:deadsnakes/ppa && apt-get update && apt-get install -y python3.8 python3-pip ffmpeg RUN alias python='python3' && alias pip='pip3' && pip install poetry +RUN pip install spacy +RUN python -m spacy download de_core_news_lg +RUN python -m spacy download pl_core_news_lg +RUN python -m spacy download en_core_news_lg +RUN python -m spacy download it_core_news_lg +RUN python -m spacy download nl_core_news_lg +RUN python -m spacy download sp_core_news_lg +RUN python -m spacy download pt_core_news_lg + ADD poetry.lock ./ ADD pyproject.toml ./ ADD README.rst ./ diff --git a/experiment/pipeline_process_spacy_pos_wer.py b/experiment/pipeline_process_spacy_pos_wer.py index d3e8ebf172d8a480185f1d3607b72e1c33736a1c..191f4f7265746cdcdc8d0cad1e22b220f300aecb 100644 --- a/experiment/pipeline_process_spacy_pos_wer.py +++ b/experiment/pipeline_process_spacy_pos_wer.py @@ -8,11 +8,12 @@ from sziszapangma.integration.experiment_manager import ExperimentManager def run_spacy_pos_wer_pipeline(dataset_name: str, asr_name: str): record_provider = get_record_provider(dataset_name) + language_code = dataset_name[:2] experiment_processor = ExperimentManager( record_id_iterator=record_provider, processing_tasks=[ SpacyPosSentenceWerProcessor( - model_name='pl_core_news_lg', + model_name=f'{language_code}_core_news_lg', gold_transcript_property_name=GOLD_TRANSCRIPT, asr_property_name=f'{asr_name}__result', alignment_property_name=f'{asr_name}__spacy_pos_alignment', @@ -22,7 +23,6 @@ def run_spacy_pos_wer_pipeline(dataset_name: str, asr_name: str): ) ], experiment_repository=get_repository(dataset_name), - relation_manager_provider=record_provider ) experiment_processor.process() diff --git a/experiment/sentence_wer_processor/sentence_wer_processor.py b/experiment/sentence_wer_processor/sentence_wer_processor.py index bda3f20695b19a3e8ffefaf460a33f05fcac0461..a51a0826ec3f56cb3bb7e4e66385e0d524518700 100644 --- a/experiment/sentence_wer_processor/sentence_wer_processor.py +++ b/experiment/sentence_wer_processor/sentence_wer_processor.py @@ -32,14 +32,13 @@ class SentenceWerProcessor(ProcessingTask): def get_gold_transcript_text(self, record_id: str, experiment_repository: ExperimentRepository) -> str: property_value = experiment_repository.get_property_for_key(record_id, self._gold_transcript_property_name) - return ' '.join([it['word'] for it in property_value]) + return property_value['gold_transcript_raw'] def get_asr_text(self, record_id: str, experiment_repository: ExperimentRepository) -> str: property_value = experiment_repository.get_property_for_key(record_id, self._asr_property_name) return property_value['full_text'] - def run_single_process(self, record_id: str, experiment_repository: ExperimentRepository, - relation_manager: RelationManager): + def run_single_process(self, record_id: str, experiment_repository: ExperimentRepository): gold_transcript_text = self.get_gold_transcript_text(record_id, experiment_repository) asr_text = self.get_asr_text(record_id, experiment_repository) alignment = self._alignment_classic_calculator.calculate_alignment( diff --git a/new_experiment/pipeline/pipeline_process_spacy_dep_tag_wer.py b/new_experiment/pipeline/pipeline_process_spacy_dep_tag_wer.py new file mode 100644 index 0000000000000000000000000000000000000000..f0727d17afa961a0c7a1c6ec327cd5ce17ed435e --- /dev/null +++ b/new_experiment/pipeline/pipeline_process_spacy_dep_tag_wer.py @@ -0,0 +1,37 @@ +import argparse + +from experiment.const_pipeline_names import GOLD_TRANSCRIPT +from experiment.experiment_dependency_provider import get_record_provider, get_repository +from experiment.sentence_wer_processor.spacy_pos_sentence_dep_tag_processor import SpacyDepTagSentenceWerProcessor +from new_experiment.new_dependency_provider import get_experiment_repository, get_minio_audio_record_repository +from new_experiment.utils.get_spacy_model_name import get_spacy_model_name +from new_experiment.utils.loaded_remote_dataset_helper import LoadedRemoteDatasetHelper +from new_experiment.utils.property_helper import PropertyHelper +from sziszapangma.integration.experiment_manager import ExperimentManager + + +def run_spacy_pos_wer_pipeline(dataset_name: str, asr_name: str): + repository = get_experiment_repository(dataset_name) + record_provider = LoadedRemoteDatasetHelper(repository, get_minio_audio_record_repository(), dataset_name) + language_code = dataset_name[:2] + model_name = get_spacy_model_name(language_code) + experiment_processor = ExperimentManager( + record_id_iterator=record_provider, + processing_tasks=[ + SpacyDepTagSentenceWerProcessor( + model_name=get_spacy_model_name(language_code), + gold_transcript_property_name=PropertyHelper.get_gold_transcript_raw(), + asr_property_name=PropertyHelper.asr_result(asr_name), + alignment_property_name=PropertyHelper.dep_tag_alignment(asr_name, model_name), + wer_property_name=PropertyHelper.dep_tag_metrics(asr_name, model_name), + task_name=f'SpacyDepTagSentenceWerProcessor___{dataset_name}___{asr_name}', + require_update=False + ) + ], + experiment_repository=repository, + ) + experiment_processor.process() + + +if __name__ == '__main__': + run_spacy_pos_wer_pipeline('de_minds14', 'whisper_tiny') diff --git a/new_experiment/pipeline/pipeline_process_spacy_ner_wer.py b/new_experiment/pipeline/pipeline_process_spacy_ner_wer.py new file mode 100644 index 0000000000000000000000000000000000000000..96cc9efcdd2b986ec32e9d3e1eaef8bf101d0d04 --- /dev/null +++ b/new_experiment/pipeline/pipeline_process_spacy_ner_wer.py @@ -0,0 +1,37 @@ +import argparse + +from experiment.const_pipeline_names import GOLD_TRANSCRIPT +from experiment.experiment_dependency_provider import get_record_provider, get_repository +from experiment.sentence_wer_processor.spacy_ner_sentence_wer_processor import SpacyNerSentenceWerProcessor +from new_experiment.new_dependency_provider import get_experiment_repository, get_minio_audio_record_repository +from new_experiment.utils.get_spacy_model_name import get_spacy_model_name +from new_experiment.utils.loaded_remote_dataset_helper import LoadedRemoteDatasetHelper +from new_experiment.utils.property_helper import PropertyHelper +from sziszapangma.integration.experiment_manager import ExperimentManager + + +def run_spacy_ner_wer_pipeline(dataset_name: str, asr_name: str): + repository = get_experiment_repository(dataset_name) + record_provider = LoadedRemoteDatasetHelper(repository, get_minio_audio_record_repository(), dataset_name) + language_code = dataset_name[:2] + model_name = get_spacy_model_name(language_code) + experiment_processor = ExperimentManager( + record_id_iterator=record_provider, + processing_tasks=[ + SpacyNerSentenceWerProcessor( + model_name=model_name, + gold_transcript_property_name=PropertyHelper.get_gold_transcript_raw(), + asr_property_name=PropertyHelper.asr_result(asr_name), + alignment_property_name=PropertyHelper.ner_alignment(asr_name, model_name), + wer_property_name=PropertyHelper.ner_metrics(asr_name, model_name), + task_name=f'SpacyNerSentenceWerProcessor___{dataset_name}___{asr_name}', + require_update=False + ) + ], + experiment_repository=repository, + ) + experiment_processor.process() + + +if __name__ == '__main__': + run_spacy_ner_wer_pipeline('de_minds14', 'whisper_tiny') diff --git a/new_experiment/pipeline/pipeline_process_spacy_pos_wer.py b/new_experiment/pipeline/pipeline_process_spacy_pos_wer.py new file mode 100644 index 0000000000000000000000000000000000000000..5959dc97a59cc9e3d345aaa86da7d67ba4d499ac --- /dev/null +++ b/new_experiment/pipeline/pipeline_process_spacy_pos_wer.py @@ -0,0 +1,34 @@ +from experiment.experiment_dependency_provider import get_record_provider, get_repository +from experiment.sentence_wer_processor.spacy_pos_sentence_wer_processor import SpacyPosSentenceWerProcessor +from new_experiment.new_dependency_provider import get_experiment_repository, get_minio_audio_record_repository +from new_experiment.utils.get_spacy_model_name import get_spacy_model_name +from new_experiment.utils.loaded_remote_dataset_helper import LoadedRemoteDatasetHelper +from new_experiment.utils.property_helper import PropertyHelper +from sziszapangma.integration.experiment_manager import ExperimentManager + + +def run_spacy_pos_wer_pipeline(dataset_name: str, asr_name: str): + repository = get_experiment_repository(dataset_name) + record_provider = LoadedRemoteDatasetHelper(repository, get_minio_audio_record_repository(), dataset_name) + language_code = dataset_name[:2] + model_name = get_spacy_model_name(language_code) + experiment_processor = ExperimentManager( + record_id_iterator=record_provider, + processing_tasks=[ + SpacyPosSentenceWerProcessor( + model_name=model_name, + gold_transcript_property_name=PropertyHelper.get_gold_transcript_raw(), + asr_property_name=PropertyHelper.asr_result(asr_name), + alignment_property_name=PropertyHelper.pos_alignment(asr_name, model_name), + wer_property_name=PropertyHelper.pos_metrics(asr_name, model_name), + task_name=f'SpacyPosSentenceWerProcessor___{dataset_name}___{asr_name}', + require_update=True + ) + ], + experiment_repository=repository, + ) + experiment_processor.process() + + +if __name__ == '__main__': + run_spacy_pos_wer_pipeline('de_minds14', 'whisper_tiny') diff --git a/new_experiment/pipeline/pipeline_process_word_classic_wer.py b/new_experiment/pipeline/pipeline_process_word_classic_wer.py index 9489f64a4244d48aa05f34b81876c5634e02c958..231a696ae4190ab8c261e6718f56148e98f480ec 100644 --- a/new_experiment/pipeline/pipeline_process_word_classic_wer.py +++ b/new_experiment/pipeline/pipeline_process_word_classic_wer.py @@ -1,14 +1,8 @@ -import argparse - -from experiment.const_pipeline_names import GOLD_TRANSCRIPT -from experiment.experiment_dependency_provider import get_record_provider, get_repository from new_experiment.new_dependency_provider import get_experiment_repository, get_minio_audio_record_repository from new_experiment.utils.loaded_remote_dataset_helper import LoadedRemoteDatasetHelper from new_experiment.utils.property_helper import PropertyHelper -from sziszapangma.core.transformer.web_embedding_transformer import WebEmbeddingTransformer from sziszapangma.integration.experiment_manager import ExperimentManager from sziszapangma.integration.task.classic_wer_metric_task import ClassicWerMetricTask -from sziszapangma.integration.task.embedding_wer_metrics_task import EmbeddingWerMetricsTask def run_word_wer_classic_pipeline(dataset_name: str, asr_name: str): @@ -24,15 +18,6 @@ def run_word_wer_classic_pipeline(dataset_name: str, asr_name: str): require_update=True, alignment_property_name=PropertyHelper.word_wer_classic_alignment(asr_name) ), - # EmbeddingWerMetricsTask( - # task_name='EmbeddingWerMetricsTask', - # asr_property_name=f'{asr_name}__result', - # gold_transcript_property_name=GOLD_TRANSCRIPT, - # metrics_property_name=f'{asr_name}__word_wer_embeddings_metrics', - # require_update=False, - # embedding_transformer=WebEmbeddingTransformer('pl', 'http://localhost:5003', 'fjsd-mkwe-oius-m9h2'), - # alignment_property_name=f'{asr_name}__word_wer_embeddings_alignment' - # ) ], experiment_repository=repository ) @@ -40,4 +25,4 @@ def run_word_wer_classic_pipeline(dataset_name: str, asr_name: str): if __name__ == '__main__': - run_word_wer_classic_pipeline('de_google_fleurs', 'whisper_tiny') + run_word_wer_classic_pipeline('de_minds14', 'whisper_tiny') diff --git a/new_experiment/pipeline/pipeline_process_word_embedding_wer.py b/new_experiment/pipeline/pipeline_process_word_embedding_wer.py index 071be0ee8ac566ab36ea6d10e757f00f292dc84b..dbdecb6461931f656395353da518d0178e4ad8ae 100644 --- a/new_experiment/pipeline/pipeline_process_word_embedding_wer.py +++ b/new_experiment/pipeline/pipeline_process_word_embedding_wer.py @@ -18,11 +18,11 @@ def run_word_wer_classic_pipeline(dataset_name: str, asr_name: str): record_id_iterator=LoadedRemoteDatasetHelper(repository, get_minio_audio_record_repository(), dataset_name), processing_tasks=[ EmbeddingWerMetricsTask( - task_name='EmbeddingWerMetricsTask', + task_name=f'EmbeddingWerMetricsTask___{dataset_name}___{asr_name}', asr_property_name=PropertyHelper.asr_result(asr_name), gold_transcript_property_name=PropertyHelper.get_gold_transcript_words(), metrics_property_name=PropertyHelper.word_wer_embeddings_metrics(asr_name), - require_update=False, + require_update=True, embedding_transformer=FasttextEmbeddingTransformer(dataset_name[:2]), alignment_property_name=PropertyHelper.word_wer_embeddings_alignment(asr_name) ) @@ -33,4 +33,4 @@ def run_word_wer_classic_pipeline(dataset_name: str, asr_name: str): if __name__ == '__main__': - run_word_wer_classic_pipeline('de_google_fleurs', 'whisper_tiny') + run_word_wer_classic_pipeline('de_minds14', 'whisper_tiny') diff --git a/new_experiment/utils/get_spacy_model_name.py b/new_experiment/utils/get_spacy_model_name.py new file mode 100644 index 0000000000000000000000000000000000000000..bf910a63c6df634f1634ff08a719532d444a1d05 --- /dev/null +++ b/new_experiment/utils/get_spacy_model_name.py @@ -0,0 +1,2 @@ +def get_spacy_model_name(language_code_2_letter: str) -> str: + return f'{language_code_2_letter}_core_news_lg' diff --git a/new_experiment/utils/property_helper.py b/new_experiment/utils/property_helper.py index f00ebe88c470938b03b0dc4303cc5a6fd21ae0b5..41bddcad11e1fe8ed8083ecbba967d9f9f2c351b 100644 --- a/new_experiment/utils/property_helper.py +++ b/new_experiment/utils/property_helper.py @@ -14,27 +14,27 @@ class PropertyHelper: @staticmethod def pos_alignment(asr_name: str, model_name: str) -> str: - return f'{asr_name}__{model_name}_pos_alignment' + return f'{asr_name}__{model_name}__pos_alignment' @staticmethod def pos_metrics(asr_name: str, model_name: str) -> str: - return f'{asr_name}__{model_name}_pos_metrics' + return f'{asr_name}__{model_name}__pos_metrics' @staticmethod def dep_tag_alignment(asr_name: str, model_name: str) -> str: - return f'{asr_name}__{model_name}_dep_tag_alignment' + return f'{asr_name}__{model_name}__dep_tag_alignment' @staticmethod def dep_tag_metrics(asr_name: str, model_name: str) -> str: - return f'{asr_name}__{model_name}_dep_tag_metrics' + return f'{asr_name}__{model_name}__dep_tag_metrics' @staticmethod def ner_alignment(asr_name: str, model_name: str) -> str: - return f'{asr_name}__{model_name}_ner_alignment' + return f'{asr_name}__{model_name}__ner_alignment' @staticmethod def ner_metrics(asr_name: str, model_name: str) -> str: - return f'{asr_name}__{model_name}_ner_metrics' + return f'{asr_name}__{model_name}__ner_metrics' @staticmethod def word_wer_classic_alignment(asr_name: str) -> str: diff --git a/new_experiment/worker.py b/new_experiment/worker.py new file mode 100644 index 0000000000000000000000000000000000000000..3a867426d5e93d983a80cd2d15b41a2a0566bda1 --- /dev/null +++ b/new_experiment/worker.py @@ -0,0 +1,44 @@ +import json +import os +import uuid + +import pika +from minio import Minio +from pymongo import MongoClient +from urllib3 import HTTPResponse + +from new_datasets.whisper_processor import WhisperAsrProcessor +from sziszapangma.integration.repository.mongo_experiment_repository import MongoExperimentRepository + + +def get_param(name: str, default: str) -> str: + return os.environ[name] if name in os.environ else default + + +_RABBIT_URL = get_param('RABBIT_URL', + 'amqps://rabbit_user:kz6m4972OUHFmtUcPOHx4kF3Lj6yw7lo@rabbit-asr-benchmarks.theliver.pl:5671/') +def main(): + parameters = pika.URLParameters(_RABBIT_URL) + connection = pika.BlockingConnection(parameters=parameters) + channel = connection.channel() + channel.basic_qos(prefetch_count=1) + + queue_name = f'asr_benchmark_experiments' + for method_frame, properties, body in channel.consume(queue_name): + print(method_frame, properties, body) + message_dict = json.loads(body.decode('utf-8')) + print(message_dict) + + task = message_dict['task'] + + + channel.basic_ack(method_frame.delivery_tag) + print('\n########################################################\n') + + requeued_messages = channel.cancel() + print('Requeued %i messages' % requeued_messages) + connection.close() + + +if __name__ == '__main__': + main() diff --git a/sziszapangma/core/transformer/fasttext_embedding_transformer.py b/sziszapangma/core/transformer/fasttext_embedding_transformer.py index b95b93cf6fb8d09767cb3a3cc78c86a1314a2c1e..fecd7c1980336251729c33df0b3840d93677ae7d 100644 --- a/sziszapangma/core/transformer/fasttext_embedding_transformer.py +++ b/sziszapangma/core/transformer/fasttext_embedding_transformer.py @@ -16,7 +16,7 @@ class FasttextEmbeddingTransformer(EmbeddingTransformer): def __init__(self, lang_id: str): self._lang_id = lang_id fasttext.util.download_model(lang_id, if_exists='ignore') - ft = fasttext.load_model(f'cc.{lang_id}.300.bin') + self._model = fasttext.load_model(f'cc.{lang_id}.300.bin') def get_embedding(self, word: str) -> np.ndarray: return self._model.get_word_vector(word) diff --git a/sziszapangma/integration/repository/mongo_experiment_repository.py b/sziszapangma/integration/repository/mongo_experiment_repository.py index 0e9fb114ca2f09115579281fbb81ed1a999284f5..6c87a1dc87b4e83c636b8006bebb5ac8ef3202c5 100644 --- a/sziszapangma/integration/repository/mongo_experiment_repository.py +++ b/sziszapangma/integration/repository/mongo_experiment_repository.py @@ -24,16 +24,9 @@ class MongoExperimentRepository(ExperimentRepository): def property_exists(self, record_id: str, property_name: str) -> bool: database = self._get_database() all_collections = database.list_collection_names() - print(property_name, all_collections) if property_name not in all_collections: - print('collection not found') return False else: - print('self.get_all_record_ids_for_property(property_name)', record_id, record_id.__class__, - # record_id in self.get_all_record_ids_for_property(property_name), - # len(self.get_all_record_ids_for_property(property_name)), - list(self.get_all_record_ids_for_property(property_name))[0], - list(self.get_all_record_ids_for_property(property_name))[0].__class__) return database[property_name].find_one({ID: record_id}) is not None def update_property_for_key(self, record_id: str, property_name: str, property_value: Any): diff --git a/sziszapangma/integration/task/classic_wer_metric_task.py b/sziszapangma/integration/task/classic_wer_metric_task.py index af7a46bc233d90e47e5a0a492286aed452cf3792..2a178090601a2327df183f4b1c79b9620a2a36f2 100644 --- a/sziszapangma/integration/task/classic_wer_metric_task.py +++ b/sziszapangma/integration/task/classic_wer_metric_task.py @@ -49,9 +49,7 @@ class ClassicWerMetricTask(ProcessingTask): record_id: str, experiment_repository: ExperimentRepository, ): - print('#############') gold_transcript = experiment_repository.get_property_for_key(record_id, self._gold_transcript_property_name) - print('$$$$$$$$$$$$$', gold_transcript) asr_result = experiment_repository.get_property_for_key(record_id, self._asr_property_name) if gold_transcript is not None and asr_result is not None and "transcription" in asr_result: alignment_steps = self._get_alignment(gold_transcript, asr_result["transcription"]) diff --git a/sziszapangma/integration/task/embedding_wer_metrics_task.py b/sziszapangma/integration/task/embedding_wer_metrics_task.py index eefc2fd4285adbbb7f0ba7315b959e45f20a937d..099773a33cbaf0ab3d98299e66b03c90ca975d25 100644 --- a/sziszapangma/integration/task/embedding_wer_metrics_task.py +++ b/sziszapangma/integration/task/embedding_wer_metrics_task.py @@ -10,7 +10,6 @@ from sziszapangma.integration.repository.experiment_repository import Experiment from sziszapangma.integration.task.processing_task import ProcessingTask from sziszapangma.integration.task.task_util import TaskUtil from sziszapangma.model.model import Word -from sziszapangma.model.relation_manager import RelationManager _SOFT_WER = "soft_wer" _EMBEDDING_WER = "embedding_wer" @@ -62,9 +61,8 @@ class EmbeddingWerMetricsTask(ProcessingTask): self, record_id: str, experiment_repository: ExperimentRepository, - relation_manager: RelationManager, ): - gold_transcript = TaskUtil.get_words_from_record(relation_manager) + gold_transcript = experiment_repository.get_property_for_key(record_id, self._gold_transcript_property_name) asr_result = experiment_repository.get_property_for_key(record_id, self._asr_property_name) if gold_transcript is not None and asr_result is not None and "transcription" in asr_result: gold_transcript_lower = self.filter_empty_words(TaskUtil.words_to_lower(gold_transcript))