From 4eb6ea659c26fc0ada729b86bd9bac8112337480 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20W=C4=85troba?= <markowanga@gmail.com> Date: Thu, 12 Jan 2023 02:43:08 +0100 Subject: [PATCH] Add spacy new pipeline --- Dockerfile | 9 ++++ experiment/pipeline_process_spacy_pos_wer.py | 4 +- .../sentence_wer_processor.py | 5 +-- .../pipeline_process_spacy_dep_tag_wer.py | 37 ++++++++++++++++ .../pipeline_process_spacy_ner_wer.py | 37 ++++++++++++++++ .../pipeline_process_spacy_pos_wer.py | 34 ++++++++++++++ .../pipeline_process_word_classic_wer.py | 17 +------ .../pipeline_process_word_embedding_wer.py | 6 +-- new_experiment/utils/get_spacy_model_name.py | 2 + new_experiment/utils/property_helper.py | 12 ++--- new_experiment/worker.py | 44 +++++++++++++++++++ .../fasttext_embedding_transformer.py | 2 +- .../repository/mongo_experiment_repository.py | 7 --- .../task/classic_wer_metric_task.py | 2 - .../task/embedding_wer_metrics_task.py | 4 +- 15 files changed, 179 insertions(+), 43 deletions(-) create mode 100644 new_experiment/pipeline/pipeline_process_spacy_dep_tag_wer.py create mode 100644 new_experiment/pipeline/pipeline_process_spacy_ner_wer.py create mode 100644 new_experiment/pipeline/pipeline_process_spacy_pos_wer.py create mode 100644 new_experiment/utils/get_spacy_model_name.py create mode 100644 new_experiment/worker.py diff --git a/Dockerfile b/Dockerfile index 0bcb59c..84d3df4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,6 +6,15 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y tzdata RUN add-apt-repository ppa:deadsnakes/ppa && apt-get update && apt-get install -y python3.8 python3-pip ffmpeg RUN alias python='python3' && alias pip='pip3' && pip install poetry +RUN pip install spacy +RUN python -m spacy download de_core_news_lg +RUN python -m spacy download pl_core_news_lg +RUN python -m spacy download en_core_news_lg +RUN python -m spacy download it_core_news_lg +RUN python -m spacy download nl_core_news_lg +RUN python -m spacy download sp_core_news_lg +RUN python -m spacy download pt_core_news_lg + ADD poetry.lock ./ ADD pyproject.toml ./ ADD README.rst ./ diff --git a/experiment/pipeline_process_spacy_pos_wer.py b/experiment/pipeline_process_spacy_pos_wer.py index d3e8ebf..191f4f7 100644 --- a/experiment/pipeline_process_spacy_pos_wer.py +++ b/experiment/pipeline_process_spacy_pos_wer.py @@ -8,11 +8,12 @@ from sziszapangma.integration.experiment_manager import ExperimentManager def run_spacy_pos_wer_pipeline(dataset_name: str, asr_name: str): record_provider = get_record_provider(dataset_name) + language_code = dataset_name[:2] experiment_processor = ExperimentManager( record_id_iterator=record_provider, processing_tasks=[ SpacyPosSentenceWerProcessor( - model_name='pl_core_news_lg', + model_name=f'{language_code}_core_news_lg', gold_transcript_property_name=GOLD_TRANSCRIPT, asr_property_name=f'{asr_name}__result', alignment_property_name=f'{asr_name}__spacy_pos_alignment', @@ -22,7 +23,6 @@ def run_spacy_pos_wer_pipeline(dataset_name: str, asr_name: str): ) ], experiment_repository=get_repository(dataset_name), - relation_manager_provider=record_provider ) experiment_processor.process() diff --git a/experiment/sentence_wer_processor/sentence_wer_processor.py b/experiment/sentence_wer_processor/sentence_wer_processor.py index bda3f20..a51a082 100644 --- a/experiment/sentence_wer_processor/sentence_wer_processor.py +++ b/experiment/sentence_wer_processor/sentence_wer_processor.py @@ -32,14 +32,13 @@ class SentenceWerProcessor(ProcessingTask): def get_gold_transcript_text(self, record_id: str, experiment_repository: ExperimentRepository) -> str: property_value = experiment_repository.get_property_for_key(record_id, self._gold_transcript_property_name) - return ' '.join([it['word'] for it in property_value]) + return property_value['gold_transcript_raw'] def get_asr_text(self, record_id: str, experiment_repository: ExperimentRepository) -> str: property_value = experiment_repository.get_property_for_key(record_id, self._asr_property_name) return property_value['full_text'] - def run_single_process(self, record_id: str, experiment_repository: ExperimentRepository, - relation_manager: RelationManager): + def run_single_process(self, record_id: str, experiment_repository: ExperimentRepository): gold_transcript_text = self.get_gold_transcript_text(record_id, experiment_repository) asr_text = self.get_asr_text(record_id, experiment_repository) alignment = self._alignment_classic_calculator.calculate_alignment( diff --git a/new_experiment/pipeline/pipeline_process_spacy_dep_tag_wer.py b/new_experiment/pipeline/pipeline_process_spacy_dep_tag_wer.py new file mode 100644 index 0000000..f0727d1 --- /dev/null +++ b/new_experiment/pipeline/pipeline_process_spacy_dep_tag_wer.py @@ -0,0 +1,37 @@ +import argparse + +from experiment.const_pipeline_names import GOLD_TRANSCRIPT +from experiment.experiment_dependency_provider import get_record_provider, get_repository +from experiment.sentence_wer_processor.spacy_pos_sentence_dep_tag_processor import SpacyDepTagSentenceWerProcessor +from new_experiment.new_dependency_provider import get_experiment_repository, get_minio_audio_record_repository +from new_experiment.utils.get_spacy_model_name import get_spacy_model_name +from new_experiment.utils.loaded_remote_dataset_helper import LoadedRemoteDatasetHelper +from new_experiment.utils.property_helper import PropertyHelper +from sziszapangma.integration.experiment_manager import ExperimentManager + + +def run_spacy_pos_wer_pipeline(dataset_name: str, asr_name: str): + repository = get_experiment_repository(dataset_name) + record_provider = LoadedRemoteDatasetHelper(repository, get_minio_audio_record_repository(), dataset_name) + language_code = dataset_name[:2] + model_name = get_spacy_model_name(language_code) + experiment_processor = ExperimentManager( + record_id_iterator=record_provider, + processing_tasks=[ + SpacyDepTagSentenceWerProcessor( + model_name=get_spacy_model_name(language_code), + gold_transcript_property_name=PropertyHelper.get_gold_transcript_raw(), + asr_property_name=PropertyHelper.asr_result(asr_name), + alignment_property_name=PropertyHelper.dep_tag_alignment(asr_name, model_name), + wer_property_name=PropertyHelper.dep_tag_metrics(asr_name, model_name), + task_name=f'SpacyDepTagSentenceWerProcessor___{dataset_name}___{asr_name}', + require_update=False + ) + ], + experiment_repository=repository, + ) + experiment_processor.process() + + +if __name__ == '__main__': + run_spacy_pos_wer_pipeline('de_minds14', 'whisper_tiny') diff --git a/new_experiment/pipeline/pipeline_process_spacy_ner_wer.py b/new_experiment/pipeline/pipeline_process_spacy_ner_wer.py new file mode 100644 index 0000000..96cc9ef --- /dev/null +++ b/new_experiment/pipeline/pipeline_process_spacy_ner_wer.py @@ -0,0 +1,37 @@ +import argparse + +from experiment.const_pipeline_names import GOLD_TRANSCRIPT +from experiment.experiment_dependency_provider import get_record_provider, get_repository +from experiment.sentence_wer_processor.spacy_ner_sentence_wer_processor import SpacyNerSentenceWerProcessor +from new_experiment.new_dependency_provider import get_experiment_repository, get_minio_audio_record_repository +from new_experiment.utils.get_spacy_model_name import get_spacy_model_name +from new_experiment.utils.loaded_remote_dataset_helper import LoadedRemoteDatasetHelper +from new_experiment.utils.property_helper import PropertyHelper +from sziszapangma.integration.experiment_manager import ExperimentManager + + +def run_spacy_ner_wer_pipeline(dataset_name: str, asr_name: str): + repository = get_experiment_repository(dataset_name) + record_provider = LoadedRemoteDatasetHelper(repository, get_minio_audio_record_repository(), dataset_name) + language_code = dataset_name[:2] + model_name = get_spacy_model_name(language_code) + experiment_processor = ExperimentManager( + record_id_iterator=record_provider, + processing_tasks=[ + SpacyNerSentenceWerProcessor( + model_name=model_name, + gold_transcript_property_name=PropertyHelper.get_gold_transcript_raw(), + asr_property_name=PropertyHelper.asr_result(asr_name), + alignment_property_name=PropertyHelper.ner_alignment(asr_name, model_name), + wer_property_name=PropertyHelper.ner_metrics(asr_name, model_name), + task_name=f'SpacyNerSentenceWerProcessor___{dataset_name}___{asr_name}', + require_update=False + ) + ], + experiment_repository=repository, + ) + experiment_processor.process() + + +if __name__ == '__main__': + run_spacy_ner_wer_pipeline('de_minds14', 'whisper_tiny') diff --git a/new_experiment/pipeline/pipeline_process_spacy_pos_wer.py b/new_experiment/pipeline/pipeline_process_spacy_pos_wer.py new file mode 100644 index 0000000..5959dc9 --- /dev/null +++ b/new_experiment/pipeline/pipeline_process_spacy_pos_wer.py @@ -0,0 +1,34 @@ +from experiment.experiment_dependency_provider import get_record_provider, get_repository +from experiment.sentence_wer_processor.spacy_pos_sentence_wer_processor import SpacyPosSentenceWerProcessor +from new_experiment.new_dependency_provider import get_experiment_repository, get_minio_audio_record_repository +from new_experiment.utils.get_spacy_model_name import get_spacy_model_name +from new_experiment.utils.loaded_remote_dataset_helper import LoadedRemoteDatasetHelper +from new_experiment.utils.property_helper import PropertyHelper +from sziszapangma.integration.experiment_manager import ExperimentManager + + +def run_spacy_pos_wer_pipeline(dataset_name: str, asr_name: str): + repository = get_experiment_repository(dataset_name) + record_provider = LoadedRemoteDatasetHelper(repository, get_minio_audio_record_repository(), dataset_name) + language_code = dataset_name[:2] + model_name = get_spacy_model_name(language_code) + experiment_processor = ExperimentManager( + record_id_iterator=record_provider, + processing_tasks=[ + SpacyPosSentenceWerProcessor( + model_name=model_name, + gold_transcript_property_name=PropertyHelper.get_gold_transcript_raw(), + asr_property_name=PropertyHelper.asr_result(asr_name), + alignment_property_name=PropertyHelper.pos_alignment(asr_name, model_name), + wer_property_name=PropertyHelper.pos_metrics(asr_name, model_name), + task_name=f'SpacyPosSentenceWerProcessor___{dataset_name}___{asr_name}', + require_update=True + ) + ], + experiment_repository=repository, + ) + experiment_processor.process() + + +if __name__ == '__main__': + run_spacy_pos_wer_pipeline('de_minds14', 'whisper_tiny') diff --git a/new_experiment/pipeline/pipeline_process_word_classic_wer.py b/new_experiment/pipeline/pipeline_process_word_classic_wer.py index 9489f64..231a696 100644 --- a/new_experiment/pipeline/pipeline_process_word_classic_wer.py +++ b/new_experiment/pipeline/pipeline_process_word_classic_wer.py @@ -1,14 +1,8 @@ -import argparse - -from experiment.const_pipeline_names import GOLD_TRANSCRIPT -from experiment.experiment_dependency_provider import get_record_provider, get_repository from new_experiment.new_dependency_provider import get_experiment_repository, get_minio_audio_record_repository from new_experiment.utils.loaded_remote_dataset_helper import LoadedRemoteDatasetHelper from new_experiment.utils.property_helper import PropertyHelper -from sziszapangma.core.transformer.web_embedding_transformer import WebEmbeddingTransformer from sziszapangma.integration.experiment_manager import ExperimentManager from sziszapangma.integration.task.classic_wer_metric_task import ClassicWerMetricTask -from sziszapangma.integration.task.embedding_wer_metrics_task import EmbeddingWerMetricsTask def run_word_wer_classic_pipeline(dataset_name: str, asr_name: str): @@ -24,15 +18,6 @@ def run_word_wer_classic_pipeline(dataset_name: str, asr_name: str): require_update=True, alignment_property_name=PropertyHelper.word_wer_classic_alignment(asr_name) ), - # EmbeddingWerMetricsTask( - # task_name='EmbeddingWerMetricsTask', - # asr_property_name=f'{asr_name}__result', - # gold_transcript_property_name=GOLD_TRANSCRIPT, - # metrics_property_name=f'{asr_name}__word_wer_embeddings_metrics', - # require_update=False, - # embedding_transformer=WebEmbeddingTransformer('pl', 'http://localhost:5003', 'fjsd-mkwe-oius-m9h2'), - # alignment_property_name=f'{asr_name}__word_wer_embeddings_alignment' - # ) ], experiment_repository=repository ) @@ -40,4 +25,4 @@ def run_word_wer_classic_pipeline(dataset_name: str, asr_name: str): if __name__ == '__main__': - run_word_wer_classic_pipeline('de_google_fleurs', 'whisper_tiny') + run_word_wer_classic_pipeline('de_minds14', 'whisper_tiny') diff --git a/new_experiment/pipeline/pipeline_process_word_embedding_wer.py b/new_experiment/pipeline/pipeline_process_word_embedding_wer.py index 071be0e..dbdecb6 100644 --- a/new_experiment/pipeline/pipeline_process_word_embedding_wer.py +++ b/new_experiment/pipeline/pipeline_process_word_embedding_wer.py @@ -18,11 +18,11 @@ def run_word_wer_classic_pipeline(dataset_name: str, asr_name: str): record_id_iterator=LoadedRemoteDatasetHelper(repository, get_minio_audio_record_repository(), dataset_name), processing_tasks=[ EmbeddingWerMetricsTask( - task_name='EmbeddingWerMetricsTask', + task_name=f'EmbeddingWerMetricsTask___{dataset_name}___{asr_name}', asr_property_name=PropertyHelper.asr_result(asr_name), gold_transcript_property_name=PropertyHelper.get_gold_transcript_words(), metrics_property_name=PropertyHelper.word_wer_embeddings_metrics(asr_name), - require_update=False, + require_update=True, embedding_transformer=FasttextEmbeddingTransformer(dataset_name[:2]), alignment_property_name=PropertyHelper.word_wer_embeddings_alignment(asr_name) ) @@ -33,4 +33,4 @@ def run_word_wer_classic_pipeline(dataset_name: str, asr_name: str): if __name__ == '__main__': - run_word_wer_classic_pipeline('de_google_fleurs', 'whisper_tiny') + run_word_wer_classic_pipeline('de_minds14', 'whisper_tiny') diff --git a/new_experiment/utils/get_spacy_model_name.py b/new_experiment/utils/get_spacy_model_name.py new file mode 100644 index 0000000..bf910a6 --- /dev/null +++ b/new_experiment/utils/get_spacy_model_name.py @@ -0,0 +1,2 @@ +def get_spacy_model_name(language_code_2_letter: str) -> str: + return f'{language_code_2_letter}_core_news_lg' diff --git a/new_experiment/utils/property_helper.py b/new_experiment/utils/property_helper.py index f00ebe8..41bddca 100644 --- a/new_experiment/utils/property_helper.py +++ b/new_experiment/utils/property_helper.py @@ -14,27 +14,27 @@ class PropertyHelper: @staticmethod def pos_alignment(asr_name: str, model_name: str) -> str: - return f'{asr_name}__{model_name}_pos_alignment' + return f'{asr_name}__{model_name}__pos_alignment' @staticmethod def pos_metrics(asr_name: str, model_name: str) -> str: - return f'{asr_name}__{model_name}_pos_metrics' + return f'{asr_name}__{model_name}__pos_metrics' @staticmethod def dep_tag_alignment(asr_name: str, model_name: str) -> str: - return f'{asr_name}__{model_name}_dep_tag_alignment' + return f'{asr_name}__{model_name}__dep_tag_alignment' @staticmethod def dep_tag_metrics(asr_name: str, model_name: str) -> str: - return f'{asr_name}__{model_name}_dep_tag_metrics' + return f'{asr_name}__{model_name}__dep_tag_metrics' @staticmethod def ner_alignment(asr_name: str, model_name: str) -> str: - return f'{asr_name}__{model_name}_ner_alignment' + return f'{asr_name}__{model_name}__ner_alignment' @staticmethod def ner_metrics(asr_name: str, model_name: str) -> str: - return f'{asr_name}__{model_name}_ner_metrics' + return f'{asr_name}__{model_name}__ner_metrics' @staticmethod def word_wer_classic_alignment(asr_name: str) -> str: diff --git a/new_experiment/worker.py b/new_experiment/worker.py new file mode 100644 index 0000000..3a86742 --- /dev/null +++ b/new_experiment/worker.py @@ -0,0 +1,44 @@ +import json +import os +import uuid + +import pika +from minio import Minio +from pymongo import MongoClient +from urllib3 import HTTPResponse + +from new_datasets.whisper_processor import WhisperAsrProcessor +from sziszapangma.integration.repository.mongo_experiment_repository import MongoExperimentRepository + + +def get_param(name: str, default: str) -> str: + return os.environ[name] if name in os.environ else default + + +_RABBIT_URL = get_param('RABBIT_URL', + 'amqps://rabbit_user:kz6m4972OUHFmtUcPOHx4kF3Lj6yw7lo@rabbit-asr-benchmarks.theliver.pl:5671/') +def main(): + parameters = pika.URLParameters(_RABBIT_URL) + connection = pika.BlockingConnection(parameters=parameters) + channel = connection.channel() + channel.basic_qos(prefetch_count=1) + + queue_name = f'asr_benchmark_experiments' + for method_frame, properties, body in channel.consume(queue_name): + print(method_frame, properties, body) + message_dict = json.loads(body.decode('utf-8')) + print(message_dict) + + task = message_dict['task'] + + + channel.basic_ack(method_frame.delivery_tag) + print('\n########################################################\n') + + requeued_messages = channel.cancel() + print('Requeued %i messages' % requeued_messages) + connection.close() + + +if __name__ == '__main__': + main() diff --git a/sziszapangma/core/transformer/fasttext_embedding_transformer.py b/sziszapangma/core/transformer/fasttext_embedding_transformer.py index b95b93c..fecd7c1 100644 --- a/sziszapangma/core/transformer/fasttext_embedding_transformer.py +++ b/sziszapangma/core/transformer/fasttext_embedding_transformer.py @@ -16,7 +16,7 @@ class FasttextEmbeddingTransformer(EmbeddingTransformer): def __init__(self, lang_id: str): self._lang_id = lang_id fasttext.util.download_model(lang_id, if_exists='ignore') - ft = fasttext.load_model(f'cc.{lang_id}.300.bin') + self._model = fasttext.load_model(f'cc.{lang_id}.300.bin') def get_embedding(self, word: str) -> np.ndarray: return self._model.get_word_vector(word) diff --git a/sziszapangma/integration/repository/mongo_experiment_repository.py b/sziszapangma/integration/repository/mongo_experiment_repository.py index 0e9fb11..6c87a1d 100644 --- a/sziszapangma/integration/repository/mongo_experiment_repository.py +++ b/sziszapangma/integration/repository/mongo_experiment_repository.py @@ -24,16 +24,9 @@ class MongoExperimentRepository(ExperimentRepository): def property_exists(self, record_id: str, property_name: str) -> bool: database = self._get_database() all_collections = database.list_collection_names() - print(property_name, all_collections) if property_name not in all_collections: - print('collection not found') return False else: - print('self.get_all_record_ids_for_property(property_name)', record_id, record_id.__class__, - # record_id in self.get_all_record_ids_for_property(property_name), - # len(self.get_all_record_ids_for_property(property_name)), - list(self.get_all_record_ids_for_property(property_name))[0], - list(self.get_all_record_ids_for_property(property_name))[0].__class__) return database[property_name].find_one({ID: record_id}) is not None def update_property_for_key(self, record_id: str, property_name: str, property_value: Any): diff --git a/sziszapangma/integration/task/classic_wer_metric_task.py b/sziszapangma/integration/task/classic_wer_metric_task.py index af7a46b..2a17809 100644 --- a/sziszapangma/integration/task/classic_wer_metric_task.py +++ b/sziszapangma/integration/task/classic_wer_metric_task.py @@ -49,9 +49,7 @@ class ClassicWerMetricTask(ProcessingTask): record_id: str, experiment_repository: ExperimentRepository, ): - print('#############') gold_transcript = experiment_repository.get_property_for_key(record_id, self._gold_transcript_property_name) - print('$$$$$$$$$$$$$', gold_transcript) asr_result = experiment_repository.get_property_for_key(record_id, self._asr_property_name) if gold_transcript is not None and asr_result is not None and "transcription" in asr_result: alignment_steps = self._get_alignment(gold_transcript, asr_result["transcription"]) diff --git a/sziszapangma/integration/task/embedding_wer_metrics_task.py b/sziszapangma/integration/task/embedding_wer_metrics_task.py index eefc2fd..099773a 100644 --- a/sziszapangma/integration/task/embedding_wer_metrics_task.py +++ b/sziszapangma/integration/task/embedding_wer_metrics_task.py @@ -10,7 +10,6 @@ from sziszapangma.integration.repository.experiment_repository import Experiment from sziszapangma.integration.task.processing_task import ProcessingTask from sziszapangma.integration.task.task_util import TaskUtil from sziszapangma.model.model import Word -from sziszapangma.model.relation_manager import RelationManager _SOFT_WER = "soft_wer" _EMBEDDING_WER = "embedding_wer" @@ -62,9 +61,8 @@ class EmbeddingWerMetricsTask(ProcessingTask): self, record_id: str, experiment_repository: ExperimentRepository, - relation_manager: RelationManager, ): - gold_transcript = TaskUtil.get_words_from_record(relation_manager) + gold_transcript = experiment_repository.get_property_for_key(record_id, self._gold_transcript_property_name) asr_result = experiment_repository.get_property_for_key(record_id, self._asr_property_name) if gold_transcript is not None and asr_result is not None and "transcription" in asr_result: gold_transcript_lower = self.filter_empty_words(TaskUtil.words_to_lower(gold_transcript)) -- GitLab