Skip to content
Snippets Groups Projects
Commit 4eb6ea65 authored by Marcin Wątroba's avatar Marcin Wątroba
Browse files

Add spacy new pipeline

parent e407a441
No related branches found
No related tags found
No related merge requests found
Showing
with 179 additions and 43 deletions
......@@ -6,6 +6,15 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y tzdata
RUN add-apt-repository ppa:deadsnakes/ppa && apt-get update && apt-get install -y python3.8 python3-pip ffmpeg
RUN alias python='python3' && alias pip='pip3' && pip install poetry
RUN pip install spacy
RUN python -m spacy download de_core_news_lg
RUN python -m spacy download pl_core_news_lg
RUN python -m spacy download en_core_news_lg
RUN python -m spacy download it_core_news_lg
RUN python -m spacy download nl_core_news_lg
RUN python -m spacy download sp_core_news_lg
RUN python -m spacy download pt_core_news_lg
ADD poetry.lock ./
ADD pyproject.toml ./
ADD README.rst ./
......
......@@ -8,11 +8,12 @@ from sziszapangma.integration.experiment_manager import ExperimentManager
def run_spacy_pos_wer_pipeline(dataset_name: str, asr_name: str):
record_provider = get_record_provider(dataset_name)
language_code = dataset_name[:2]
experiment_processor = ExperimentManager(
record_id_iterator=record_provider,
processing_tasks=[
SpacyPosSentenceWerProcessor(
model_name='pl_core_news_lg',
model_name=f'{language_code}_core_news_lg',
gold_transcript_property_name=GOLD_TRANSCRIPT,
asr_property_name=f'{asr_name}__result',
alignment_property_name=f'{asr_name}__spacy_pos_alignment',
......@@ -22,7 +23,6 @@ def run_spacy_pos_wer_pipeline(dataset_name: str, asr_name: str):
)
],
experiment_repository=get_repository(dataset_name),
relation_manager_provider=record_provider
)
experiment_processor.process()
......
......@@ -32,14 +32,13 @@ class SentenceWerProcessor(ProcessingTask):
def get_gold_transcript_text(self, record_id: str, experiment_repository: ExperimentRepository) -> str:
property_value = experiment_repository.get_property_for_key(record_id, self._gold_transcript_property_name)
return ' '.join([it['word'] for it in property_value])
return property_value['gold_transcript_raw']
def get_asr_text(self, record_id: str, experiment_repository: ExperimentRepository) -> str:
property_value = experiment_repository.get_property_for_key(record_id, self._asr_property_name)
return property_value['full_text']
def run_single_process(self, record_id: str, experiment_repository: ExperimentRepository,
relation_manager: RelationManager):
def run_single_process(self, record_id: str, experiment_repository: ExperimentRepository):
gold_transcript_text = self.get_gold_transcript_text(record_id, experiment_repository)
asr_text = self.get_asr_text(record_id, experiment_repository)
alignment = self._alignment_classic_calculator.calculate_alignment(
......
import argparse
from experiment.const_pipeline_names import GOLD_TRANSCRIPT
from experiment.experiment_dependency_provider import get_record_provider, get_repository
from experiment.sentence_wer_processor.spacy_pos_sentence_dep_tag_processor import SpacyDepTagSentenceWerProcessor
from new_experiment.new_dependency_provider import get_experiment_repository, get_minio_audio_record_repository
from new_experiment.utils.get_spacy_model_name import get_spacy_model_name
from new_experiment.utils.loaded_remote_dataset_helper import LoadedRemoteDatasetHelper
from new_experiment.utils.property_helper import PropertyHelper
from sziszapangma.integration.experiment_manager import ExperimentManager
def run_spacy_pos_wer_pipeline(dataset_name: str, asr_name: str):
repository = get_experiment_repository(dataset_name)
record_provider = LoadedRemoteDatasetHelper(repository, get_minio_audio_record_repository(), dataset_name)
language_code = dataset_name[:2]
model_name = get_spacy_model_name(language_code)
experiment_processor = ExperimentManager(
record_id_iterator=record_provider,
processing_tasks=[
SpacyDepTagSentenceWerProcessor(
model_name=get_spacy_model_name(language_code),
gold_transcript_property_name=PropertyHelper.get_gold_transcript_raw(),
asr_property_name=PropertyHelper.asr_result(asr_name),
alignment_property_name=PropertyHelper.dep_tag_alignment(asr_name, model_name),
wer_property_name=PropertyHelper.dep_tag_metrics(asr_name, model_name),
task_name=f'SpacyDepTagSentenceWerProcessor___{dataset_name}___{asr_name}',
require_update=False
)
],
experiment_repository=repository,
)
experiment_processor.process()
if __name__ == '__main__':
run_spacy_pos_wer_pipeline('de_minds14', 'whisper_tiny')
import argparse
from experiment.const_pipeline_names import GOLD_TRANSCRIPT
from experiment.experiment_dependency_provider import get_record_provider, get_repository
from experiment.sentence_wer_processor.spacy_ner_sentence_wer_processor import SpacyNerSentenceWerProcessor
from new_experiment.new_dependency_provider import get_experiment_repository, get_minio_audio_record_repository
from new_experiment.utils.get_spacy_model_name import get_spacy_model_name
from new_experiment.utils.loaded_remote_dataset_helper import LoadedRemoteDatasetHelper
from new_experiment.utils.property_helper import PropertyHelper
from sziszapangma.integration.experiment_manager import ExperimentManager
def run_spacy_ner_wer_pipeline(dataset_name: str, asr_name: str):
repository = get_experiment_repository(dataset_name)
record_provider = LoadedRemoteDatasetHelper(repository, get_minio_audio_record_repository(), dataset_name)
language_code = dataset_name[:2]
model_name = get_spacy_model_name(language_code)
experiment_processor = ExperimentManager(
record_id_iterator=record_provider,
processing_tasks=[
SpacyNerSentenceWerProcessor(
model_name=model_name,
gold_transcript_property_name=PropertyHelper.get_gold_transcript_raw(),
asr_property_name=PropertyHelper.asr_result(asr_name),
alignment_property_name=PropertyHelper.ner_alignment(asr_name, model_name),
wer_property_name=PropertyHelper.ner_metrics(asr_name, model_name),
task_name=f'SpacyNerSentenceWerProcessor___{dataset_name}___{asr_name}',
require_update=False
)
],
experiment_repository=repository,
)
experiment_processor.process()
if __name__ == '__main__':
run_spacy_ner_wer_pipeline('de_minds14', 'whisper_tiny')
from experiment.experiment_dependency_provider import get_record_provider, get_repository
from experiment.sentence_wer_processor.spacy_pos_sentence_wer_processor import SpacyPosSentenceWerProcessor
from new_experiment.new_dependency_provider import get_experiment_repository, get_minio_audio_record_repository
from new_experiment.utils.get_spacy_model_name import get_spacy_model_name
from new_experiment.utils.loaded_remote_dataset_helper import LoadedRemoteDatasetHelper
from new_experiment.utils.property_helper import PropertyHelper
from sziszapangma.integration.experiment_manager import ExperimentManager
def run_spacy_pos_wer_pipeline(dataset_name: str, asr_name: str):
repository = get_experiment_repository(dataset_name)
record_provider = LoadedRemoteDatasetHelper(repository, get_minio_audio_record_repository(), dataset_name)
language_code = dataset_name[:2]
model_name = get_spacy_model_name(language_code)
experiment_processor = ExperimentManager(
record_id_iterator=record_provider,
processing_tasks=[
SpacyPosSentenceWerProcessor(
model_name=model_name,
gold_transcript_property_name=PropertyHelper.get_gold_transcript_raw(),
asr_property_name=PropertyHelper.asr_result(asr_name),
alignment_property_name=PropertyHelper.pos_alignment(asr_name, model_name),
wer_property_name=PropertyHelper.pos_metrics(asr_name, model_name),
task_name=f'SpacyPosSentenceWerProcessor___{dataset_name}___{asr_name}',
require_update=True
)
],
experiment_repository=repository,
)
experiment_processor.process()
if __name__ == '__main__':
run_spacy_pos_wer_pipeline('de_minds14', 'whisper_tiny')
import argparse
from experiment.const_pipeline_names import GOLD_TRANSCRIPT
from experiment.experiment_dependency_provider import get_record_provider, get_repository
from new_experiment.new_dependency_provider import get_experiment_repository, get_minio_audio_record_repository
from new_experiment.utils.loaded_remote_dataset_helper import LoadedRemoteDatasetHelper
from new_experiment.utils.property_helper import PropertyHelper
from sziszapangma.core.transformer.web_embedding_transformer import WebEmbeddingTransformer
from sziszapangma.integration.experiment_manager import ExperimentManager
from sziszapangma.integration.task.classic_wer_metric_task import ClassicWerMetricTask
from sziszapangma.integration.task.embedding_wer_metrics_task import EmbeddingWerMetricsTask
def run_word_wer_classic_pipeline(dataset_name: str, asr_name: str):
......@@ -24,15 +18,6 @@ def run_word_wer_classic_pipeline(dataset_name: str, asr_name: str):
require_update=True,
alignment_property_name=PropertyHelper.word_wer_classic_alignment(asr_name)
),
# EmbeddingWerMetricsTask(
# task_name='EmbeddingWerMetricsTask',
# asr_property_name=f'{asr_name}__result',
# gold_transcript_property_name=GOLD_TRANSCRIPT,
# metrics_property_name=f'{asr_name}__word_wer_embeddings_metrics',
# require_update=False,
# embedding_transformer=WebEmbeddingTransformer('pl', 'http://localhost:5003', 'fjsd-mkwe-oius-m9h2'),
# alignment_property_name=f'{asr_name}__word_wer_embeddings_alignment'
# )
],
experiment_repository=repository
)
......@@ -40,4 +25,4 @@ def run_word_wer_classic_pipeline(dataset_name: str, asr_name: str):
if __name__ == '__main__':
run_word_wer_classic_pipeline('de_google_fleurs', 'whisper_tiny')
run_word_wer_classic_pipeline('de_minds14', 'whisper_tiny')
......@@ -18,11 +18,11 @@ def run_word_wer_classic_pipeline(dataset_name: str, asr_name: str):
record_id_iterator=LoadedRemoteDatasetHelper(repository, get_minio_audio_record_repository(), dataset_name),
processing_tasks=[
EmbeddingWerMetricsTask(
task_name='EmbeddingWerMetricsTask',
task_name=f'EmbeddingWerMetricsTask___{dataset_name}___{asr_name}',
asr_property_name=PropertyHelper.asr_result(asr_name),
gold_transcript_property_name=PropertyHelper.get_gold_transcript_words(),
metrics_property_name=PropertyHelper.word_wer_embeddings_metrics(asr_name),
require_update=False,
require_update=True,
embedding_transformer=FasttextEmbeddingTransformer(dataset_name[:2]),
alignment_property_name=PropertyHelper.word_wer_embeddings_alignment(asr_name)
)
......@@ -33,4 +33,4 @@ def run_word_wer_classic_pipeline(dataset_name: str, asr_name: str):
if __name__ == '__main__':
run_word_wer_classic_pipeline('de_google_fleurs', 'whisper_tiny')
run_word_wer_classic_pipeline('de_minds14', 'whisper_tiny')
def get_spacy_model_name(language_code_2_letter: str) -> str:
return f'{language_code_2_letter}_core_news_lg'
......@@ -14,27 +14,27 @@ class PropertyHelper:
@staticmethod
def pos_alignment(asr_name: str, model_name: str) -> str:
return f'{asr_name}__{model_name}_pos_alignment'
return f'{asr_name}__{model_name}__pos_alignment'
@staticmethod
def pos_metrics(asr_name: str, model_name: str) -> str:
return f'{asr_name}__{model_name}_pos_metrics'
return f'{asr_name}__{model_name}__pos_metrics'
@staticmethod
def dep_tag_alignment(asr_name: str, model_name: str) -> str:
return f'{asr_name}__{model_name}_dep_tag_alignment'
return f'{asr_name}__{model_name}__dep_tag_alignment'
@staticmethod
def dep_tag_metrics(asr_name: str, model_name: str) -> str:
return f'{asr_name}__{model_name}_dep_tag_metrics'
return f'{asr_name}__{model_name}__dep_tag_metrics'
@staticmethod
def ner_alignment(asr_name: str, model_name: str) -> str:
return f'{asr_name}__{model_name}_ner_alignment'
return f'{asr_name}__{model_name}__ner_alignment'
@staticmethod
def ner_metrics(asr_name: str, model_name: str) -> str:
return f'{asr_name}__{model_name}_ner_metrics'
return f'{asr_name}__{model_name}__ner_metrics'
@staticmethod
def word_wer_classic_alignment(asr_name: str) -> str:
......
import json
import os
import uuid
import pika
from minio import Minio
from pymongo import MongoClient
from urllib3 import HTTPResponse
from new_datasets.whisper_processor import WhisperAsrProcessor
from sziszapangma.integration.repository.mongo_experiment_repository import MongoExperimentRepository
def get_param(name: str, default: str) -> str:
return os.environ[name] if name in os.environ else default
_RABBIT_URL = get_param('RABBIT_URL',
'amqps://rabbit_user:kz6m4972OUHFmtUcPOHx4kF3Lj6yw7lo@rabbit-asr-benchmarks.theliver.pl:5671/')
def main():
parameters = pika.URLParameters(_RABBIT_URL)
connection = pika.BlockingConnection(parameters=parameters)
channel = connection.channel()
channel.basic_qos(prefetch_count=1)
queue_name = f'asr_benchmark_experiments'
for method_frame, properties, body in channel.consume(queue_name):
print(method_frame, properties, body)
message_dict = json.loads(body.decode('utf-8'))
print(message_dict)
task = message_dict['task']
channel.basic_ack(method_frame.delivery_tag)
print('\n########################################################\n')
requeued_messages = channel.cancel()
print('Requeued %i messages' % requeued_messages)
connection.close()
if __name__ == '__main__':
main()
......@@ -16,7 +16,7 @@ class FasttextEmbeddingTransformer(EmbeddingTransformer):
def __init__(self, lang_id: str):
self._lang_id = lang_id
fasttext.util.download_model(lang_id, if_exists='ignore')
ft = fasttext.load_model(f'cc.{lang_id}.300.bin')
self._model = fasttext.load_model(f'cc.{lang_id}.300.bin')
def get_embedding(self, word: str) -> np.ndarray:
return self._model.get_word_vector(word)
......
......@@ -24,16 +24,9 @@ class MongoExperimentRepository(ExperimentRepository):
def property_exists(self, record_id: str, property_name: str) -> bool:
database = self._get_database()
all_collections = database.list_collection_names()
print(property_name, all_collections)
if property_name not in all_collections:
print('collection not found')
return False
else:
print('self.get_all_record_ids_for_property(property_name)', record_id, record_id.__class__,
# record_id in self.get_all_record_ids_for_property(property_name),
# len(self.get_all_record_ids_for_property(property_name)),
list(self.get_all_record_ids_for_property(property_name))[0],
list(self.get_all_record_ids_for_property(property_name))[0].__class__)
return database[property_name].find_one({ID: record_id}) is not None
def update_property_for_key(self, record_id: str, property_name: str, property_value: Any):
......
......@@ -49,9 +49,7 @@ class ClassicWerMetricTask(ProcessingTask):
record_id: str,
experiment_repository: ExperimentRepository,
):
print('#############')
gold_transcript = experiment_repository.get_property_for_key(record_id, self._gold_transcript_property_name)
print('$$$$$$$$$$$$$', gold_transcript)
asr_result = experiment_repository.get_property_for_key(record_id, self._asr_property_name)
if gold_transcript is not None and asr_result is not None and "transcription" in asr_result:
alignment_steps = self._get_alignment(gold_transcript, asr_result["transcription"])
......
......@@ -10,7 +10,6 @@ from sziszapangma.integration.repository.experiment_repository import Experiment
from sziszapangma.integration.task.processing_task import ProcessingTask
from sziszapangma.integration.task.task_util import TaskUtil
from sziszapangma.model.model import Word
from sziszapangma.model.relation_manager import RelationManager
_SOFT_WER = "soft_wer"
_EMBEDDING_WER = "embedding_wer"
......@@ -62,9 +61,8 @@ class EmbeddingWerMetricsTask(ProcessingTask):
self,
record_id: str,
experiment_repository: ExperimentRepository,
relation_manager: RelationManager,
):
gold_transcript = TaskUtil.get_words_from_record(relation_manager)
gold_transcript = experiment_repository.get_property_for_key(record_id, self._gold_transcript_property_name)
asr_result = experiment_repository.get_property_for_key(record_id, self._asr_property_name)
if gold_transcript is not None and asr_result is not None and "transcription" in asr_result:
gold_transcript_lower = self.filter_empty_words(TaskUtil.words_to_lower(gold_transcript))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment