From 428b93f6636a78a7cd7f9c8ce255f265e34dd2a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20W=C4=85troba?= <markowanga@gmail.com> Date: Fri, 29 Apr 2022 08:39:14 +0200 Subject: [PATCH] Add voicelab pipeline stages --- experiment/pos_processing/__init__.py | 0 .../asr_spacy_token_pos_processing_task.py | 24 ++++++ ...nscript_spacy_token_pos_processing_task.py | 24 ++++++ .../spacy_pos_wer_processing_task.py | 85 +++++++++++++++++++ .../spacy_token_pos_processing_task.py | 58 +++++++++++++ 5 files changed, 191 insertions(+) create mode 100644 experiment/pos_processing/__init__.py create mode 100644 experiment/pos_processing/asr_spacy_token_pos_processing_task.py create mode 100644 experiment/pos_processing/gold_transcript_spacy_token_pos_processing_task.py create mode 100644 experiment/pos_processing/spacy_pos_wer_processing_task.py create mode 100644 experiment/pos_processing/spacy_token_pos_processing_task.py diff --git a/experiment/pos_processing/__init__.py b/experiment/pos_processing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/experiment/pos_processing/asr_spacy_token_pos_processing_task.py b/experiment/pos_processing/asr_spacy_token_pos_processing_task.py new file mode 100644 index 0000000..0abf7b1 --- /dev/null +++ b/experiment/pos_processing/asr_spacy_token_pos_processing_task.py @@ -0,0 +1,24 @@ +from dataclasses import dataclass +from typing import Any + +from experiment.pos_processing.spacy_token_pos_processing_task import SpacyTokenPosProcessingTask + + +@dataclass +class WordSpan: + text: str + index_start: int + index_end: int + + +class AsrSpacyTokenPosProcessingTask(SpacyTokenPosProcessingTask): + _spacy_property_name: str + _nlp: Any + _input_property_name: str + + def __init__(self, task_name: str, spacy_property_name: str, require_update: bool, + input_property_name: str): + super().__init__(task_name, spacy_property_name, require_update, input_property_name) + + def get_transcript_to_process(self, property_value: Any) -> str: + return property_value['full_text'] diff --git a/experiment/pos_processing/gold_transcript_spacy_token_pos_processing_task.py b/experiment/pos_processing/gold_transcript_spacy_token_pos_processing_task.py new file mode 100644 index 0000000..870a3ac --- /dev/null +++ b/experiment/pos_processing/gold_transcript_spacy_token_pos_processing_task.py @@ -0,0 +1,24 @@ +from dataclasses import dataclass +from typing import Any + +from experiment.pos_processing.spacy_token_pos_processing_task import SpacyTokenPosProcessingTask + + +@dataclass +class WordSpan: + text: str + index_start: int + index_end: int + + +class GoldTranscriptSpacyTokenPosProcessingTask(SpacyTokenPosProcessingTask): + _spacy_property_name: str + _nlp: Any + _input_property_name: str + + def __init__(self, task_name: str, spacy_property_name: str, require_update: bool, + input_property_name: str): + super().__init__(task_name, spacy_property_name, require_update, input_property_name) + + def get_transcript_to_process(self, property_value: Any) -> str: + return ' '.join([it['word'] for it in property_value]) diff --git a/experiment/pos_processing/spacy_pos_wer_processing_task.py b/experiment/pos_processing/spacy_pos_wer_processing_task.py new file mode 100644 index 0000000..23d4f57 --- /dev/null +++ b/experiment/pos_processing/spacy_pos_wer_processing_task.py @@ -0,0 +1,85 @@ +from typing import Any, List, Dict + +from sziszapangma.core.alignment.alignment_classic_calculator import AlignmentClassicCalculator +from sziszapangma.core.alignment.alignment_step import AlignmentStep +from sziszapangma.core.wer.wer_calculator import WerCalculator +from sziszapangma.integration.mapper.alignment_step_mapper import AlignmentStepMapper +from sziszapangma.integration.repository.experiment_repository import ExperimentRepository +from sziszapangma.integration.task.processing_task import ProcessingTask +from sziszapangma.model.model import Word +from sziszapangma.model.relation_manager import RelationManager + +_CLASSIC_WER = "classic_wer" + + +class SpacyPosWerProcessingTask(ProcessingTask): + _gold_transcript_pos_property_name: str + _asr_pos_property_name: str + _pos_alignment_wer: str + _pos_metrics_wer: str + _alignment_classic_calculator: AlignmentClassicCalculator + _wer_calculator: WerCalculator + + def __init__( + self, + task_name: str, + gold_transcript_pos_property_name: str, + require_update: bool, + asr_pos_property_name: str, + pos_alignment_wer: str, + pos_metrics_wer: str + ): + super().__init__(task_name, require_update) + self._gold_transcript_pos_property_name = gold_transcript_pos_property_name + self._asr_pos_property_name = asr_pos_property_name + self._pos_alignment_wer = pos_alignment_wer + self._pos_metrics_wer = pos_metrics_wer + self._alignment_classic_calculator = AlignmentClassicCalculator() + self._wer_calculator = WerCalculator() + + def run_single_process( + self, + record_id: str, + experiment_repository: ExperimentRepository, + relation_manager: RelationManager, + ): + gold_transcript_pos = [ + Word(id=it['id'], type='Word', text=it['pos']) + for it in + experiment_repository.get_property_for_key( + record_id, self._gold_transcript_pos_property_name) + ] + asr_transcript_pos = [ + Word(id=it['id'], type='Word', text=it['pos']) + for it in + experiment_repository.get_property_for_key(record_id, self._asr_pos_property_name) + ] + if gold_transcript_pos is not None and asr_transcript_pos is not None: + alignment_steps = self._get_alignment(gold_transcript_pos, asr_transcript_pos) + experiment_repository.update_property_for_key( + record_id, + self._pos_alignment_wer, + [AlignmentStepMapper.to_json_dict(it) for it in alignment_steps], + ) + experiment_repository.update_property_for_key( + record_id, self._pos_metrics_wer, self.calculate_metrics(alignment_steps) + ) + + def _get_alignment( + self, gold_transcript: List[Word], asr_transcript: List[Word] + ) -> List[AlignmentStep]: + return self._alignment_classic_calculator.calculate_alignment( + reference=gold_transcript, hypothesis=asr_transcript + ) + + def calculate_metrics(self, alignment_steps: List[AlignmentStep]) -> Dict[str, Any]: + """Calculate all metrics for data sample.""" + metrics = dict() + metrics[_CLASSIC_WER] = self._wer_calculator.calculate_wer(alignment_steps) + return metrics + + def skip_for_record(self, record_id: str, experiment_repository: ExperimentRepository) -> bool: + return ( + experiment_repository.get_property_for_key(record_id, self._pos_metrics_wer) + is not None + ) diff --git a/experiment/pos_processing/spacy_token_pos_processing_task.py b/experiment/pos_processing/spacy_token_pos_processing_task.py new file mode 100644 index 0000000..2c8bb0b --- /dev/null +++ b/experiment/pos_processing/spacy_token_pos_processing_task.py @@ -0,0 +1,58 @@ +import uuid +from abc import abstractmethod, ABC +from dataclasses import dataclass +from typing import Any, Dict + +import spacy +from spacy.tokens import Token + +from sziszapangma.integration.repository.experiment_repository import ExperimentRepository +from sziszapangma.integration.task.processing_task import ProcessingTask +from sziszapangma.model.relation_manager import RelationManager + + +@dataclass +class WordSpan: + text: str + index_start: int + index_end: int + + +class SpacyTokenPosProcessingTask(ProcessingTask, ABC): + _spacy_model_name: str + _nlp: Any + _input_property_name: str + _spacy_property_name: str + + def __init__( + self, + task_name: str, + spacy_property_name: str, + require_update: bool, + input_property_name: str + ): + super().__init__(task_name, require_update) + self._spacy_property_name = spacy_property_name + self._nlp = spacy.load("pl_core_news_lg") + self._input_property_name = input_property_name + + def run_single_process(self, record_id: str, experiment_repository: ExperimentRepository, + relation_manager: RelationManager): + test_property = experiment_repository.get_property_for_key( + record_id, self._input_property_name) + document = self._nlp(self.get_transcript_to_process(test_property)) + + spacy_result = [self.token_to_result_dict(token) for token in document] + experiment_repository.update_property_for_key(record_id, self._spacy_property_name, + spacy_result) + + @staticmethod + def token_to_result_dict(token: Token) -> Dict[str, str]: + return {'id': str(uuid.uuid4()), 'word': token.text, 'pos': token.pos_} + + @abstractmethod + def get_transcript_to_process(self, property_value: Any) -> str: + pass + + def skip_for_record(self, record_id: str, experiment_repository: ExperimentRepository) -> bool: + return experiment_repository.property_exists(record_id, self._spacy_property_name) -- GitLab