From 428b93f6636a78a7cd7f9c8ce255f265e34dd2a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marcin=20W=C4=85troba?= <markowanga@gmail.com>
Date: Fri, 29 Apr 2022 08:39:14 +0200
Subject: [PATCH] Add voicelab pipeline stages

---
 experiment/pos_processing/__init__.py         |  0
 .../asr_spacy_token_pos_processing_task.py    | 24 ++++++
 ...nscript_spacy_token_pos_processing_task.py | 24 ++++++
 .../spacy_pos_wer_processing_task.py          | 85 +++++++++++++++++++
 .../spacy_token_pos_processing_task.py        | 58 +++++++++++++
 5 files changed, 191 insertions(+)
 create mode 100644 experiment/pos_processing/__init__.py
 create mode 100644 experiment/pos_processing/asr_spacy_token_pos_processing_task.py
 create mode 100644 experiment/pos_processing/gold_transcript_spacy_token_pos_processing_task.py
 create mode 100644 experiment/pos_processing/spacy_pos_wer_processing_task.py
 create mode 100644 experiment/pos_processing/spacy_token_pos_processing_task.py

diff --git a/experiment/pos_processing/__init__.py b/experiment/pos_processing/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/experiment/pos_processing/asr_spacy_token_pos_processing_task.py b/experiment/pos_processing/asr_spacy_token_pos_processing_task.py
new file mode 100644
index 0000000..0abf7b1
--- /dev/null
+++ b/experiment/pos_processing/asr_spacy_token_pos_processing_task.py
@@ -0,0 +1,24 @@
+from dataclasses import dataclass
+from typing import Any
+
+from experiment.pos_processing.spacy_token_pos_processing_task import SpacyTokenPosProcessingTask
+
+
+@dataclass
+class WordSpan:
+    text: str
+    index_start: int
+    index_end: int
+
+
+class AsrSpacyTokenPosProcessingTask(SpacyTokenPosProcessingTask):
+    _spacy_property_name: str
+    _nlp: Any
+    _input_property_name: str
+
+    def __init__(self, task_name: str, spacy_property_name: str, require_update: bool,
+                 input_property_name: str):
+        super().__init__(task_name, spacy_property_name, require_update, input_property_name)
+
+    def get_transcript_to_process(self, property_value: Any) -> str:
+        return property_value['full_text']
diff --git a/experiment/pos_processing/gold_transcript_spacy_token_pos_processing_task.py b/experiment/pos_processing/gold_transcript_spacy_token_pos_processing_task.py
new file mode 100644
index 0000000..870a3ac
--- /dev/null
+++ b/experiment/pos_processing/gold_transcript_spacy_token_pos_processing_task.py
@@ -0,0 +1,24 @@
+from dataclasses import dataclass
+from typing import Any
+
+from experiment.pos_processing.spacy_token_pos_processing_task import SpacyTokenPosProcessingTask
+
+
+@dataclass
+class WordSpan:
+    text: str
+    index_start: int
+    index_end: int
+
+
+class GoldTranscriptSpacyTokenPosProcessingTask(SpacyTokenPosProcessingTask):
+    _spacy_property_name: str
+    _nlp: Any
+    _input_property_name: str
+
+    def __init__(self, task_name: str, spacy_property_name: str, require_update: bool,
+                 input_property_name: str):
+        super().__init__(task_name, spacy_property_name, require_update, input_property_name)
+
+    def get_transcript_to_process(self, property_value: Any) -> str:
+        return ' '.join([it['word'] for it in property_value])
diff --git a/experiment/pos_processing/spacy_pos_wer_processing_task.py b/experiment/pos_processing/spacy_pos_wer_processing_task.py
new file mode 100644
index 0000000..23d4f57
--- /dev/null
+++ b/experiment/pos_processing/spacy_pos_wer_processing_task.py
@@ -0,0 +1,85 @@
+from typing import Any, List, Dict
+
+from sziszapangma.core.alignment.alignment_classic_calculator import AlignmentClassicCalculator
+from sziszapangma.core.alignment.alignment_step import AlignmentStep
+from sziszapangma.core.wer.wer_calculator import WerCalculator
+from sziszapangma.integration.mapper.alignment_step_mapper import AlignmentStepMapper
+from sziszapangma.integration.repository.experiment_repository import ExperimentRepository
+from sziszapangma.integration.task.processing_task import ProcessingTask
+from sziszapangma.model.model import Word
+from sziszapangma.model.relation_manager import RelationManager
+
+_CLASSIC_WER = "classic_wer"
+
+
+class SpacyPosWerProcessingTask(ProcessingTask):
+    _gold_transcript_pos_property_name: str
+    _asr_pos_property_name: str
+    _pos_alignment_wer: str
+    _pos_metrics_wer: str
+    _alignment_classic_calculator: AlignmentClassicCalculator
+    _wer_calculator: WerCalculator
+
+    def __init__(
+        self,
+        task_name: str,
+        gold_transcript_pos_property_name: str,
+        require_update: bool,
+        asr_pos_property_name: str,
+        pos_alignment_wer: str,
+        pos_metrics_wer: str
+    ):
+        super().__init__(task_name, require_update)
+        self._gold_transcript_pos_property_name = gold_transcript_pos_property_name
+        self._asr_pos_property_name = asr_pos_property_name
+        self._pos_alignment_wer = pos_alignment_wer
+        self._pos_metrics_wer = pos_metrics_wer
+        self._alignment_classic_calculator = AlignmentClassicCalculator()
+        self._wer_calculator = WerCalculator()
+
+    def run_single_process(
+        self,
+        record_id: str,
+        experiment_repository: ExperimentRepository,
+        relation_manager: RelationManager,
+    ):
+        gold_transcript_pos = [
+            Word(id=it['id'], type='Word', text=it['pos'])
+            for it in
+            experiment_repository.get_property_for_key(
+                record_id, self._gold_transcript_pos_property_name)
+        ]
+        asr_transcript_pos = [
+            Word(id=it['id'], type='Word', text=it['pos'])
+            for it in
+            experiment_repository.get_property_for_key(record_id, self._asr_pos_property_name)
+        ]
+        if gold_transcript_pos is not None and asr_transcript_pos is not None:
+            alignment_steps = self._get_alignment(gold_transcript_pos, asr_transcript_pos)
+            experiment_repository.update_property_for_key(
+                record_id,
+                self._pos_alignment_wer,
+                [AlignmentStepMapper.to_json_dict(it) for it in alignment_steps],
+            )
+            experiment_repository.update_property_for_key(
+                record_id, self._pos_metrics_wer, self.calculate_metrics(alignment_steps)
+            )
+
+    def _get_alignment(
+        self, gold_transcript: List[Word], asr_transcript: List[Word]
+    ) -> List[AlignmentStep]:
+        return self._alignment_classic_calculator.calculate_alignment(
+            reference=gold_transcript, hypothesis=asr_transcript
+        )
+
+    def calculate_metrics(self, alignment_steps: List[AlignmentStep]) -> Dict[str, Any]:
+        """Calculate all metrics for data sample."""
+        metrics = dict()
+        metrics[_CLASSIC_WER] = self._wer_calculator.calculate_wer(alignment_steps)
+        return metrics
+
+    def skip_for_record(self, record_id: str, experiment_repository: ExperimentRepository) -> bool:
+        return (
+            experiment_repository.get_property_for_key(record_id, self._pos_metrics_wer)
+            is not None
+        )
diff --git a/experiment/pos_processing/spacy_token_pos_processing_task.py b/experiment/pos_processing/spacy_token_pos_processing_task.py
new file mode 100644
index 0000000..2c8bb0b
--- /dev/null
+++ b/experiment/pos_processing/spacy_token_pos_processing_task.py
@@ -0,0 +1,58 @@
+import uuid
+from abc import abstractmethod, ABC
+from dataclasses import dataclass
+from typing import Any, Dict
+
+import spacy
+from spacy.tokens import Token
+
+from sziszapangma.integration.repository.experiment_repository import ExperimentRepository
+from sziszapangma.integration.task.processing_task import ProcessingTask
+from sziszapangma.model.relation_manager import RelationManager
+
+
+@dataclass
+class WordSpan:
+    text: str
+    index_start: int
+    index_end: int
+
+
+class SpacyTokenPosProcessingTask(ProcessingTask, ABC):
+    _spacy_model_name: str
+    _nlp: Any
+    _input_property_name: str
+    _spacy_property_name: str
+
+    def __init__(
+        self,
+        task_name: str,
+        spacy_property_name: str,
+        require_update: bool,
+        input_property_name: str
+    ):
+        super().__init__(task_name, require_update)
+        self._spacy_property_name = spacy_property_name
+        self._nlp = spacy.load("pl_core_news_lg")
+        self._input_property_name = input_property_name
+
+    def run_single_process(self, record_id: str, experiment_repository: ExperimentRepository,
+                           relation_manager: RelationManager):
+        test_property = experiment_repository.get_property_for_key(
+            record_id, self._input_property_name)
+        document = self._nlp(self.get_transcript_to_process(test_property))
+
+        spacy_result = [self.token_to_result_dict(token) for token in document]
+        experiment_repository.update_property_for_key(record_id, self._spacy_property_name,
+                                                      spacy_result)
+
+    @staticmethod
+    def token_to_result_dict(token: Token) -> Dict[str, str]:
+        return {'id': str(uuid.uuid4()), 'word': token.text, 'pos': token.pos_}
+
+    @abstractmethod
+    def get_transcript_to_process(self, property_value: Any) -> str:
+        pass
+
+    def skip_for_record(self, record_id: str, experiment_repository: ExperimentRepository) -> bool:
+        return experiment_repository.property_exists(record_id, self._spacy_property_name)
-- 
GitLab