Divide WerCalculator for Alignment and Wer calculator

02e76f7f · Marcin Wątroba · 2e9b00d6 · 02e76f7f · 02e76f7f · 02e76f7f
Unverified Commit 02e76f7f authored 4 years ago by Marcin Wątroba
--- a/setup.cfg
+++ b/setup.cfg
@@ -21,6 +21,6 @@ exclude = docs
 # Define setup.py command aliases here
 test = pytest
-[tool:pytest]
+;[tool:pytest]
-collect_ignore = ['setup.py']
+;collect_ignore = ['setup.py']
--- a/sziszapangma/core/alignment/__init__.py
+++ b/sziszapangma/core/alignment/__init__.py
--- a/sziszapangma/core/alignment/alignment_calculator.py
+++ b/sziszapangma/core/alignment/alignment_calculator.py
+from abc import ABC
+from typing import List, Tuple, Optional
+import numpy as np
+from sziszapangma.core.alignment.step_type import StepType
+from sziszapangma.core.alignment.alignment_step import AlignmentStep
+from sziszapangma.core.alignment.distance_matrix_calculator import \
+    DistanceCalculator
+from sziszapangma.core.alignment.step_words import StepWords
+from sziszapangma.core.alignment.alignment_processing_step import \
+    AlignmentProcessingStep
+from sziszapangma.core.alignment.word import Word
+class AlignmentCalculator(ABC):
+    _distance_matrix_calculator: DistanceCalculator
+    def __init__(self, distance_matrix_calculator: DistanceCalculator):
+        self._distance_matrix_calculator = distance_matrix_calculator
+    def convert_processing_steps_to_result(
+        self,
+        processing_steps: List[AlignmentProcessingStep],
+    ) -> List[AlignmentStep]:
+        return [
+            AlignmentStep(step.step_type, step.step_words, step.step_cost)
+            for step in processing_steps
+        ]
+    def _get_reference_indexes_per_steps(
+        self,
+        steps: List[AlignmentProcessingStep]
+    ) -> List[int]:
+        counter = 0
+        indexes = []
+        for step in steps:
+            indexes.append(counter)
+            if step.step_type.contain_reference_word():
+                counter = counter + 1
+        return indexes
+    def get_distance_matrix_between_words(
+        self,
+        reference: List[Word],
+        hypothesis: List[Word]
+    ) -> np.ndarray:
+        return self._distance_matrix_calculator.calculate_distance_matrix(
+            reference, hypothesis)
+    @staticmethod
+    def _get_initialized_levenshtein_matrix(
+        reference: List[Word],
+        hypothesis: List[Word]
+    ) -> Tuple[np.ndarray, List[List[Optional[AlignmentProcessingStep]]]]:
+        # TODO: consider about remove distance_arr replaced by steps_arr
+        reference_len = len(reference)
+        hypothesis_len = len(hypothesis)
+        distance_arr = np.zeros((reference_len + 1) * (hypothesis_len + 1)) \
+            .reshape((reference_len + 1, hypothesis_len + 1))
+        steps_arr = [
+            [None for _ in range(hypothesis_len + 1)]
+            for _ in range(reference_len + 1)
+        ]
+        # levenshtein initial
+        for ref_index in range(reference_len + 1):
+            distance_arr[ref_index][0] = ref_index
+            step_words = StepWords(
+                reference[ref_index - 1] if ref_index > 0 else None,
+                None
+            )
+            steps_arr[ref_index][0] = AlignmentProcessingStep\
+                .levenshtein_deletion(ref_index - 1, step_words)
+        for hyp_index in range(hypothesis_len + 1):
+            distance_arr[0][hyp_index] = hyp_index
+            step_words = StepWords(
+                None,
+                hypothesis[hyp_index - 1] if hyp_index > 0 else None
+            )
+            steps_arr[0][hyp_index] = AlignmentProcessingStep\
+                .levenshtein_insertion(hyp_index - 1, step_words)
+        return distance_arr, steps_arr
+    @staticmethod
+    def _get_levenshtein_processing_step_cross(
+        prev_cross_distance: float,
+        step_words: StepWords,
+        current_distance: float
+    ) -> AlignmentProcessingStep:
+        return AlignmentProcessingStep.levenshtein_correct(
+            prev_cross_distance, step_words, 0) \
+            if current_distance == 0 \
+            else AlignmentProcessingStep.levenshtein_substitution(
+            prev_cross_distance, step_words, current_distance)
+    def get_levenshtein_embedding_based(
+        self,
+        reference: List[Word],
+        hypothesis: List[Word],
+        distance_matrix: np.ndarray
+    ) -> Tuple[np.ndarray, List[List[AlignmentProcessingStep]]]:
+        reference_len = len(reference)
+        hypothesis_len = len(hypothesis)
+        distance_arr, steps_arr = self._get_initialized_levenshtein_matrix(
+            reference, hypothesis)
+        for ref_index in range(reference_len):
+            for hyp_index in range(hypothesis_len):
+                step_words = StepWords(reference[ref_index],
+                                       hypothesis[hyp_index])
+                current_distance = distance_matrix[ref_index][hyp_index]
+                prev_cross_distance = distance_arr[ref_index][hyp_index]
+                cross_go_step = self._get_levenshtein_processing_step_cross(
+                    prev_cross_distance, step_words, current_distance)
+                insertion_step = AlignmentProcessingStep.levenshtein_insertion(
+                    distance_arr[ref_index + 1][hyp_index], step_words)
+                deletion_step = AlignmentProcessingStep.levenshtein_deletion(
+                    distance_arr[ref_index][hyp_index + 1], step_words)
+                best_step = min([cross_go_step, insertion_step, deletion_step],
+                                key=lambda it: it.total_distance())
+                distance_arr[ref_index + 1][hyp_index + 1] = \
+                    best_step.total_distance()
+                steps_arr[ref_index + 1][hyp_index + 1] = best_step
+        return distance_arr, steps_arr
+    def extract_steps_path(
+        self,
+        steps_matrix: List[List[AlignmentProcessingStep]]
+    ) -> List[AlignmentProcessingStep]:
+        x = len(steps_matrix) - 1
+        y = len(steps_matrix[0]) - 1
+        to_return = []
+        while not (x == 0 and y == 0):
+            current_step = steps_matrix[x][y]
+            to_return.append(current_step)
+            if current_step.step_type == StepType.DELETION:
+                x = x - 1
+            elif current_step.step_type == StepType.INSERTION:
+                y = y - 1
+            else:  # creation and substitution
+                y = y - 1
+                x = x - 1
+        return to_return[::-1]
+    def _calculate_steps_path(
+        self,
+        reference: List[Word],
+        hypothesis: List[Word]
+    ) -> List[AlignmentProcessingStep]:
+        distance_between_words = self.get_distance_matrix_between_words(
+            reference, hypothesis)
+        _, steps_matrix = self.get_levenshtein_embedding_based(
+            reference, hypothesis, distance_between_words)
+        return self.extract_steps_path(steps_matrix)
+    def calculate_alignment(
+        self,
+        reference: List[Word],
+        hypothesis: List[Word]
+    ) -> List[AlignmentStep]:
+        steps_path = self._calculate_steps_path(reference, hypothesis)
+        return self.convert_processing_steps_to_result(steps_path)
+    def calculate_alignment_weighted(
+        self,
+        reference: List[Word],
+        hypothesis: List[Word],
+        weights: List[float]
+    ) -> List[AlignmentStep]:
+        steps_path = self._calculate_steps_path(reference, hypothesis)
+        return self.convert_processing_steps_to_result(steps_path)
--- a/sziszapangma/core/wer/classic_wer_calculator.py
+++ b/sziszapangma/core/wer/classic_wer_calculator.py
-from sziszapangma.core.wer.distance_matrix_calculator import \
+from sziszapangma.core.alignment.alignment_calculator import \
+    AlignmentCalculator
+from sziszapangma.core.alignment.distance_matrix_calculator import \
    BinaryDistanceCalculator
-from sziszapangma.core.wer.wer_calculator import WerCalculator
-class ClassicWerCalculator(WerCalculator):
+class AlignmentClassicCalculator(AlignmentCalculator):
    def __init__(self):
        super().__init__(BinaryDistanceCalculator())
--- a/sziszapangma/core/wer/wer_embedding_calculator.py
+++ b/sziszapangma/core/wer/wer_embedding_calculator.py
-from typing import List, Optional
+from typing import List
+from sziszapangma.core.alignment.alignment_calculator import \
+    AlignmentCalculator
+from sziszapangma.core.alignment.alignment_processing_step import \
+    AlignmentProcessingStep
+from sziszapangma.core.alignment.alignment_step import AlignmentStep
+from sziszapangma.core.alignment.distance_matrix_calculator import \
+    BinaryDistanceCalculator, DistanceCalculator, CosineDistanceCalculator
+from sziszapangma.core.alignment.step_words import StepWords
 from sziszapangma.core.transformer.embedding_transformer import \
    EmbeddingTransformer
-from sziszapangma.core.wer.distance_matrix_calculator import \
-    BinaryDistanceCalculator, DistanceCalculator, CosineDistanceCalculator
-from sziszapangma.core.wer.step_words import StepWords
-from sziszapangma.core.wer.wer_calculator import WerCalculator
-from sziszapangma.core.wer.wer_processing_step import WerProcessingStep
-from sziszapangma.core.wer.wer_step import WerStep
-class WerEmbeddingCalculator(WerCalculator):
+class AlignmentEmbeddingCalculator(AlignmentCalculator):
    _distance_calculator: DistanceCalculator
    def __init__(self, embedding_transformer: EmbeddingTransformer):
@@ -30,7 +32,7 @@ class WerEmbeddingCalculator(WerCalculator):
    def _calculate_result_cost_for_step(
        self,
-        processing_step: WerProcessingStep
+        processing_step: AlignmentProcessingStep
    ) -> float:
        step_words = processing_step.step_words
        return self._calculate_distance_for_word_step(step_words) \
@@ -39,25 +41,10 @@ class WerEmbeddingCalculator(WerCalculator):
    def convert_processing_steps_to_result(
        self,
-        processing_steps: List[WerProcessingStep],
+        processing_steps: List[AlignmentProcessingStep]
-        reference_weights: Optional[List[float]] = None
+    ) -> List[AlignmentStep]:
-    ) -> List[WerStep]:
+        return [
-        if reference_weights is None:
+            AlignmentStep(step.step_type, step.step_words,
-            return [
+                          self._calculate_result_cost_for_step(step))
-                WerStep(step.step_type, step.step_words,
+            for step in processing_steps
-                        self._calculate_result_cost_for_step(step))
+        ]
-                for step in processing_steps
-            ]
-        else:
-            indexes_per_steps = self._get_reference_indexes_per_steps(
-                processing_steps)
-            return [
-                WerStep(
-                    processing_steps[step_index].step_type,
-                    processing_steps[step_index].step_words,
-                    reference_weights[indexes_per_steps[step_index]] *
-                    self._calculate_result_cost_for_step(
-                        processing_steps[step_index])
-                )
-                for step_index in range(len(processing_steps))
-            ]
--- a/sziszapangma/core/wer/wer_processing_step.py
+++ b/sziszapangma/core/wer/wer_processing_step.py
 from dataclasses import dataclass
-from sziszapangma.core.wer.step_type import StepType
+from sziszapangma.core.alignment.step_type import StepType
-from sziszapangma.core.wer.step_words import StepWords
+from sziszapangma.core.alignment.step_words import StepWords
 @dataclass(frozen=True)
-class WerProcessingStep:
+class AlignmentProcessingStep:
    step_type: StepType
    step_words: StepWords
    previous_distance: float
@@ -15,27 +15,27 @@ class WerProcessingStep:
    def levenshtein_insertion(cls, previous_distance: float,
                              step_words: StepWords, step_cost: float = 1):
        words = StepWords(None, step_words.hypothesis_word)
-        return WerProcessingStep(StepType.INSERTION, words,
+        return AlignmentProcessingStep(StepType.INSERTION, words,
-                                 previous_distance, step_cost)
+                                       previous_distance, step_cost)
    @classmethod
    def levenshtein_deletion(cls, previous_distance: float,
                             step_words: StepWords, step_cost: float = 1):
        words = StepWords(step_words.reference_word, None)
-        return WerProcessingStep(StepType.DELETION, words,
+        return AlignmentProcessingStep(StepType.DELETION, words,
-                                 previous_distance, step_cost)
+                                       previous_distance, step_cost)
    @classmethod
    def levenshtein_substitution(cls, previous_distance: float,
                                 step_words: StepWords, step_cost: float):
-        return WerProcessingStep(StepType.SUBSTITUTION, step_words,
+        return AlignmentProcessingStep(StepType.SUBSTITUTION, step_words,
-                                 previous_distance, step_cost)
+                                       previous_distance, step_cost)
    @classmethod
    def levenshtein_correct(cls, previous_distance: float,
                            step_words: StepWords, step_cost: float):
-        return WerProcessingStep(StepType.CORRECT, step_words,
+        return AlignmentProcessingStep(StepType.CORRECT, step_words,
-                                 previous_distance, step_cost)
+                                       previous_distance, step_cost)
    def total_distance(self) -> float:
        return self.step_cost + self.previous_distance
--- a/sziszapangma/core/wer/wer_soft_calculator.py
+++ b/sziszapangma/core/wer/wer_soft_calculator.py
-from sziszapangma.core.wer.distance_matrix_calculator import \
+from sziszapangma.core.alignment.alignment_calculator import \
+    AlignmentCalculator
+from sziszapangma.core.alignment.distance_matrix_calculator import \
    CosineDistanceCalculator
-from sziszapangma.core.wer.wer_calculator import WerCalculator
 from sziszapangma.core.transformer.embedding_transformer import \
    EmbeddingTransformer
-class WerSoftCalculator(WerCalculator):
+class AlignmentSoftCalculator(AlignmentCalculator):
    def __init__(self, embedding_transformer: EmbeddingTransformer):
        super().__init__(CosineDistanceCalculator(embedding_transformer))
--- a/sziszapangma/core/alignment/alignment_step.py
+++ b/sziszapangma/core/alignment/alignment_step.py
+from dataclasses import dataclass
+from sziszapangma.core.alignment.step_type import StepType
+from sziszapangma.core.alignment.step_words import StepWords
+@dataclass(frozen=True)
+class AlignmentStep:
+    step_type: StepType
+    step_words: StepWords
+    step_cost: float
+    def with_weight_multiplication(self, weight: float):
+        return AlignmentStep(
+            step_type=self.step_type,
+            step_words=self.step_words,
+            step_cost=self.step_cost * weight
+        )
--- a/sziszapangma/core/alignment/alignment_util.py
+++ b/sziszapangma/core/alignment/alignment_util.py
+from typing import List, Optional
+import numpy as np
+import pandas as pd
+from sziszapangma.core.alignment.alignment_step import AlignmentStep
+class AlignmentUtil:
+    @staticmethod
+    def _optional_str_to_str(value: Optional[str]) -> str:
+        return value if value is not None else ''
+    @staticmethod
+    def _wer_step_to_pandas_row_lit(step: AlignmentStep) -> List[any]:
+        return [
+            step.step_type.get_short_name(),
+            AlignmentUtil._optional_str_to_str(step.step_words.reference_word),
+            AlignmentUtil._optional_str_to_str(
+                step.step_words.hypothesis_word),
+            round(step.step_cost, 3)
+        ]
+    @staticmethod
+    def steps_to_dataframe(steps: List[AlignmentStep]) -> pd.DataFrame:
+        arr = np.array([
+            AlignmentUtil._wer_step_to_pandas_row_lit(step)
+            for step in steps
+        ])
+        return pd.DataFrame(
+            arr,
+            columns=['step_type', 'reference', 'hypothesis', 'cost']
+        )
+    @staticmethod
+    def get_reference_indexes_per_steps(
+        steps: List[AlignmentStep]
+    ) -> List[int]:
+        counter = 0
+        indexes = []
+        for step in steps:
+            indexes.append(counter)
+            if step.step_type.contain_reference_word():
+                counter = counter + 1
+        return indexes
+    @staticmethod
+    def get_reference_length(steps: List[AlignmentStep]) -> int:
+        return sum([
+            1 if step.step_type.contain_reference_word() else 0
+            for step in steps
+        ])
+    @staticmethod
+    def apply_weights_to_alignment(
+        steps: List[AlignmentStep],
+        weights: List[float]
+    ) -> List[AlignmentStep]:
+        if AlignmentUtil.get_reference_length(steps) != len(weights):
+            raise Exception(
+                f"Incorrect length of weights, current={len(weights)}, "
+                f"required={AlignmentUtil.get_reference_length(steps)}"
+            )
+        reference_indexes_per_steps = \
+            AlignmentUtil.get_reference_indexes_per_steps(steps)
+        return [
+            steps[index].with_weight_multiplication(
+                weights[reference_indexes_per_steps[index]])
+            for index in range(len(steps))
+        ]
--- a/sziszapangma/core/wer/distance_matrix_calculator.py
+++ b/sziszapangma/core/wer/distance_matrix_calculator.py
@@ -5,7 +5,7 @@ import numpy as np
 from sziszapangma.core.transformer.embedding_transformer import \
    EmbeddingTransformer
-from sziszapangma.core.wer.word import Word
+from sziszapangma.core.alignment.word import Word
 class DistanceCalculator(ABC):

--- a/sziszapangma/core/wer/step_type.py
+++ b/sziszapangma/core/wer/step_type.py
--- a/sziszapangma/core/wer/step_words.py
+++ b/sziszapangma/core/wer/step_words.py
 from dataclasses import dataclass
 from typing import Optional
-from sziszapangma.core.wer.word import Word
+from sziszapangma.core.alignment.word import Word
 @dataclass(frozen=True)

--- a/sziszapangma/core/wer/word.py
+++ b/sziszapangma/core/wer/word.py
--- a/sziszapangma/core/wer/__pycache__/classic_wer_calculator.cpython-38.pyc
+++ b/sziszapangma/core/wer/__pycache__/classic_wer_calculator.cpython-38.pyc
--- a/sziszapangma/core/wer/__pycache__/distance_matrix_calculator.cpython-38.pyc
+++ b/sziszapangma/core/wer/__pycache__/distance_matrix_calculator.cpython-38.pyc
--- a/sziszapangma/core/wer/__pycache__/step_type.cpython-38.pyc
+++ b/sziszapangma/core/wer/__pycache__/step_type.cpython-38.pyc
--- a/sziszapangma/core/wer/__pycache__/step_words.cpython-38.pyc
+++ b/sziszapangma/core/wer/__pycache__/step_words.cpython-38.pyc
--- a/sziszapangma/core/wer/__pycache__/wer_embedding_calculator.cpython-38.pyc
+++ b/sziszapangma/core/wer/__pycache__/wer_embedding_calculator.cpython-38.pyc
--- a/sziszapangma/core/wer/__pycache__/wer_processing_step.cpython-38.pyc
+++ b/sziszapangma/core/wer/__pycache__/wer_processing_step.cpython-38.pyc
--- a/sziszapangma/core/wer/__pycache__/wer_soft_calculator.cpython-38.pyc
+++ b/sziszapangma/core/wer/__pycache__/wer_soft_calculator.cpython-38.pyc