diff --git a/setup.cfg b/setup.cfg index a65cf7a3f90c161536575830569515ffb564a0e6..2642f6ad6c22dacc518a2e26b7c4078cf623ff74 100644 --- a/setup.cfg +++ b/setup.cfg @@ -21,6 +21,6 @@ exclude = docs # Define setup.py command aliases here test = pytest -[tool:pytest] -collect_ignore = ['setup.py'] +;[tool:pytest] +;collect_ignore = ['setup.py'] diff --git a/sziszapangma/core/alignment/__init__.py b/sziszapangma/core/alignment/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/sziszapangma/core/alignment/alignment_calculator.py b/sziszapangma/core/alignment/alignment_calculator.py new file mode 100644 index 0000000000000000000000000000000000000000..f69ec9537fac17d7a25d8768ea6bd6eec08c07af --- /dev/null +++ b/sziszapangma/core/alignment/alignment_calculator.py @@ -0,0 +1,179 @@ +from abc import ABC +from typing import List, Tuple, Optional + +import numpy as np + +from sziszapangma.core.alignment.step_type import StepType +from sziszapangma.core.alignment.alignment_step import AlignmentStep +from sziszapangma.core.alignment.distance_matrix_calculator import \ + DistanceCalculator +from sziszapangma.core.alignment.step_words import StepWords +from sziszapangma.core.alignment.alignment_processing_step import \ + AlignmentProcessingStep +from sziszapangma.core.alignment.word import Word + + +class AlignmentCalculator(ABC): + _distance_matrix_calculator: DistanceCalculator + + def __init__(self, distance_matrix_calculator: DistanceCalculator): + self._distance_matrix_calculator = distance_matrix_calculator + + def convert_processing_steps_to_result( + self, + processing_steps: List[AlignmentProcessingStep], + ) -> List[AlignmentStep]: + return [ + AlignmentStep(step.step_type, step.step_words, step.step_cost) + for step in processing_steps + ] + + def _get_reference_indexes_per_steps( + self, + steps: List[AlignmentProcessingStep] + ) -> List[int]: + counter = 0 + indexes = [] + for step in steps: + indexes.append(counter) + if step.step_type.contain_reference_word(): + counter = counter + 1 + return indexes + + def get_distance_matrix_between_words( + self, + reference: List[Word], + hypothesis: List[Word] + ) -> np.ndarray: + return self._distance_matrix_calculator.calculate_distance_matrix( + reference, hypothesis) + + @staticmethod + def _get_initialized_levenshtein_matrix( + reference: List[Word], + hypothesis: List[Word] + ) -> Tuple[np.ndarray, List[List[Optional[AlignmentProcessingStep]]]]: + + # TODO: consider about remove distance_arr replaced by steps_arr + reference_len = len(reference) + hypothesis_len = len(hypothesis) + distance_arr = np.zeros((reference_len + 1) * (hypothesis_len + 1)) \ + .reshape((reference_len + 1, hypothesis_len + 1)) + steps_arr = [ + [None for _ in range(hypothesis_len + 1)] + for _ in range(reference_len + 1) + ] + + # levenshtein initial + for ref_index in range(reference_len + 1): + distance_arr[ref_index][0] = ref_index + step_words = StepWords( + reference[ref_index - 1] if ref_index > 0 else None, + None + ) + steps_arr[ref_index][0] = AlignmentProcessingStep\ + .levenshtein_deletion(ref_index - 1, step_words) + for hyp_index in range(hypothesis_len + 1): + distance_arr[0][hyp_index] = hyp_index + step_words = StepWords( + None, + hypothesis[hyp_index - 1] if hyp_index > 0 else None + ) + steps_arr[0][hyp_index] = AlignmentProcessingStep\ + .levenshtein_insertion(hyp_index - 1, step_words) + + return distance_arr, steps_arr + + @staticmethod + def _get_levenshtein_processing_step_cross( + prev_cross_distance: float, + step_words: StepWords, + current_distance: float + ) -> AlignmentProcessingStep: + return AlignmentProcessingStep.levenshtein_correct( + prev_cross_distance, step_words, 0) \ + if current_distance == 0 \ + else AlignmentProcessingStep.levenshtein_substitution( + prev_cross_distance, step_words, current_distance) + + def get_levenshtein_embedding_based( + self, + reference: List[Word], + hypothesis: List[Word], + distance_matrix: np.ndarray + ) -> Tuple[np.ndarray, List[List[AlignmentProcessingStep]]]: + + reference_len = len(reference) + hypothesis_len = len(hypothesis) + distance_arr, steps_arr = self._get_initialized_levenshtein_matrix( + reference, hypothesis) + + for ref_index in range(reference_len): + for hyp_index in range(hypothesis_len): + step_words = StepWords(reference[ref_index], + hypothesis[hyp_index]) + current_distance = distance_matrix[ref_index][hyp_index] + prev_cross_distance = distance_arr[ref_index][hyp_index] + + cross_go_step = self._get_levenshtein_processing_step_cross( + prev_cross_distance, step_words, current_distance) + insertion_step = AlignmentProcessingStep.levenshtein_insertion( + distance_arr[ref_index + 1][hyp_index], step_words) + deletion_step = AlignmentProcessingStep.levenshtein_deletion( + distance_arr[ref_index][hyp_index + 1], step_words) + + best_step = min([cross_go_step, insertion_step, deletion_step], + key=lambda it: it.total_distance()) + + distance_arr[ref_index + 1][hyp_index + 1] = \ + best_step.total_distance() + steps_arr[ref_index + 1][hyp_index + 1] = best_step + + return distance_arr, steps_arr + + def extract_steps_path( + self, + steps_matrix: List[List[AlignmentProcessingStep]] + ) -> List[AlignmentProcessingStep]: + x = len(steps_matrix) - 1 + y = len(steps_matrix[0]) - 1 + to_return = [] + while not (x == 0 and y == 0): + current_step = steps_matrix[x][y] + to_return.append(current_step) + if current_step.step_type == StepType.DELETION: + x = x - 1 + elif current_step.step_type == StepType.INSERTION: + y = y - 1 + else: # creation and substitution + y = y - 1 + x = x - 1 + return to_return[::-1] + + def _calculate_steps_path( + self, + reference: List[Word], + hypothesis: List[Word] + ) -> List[AlignmentProcessingStep]: + distance_between_words = self.get_distance_matrix_between_words( + reference, hypothesis) + _, steps_matrix = self.get_levenshtein_embedding_based( + reference, hypothesis, distance_between_words) + return self.extract_steps_path(steps_matrix) + + def calculate_alignment( + self, + reference: List[Word], + hypothesis: List[Word] + ) -> List[AlignmentStep]: + steps_path = self._calculate_steps_path(reference, hypothesis) + return self.convert_processing_steps_to_result(steps_path) + + def calculate_alignment_weighted( + self, + reference: List[Word], + hypothesis: List[Word], + weights: List[float] + ) -> List[AlignmentStep]: + steps_path = self._calculate_steps_path(reference, hypothesis) + return self.convert_processing_steps_to_result(steps_path) diff --git a/sziszapangma/core/alignment/alignment_classic_calculator.py b/sziszapangma/core/alignment/alignment_classic_calculator.py new file mode 100644 index 0000000000000000000000000000000000000000..fbf60eb310caa9d7cd2370ba253f4168da7c62f7 --- /dev/null +++ b/sziszapangma/core/alignment/alignment_classic_calculator.py @@ -0,0 +1,10 @@ +from sziszapangma.core.alignment.alignment_calculator import \ + AlignmentCalculator +from sziszapangma.core.alignment.distance_matrix_calculator import \ + BinaryDistanceCalculator + + +class AlignmentClassicCalculator(AlignmentCalculator): + + def __init__(self): + super().__init__(BinaryDistanceCalculator()) diff --git a/sziszapangma/core/alignment/alignment_embedding_calculator.py b/sziszapangma/core/alignment/alignment_embedding_calculator.py new file mode 100644 index 0000000000000000000000000000000000000000..a20802dc2d2ad132edb57854b7c1e1ec71ef621d --- /dev/null +++ b/sziszapangma/core/alignment/alignment_embedding_calculator.py @@ -0,0 +1,50 @@ +from typing import List + +from sziszapangma.core.alignment.alignment_calculator import \ + AlignmentCalculator +from sziszapangma.core.alignment.alignment_processing_step import \ + AlignmentProcessingStep +from sziszapangma.core.alignment.alignment_step import AlignmentStep +from sziszapangma.core.alignment.distance_matrix_calculator import \ + BinaryDistanceCalculator, DistanceCalculator, CosineDistanceCalculator +from sziszapangma.core.alignment.step_words import StepWords +from sziszapangma.core.transformer.embedding_transformer import \ + EmbeddingTransformer + + +class AlignmentEmbeddingCalculator(AlignmentCalculator): + _distance_calculator: DistanceCalculator + + def __init__(self, embedding_transformer: EmbeddingTransformer): + super().__init__(BinaryDistanceCalculator()) + self._embedding_transformer = embedding_transformer + self._distance_calculator = CosineDistanceCalculator( + embedding_transformer) + + def _calculate_distance_for_word_step( + self, + step_words: StepWords + ) -> float: + return self._distance_calculator.calculate_distance_for_words( + step_words.reference_word, + step_words.hypothesis_word + ) + + def _calculate_result_cost_for_step( + self, + processing_step: AlignmentProcessingStep + ) -> float: + step_words = processing_step.step_words + return self._calculate_distance_for_word_step(step_words) \ + if processing_step.step_type.is_cross_step() \ + else processing_step.step_cost + + def convert_processing_steps_to_result( + self, + processing_steps: List[AlignmentProcessingStep] + ) -> List[AlignmentStep]: + return [ + AlignmentStep(step.step_type, step.step_words, + self._calculate_result_cost_for_step(step)) + for step in processing_steps + ] diff --git a/sziszapangma/core/wer/wer_processing_step.py b/sziszapangma/core/alignment/alignment_processing_step.py similarity index 58% rename from sziszapangma/core/wer/wer_processing_step.py rename to sziszapangma/core/alignment/alignment_processing_step.py index 15d92dddefca7deb436b7d24bd734e8e72acf94e..e4ab96dc1578348db186006797eb1801842a1588 100644 --- a/sziszapangma/core/wer/wer_processing_step.py +++ b/sziszapangma/core/alignment/alignment_processing_step.py @@ -1,11 +1,11 @@ from dataclasses import dataclass -from sziszapangma.core.wer.step_type import StepType -from sziszapangma.core.wer.step_words import StepWords +from sziszapangma.core.alignment.step_type import StepType +from sziszapangma.core.alignment.step_words import StepWords @dataclass(frozen=True) -class WerProcessingStep: +class AlignmentProcessingStep: step_type: StepType step_words: StepWords previous_distance: float @@ -15,27 +15,27 @@ class WerProcessingStep: def levenshtein_insertion(cls, previous_distance: float, step_words: StepWords, step_cost: float = 1): words = StepWords(None, step_words.hypothesis_word) - return WerProcessingStep(StepType.INSERTION, words, - previous_distance, step_cost) + return AlignmentProcessingStep(StepType.INSERTION, words, + previous_distance, step_cost) @classmethod def levenshtein_deletion(cls, previous_distance: float, step_words: StepWords, step_cost: float = 1): words = StepWords(step_words.reference_word, None) - return WerProcessingStep(StepType.DELETION, words, - previous_distance, step_cost) + return AlignmentProcessingStep(StepType.DELETION, words, + previous_distance, step_cost) @classmethod def levenshtein_substitution(cls, previous_distance: float, step_words: StepWords, step_cost: float): - return WerProcessingStep(StepType.SUBSTITUTION, step_words, - previous_distance, step_cost) + return AlignmentProcessingStep(StepType.SUBSTITUTION, step_words, + previous_distance, step_cost) @classmethod def levenshtein_correct(cls, previous_distance: float, step_words: StepWords, step_cost: float): - return WerProcessingStep(StepType.CORRECT, step_words, - previous_distance, step_cost) + return AlignmentProcessingStep(StepType.CORRECT, step_words, + previous_distance, step_cost) def total_distance(self) -> float: return self.step_cost + self.previous_distance diff --git a/sziszapangma/core/wer/wer_soft_calculator.py b/sziszapangma/core/alignment/alignment_soft_calculator.py similarity index 56% rename from sziszapangma/core/wer/wer_soft_calculator.py rename to sziszapangma/core/alignment/alignment_soft_calculator.py index e17728264cb922a6357005de289ed2e3cebf7cd9..c7de34cecef6693d6058260a1a2fd3a8997acb7b 100644 --- a/sziszapangma/core/wer/wer_soft_calculator.py +++ b/sziszapangma/core/alignment/alignment_soft_calculator.py @@ -1,11 +1,12 @@ -from sziszapangma.core.wer.distance_matrix_calculator import \ +from sziszapangma.core.alignment.alignment_calculator import \ + AlignmentCalculator +from sziszapangma.core.alignment.distance_matrix_calculator import \ CosineDistanceCalculator -from sziszapangma.core.wer.wer_calculator import WerCalculator from sziszapangma.core.transformer.embedding_transformer import \ EmbeddingTransformer -class WerSoftCalculator(WerCalculator): +class AlignmentSoftCalculator(AlignmentCalculator): def __init__(self, embedding_transformer: EmbeddingTransformer): super().__init__(CosineDistanceCalculator(embedding_transformer)) diff --git a/sziszapangma/core/alignment/alignment_step.py b/sziszapangma/core/alignment/alignment_step.py new file mode 100644 index 0000000000000000000000000000000000000000..cefd0d105f499b0e1d1ba68d29d8fedc16f2e17b --- /dev/null +++ b/sziszapangma/core/alignment/alignment_step.py @@ -0,0 +1,18 @@ +from dataclasses import dataclass + +from sziszapangma.core.alignment.step_type import StepType +from sziszapangma.core.alignment.step_words import StepWords + + +@dataclass(frozen=True) +class AlignmentStep: + step_type: StepType + step_words: StepWords + step_cost: float + + def with_weight_multiplication(self, weight: float): + return AlignmentStep( + step_type=self.step_type, + step_words=self.step_words, + step_cost=self.step_cost * weight + ) diff --git a/sziszapangma/core/alignment/alignment_util.py b/sziszapangma/core/alignment/alignment_util.py new file mode 100644 index 0000000000000000000000000000000000000000..c1887317e3925e2143fd85c32b0aa82199a410f9 --- /dev/null +++ b/sziszapangma/core/alignment/alignment_util.py @@ -0,0 +1,71 @@ +from typing import List, Optional + +import numpy as np +import pandas as pd + +from sziszapangma.core.alignment.alignment_step import AlignmentStep + + +class AlignmentUtil: + + @staticmethod + def _optional_str_to_str(value: Optional[str]) -> str: + return value if value is not None else '' + + @staticmethod + def _wer_step_to_pandas_row_lit(step: AlignmentStep) -> List[any]: + return [ + step.step_type.get_short_name(), + AlignmentUtil._optional_str_to_str(step.step_words.reference_word), + AlignmentUtil._optional_str_to_str( + step.step_words.hypothesis_word), + round(step.step_cost, 3) + ] + + @staticmethod + def steps_to_dataframe(steps: List[AlignmentStep]) -> pd.DataFrame: + arr = np.array([ + AlignmentUtil._wer_step_to_pandas_row_lit(step) + for step in steps + ]) + return pd.DataFrame( + arr, + columns=['step_type', 'reference', 'hypothesis', 'cost'] + ) + + @staticmethod + def get_reference_indexes_per_steps( + steps: List[AlignmentStep] + ) -> List[int]: + counter = 0 + indexes = [] + for step in steps: + indexes.append(counter) + if step.step_type.contain_reference_word(): + counter = counter + 1 + return indexes + + @staticmethod + def get_reference_length(steps: List[AlignmentStep]) -> int: + return sum([ + 1 if step.step_type.contain_reference_word() else 0 + for step in steps + ]) + + @staticmethod + def apply_weights_to_alignment( + steps: List[AlignmentStep], + weights: List[float] + ) -> List[AlignmentStep]: + if AlignmentUtil.get_reference_length(steps) != len(weights): + raise Exception( + f"Incorrect length of weights, current={len(weights)}, " + f"required={AlignmentUtil.get_reference_length(steps)}" + ) + reference_indexes_per_steps = \ + AlignmentUtil.get_reference_indexes_per_steps(steps) + return [ + steps[index].with_weight_multiplication( + weights[reference_indexes_per_steps[index]]) + for index in range(len(steps)) + ] diff --git a/sziszapangma/core/wer/distance_matrix_calculator.py b/sziszapangma/core/alignment/distance_matrix_calculator.py similarity index 98% rename from sziszapangma/core/wer/distance_matrix_calculator.py rename to sziszapangma/core/alignment/distance_matrix_calculator.py index 47e736a229165267193bb5ef926ce57875c22acc..5f17ea7f72e45d6f1bd0cad4f4732e786f591d22 100644 --- a/sziszapangma/core/wer/distance_matrix_calculator.py +++ b/sziszapangma/core/alignment/distance_matrix_calculator.py @@ -5,7 +5,7 @@ import numpy as np from sziszapangma.core.transformer.embedding_transformer import \ EmbeddingTransformer -from sziszapangma.core.wer.word import Word +from sziszapangma.core.alignment.word import Word class DistanceCalculator(ABC): diff --git a/sziszapangma/core/wer/step_type.py b/sziszapangma/core/alignment/step_type.py similarity index 100% rename from sziszapangma/core/wer/step_type.py rename to sziszapangma/core/alignment/step_type.py diff --git a/sziszapangma/core/wer/step_words.py b/sziszapangma/core/alignment/step_words.py similarity index 77% rename from sziszapangma/core/wer/step_words.py rename to sziszapangma/core/alignment/step_words.py index ced47b169f30b15f92cadcb79697cbdff2826add..067466f5318d20fd3d785ce9c1106403ac130574 100644 --- a/sziszapangma/core/wer/step_words.py +++ b/sziszapangma/core/alignment/step_words.py @@ -1,7 +1,7 @@ from dataclasses import dataclass from typing import Optional -from sziszapangma.core.wer.word import Word +from sziszapangma.core.alignment.word import Word @dataclass(frozen=True) diff --git a/sziszapangma/core/wer/word.py b/sziszapangma/core/alignment/word.py similarity index 100% rename from sziszapangma/core/wer/word.py rename to sziszapangma/core/alignment/word.py diff --git a/sziszapangma/core/wer/__pycache__/classic_wer_calculator.cpython-38.pyc b/sziszapangma/core/wer/__pycache__/classic_wer_calculator.cpython-38.pyc deleted file mode 100644 index 5114407c01a0c978a0d91a8e6465be84cc749485..0000000000000000000000000000000000000000 Binary files a/sziszapangma/core/wer/__pycache__/classic_wer_calculator.cpython-38.pyc and /dev/null differ diff --git a/sziszapangma/core/wer/__pycache__/distance_matrix_calculator.cpython-38.pyc b/sziszapangma/core/wer/__pycache__/distance_matrix_calculator.cpython-38.pyc deleted file mode 100644 index 2bf0c9c59168075c6523a4c441087072e2f1547f..0000000000000000000000000000000000000000 Binary files a/sziszapangma/core/wer/__pycache__/distance_matrix_calculator.cpython-38.pyc and /dev/null differ diff --git a/sziszapangma/core/wer/__pycache__/step_type.cpython-38.pyc b/sziszapangma/core/wer/__pycache__/step_type.cpython-38.pyc deleted file mode 100644 index d3ee780df43be6c35dd46f10f72bad0a42c35bb8..0000000000000000000000000000000000000000 Binary files a/sziszapangma/core/wer/__pycache__/step_type.cpython-38.pyc and /dev/null differ diff --git a/sziszapangma/core/wer/__pycache__/step_words.cpython-38.pyc b/sziszapangma/core/wer/__pycache__/step_words.cpython-38.pyc deleted file mode 100644 index 4142240350aabea762e0c740887510d3e13d56cf..0000000000000000000000000000000000000000 Binary files a/sziszapangma/core/wer/__pycache__/step_words.cpython-38.pyc and /dev/null differ diff --git a/sziszapangma/core/wer/__pycache__/wer_embedding_calculator.cpython-38.pyc b/sziszapangma/core/wer/__pycache__/wer_embedding_calculator.cpython-38.pyc deleted file mode 100644 index 924ae83ebc3c912ad3acb439a5766bc626c0ac74..0000000000000000000000000000000000000000 Binary files a/sziszapangma/core/wer/__pycache__/wer_embedding_calculator.cpython-38.pyc and /dev/null differ diff --git a/sziszapangma/core/wer/__pycache__/wer_processing_step.cpython-38.pyc b/sziszapangma/core/wer/__pycache__/wer_processing_step.cpython-38.pyc deleted file mode 100644 index e74e7f411d80ab89142f285eaaaf84f831cf9522..0000000000000000000000000000000000000000 Binary files a/sziszapangma/core/wer/__pycache__/wer_processing_step.cpython-38.pyc and /dev/null differ diff --git a/sziszapangma/core/wer/__pycache__/wer_soft_calculator.cpython-38.pyc b/sziszapangma/core/wer/__pycache__/wer_soft_calculator.cpython-38.pyc deleted file mode 100644 index e66f737228a10bcea2f3c886a1ccb85c904c6d0c..0000000000000000000000000000000000000000 Binary files a/sziszapangma/core/wer/__pycache__/wer_soft_calculator.cpython-38.pyc and /dev/null differ diff --git a/sziszapangma/core/wer/__pycache__/wer_span_question.cpython-38.pyc b/sziszapangma/core/wer/__pycache__/wer_span_question.cpython-38.pyc deleted file mode 100644 index 61e5fac5c3866d97cb474694ebc1fa60aba75bd9..0000000000000000000000000000000000000000 Binary files a/sziszapangma/core/wer/__pycache__/wer_span_question.cpython-38.pyc and /dev/null differ diff --git a/sziszapangma/core/wer/__pycache__/wer_step.cpython-38.pyc b/sziszapangma/core/wer/__pycache__/wer_step.cpython-38.pyc deleted file mode 100644 index 15b375efa80ab3c0292468ee00bb3e178407600e..0000000000000000000000000000000000000000 Binary files a/sziszapangma/core/wer/__pycache__/wer_step.cpython-38.pyc and /dev/null differ diff --git a/sziszapangma/core/wer/__pycache__/wer_util.cpython-38.pyc b/sziszapangma/core/wer/__pycache__/wer_util.cpython-38.pyc deleted file mode 100644 index 5caa38f0edc34f2f56e7ed8b79fd36da809fc616..0000000000000000000000000000000000000000 Binary files a/sziszapangma/core/wer/__pycache__/wer_util.cpython-38.pyc and /dev/null differ diff --git a/sziszapangma/core/wer/classic_wer_calculator.py b/sziszapangma/core/wer/classic_wer_calculator.py deleted file mode 100644 index db54d33764a3593ad53c0ddd40ea84ef3eaf8de9..0000000000000000000000000000000000000000 --- a/sziszapangma/core/wer/classic_wer_calculator.py +++ /dev/null @@ -1,9 +0,0 @@ -from sziszapangma.core.wer.distance_matrix_calculator import \ - BinaryDistanceCalculator -from sziszapangma.core.wer.wer_calculator import WerCalculator - - -class ClassicWerCalculator(WerCalculator): - - def __init__(self): - super().__init__(BinaryDistanceCalculator()) diff --git a/sziszapangma/core/wer/wer_span_question.py b/sziszapangma/core/wer/span.py similarity index 59% rename from sziszapangma/core/wer/wer_span_question.py rename to sziszapangma/core/wer/span.py index bfdf43488e7c8f3d4022de8fa892d3ad89b203f7..44cfe840d0f2c6f68743963e749ae00a27450191 100644 --- a/sziszapangma/core/wer/wer_span_question.py +++ b/sziszapangma/core/wer/span.py @@ -6,11 +6,11 @@ class Span: index_start: int index_end: int - def _is_index_belong(self, index: int) -> bool: + def is_index_belong(self, index: int) -> bool: return self.index_start <= index < self.index_end - def get_reference_weights_table(self, total_size: int): + def get_reference_mask_table(self, total_size: int): return [ - 1 if self._is_index_belong(it) else 0 + self.is_index_belong(it) for it in range(total_size) ] diff --git a/sziszapangma/core/wer/wer_calculator.py b/sziszapangma/core/wer/wer_calculator.py index edb06e4799f72d73327d635a87fe7991567db464..3fa65dbe5e0ff306a6ba118e4880db724a31d5bd 100644 --- a/sziszapangma/core/wer/wer_calculator.py +++ b/sziszapangma/core/wer/wer_calculator.py @@ -1,221 +1,52 @@ from abc import ABC -from typing import List, Tuple, Optional +from typing import List -import numpy as np - -from sziszapangma.core.wer.distance_matrix_calculator import \ - DistanceCalculator -from sziszapangma.core.wer.step_type import StepType -from sziszapangma.core.wer.wer_processing_step import WerProcessingStep -from sziszapangma.core.wer.wer_span_question import Span -from sziszapangma.core.wer.wer_step import WerStep, StepWords -from sziszapangma.core.wer.word import Word +from sziszapangma.core.alignment.alignment_step import AlignmentStep +from sziszapangma.core.alignment.alignment_util import AlignmentUtil +from sziszapangma.core.wer.span import Span class WerCalculator(ABC): - _distance_matrix_calculator: DistanceCalculator - - def __init__(self, distance_matrix_calculator: DistanceCalculator): - self._distance_matrix_calculator = distance_matrix_calculator - - def convert_processing_steps_to_result( - self, - processing_steps: List[WerProcessingStep], - reference_weights: Optional[List[float]] = None - ) -> List[WerStep]: - if reference_weights is None: - return [ - WerStep(step.step_type, step.step_words, step.step_cost) - for step in processing_steps - ] - else: - indexes_per_steps = self._get_reference_indexes_per_steps( - processing_steps) - return [ - WerStep( - processing_steps[step_index].step_type, - processing_steps[step_index].step_words, - reference_weights[indexes_per_steps[step_index]] * - processing_steps[step_index].step_cost - ) - for step_index in range(len(processing_steps)) - ] - - def get_distance_matrix_between_words( - self, - reference: List[Word], - hypothesis: List[Word] - ) -> np.ndarray: - return self._distance_matrix_calculator.calculate_distance_matrix( - reference, hypothesis) - - def extract_steps_path( - self, - steps_matrix: List[List[WerProcessingStep]] - ) -> List[WerProcessingStep]: - x = len(steps_matrix) - 1 - y = len(steps_matrix[0]) - 1 - to_return = [] - while not (x == 0 and y == 0): - current_step = steps_matrix[x][y] - to_return.append(current_step) - if current_step.step_type == StepType.DELETION: - x = x - 1 - elif current_step.step_type == StepType.INSERTION: - y = y - 1 - else: # creation and substitution - y = y - 1 - x = x - 1 - return to_return[::-1] - - @staticmethod - def _get_levenshtein_processing_step_cross( - prev_cross_distance: float, - step_words: StepWords, - current_distance: float - ) -> WerProcessingStep: - return WerProcessingStep.levenshtein_correct( - prev_cross_distance, step_words, 0) \ - if current_distance == 0 \ - else WerProcessingStep.levenshtein_substitution( - prev_cross_distance, step_words, current_distance) - - def get_levenshtein_embedding_based( - self, - reference: List[Word], - hypothesis: List[Word], - distance_matrix: np.ndarray - ) -> Tuple[np.ndarray, List[List[WerProcessingStep]]]: - - reference_len = len(reference) - hypothesis_len = len(hypothesis) - distance_arr, steps_arr = self._get_initialized_levenshtein_matrix( - reference, hypothesis) - - for ref_index in range(reference_len): - for hyp_index in range(hypothesis_len): - step_words = StepWords(reference[ref_index], - hypothesis[hyp_index]) - current_distance = distance_matrix[ref_index][hyp_index] - prev_cross_distance = distance_arr[ref_index][hyp_index] - - cross_go_step = self._get_levenshtein_processing_step_cross( - prev_cross_distance, step_words, current_distance) - insertion_step = WerProcessingStep.levenshtein_insertion( - distance_arr[ref_index + 1][hyp_index], step_words) - deletion_step = WerProcessingStep.levenshtein_deletion( - distance_arr[ref_index][hyp_index + 1], step_words) - - best_step = min([cross_go_step, insertion_step, deletion_step], - key=lambda it: it.total_distance()) - - distance_arr[ref_index + 1][hyp_index + 1] = \ - best_step.total_distance() - steps_arr[ref_index + 1][hyp_index + 1] = best_step - - return distance_arr, steps_arr @staticmethod - def _get_initialized_levenshtein_matrix( - reference: List[Word], - hypothesis: List[Word] - ) -> Tuple[np.ndarray, List[List[Optional[WerProcessingStep]]]]: - - # TODO: consider about remove distance_arr replaced by steps_arr - reference_len = len(reference) - hypothesis_len = len(hypothesis) - distance_arr = np.zeros((reference_len + 1) * (hypothesis_len + 1)) \ - .reshape((reference_len + 1, hypothesis_len + 1)) - steps_arr = [ - [None for _ in range(hypothesis_len + 1)] - for _ in range(reference_len + 1) - ] - - # levenshtein initial - for ref_index in range(reference_len + 1): - distance_arr[ref_index][0] = ref_index - step_words = StepWords( - reference[ref_index - 1] if ref_index > 0 else None, - None - ) - steps_arr[ref_index][0] = WerProcessingStep.levenshtein_deletion( - ref_index - 1, step_words) - for hyp_index in range(hypothesis_len + 1): - distance_arr[0][hyp_index] = hyp_index - step_words = StepWords( - None, - hypothesis[hyp_index - 1] if hyp_index > 0 else None + def _convert_processing_steps_to_result( + input_steps: List[AlignmentStep], + span: Span + ) -> List[AlignmentStep]: + indexes_per_steps = AlignmentUtil.get_reference_indexes_per_steps( + input_steps) + return [ + AlignmentStep( + input_steps[step_index].step_type, + input_steps[step_index].step_words, + input_steps[step_index].step_cost * span.is_index_belong( + indexes_per_steps[step_index]) ) - steps_arr[0][hyp_index] = WerProcessingStep.levenshtein_insertion( - hyp_index - 1, step_words) - - return distance_arr, steps_arr - - def _get_reference_indexes_per_steps( - self, - steps: List[WerProcessingStep] - ) -> List[int]: - counter = 0 - indexes = [] - for step in steps: - indexes.append(counter) - if step.step_type.contain_reference_word(): - counter = counter + 1 - return indexes + for step_index in range(len(input_steps)) + ] + @staticmethod def _calculate_wer( - self, - steps: List[WerStep], + steps: List[AlignmentStep], ) -> float: - reference_len = sum([ - 1 if step.step_type.contain_reference_word() else 0 - for step in steps - ]) + reference_len = AlignmentUtil.get_reference_length(steps) return sum([step.step_cost for step in steps]) / reference_len - def _calculate_steps_path( - self, - reference: List[Word], - hypothesis: List[Word] - ) -> List[WerProcessingStep]: - distance_between_words = self.get_distance_matrix_between_words( - reference, hypothesis) - _, steps_matrix = self.get_levenshtein_embedding_based( - reference, hypothesis, distance_between_words) - return self.extract_steps_path(steps_matrix) - def calculate_wer( self, - reference: List[Word], - hypothesis: List[Word] - ) -> Tuple[float, List[WerStep]]: - steps_path = self._calculate_steps_path(reference, hypothesis) - steps = self.convert_processing_steps_to_result(steps_path) - return self._calculate_wer(steps), steps + steps: List[AlignmentStep] + ) -> float: + return self._calculate_wer(steps) def calculate_wer_for_spans( self, - reference: List[Word], - hypothesis: List[Word], + steps: List[AlignmentStep], spans: List[Span] ) -> List[float]: - steps_path = self._calculate_steps_path(reference, hypothesis) - reference_len = len(reference) return [ - self._calculate_wer(self.convert_processing_steps_to_result( - processing_steps=steps_path, - reference_weights=span.get_reference_weights_table( - reference_len) + self._calculate_wer(self._convert_processing_steps_to_result( + input_steps=steps, + span=span )) for span in spans ] - - def calculate_wer_weighted( - self, - reference: List[Word], - hypothesis: List[Word], - weights: List[float] - ) -> Tuple[float, List[WerStep]]: - steps_path = self._calculate_steps_path(reference, hypothesis) - steps = self.convert_processing_steps_to_result(steps_path, weights) - return self._calculate_wer(steps), steps diff --git a/sziszapangma/core/wer/wer_embedding_calculator.py b/sziszapangma/core/wer/wer_embedding_calculator.py deleted file mode 100644 index 5f371b7f519dc6f17b618c0a19e252cefd900089..0000000000000000000000000000000000000000 --- a/sziszapangma/core/wer/wer_embedding_calculator.py +++ /dev/null @@ -1,63 +0,0 @@ -from typing import List, Optional - -from sziszapangma.core.transformer.embedding_transformer import \ - EmbeddingTransformer -from sziszapangma.core.wer.distance_matrix_calculator import \ - BinaryDistanceCalculator, DistanceCalculator, CosineDistanceCalculator -from sziszapangma.core.wer.step_words import StepWords -from sziszapangma.core.wer.wer_calculator import WerCalculator -from sziszapangma.core.wer.wer_processing_step import WerProcessingStep -from sziszapangma.core.wer.wer_step import WerStep - - -class WerEmbeddingCalculator(WerCalculator): - _distance_calculator: DistanceCalculator - - def __init__(self, embedding_transformer: EmbeddingTransformer): - super().__init__(BinaryDistanceCalculator()) - self._embedding_transformer = embedding_transformer - self._distance_calculator = CosineDistanceCalculator( - embedding_transformer) - - def _calculate_distance_for_word_step( - self, - step_words: StepWords - ) -> float: - return self._distance_calculator.calculate_distance_for_words( - step_words.reference_word, - step_words.hypothesis_word - ) - - def _calculate_result_cost_for_step( - self, - processing_step: WerProcessingStep - ) -> float: - step_words = processing_step.step_words - return self._calculate_distance_for_word_step(step_words) \ - if processing_step.step_type.is_cross_step() \ - else processing_step.step_cost - - def convert_processing_steps_to_result( - self, - processing_steps: List[WerProcessingStep], - reference_weights: Optional[List[float]] = None - ) -> List[WerStep]: - if reference_weights is None: - return [ - WerStep(step.step_type, step.step_words, - self._calculate_result_cost_for_step(step)) - for step in processing_steps - ] - else: - indexes_per_steps = self._get_reference_indexes_per_steps( - processing_steps) - return [ - WerStep( - processing_steps[step_index].step_type, - processing_steps[step_index].step_words, - reference_weights[indexes_per_steps[step_index]] * - self._calculate_result_cost_for_step( - processing_steps[step_index]) - ) - for step_index in range(len(processing_steps)) - ] diff --git a/sziszapangma/core/wer/wer_step.py b/sziszapangma/core/wer/wer_step.py deleted file mode 100644 index 5e1efa02ba9320b641c21c9a32712a9a23db94a3..0000000000000000000000000000000000000000 --- a/sziszapangma/core/wer/wer_step.py +++ /dev/null @@ -1,11 +0,0 @@ -from dataclasses import dataclass - -from sziszapangma.core.wer.step_type import StepType -from sziszapangma.core.wer.step_words import StepWords - - -@dataclass(frozen=True) -class WerStep: - step_type: StepType - step_words: StepWords - step_cost: float diff --git a/sziszapangma/core/wer/wer_util.py b/sziszapangma/core/wer/wer_util.py deleted file mode 100644 index a6ef66618105ed4162aad40988a15537f81e91e2..0000000000000000000000000000000000000000 --- a/sziszapangma/core/wer/wer_util.py +++ /dev/null @@ -1,33 +0,0 @@ -from typing import List, Optional - -import numpy as np -import pandas as pd - -from sziszapangma.core.wer.wer_step import WerStep - - -class WerUtil: - - @staticmethod - def _optional_str_to_str(value: Optional[str]) -> str: - return value if value is not None else '' - - @staticmethod - def _wer_step_to_pandas_row_lit(step: WerStep) -> List[any]: - return [ - step.step_type.get_short_name(), - WerUtil._optional_str_to_str(step.step_words.reference_word), - WerUtil._optional_str_to_str(step.step_words.hypothesis_word), - round(step.step_cost, 3) - ] - - @staticmethod - def steps_to_dataframe(steps: List[WerStep]) -> pd.DataFrame: - arr = np.array([ - WerUtil._wer_step_to_pandas_row_lit(step) - for step in steps - ]) - return pd.DataFrame( - arr, - columns=['step_type', 'reference', 'hypothesis', 'cost'] - ) diff --git a/sziszapangma/integration/task/classic_wer_metric_task.py b/sziszapangma/integration/task/classic_wer_metric_task.py index dfd8d2696e21a317e9272789e7a5a65e2cc8fb4e..1f4ba703fe56f77d14b11763e36549db75fd57cf 100644 --- a/sziszapangma/integration/task/classic_wer_metric_task.py +++ b/sziszapangma/integration/task/classic_wer_metric_task.py @@ -1,6 +1,8 @@ from typing import List, Dict -from sziszapangma.core.wer.classic_wer_calculator import ClassicWerCalculator +from sziszapangma.core.alignment.alignment_classic_calculator import \ + AlignmentClassicCalculator +from sziszapangma.core.wer.wer_calculator import WerCalculator from sziszapangma.integration.repository.experiment_repository import \ ExperimentRepository from sziszapangma.integration.task.processing_task import ProcessingTask @@ -12,7 +14,8 @@ _WORD = 'word' class ClassicWerMetricTask(ProcessingTask): _metrics_property_name: str _gold_transcript_property_name: str - _classic_wer_calculator: ClassicWerCalculator + _alignment_classic_calculator: AlignmentClassicCalculator + _wer_calculator: WerCalculator def __init__( self, @@ -26,7 +29,8 @@ class ClassicWerMetricTask(ProcessingTask): self._gold_transcript_property_name = gold_transcript_property_name self._asr_property_name = asr_property_name self._metrics_property_name = metrics_property_name - self._classic_wer_calculator = ClassicWerCalculator() + self._alignment_classic_calculator = AlignmentClassicCalculator() + self._wer_calculator = WerCalculator() def skip_for_record(self, record_id: str, experiment_repository: ExperimentRepository) -> bool: @@ -55,10 +59,12 @@ class ClassicWerMetricTask(ProcessingTask): gold_transcript: List[Dict[str, any]], asr_result: List[str] ) -> float: - return self._classic_wer_calculator.calculate_wer( - reference=[it[_WORD] for it in gold_transcript], - hypothesis=[it for it in asr_result], - )[0] + return self._wer_calculator.calculate_wer( + self._alignment_classic_calculator.calculate_alignment( + reference=[it[_WORD] for it in gold_transcript], + hypothesis=[it for it in asr_result], + ) + ) def calculate_metrics( self, diff --git a/sziszapangma/integration/task/embedding_wer_metrics_task.py b/sziszapangma/integration/task/embedding_wer_metrics_task.py index 67f8b943c692b1fd0723f6067e63d79dcadd5fe4..3145fbfdf177e1db528b47c3916cb0b2e6b66c09 100644 --- a/sziszapangma/integration/task/embedding_wer_metrics_task.py +++ b/sziszapangma/integration/task/embedding_wer_metrics_task.py @@ -1,12 +1,14 @@ from typing import List, Dict +from sziszapangma.core.alignment.alignment_embedding_calculator import \ + AlignmentEmbeddingCalculator +from sziszapangma.core.alignment.alignment_soft_calculator import \ + AlignmentSoftCalculator from sziszapangma.core.transformer.cached_embedding_transformer import \ CachedEmbeddingTransformer from sziszapangma.core.transformer.embedding_transformer import \ EmbeddingTransformer -from sziszapangma.core.wer.wer_embedding_calculator import \ - WerEmbeddingCalculator -from sziszapangma.core.wer.wer_soft_calculator import WerSoftCalculator +from sziszapangma.core.wer.wer_calculator import WerCalculator from sziszapangma.integration.repository.experiment_repository import \ ExperimentRepository from sziszapangma.integration.task.processing_task import ProcessingTask @@ -20,8 +22,9 @@ class EmbeddingWerMetricsTask(ProcessingTask): _metrics_property_name: str _gold_transcript_property_name: str _embedding_transformer: CachedEmbeddingTransformer - _wer_embedding_calculator: WerEmbeddingCalculator - _wer_soft_calculator: WerSoftCalculator + _alignment_embedding_calculator: AlignmentEmbeddingCalculator + _alignment_soft_calculator: AlignmentSoftCalculator + _wer_calculator: WerCalculator def __init__( self, @@ -38,10 +41,11 @@ class EmbeddingWerMetricsTask(ProcessingTask): self._metrics_property_name = metrics_property_name self._embedding_transformer = \ CachedEmbeddingTransformer(embedding_transformer) - self._wer_embedding_calculator = \ - WerEmbeddingCalculator(self._embedding_transformer) - self._wer_soft_calculator = \ - WerSoftCalculator(self._embedding_transformer) + self._alignment_embedding_calculator = \ + AlignmentEmbeddingCalculator(self._embedding_transformer) + self._alignment_soft_calculator = \ + AlignmentSoftCalculator(self._embedding_transformer) + self._wer_calculator = WerCalculator() def skip_for_record(self, record_id: str, experiment_repository: ExperimentRepository) -> bool: @@ -74,8 +78,8 @@ class EmbeddingWerMetricsTask(ProcessingTask): """Calculate all metrics for data sample.""" metrics = dict() reference = [it[_WORD] for it in gold_transcript] - metrics[_SOFT_WER] = self._wer_soft_calculator.calculate_wer( - reference, asr_result)[0] - metrics[_EMBEDDING_WER] = self._wer_embedding_calculator.calculate_wer( - reference, asr_result)[0] + metrics[_SOFT_WER] = self._alignment_soft_calculator\ + .calculate_alignment(reference, asr_result)[0] + metrics[_EMBEDDING_WER] = self._alignment_embedding_calculator\ + .calculate_wer(reference, asr_result)[0] return metrics diff --git a/tests/test_classic_wer.py b/tests/test_classic_wer.py index c78bc41f86fc37081a45d6b3be5f49b89c5910df..ff67ebd11117c13284f38efefe6a059bd5cd1c90 100644 --- a/tests/test_classic_wer.py +++ b/tests/test_classic_wer.py @@ -2,10 +2,12 @@ from typing import List, Tuple import pytest -from sziszapangma.core.wer.classic_wer_calculator import ClassicWerCalculator -from sziszapangma.core.wer.step_type import StepType -from sziszapangma.core.wer.step_words import StepWords -from sziszapangma.core.wer.word import Word +from sziszapangma.core.alignment.alignment_classic_calculator import \ + AlignmentClassicCalculator +from sziszapangma.core.alignment.step_type import StepType +from sziszapangma.core.alignment.step_words import StepWords +from sziszapangma.core.alignment.word import Word +from sziszapangma.core.wer.wer_calculator import WerCalculator def string_list_to_words(strings: List[str]) -> List[Word]: @@ -21,14 +23,17 @@ def get_sample_data() -> Tuple[List[Word], List[Word]]: def test_classic_calculate_wer_value(): """Sample test for core calculate.""" reference, hypothesis = get_sample_data() - wer_result = ClassicWerCalculator().calculate_wer(reference, hypothesis) - assert pytest.approx(wer_result[0]) == 0.8333333 + alignment = AlignmentClassicCalculator()\ + .calculate_alignment(reference, hypothesis) + wer_result = WerCalculator().calculate_wer(alignment) + assert pytest.approx(wer_result) == 0.8333333 def test_classic_calculate_wer_steps(): """Sample test for core calculate.""" reference, hypothesis = get_sample_data() - wer_result = ClassicWerCalculator().calculate_wer(reference, hypothesis) + alignment = AlignmentClassicCalculator().calculate_alignment( + reference, hypothesis) reference_words = [ StepWords(reference[0], hypothesis[0]), @@ -44,7 +49,7 @@ def test_classic_calculate_wer_steps(): StepType.CORRECT, StepType.INSERTION, StepType.INSERTION, StepType.SUBSTITUTION, StepType.SUBSTITUTION] - assert len(wer_result[1]) == 8 - assert [it.step_type for it in wer_result[1]] == step_types - assert [it.step_cost for it in wer_result[1]] == [0, 1, 0, 0, 1, 1, 1, 1] - assert [it.step_words for it in wer_result[1]] == reference_words + assert len(alignment) == 8 + assert [it.step_type for it in alignment] == step_types + assert [it.step_cost for it in alignment] == [0, 1, 0, 0, 1, 1, 1, 1] + assert [it.step_words for it in alignment] == reference_words diff --git a/tests/test_embedding_wer.py b/tests/test_embedding_wer.py index 8507e77d22b2c5e96b275e0f467754849d5815be..4f7cd55c9d0fdb1f9abff24bb80c0c040ec92061 100644 --- a/tests/test_embedding_wer.py +++ b/tests/test_embedding_wer.py @@ -2,9 +2,12 @@ from typing import List, Tuple import pytest -from sziszapangma.core.wer.wer_embedding_calculator import \ - WerEmbeddingCalculator -from sziszapangma.core.wer.word import Word +from sziszapangma.core.alignment.alignment_calculator import \ + AlignmentCalculator +from sziszapangma.core.alignment.alignment_embedding_calculator import \ + AlignmentEmbeddingCalculator +from sziszapangma.core.alignment.word import Word +from sziszapangma.core.wer.wer_calculator import WerCalculator from tests.file_stored_embedding_transformer import \ FileStoredEmbeddingTransformer @@ -19,14 +22,15 @@ def get_sample_data() -> Tuple[List[Word], List[Word]]: return string_list_to_words(reference), string_list_to_words(hypothesis) -def get_calculator() -> WerEmbeddingCalculator: - return WerEmbeddingCalculator( +def get_alignment_calculator() -> AlignmentCalculator: + return AlignmentEmbeddingCalculator( FileStoredEmbeddingTransformer('tests/embeddings_pl.json')) def test_classic_calculate_wer_value(): """Sample test for core calculate.""" reference, hypothesis = get_sample_data() - wer_result = get_calculator().calculate_wer(reference, hypothesis) - print(wer_result[0]) - assert pytest.approx(wer_result[0]) == 0.55879563 + alignment = get_alignment_calculator().calculate_alignment(reference, + hypothesis) + wer_result = WerCalculator().calculate_wer(alignment) + assert pytest.approx(wer_result) == 0.55879563 diff --git a/tests/test_soft_wer.py b/tests/test_soft_wer.py index e47240bad34bb935760ca513cc7570ed56e319d2..85a34338b831fff39525f557c69886911cb4e100 100644 --- a/tests/test_soft_wer.py +++ b/tests/test_soft_wer.py @@ -2,8 +2,12 @@ from typing import List, Tuple import pytest -from sziszapangma.core.wer.wer_soft_calculator import WerSoftCalculator -from sziszapangma.core.wer.word import Word +from sziszapangma.core.alignment.alignment_calculator import \ + AlignmentCalculator +from sziszapangma.core.alignment.alignment_soft_calculator import \ + AlignmentSoftCalculator +from sziszapangma.core.alignment.word import Word +from sziszapangma.core.wer.wer_calculator import WerCalculator from tests.file_stored_embedding_transformer import \ FileStoredEmbeddingTransformer @@ -18,14 +22,16 @@ def get_sample_data() -> Tuple[List[Word], List[Word]]: return string_list_to_words(reference), string_list_to_words(hypothesis) -def get_calculator() -> WerSoftCalculator: - return WerSoftCalculator( +def get_alignment_calculator() -> AlignmentCalculator: + return AlignmentSoftCalculator( FileStoredEmbeddingTransformer('tests/embeddings_pl.json')) def test_classic_calculate_wer_value(): """Sample test for core calculate.""" reference, hypothesis = get_sample_data() - wer_result = get_calculator().calculate_wer(reference, hypothesis) - print(wer_result[0]) - assert pytest.approx(wer_result[0]) == 0.50186761 + alignment = get_alignment_calculator().calculate_alignment( + reference, hypothesis) + wer_result = WerCalculator().calculate_wer(alignment) + print(wer_result) + assert pytest.approx(wer_result) == 0.50186761 diff --git a/tox.ini b/tox.ini index 58bfbf90e0a2440116375af2dfed2db93a63baa1..7062dd226ee8550c29f47c9d907e6e32b5c2bac7 100644 --- a/tox.ini +++ b/tox.ini @@ -17,8 +17,6 @@ deps = ; requirements.txt with the pinned versions and uncomment the following line: ; -r{toxinidir}/requirements.txt commands = - ls -la - pip list pytest ; pytest --basetemp={envtmpdir}