diff --git a/requirements_dev.txt b/requirements_dev.txt index 4211aa637ce1f8e33135540beed543f026c9886a..91f97d1b5aba1cc29f2f9e93252ef4eb81f0c028 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -1,7 +1,5 @@ -pip==21.1.2 -bump2version==1.0.1 wheel==0.36.2 -watchdog==2.1.2 +watchdog==2.1.3 flake8==3.9.2 tox==3.23.1 coverage==5.5 diff --git a/setup.cfg b/setup.cfg index a65cf7a3f90c161536575830569515ffb564a0e6..2642f6ad6c22dacc518a2e26b7c4078cf623ff74 100644 --- a/setup.cfg +++ b/setup.cfg @@ -21,6 +21,6 @@ exclude = docs # Define setup.py command aliases here test = pytest -[tool:pytest] -collect_ignore = ['setup.py'] +;[tool:pytest] +;collect_ignore = ['setup.py'] diff --git a/sziszapangma/__pycache__/__init__.cpython-39.pyc b/sziszapangma/__pycache__/__init__.cpython-39.pyc index 0da856c208c3cdb6494296026133e07e14235d3c..125189747dbbc47416dd2dc5f0eaaeb1d97f848b 100644 Binary files a/sziszapangma/__pycache__/__init__.cpython-39.pyc and b/sziszapangma/__pycache__/__init__.cpython-39.pyc differ diff --git a/sziszapangma/core/alignment/__init__.py b/sziszapangma/core/alignment/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/sziszapangma/core/alignment/alignment_calculator.py b/sziszapangma/core/alignment/alignment_calculator.py new file mode 100644 index 0000000000000000000000000000000000000000..f69ec9537fac17d7a25d8768ea6bd6eec08c07af --- /dev/null +++ b/sziszapangma/core/alignment/alignment_calculator.py @@ -0,0 +1,179 @@ +from abc import ABC +from typing import List, Tuple, Optional + +import numpy as np + +from sziszapangma.core.alignment.step_type import StepType +from sziszapangma.core.alignment.alignment_step import AlignmentStep +from sziszapangma.core.alignment.distance_matrix_calculator import \ + DistanceCalculator +from sziszapangma.core.alignment.step_words import StepWords +from sziszapangma.core.alignment.alignment_processing_step import \ + AlignmentProcessingStep +from sziszapangma.core.alignment.word import Word + + +class AlignmentCalculator(ABC): + _distance_matrix_calculator: DistanceCalculator + + def __init__(self, distance_matrix_calculator: DistanceCalculator): + self._distance_matrix_calculator = distance_matrix_calculator + + def convert_processing_steps_to_result( + self, + processing_steps: List[AlignmentProcessingStep], + ) -> List[AlignmentStep]: + return [ + AlignmentStep(step.step_type, step.step_words, step.step_cost) + for step in processing_steps + ] + + def _get_reference_indexes_per_steps( + self, + steps: List[AlignmentProcessingStep] + ) -> List[int]: + counter = 0 + indexes = [] + for step in steps: + indexes.append(counter) + if step.step_type.contain_reference_word(): + counter = counter + 1 + return indexes + + def get_distance_matrix_between_words( + self, + reference: List[Word], + hypothesis: List[Word] + ) -> np.ndarray: + return self._distance_matrix_calculator.calculate_distance_matrix( + reference, hypothesis) + + @staticmethod + def _get_initialized_levenshtein_matrix( + reference: List[Word], + hypothesis: List[Word] + ) -> Tuple[np.ndarray, List[List[Optional[AlignmentProcessingStep]]]]: + + # TODO: consider about remove distance_arr replaced by steps_arr + reference_len = len(reference) + hypothesis_len = len(hypothesis) + distance_arr = np.zeros((reference_len + 1) * (hypothesis_len + 1)) \ + .reshape((reference_len + 1, hypothesis_len + 1)) + steps_arr = [ + [None for _ in range(hypothesis_len + 1)] + for _ in range(reference_len + 1) + ] + + # levenshtein initial + for ref_index in range(reference_len + 1): + distance_arr[ref_index][0] = ref_index + step_words = StepWords( + reference[ref_index - 1] if ref_index > 0 else None, + None + ) + steps_arr[ref_index][0] = AlignmentProcessingStep\ + .levenshtein_deletion(ref_index - 1, step_words) + for hyp_index in range(hypothesis_len + 1): + distance_arr[0][hyp_index] = hyp_index + step_words = StepWords( + None, + hypothesis[hyp_index - 1] if hyp_index > 0 else None + ) + steps_arr[0][hyp_index] = AlignmentProcessingStep\ + .levenshtein_insertion(hyp_index - 1, step_words) + + return distance_arr, steps_arr + + @staticmethod + def _get_levenshtein_processing_step_cross( + prev_cross_distance: float, + step_words: StepWords, + current_distance: float + ) -> AlignmentProcessingStep: + return AlignmentProcessingStep.levenshtein_correct( + prev_cross_distance, step_words, 0) \ + if current_distance == 0 \ + else AlignmentProcessingStep.levenshtein_substitution( + prev_cross_distance, step_words, current_distance) + + def get_levenshtein_embedding_based( + self, + reference: List[Word], + hypothesis: List[Word], + distance_matrix: np.ndarray + ) -> Tuple[np.ndarray, List[List[AlignmentProcessingStep]]]: + + reference_len = len(reference) + hypothesis_len = len(hypothesis) + distance_arr, steps_arr = self._get_initialized_levenshtein_matrix( + reference, hypothesis) + + for ref_index in range(reference_len): + for hyp_index in range(hypothesis_len): + step_words = StepWords(reference[ref_index], + hypothesis[hyp_index]) + current_distance = distance_matrix[ref_index][hyp_index] + prev_cross_distance = distance_arr[ref_index][hyp_index] + + cross_go_step = self._get_levenshtein_processing_step_cross( + prev_cross_distance, step_words, current_distance) + insertion_step = AlignmentProcessingStep.levenshtein_insertion( + distance_arr[ref_index + 1][hyp_index], step_words) + deletion_step = AlignmentProcessingStep.levenshtein_deletion( + distance_arr[ref_index][hyp_index + 1], step_words) + + best_step = min([cross_go_step, insertion_step, deletion_step], + key=lambda it: it.total_distance()) + + distance_arr[ref_index + 1][hyp_index + 1] = \ + best_step.total_distance() + steps_arr[ref_index + 1][hyp_index + 1] = best_step + + return distance_arr, steps_arr + + def extract_steps_path( + self, + steps_matrix: List[List[AlignmentProcessingStep]] + ) -> List[AlignmentProcessingStep]: + x = len(steps_matrix) - 1 + y = len(steps_matrix[0]) - 1 + to_return = [] + while not (x == 0 and y == 0): + current_step = steps_matrix[x][y] + to_return.append(current_step) + if current_step.step_type == StepType.DELETION: + x = x - 1 + elif current_step.step_type == StepType.INSERTION: + y = y - 1 + else: # creation and substitution + y = y - 1 + x = x - 1 + return to_return[::-1] + + def _calculate_steps_path( + self, + reference: List[Word], + hypothesis: List[Word] + ) -> List[AlignmentProcessingStep]: + distance_between_words = self.get_distance_matrix_between_words( + reference, hypothesis) + _, steps_matrix = self.get_levenshtein_embedding_based( + reference, hypothesis, distance_between_words) + return self.extract_steps_path(steps_matrix) + + def calculate_alignment( + self, + reference: List[Word], + hypothesis: List[Word] + ) -> List[AlignmentStep]: + steps_path = self._calculate_steps_path(reference, hypothesis) + return self.convert_processing_steps_to_result(steps_path) + + def calculate_alignment_weighted( + self, + reference: List[Word], + hypothesis: List[Word], + weights: List[float] + ) -> List[AlignmentStep]: + steps_path = self._calculate_steps_path(reference, hypothesis) + return self.convert_processing_steps_to_result(steps_path) diff --git a/sziszapangma/core/alignment/alignment_classic_calculator.py b/sziszapangma/core/alignment/alignment_classic_calculator.py new file mode 100644 index 0000000000000000000000000000000000000000..fbf60eb310caa9d7cd2370ba253f4168da7c62f7 --- /dev/null +++ b/sziszapangma/core/alignment/alignment_classic_calculator.py @@ -0,0 +1,10 @@ +from sziszapangma.core.alignment.alignment_calculator import \ + AlignmentCalculator +from sziszapangma.core.alignment.distance_matrix_calculator import \ + BinaryDistanceCalculator + + +class AlignmentClassicCalculator(AlignmentCalculator): + + def __init__(self): + super().__init__(BinaryDistanceCalculator()) diff --git a/sziszapangma/core/alignment/alignment_embedding_calculator.py b/sziszapangma/core/alignment/alignment_embedding_calculator.py new file mode 100644 index 0000000000000000000000000000000000000000..a20802dc2d2ad132edb57854b7c1e1ec71ef621d --- /dev/null +++ b/sziszapangma/core/alignment/alignment_embedding_calculator.py @@ -0,0 +1,50 @@ +from typing import List + +from sziszapangma.core.alignment.alignment_calculator import \ + AlignmentCalculator +from sziszapangma.core.alignment.alignment_processing_step import \ + AlignmentProcessingStep +from sziszapangma.core.alignment.alignment_step import AlignmentStep +from sziszapangma.core.alignment.distance_matrix_calculator import \ + BinaryDistanceCalculator, DistanceCalculator, CosineDistanceCalculator +from sziszapangma.core.alignment.step_words import StepWords +from sziszapangma.core.transformer.embedding_transformer import \ + EmbeddingTransformer + + +class AlignmentEmbeddingCalculator(AlignmentCalculator): + _distance_calculator: DistanceCalculator + + def __init__(self, embedding_transformer: EmbeddingTransformer): + super().__init__(BinaryDistanceCalculator()) + self._embedding_transformer = embedding_transformer + self._distance_calculator = CosineDistanceCalculator( + embedding_transformer) + + def _calculate_distance_for_word_step( + self, + step_words: StepWords + ) -> float: + return self._distance_calculator.calculate_distance_for_words( + step_words.reference_word, + step_words.hypothesis_word + ) + + def _calculate_result_cost_for_step( + self, + processing_step: AlignmentProcessingStep + ) -> float: + step_words = processing_step.step_words + return self._calculate_distance_for_word_step(step_words) \ + if processing_step.step_type.is_cross_step() \ + else processing_step.step_cost + + def convert_processing_steps_to_result( + self, + processing_steps: List[AlignmentProcessingStep] + ) -> List[AlignmentStep]: + return [ + AlignmentStep(step.step_type, step.step_words, + self._calculate_result_cost_for_step(step)) + for step in processing_steps + ] diff --git a/sziszapangma/core/wer/wer_processing_step.py b/sziszapangma/core/alignment/alignment_processing_step.py similarity index 58% rename from sziszapangma/core/wer/wer_processing_step.py rename to sziszapangma/core/alignment/alignment_processing_step.py index 15d92dddefca7deb436b7d24bd734e8e72acf94e..e4ab96dc1578348db186006797eb1801842a1588 100644 --- a/sziszapangma/core/wer/wer_processing_step.py +++ b/sziszapangma/core/alignment/alignment_processing_step.py @@ -1,11 +1,11 @@ from dataclasses import dataclass -from sziszapangma.core.wer.step_type import StepType -from sziszapangma.core.wer.step_words import StepWords +from sziszapangma.core.alignment.step_type import StepType +from sziszapangma.core.alignment.step_words import StepWords @dataclass(frozen=True) -class WerProcessingStep: +class AlignmentProcessingStep: step_type: StepType step_words: StepWords previous_distance: float @@ -15,27 +15,27 @@ class WerProcessingStep: def levenshtein_insertion(cls, previous_distance: float, step_words: StepWords, step_cost: float = 1): words = StepWords(None, step_words.hypothesis_word) - return WerProcessingStep(StepType.INSERTION, words, - previous_distance, step_cost) + return AlignmentProcessingStep(StepType.INSERTION, words, + previous_distance, step_cost) @classmethod def levenshtein_deletion(cls, previous_distance: float, step_words: StepWords, step_cost: float = 1): words = StepWords(step_words.reference_word, None) - return WerProcessingStep(StepType.DELETION, words, - previous_distance, step_cost) + return AlignmentProcessingStep(StepType.DELETION, words, + previous_distance, step_cost) @classmethod def levenshtein_substitution(cls, previous_distance: float, step_words: StepWords, step_cost: float): - return WerProcessingStep(StepType.SUBSTITUTION, step_words, - previous_distance, step_cost) + return AlignmentProcessingStep(StepType.SUBSTITUTION, step_words, + previous_distance, step_cost) @classmethod def levenshtein_correct(cls, previous_distance: float, step_words: StepWords, step_cost: float): - return WerProcessingStep(StepType.CORRECT, step_words, - previous_distance, step_cost) + return AlignmentProcessingStep(StepType.CORRECT, step_words, + previous_distance, step_cost) def total_distance(self) -> float: return self.step_cost + self.previous_distance diff --git a/sziszapangma/core/wer/wer_soft_calculator.py b/sziszapangma/core/alignment/alignment_soft_calculator.py similarity index 56% rename from sziszapangma/core/wer/wer_soft_calculator.py rename to sziszapangma/core/alignment/alignment_soft_calculator.py index e17728264cb922a6357005de289ed2e3cebf7cd9..c7de34cecef6693d6058260a1a2fd3a8997acb7b 100644 --- a/sziszapangma/core/wer/wer_soft_calculator.py +++ b/sziszapangma/core/alignment/alignment_soft_calculator.py @@ -1,11 +1,12 @@ -from sziszapangma.core.wer.distance_matrix_calculator import \ +from sziszapangma.core.alignment.alignment_calculator import \ + AlignmentCalculator +from sziszapangma.core.alignment.distance_matrix_calculator import \ CosineDistanceCalculator -from sziszapangma.core.wer.wer_calculator import WerCalculator from sziszapangma.core.transformer.embedding_transformer import \ EmbeddingTransformer -class WerSoftCalculator(WerCalculator): +class AlignmentSoftCalculator(AlignmentCalculator): def __init__(self, embedding_transformer: EmbeddingTransformer): super().__init__(CosineDistanceCalculator(embedding_transformer)) diff --git a/sziszapangma/core/alignment/alignment_step.py b/sziszapangma/core/alignment/alignment_step.py new file mode 100644 index 0000000000000000000000000000000000000000..cefd0d105f499b0e1d1ba68d29d8fedc16f2e17b --- /dev/null +++ b/sziszapangma/core/alignment/alignment_step.py @@ -0,0 +1,18 @@ +from dataclasses import dataclass + +from sziszapangma.core.alignment.step_type import StepType +from sziszapangma.core.alignment.step_words import StepWords + + +@dataclass(frozen=True) +class AlignmentStep: + step_type: StepType + step_words: StepWords + step_cost: float + + def with_weight_multiplication(self, weight: float): + return AlignmentStep( + step_type=self.step_type, + step_words=self.step_words, + step_cost=self.step_cost * weight + ) diff --git a/sziszapangma/core/alignment/alignment_util.py b/sziszapangma/core/alignment/alignment_util.py new file mode 100644 index 0000000000000000000000000000000000000000..c1887317e3925e2143fd85c32b0aa82199a410f9 --- /dev/null +++ b/sziszapangma/core/alignment/alignment_util.py @@ -0,0 +1,71 @@ +from typing import List, Optional + +import numpy as np +import pandas as pd + +from sziszapangma.core.alignment.alignment_step import AlignmentStep + + +class AlignmentUtil: + + @staticmethod + def _optional_str_to_str(value: Optional[str]) -> str: + return value if value is not None else '' + + @staticmethod + def _wer_step_to_pandas_row_lit(step: AlignmentStep) -> List[any]: + return [ + step.step_type.get_short_name(), + AlignmentUtil._optional_str_to_str(step.step_words.reference_word), + AlignmentUtil._optional_str_to_str( + step.step_words.hypothesis_word), + round(step.step_cost, 3) + ] + + @staticmethod + def steps_to_dataframe(steps: List[AlignmentStep]) -> pd.DataFrame: + arr = np.array([ + AlignmentUtil._wer_step_to_pandas_row_lit(step) + for step in steps + ]) + return pd.DataFrame( + arr, + columns=['step_type', 'reference', 'hypothesis', 'cost'] + ) + + @staticmethod + def get_reference_indexes_per_steps( + steps: List[AlignmentStep] + ) -> List[int]: + counter = 0 + indexes = [] + for step in steps: + indexes.append(counter) + if step.step_type.contain_reference_word(): + counter = counter + 1 + return indexes + + @staticmethod + def get_reference_length(steps: List[AlignmentStep]) -> int: + return sum([ + 1 if step.step_type.contain_reference_word() else 0 + for step in steps + ]) + + @staticmethod + def apply_weights_to_alignment( + steps: List[AlignmentStep], + weights: List[float] + ) -> List[AlignmentStep]: + if AlignmentUtil.get_reference_length(steps) != len(weights): + raise Exception( + f"Incorrect length of weights, current={len(weights)}, " + f"required={AlignmentUtil.get_reference_length(steps)}" + ) + reference_indexes_per_steps = \ + AlignmentUtil.get_reference_indexes_per_steps(steps) + return [ + steps[index].with_weight_multiplication( + weights[reference_indexes_per_steps[index]]) + for index in range(len(steps)) + ] diff --git a/sziszapangma/core/wer/distance_matrix_calculator.py b/sziszapangma/core/alignment/distance_matrix_calculator.py similarity index 68% rename from sziszapangma/core/wer/distance_matrix_calculator.py rename to sziszapangma/core/alignment/distance_matrix_calculator.py index 50e359af263175cde48c883b7157a02451e2384f..5f17ea7f72e45d6f1bd0cad4f4732e786f591d22 100644 --- a/sziszapangma/core/wer/distance_matrix_calculator.py +++ b/sziszapangma/core/alignment/distance_matrix_calculator.py @@ -5,30 +5,31 @@ import numpy as np from sziszapangma.core.transformer.embedding_transformer import \ EmbeddingTransformer +from sziszapangma.core.alignment.word import Word class DistanceCalculator(ABC): @abstractmethod def calculate_distance_matrix( self, - reference: List[str], - hypothesis: List[str] + reference: List[Word], + hypothesis: List[Word] ) -> np.array: pass @abstractmethod - def calculate_distance_for_words(self, word1: str, word2: str) -> float: + def calculate_distance_for_words(self, word1: Word, word2: Word) -> float: pass class BinaryDistanceCalculator(DistanceCalculator): - def calculate_distance_for_words(self, word1: str, word2: str) -> float: - return 0 if word1 == word2 else 1 + def calculate_distance_for_words(self, word1: Word, word2: Word) -> float: + return 0 if word1.value == word2.value else 1 def calculate_distance_matrix( self, - reference: List[str], - hypothesis: List[str] + reference: List[Word], + hypothesis: List[Word] ) -> np.array: return np.array([ [self.calculate_distance_for_words(reference_word, hypothesis_word) @@ -43,10 +44,10 @@ class CosineDistanceCalculator(DistanceCalculator): def __init__(self, embedding_transformer: EmbeddingTransformer): self._embedding_transformer = embedding_transformer - def calculate_distance_for_words(self, word1: str, word2: str) -> float: + def calculate_distance_for_words(self, word1: Word, word2: Word) -> float: return self.cosine_distance_between_words_embeddings( - self._embedding_transformer.get_embedding(word1), - self._embedding_transformer.get_embedding(word2) + self._embedding_transformer.get_embedding(word1.value), + self._embedding_transformer.get_embedding(word2.value) ) @staticmethod @@ -67,22 +68,22 @@ class CosineDistanceCalculator(DistanceCalculator): b_norm = np.linalg.norm(b, axis=1, keepdims=True) else: raise RuntimeError("array dimensions {} not right".format(a.ndim)) - similiarity = np.dot(a, b.T) / (a_norm * b_norm) - dist = 1. - similiarity + similarity = np.dot(a, b.T) / (a_norm * b_norm) + dist = 1. - similarity return dist def calculate_distance_matrix( self, - reference: List[str], - hypothesis: List[str] + reference: List[Word], + hypothesis: List[Word] ) -> np.array: embeddings_dict = self._embedding_transformer.get_embeddings( - list(set(reference + hypothesis)) + list(set(it.value for it in (reference + hypothesis))) ) return np.array([[ self.cosine_distance_between_words_embeddings( - embeddings_dict[reference_word], - embeddings_dict[hypothesis_word], + embeddings_dict[reference_word.value], + embeddings_dict[hypothesis_word.value], ) for hypothesis_word in hypothesis] for reference_word in reference diff --git a/sziszapangma/core/wer/step_type.py b/sziszapangma/core/alignment/step_type.py similarity index 100% rename from sziszapangma/core/wer/step_type.py rename to sziszapangma/core/alignment/step_type.py diff --git a/sziszapangma/core/alignment/step_words.py b/sziszapangma/core/alignment/step_words.py new file mode 100644 index 0000000000000000000000000000000000000000..067466f5318d20fd3d785ce9c1106403ac130574 --- /dev/null +++ b/sziszapangma/core/alignment/step_words.py @@ -0,0 +1,10 @@ +from dataclasses import dataclass +from typing import Optional + +from sziszapangma.core.alignment.word import Word + + +@dataclass(frozen=True) +class StepWords: + reference_word: Optional[Word] + hypothesis_word: Optional[Word] diff --git a/sziszapangma/core/alignment/word.py b/sziszapangma/core/alignment/word.py new file mode 100644 index 0000000000000000000000000000000000000000..caf6e62b3fa7eb68a99360fa2dde03b968394d53 --- /dev/null +++ b/sziszapangma/core/alignment/word.py @@ -0,0 +1,12 @@ +import uuid +from dataclasses import dataclass + + +@dataclass(frozen=True) +class Word: + id: str + value: str + + @classmethod + def from_string(cls, string: str): + return cls(str(uuid.uuid4()), string) diff --git a/sziszapangma/core/wer/__pycache__/classic_wer_calculator.cpython-38.pyc b/sziszapangma/core/wer/__pycache__/classic_wer_calculator.cpython-38.pyc deleted file mode 100644 index 5114407c01a0c978a0d91a8e6465be84cc749485..0000000000000000000000000000000000000000 Binary files a/sziszapangma/core/wer/__pycache__/classic_wer_calculator.cpython-38.pyc and /dev/null differ diff --git a/sziszapangma/core/wer/__pycache__/distance_matrix_calculator.cpython-38.pyc b/sziszapangma/core/wer/__pycache__/distance_matrix_calculator.cpython-38.pyc deleted file mode 100644 index 2bf0c9c59168075c6523a4c441087072e2f1547f..0000000000000000000000000000000000000000 Binary files a/sziszapangma/core/wer/__pycache__/distance_matrix_calculator.cpython-38.pyc and /dev/null differ diff --git a/sziszapangma/core/wer/__pycache__/step_type.cpython-38.pyc b/sziszapangma/core/wer/__pycache__/step_type.cpython-38.pyc deleted file mode 100644 index d3ee780df43be6c35dd46f10f72bad0a42c35bb8..0000000000000000000000000000000000000000 Binary files a/sziszapangma/core/wer/__pycache__/step_type.cpython-38.pyc and /dev/null differ diff --git a/sziszapangma/core/wer/__pycache__/step_words.cpython-38.pyc b/sziszapangma/core/wer/__pycache__/step_words.cpython-38.pyc deleted file mode 100644 index 4142240350aabea762e0c740887510d3e13d56cf..0000000000000000000000000000000000000000 Binary files a/sziszapangma/core/wer/__pycache__/step_words.cpython-38.pyc and /dev/null differ diff --git a/sziszapangma/core/wer/__pycache__/wer_embedding_calculator.cpython-38.pyc b/sziszapangma/core/wer/__pycache__/wer_embedding_calculator.cpython-38.pyc deleted file mode 100644 index 924ae83ebc3c912ad3acb439a5766bc626c0ac74..0000000000000000000000000000000000000000 Binary files a/sziszapangma/core/wer/__pycache__/wer_embedding_calculator.cpython-38.pyc and /dev/null differ diff --git a/sziszapangma/core/wer/__pycache__/wer_processing_step.cpython-38.pyc b/sziszapangma/core/wer/__pycache__/wer_processing_step.cpython-38.pyc deleted file mode 100644 index e74e7f411d80ab89142f285eaaaf84f831cf9522..0000000000000000000000000000000000000000 Binary files a/sziszapangma/core/wer/__pycache__/wer_processing_step.cpython-38.pyc and /dev/null differ diff --git a/sziszapangma/core/wer/__pycache__/wer_soft_calculator.cpython-38.pyc b/sziszapangma/core/wer/__pycache__/wer_soft_calculator.cpython-38.pyc deleted file mode 100644 index e66f737228a10bcea2f3c886a1ccb85c904c6d0c..0000000000000000000000000000000000000000 Binary files a/sziszapangma/core/wer/__pycache__/wer_soft_calculator.cpython-38.pyc and /dev/null differ diff --git a/sziszapangma/core/wer/__pycache__/wer_span_question.cpython-38.pyc b/sziszapangma/core/wer/__pycache__/wer_span_question.cpython-38.pyc deleted file mode 100644 index 61e5fac5c3866d97cb474694ebc1fa60aba75bd9..0000000000000000000000000000000000000000 Binary files a/sziszapangma/core/wer/__pycache__/wer_span_question.cpython-38.pyc and /dev/null differ diff --git a/sziszapangma/core/wer/__pycache__/wer_step.cpython-38.pyc b/sziszapangma/core/wer/__pycache__/wer_step.cpython-38.pyc deleted file mode 100644 index 15b375efa80ab3c0292468ee00bb3e178407600e..0000000000000000000000000000000000000000 Binary files a/sziszapangma/core/wer/__pycache__/wer_step.cpython-38.pyc and /dev/null differ diff --git a/sziszapangma/core/wer/__pycache__/wer_util.cpython-38.pyc b/sziszapangma/core/wer/__pycache__/wer_util.cpython-38.pyc deleted file mode 100644 index 5caa38f0edc34f2f56e7ed8b79fd36da809fc616..0000000000000000000000000000000000000000 Binary files a/sziszapangma/core/wer/__pycache__/wer_util.cpython-38.pyc and /dev/null differ diff --git a/sziszapangma/core/wer/classic_wer_calculator.py b/sziszapangma/core/wer/classic_wer_calculator.py deleted file mode 100644 index db54d33764a3593ad53c0ddd40ea84ef3eaf8de9..0000000000000000000000000000000000000000 --- a/sziszapangma/core/wer/classic_wer_calculator.py +++ /dev/null @@ -1,9 +0,0 @@ -from sziszapangma.core.wer.distance_matrix_calculator import \ - BinaryDistanceCalculator -from sziszapangma.core.wer.wer_calculator import WerCalculator - - -class ClassicWerCalculator(WerCalculator): - - def __init__(self): - super().__init__(BinaryDistanceCalculator()) diff --git a/sziszapangma/core/wer/wer_span_question.py b/sziszapangma/core/wer/span.py similarity index 59% rename from sziszapangma/core/wer/wer_span_question.py rename to sziszapangma/core/wer/span.py index bfdf43488e7c8f3d4022de8fa892d3ad89b203f7..44cfe840d0f2c6f68743963e749ae00a27450191 100644 --- a/sziszapangma/core/wer/wer_span_question.py +++ b/sziszapangma/core/wer/span.py @@ -6,11 +6,11 @@ class Span: index_start: int index_end: int - def _is_index_belong(self, index: int) -> bool: + def is_index_belong(self, index: int) -> bool: return self.index_start <= index < self.index_end - def get_reference_weights_table(self, total_size: int): + def get_reference_mask_table(self, total_size: int): return [ - 1 if self._is_index_belong(it) else 0 + self.is_index_belong(it) for it in range(total_size) ] diff --git a/sziszapangma/core/wer/step_words.py b/sziszapangma/core/wer/step_words.py deleted file mode 100644 index 889c2fc79cc09bb1cea028a9526e3b1abc550e2f..0000000000000000000000000000000000000000 --- a/sziszapangma/core/wer/step_words.py +++ /dev/null @@ -1,8 +0,0 @@ -from dataclasses import dataclass -from typing import Optional - - -@dataclass(frozen=True) -class StepWords: - reference_word: Optional[str] - hypothesis_word: Optional[str] diff --git a/sziszapangma/core/wer/wer_calculator.py b/sziszapangma/core/wer/wer_calculator.py index 724fb949e83816e46e1dab3f9254d77aa8429bba..3fa65dbe5e0ff306a6ba118e4880db724a31d5bd 100644 --- a/sziszapangma/core/wer/wer_calculator.py +++ b/sziszapangma/core/wer/wer_calculator.py @@ -1,220 +1,52 @@ from abc import ABC -from typing import List, Tuple, Optional +from typing import List -import numpy as np - -from sziszapangma.core.wer.distance_matrix_calculator import \ - DistanceCalculator -from sziszapangma.core.wer.step_type import StepType -from sziszapangma.core.wer.wer_processing_step import WerProcessingStep -from sziszapangma.core.wer.wer_span_question import Span -from sziszapangma.core.wer.wer_step import WerStep, StepWords +from sziszapangma.core.alignment.alignment_step import AlignmentStep +from sziszapangma.core.alignment.alignment_util import AlignmentUtil +from sziszapangma.core.wer.span import Span class WerCalculator(ABC): - _distance_matrix_calculator: DistanceCalculator - - def __init__(self, distance_matrix_calculator: DistanceCalculator): - self._distance_matrix_calculator = distance_matrix_calculator - - def convert_processing_steps_to_result( - self, - processing_steps: List[WerProcessingStep], - reference_weights: Optional[List[float]] = None - ) -> List[WerStep]: - if reference_weights is None: - return [ - WerStep(step.step_type, step.step_words, step.step_cost) - for step in processing_steps - ] - else: - indexes_per_steps = self._get_reference_indexes_per_steps( - processing_steps) - return [ - WerStep( - processing_steps[step_index].step_type, - processing_steps[step_index].step_words, - reference_weights[indexes_per_steps[step_index]] * - processing_steps[step_index].step_cost - ) - for step_index in range(len(processing_steps)) - ] - - def get_distance_matrix_between_words( - self, - reference: List[str], - hypothesis: List[str] - ) -> np.ndarray: - return self._distance_matrix_calculator.calculate_distance_matrix( - reference, hypothesis) - - def extract_steps_path( - self, - steps_matrix: List[List[WerProcessingStep]] - ) -> List[WerProcessingStep]: - x = len(steps_matrix) - 1 - y = len(steps_matrix[0]) - 1 - to_return = [] - while not (x == 0 and y == 0): - current_step = steps_matrix[x][y] - to_return.append(current_step) - if current_step.step_type == StepType.DELETION: - x = x - 1 - elif current_step.step_type == StepType.INSERTION: - y = y - 1 - else: # creation and substitution - y = y - 1 - x = x - 1 - return to_return[::-1] - - @staticmethod - def _get_levenshtein_processing_step_cross( - prev_cross_distance: float, - step_words: StepWords, - current_distance: float - ) -> WerProcessingStep: - return WerProcessingStep.levenshtein_correct( - prev_cross_distance, step_words, 0) \ - if current_distance == 0 \ - else WerProcessingStep.levenshtein_substitution( - prev_cross_distance, step_words, current_distance) - - def get_levenshtein_embedding_based( - self, - reference: List[str], - hypothesis: List[str], - distance_matrix: np.ndarray - ) -> Tuple[np.ndarray, List[List[WerProcessingStep]]]: - - reference_len = len(reference) - hypothesis_len = len(hypothesis) - distance_arr, steps_arr = self._get_initialized_levenshtein_matrix( - reference, hypothesis) - - for ref_index in range(reference_len): - for hyp_index in range(hypothesis_len): - step_words = StepWords(reference[ref_index], - hypothesis[hyp_index]) - current_distance = distance_matrix[ref_index][hyp_index] - prev_cross_distance = distance_arr[ref_index][hyp_index] - - cross_go_step = self._get_levenshtein_processing_step_cross( - prev_cross_distance, step_words, current_distance) - insertion_step = WerProcessingStep.levenshtein_insertion( - distance_arr[ref_index + 1][hyp_index], step_words) - deletion_step = WerProcessingStep.levenshtein_deletion( - distance_arr[ref_index][hyp_index + 1], step_words) - - best_step = min([cross_go_step, insertion_step, deletion_step], - key=lambda it: it.total_distance()) - - distance_arr[ref_index + 1][hyp_index + 1] = \ - best_step.total_distance() - steps_arr[ref_index + 1][hyp_index + 1] = best_step - - return distance_arr, steps_arr @staticmethod - def _get_initialized_levenshtein_matrix( - reference: List[str], - hypothesis: List[str] - ) -> Tuple[np.ndarray, List[List[Optional[WerProcessingStep]]]]: - - # TODO: consider about remove distance_arr replaced by steps_arr - reference_len = len(reference) - hypothesis_len = len(hypothesis) - distance_arr = np.zeros((reference_len + 1) * (hypothesis_len + 1)) \ - .reshape((reference_len + 1, hypothesis_len + 1)) - steps_arr = [ - [None for _ in range(hypothesis_len + 1)] - for _ in range(reference_len + 1) - ] - - # levenshtein initial - for ref_index in range(reference_len + 1): - distance_arr[ref_index][0] = ref_index - step_words = StepWords( - reference[ref_index - 1] if ref_index > 0 else None, - None - ) - steps_arr[ref_index][0] = WerProcessingStep.levenshtein_deletion( - ref_index - 1, step_words) - for hyp_index in range(hypothesis_len + 1): - distance_arr[0][hyp_index] = hyp_index - step_words = StepWords( - None, - hypothesis[hyp_index - 1] if hyp_index > 0 else None + def _convert_processing_steps_to_result( + input_steps: List[AlignmentStep], + span: Span + ) -> List[AlignmentStep]: + indexes_per_steps = AlignmentUtil.get_reference_indexes_per_steps( + input_steps) + return [ + AlignmentStep( + input_steps[step_index].step_type, + input_steps[step_index].step_words, + input_steps[step_index].step_cost * span.is_index_belong( + indexes_per_steps[step_index]) ) - steps_arr[0][hyp_index] = WerProcessingStep.levenshtein_insertion( - hyp_index - 1, step_words) - - return distance_arr, steps_arr - - def _get_reference_indexes_per_steps( - self, - steps: List[WerProcessingStep] - ) -> List[int]: - counter = 0 - indexes = [] - for step in steps: - indexes.append(counter) - if step.step_type.contain_reference_word(): - counter = counter + 1 - return indexes + for step_index in range(len(input_steps)) + ] + @staticmethod def _calculate_wer( - self, - steps: List[WerStep], + steps: List[AlignmentStep], ) -> float: - reference_len = sum([ - 1 if step.step_type.contain_reference_word() else 0 - for step in steps - ]) + reference_len = AlignmentUtil.get_reference_length(steps) return sum([step.step_cost for step in steps]) / reference_len - def _calculate_steps_path( - self, - reference: List[str], - hypothesis: List[str] - ) -> List[WerProcessingStep]: - distance_between_words = self.get_distance_matrix_between_words( - reference, hypothesis) - _, steps_matrix = self.get_levenshtein_embedding_based( - reference, hypothesis, distance_between_words) - return self.extract_steps_path(steps_matrix) - def calculate_wer( self, - reference: List[str], - hypothesis: List[str] - ) -> Tuple[float, List[WerStep]]: - steps_path = self._calculate_steps_path(reference, hypothesis) - steps = self.convert_processing_steps_to_result(steps_path) - return self._calculate_wer(steps), steps + steps: List[AlignmentStep] + ) -> float: + return self._calculate_wer(steps) def calculate_wer_for_spans( self, - reference: List[str], - hypothesis: List[str], + steps: List[AlignmentStep], spans: List[Span] ) -> List[float]: - steps_path = self._calculate_steps_path(reference, hypothesis) - reference_len = len(reference) return [ - self._calculate_wer(self.convert_processing_steps_to_result( - processing_steps=steps_path, - reference_weights=span.get_reference_weights_table( - reference_len) + self._calculate_wer(self._convert_processing_steps_to_result( + input_steps=steps, + span=span )) for span in spans ] - - def calculate_wer_weighted( - self, - reference: List[str], - hypothesis: List[str], - weights: List[float] - ) -> Tuple[float, List[WerStep]]: - steps_path = self._calculate_steps_path(reference, hypothesis) - steps = self.convert_processing_steps_to_result(steps_path, weights) - return self._calculate_wer(steps), steps diff --git a/sziszapangma/core/wer/wer_embedding_calculator.py b/sziszapangma/core/wer/wer_embedding_calculator.py deleted file mode 100644 index 5f371b7f519dc6f17b618c0a19e252cefd900089..0000000000000000000000000000000000000000 --- a/sziszapangma/core/wer/wer_embedding_calculator.py +++ /dev/null @@ -1,63 +0,0 @@ -from typing import List, Optional - -from sziszapangma.core.transformer.embedding_transformer import \ - EmbeddingTransformer -from sziszapangma.core.wer.distance_matrix_calculator import \ - BinaryDistanceCalculator, DistanceCalculator, CosineDistanceCalculator -from sziszapangma.core.wer.step_words import StepWords -from sziszapangma.core.wer.wer_calculator import WerCalculator -from sziszapangma.core.wer.wer_processing_step import WerProcessingStep -from sziszapangma.core.wer.wer_step import WerStep - - -class WerEmbeddingCalculator(WerCalculator): - _distance_calculator: DistanceCalculator - - def __init__(self, embedding_transformer: EmbeddingTransformer): - super().__init__(BinaryDistanceCalculator()) - self._embedding_transformer = embedding_transformer - self._distance_calculator = CosineDistanceCalculator( - embedding_transformer) - - def _calculate_distance_for_word_step( - self, - step_words: StepWords - ) -> float: - return self._distance_calculator.calculate_distance_for_words( - step_words.reference_word, - step_words.hypothesis_word - ) - - def _calculate_result_cost_for_step( - self, - processing_step: WerProcessingStep - ) -> float: - step_words = processing_step.step_words - return self._calculate_distance_for_word_step(step_words) \ - if processing_step.step_type.is_cross_step() \ - else processing_step.step_cost - - def convert_processing_steps_to_result( - self, - processing_steps: List[WerProcessingStep], - reference_weights: Optional[List[float]] = None - ) -> List[WerStep]: - if reference_weights is None: - return [ - WerStep(step.step_type, step.step_words, - self._calculate_result_cost_for_step(step)) - for step in processing_steps - ] - else: - indexes_per_steps = self._get_reference_indexes_per_steps( - processing_steps) - return [ - WerStep( - processing_steps[step_index].step_type, - processing_steps[step_index].step_words, - reference_weights[indexes_per_steps[step_index]] * - self._calculate_result_cost_for_step( - processing_steps[step_index]) - ) - for step_index in range(len(processing_steps)) - ] diff --git a/sziszapangma/core/wer/wer_step.py b/sziszapangma/core/wer/wer_step.py deleted file mode 100644 index 5e1efa02ba9320b641c21c9a32712a9a23db94a3..0000000000000000000000000000000000000000 --- a/sziszapangma/core/wer/wer_step.py +++ /dev/null @@ -1,11 +0,0 @@ -from dataclasses import dataclass - -from sziszapangma.core.wer.step_type import StepType -from sziszapangma.core.wer.step_words import StepWords - - -@dataclass(frozen=True) -class WerStep: - step_type: StepType - step_words: StepWords - step_cost: float diff --git a/sziszapangma/core/wer/wer_util.py b/sziszapangma/core/wer/wer_util.py deleted file mode 100644 index a6ef66618105ed4162aad40988a15537f81e91e2..0000000000000000000000000000000000000000 --- a/sziszapangma/core/wer/wer_util.py +++ /dev/null @@ -1,33 +0,0 @@ -from typing import List, Optional - -import numpy as np -import pandas as pd - -from sziszapangma.core.wer.wer_step import WerStep - - -class WerUtil: - - @staticmethod - def _optional_str_to_str(value: Optional[str]) -> str: - return value if value is not None else '' - - @staticmethod - def _wer_step_to_pandas_row_lit(step: WerStep) -> List[any]: - return [ - step.step_type.get_short_name(), - WerUtil._optional_str_to_str(step.step_words.reference_word), - WerUtil._optional_str_to_str(step.step_words.hypothesis_word), - round(step.step_cost, 3) - ] - - @staticmethod - def steps_to_dataframe(steps: List[WerStep]) -> pd.DataFrame: - arr = np.array([ - WerUtil._wer_step_to_pandas_row_lit(step) - for step in steps - ]) - return pd.DataFrame( - arr, - columns=['step_type', 'reference', 'hypothesis', 'cost'] - ) diff --git a/sziszapangma/integration/mapper/__init__.py b/sziszapangma/integration/mapper/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/sziszapangma/integration/mapper/alignment_step_mapper.py b/sziszapangma/integration/mapper/alignment_step_mapper.py new file mode 100644 index 0000000000000000000000000000000000000000..8b3bf9b32aaef3b1be84cc5c036208e3b31d4bc4 --- /dev/null +++ b/sziszapangma/integration/mapper/alignment_step_mapper.py @@ -0,0 +1,16 @@ +from typing import Dict + +from sziszapangma.core.alignment.alignment_step import AlignmentStep +from sziszapangma.integration.mapper.step_words_mapper import StepWordsMapper + + +class AlignmentStepMapper: + + @staticmethod + def to_json_dict(alignment_step: AlignmentStep) -> Dict[str, any]: + return { + 'step_type': alignment_step.step_type.name, + 'step_words': StepWordsMapper.to_json_dict( + alignment_step.step_words), + 'step_cost': alignment_step.step_cost + } diff --git a/sziszapangma/integration/mapper/step_words_mapper.py b/sziszapangma/integration/mapper/step_words_mapper.py new file mode 100644 index 0000000000000000000000000000000000000000..a28b532411317d7510ef92723eb0274583a18430 --- /dev/null +++ b/sziszapangma/integration/mapper/step_words_mapper.py @@ -0,0 +1,27 @@ +from typing import Dict + +from sziszapangma.core.alignment.step_words import StepWords +from sziszapangma.integration.mapper.word_mapper import WordMapper + + +class StepWordsMapper: + + @staticmethod + def to_json_dict(step_words: StepWords) -> Dict[str, any]: + to_return = dict() + if step_words.hypothesis_word is not None: + to_return['hypothesis_word'] = WordMapper.to_json_dict( + step_words.hypothesis_word) + if step_words.reference_word is not None: + to_return['reference_word'] = WordMapper.to_json_dict( + step_words.reference_word) + return to_return + + @staticmethod + def from_json_dict(input_json_dict: Dict[str, any]) -> StepWords: + return StepWords( + None if 'reference_word' not in input_json_dict + else WordMapper.from_json_dict(input_json_dict['reference_word']), + None if 'hypothesis_word' not in input_json_dict + else WordMapper.from_json_dict(input_json_dict['hypothesis_word']), + ) diff --git a/sziszapangma/integration/mapper/word_mapper.py b/sziszapangma/integration/mapper/word_mapper.py new file mode 100644 index 0000000000000000000000000000000000000000..f7b0cd485c62cdb95b46c2154e09b91846aa8854 --- /dev/null +++ b/sziszapangma/integration/mapper/word_mapper.py @@ -0,0 +1,20 @@ +from typing import Dict + +from sziszapangma.core.alignment.word import Word + +_ID = 'id' +_VALUE = 'value' + + +class WordMapper: + + @staticmethod + def to_json_dict(word: Word) -> Dict[str, str]: + return { + _ID: word.id, + _VALUE: word.value + } + + @staticmethod + def from_json_dict(input_json_dict: Dict[str, str]) -> Word: + return Word(input_json_dict[_ID], input_json_dict[_VALUE]) diff --git a/sziszapangma/integration/task/classic_wer_metric_task.py b/sziszapangma/integration/task/classic_wer_metric_task.py index dfd8d2696e21a317e9272789e7a5a65e2cc8fb4e..4657a4aee4db1b723728ad5bd1ceaf76986af925 100644 --- a/sziszapangma/integration/task/classic_wer_metric_task.py +++ b/sziszapangma/integration/task/classic_wer_metric_task.py @@ -1,18 +1,25 @@ from typing import List, Dict -from sziszapangma.core.wer.classic_wer_calculator import ClassicWerCalculator +from sziszapangma.core.alignment.alignment_classic_calculator import \ + AlignmentClassicCalculator +from sziszapangma.core.alignment.alignment_step import AlignmentStep +from sziszapangma.core.wer.wer_calculator import WerCalculator +from sziszapangma.integration.mapper.alignment_step_mapper import \ + AlignmentStepMapper +from sziszapangma.integration.mapper.word_mapper import WordMapper from sziszapangma.integration.repository.experiment_repository import \ ExperimentRepository from sziszapangma.integration.task.processing_task import ProcessingTask _CLASSIC_WER = 'classic_wer' -_WORD = 'word' class ClassicWerMetricTask(ProcessingTask): _metrics_property_name: str + _alignment_property_name: str _gold_transcript_property_name: str - _classic_wer_calculator: ClassicWerCalculator + _alignment_classic_calculator: AlignmentClassicCalculator + _wer_calculator: WerCalculator def __init__( self, @@ -20,16 +27,22 @@ class ClassicWerMetricTask(ProcessingTask): gold_transcript_property_name: str, asr_property_name: str, metrics_property_name: str, + alignment_property_name: str, require_update: bool ): super().__init__(task_name, require_update) self._gold_transcript_property_name = gold_transcript_property_name self._asr_property_name = asr_property_name + self._alignment_property_name = alignment_property_name self._metrics_property_name = metrics_property_name - self._classic_wer_calculator = ClassicWerCalculator() + self._alignment_classic_calculator = AlignmentClassicCalculator() + self._wer_calculator = WerCalculator() - def skip_for_record(self, record_id: str, - experiment_repository: ExperimentRepository) -> bool: + def skip_for_record( + self, + record_id: str, + experiment_repository: ExperimentRepository + ) -> bool: return experiment_repository \ .get_property_for_key(record_id, self._metrics_property_name) @@ -41,32 +54,44 @@ class ClassicWerMetricTask(ProcessingTask): asr_result = experiment_repository \ .get_property_for_key(record_id, self._asr_property_name) if 'transcription' in asr_result: + alignment_steps = self._get_alignment( + gold_transcript, asr_result['transcription'] + ) + experiment_repository.update_property_for_key( + record_id, + self._alignment_property_name, + [AlignmentStepMapper.to_json_dict(it) + for it in alignment_steps] + ) experiment_repository.update_property_for_key( record_id, self._metrics_property_name, - self.calculate_metrics( - gold_transcript=gold_transcript, - asr_result=asr_result['transcription'] - ) + self.calculate_metrics(alignment_steps) ) - def _run_wer_calculations( + def _get_alignment( self, gold_transcript: List[Dict[str, any]], - asr_result: List[str] - ) -> float: - return self._classic_wer_calculator.calculate_wer( - reference=[it[_WORD] for it in gold_transcript], - hypothesis=[it for it in asr_result], - )[0] + asr_result: List[Dict[str, any]] + ) -> List[AlignmentStep]: + gold_transcript_words = [ + WordMapper.from_json_dict(word_dict) + for word_dict in gold_transcript + ] + asr_words = [ + WordMapper.from_json_dict(word_dict) + for word_dict in asr_result + ] + return self._alignment_classic_calculator \ + .calculate_alignment(reference=gold_transcript_words, + hypothesis=asr_words) def calculate_metrics( self, - gold_transcript: List[Dict[str, any]], - asr_result: List[str] + alignment_steps: List[AlignmentStep] ) -> Dict[str, any]: """Calculate all metrics for data sample.""" metrics = dict() - metrics[_CLASSIC_WER] = self._run_wer_calculations( - gold_transcript, asr_result) + metrics[_CLASSIC_WER] = self._wer_calculator.calculate_wer( + alignment_steps) return metrics diff --git a/sziszapangma/integration/task/embedding_wer_metrics_task.py b/sziszapangma/integration/task/embedding_wer_metrics_task.py index 67f8b943c692b1fd0723f6067e63d79dcadd5fe4..3eb3476171ee99318a434c70ba06627300d645ac 100644 --- a/sziszapangma/integration/task/embedding_wer_metrics_task.py +++ b/sziszapangma/integration/task/embedding_wer_metrics_task.py @@ -1,12 +1,18 @@ from typing import List, Dict +from sziszapangma.core.alignment.alignment_embedding_calculator import \ + AlignmentEmbeddingCalculator +from sziszapangma.core.alignment.alignment_soft_calculator import \ + AlignmentSoftCalculator +from sziszapangma.core.alignment.word import Word from sziszapangma.core.transformer.cached_embedding_transformer import \ CachedEmbeddingTransformer from sziszapangma.core.transformer.embedding_transformer import \ EmbeddingTransformer -from sziszapangma.core.wer.wer_embedding_calculator import \ - WerEmbeddingCalculator -from sziszapangma.core.wer.wer_soft_calculator import WerSoftCalculator +from sziszapangma.core.wer.wer_calculator import WerCalculator +from sziszapangma.integration.mapper.alignment_step_mapper import \ + AlignmentStepMapper +from sziszapangma.integration.mapper.word_mapper import WordMapper from sziszapangma.integration.repository.experiment_repository import \ ExperimentRepository from sziszapangma.integration.task.processing_task import ProcessingTask @@ -18,10 +24,12 @@ _WORD = 'word' class EmbeddingWerMetricsTask(ProcessingTask): _metrics_property_name: str + _alignment_property_name: str _gold_transcript_property_name: str _embedding_transformer: CachedEmbeddingTransformer - _wer_embedding_calculator: WerEmbeddingCalculator - _wer_soft_calculator: WerSoftCalculator + _alignment_embedding_calculator: AlignmentEmbeddingCalculator + _alignment_soft_calculator: AlignmentSoftCalculator + _wer_calculator: WerCalculator def __init__( self, @@ -29,6 +37,7 @@ class EmbeddingWerMetricsTask(ProcessingTask): gold_transcript_property_name: str, asr_property_name: str, metrics_property_name: str, + alignment_property_name: str, require_update: bool, embedding_transformer: EmbeddingTransformer ): @@ -38,10 +47,12 @@ class EmbeddingWerMetricsTask(ProcessingTask): self._metrics_property_name = metrics_property_name self._embedding_transformer = \ CachedEmbeddingTransformer(embedding_transformer) - self._wer_embedding_calculator = \ - WerEmbeddingCalculator(self._embedding_transformer) - self._wer_soft_calculator = \ - WerSoftCalculator(self._embedding_transformer) + self._alignment_embedding_calculator = \ + AlignmentEmbeddingCalculator(self._embedding_transformer) + self._alignment_soft_calculator = \ + AlignmentSoftCalculator(self._embedding_transformer) + self._wer_calculator = WerCalculator() + self._alignment_property_name = alignment_property_name def skip_for_record(self, record_id: str, experiment_repository: ExperimentRepository) -> bool: @@ -56,26 +67,39 @@ class EmbeddingWerMetricsTask(ProcessingTask): asr_result = experiment_repository \ .get_property_for_key(record_id, self._asr_property_name) if 'transcription' in asr_result: + gold_transcript_words = self._map_words_to_domain(gold_transcript) + asr_words = self._map_words_to_domain(asr_result['transcription']) + + soft_alignment = self._alignment_soft_calculator \ + .calculate_alignment(gold_transcript_words, asr_words) + embedding_alignment = self._alignment_embedding_calculator \ + .calculate_alignment(gold_transcript_words, asr_words) + + soft_wer = self._wer_calculator.calculate_wer(soft_alignment) + embedding_wer = self._wer_calculator \ + .calculate_wer(embedding_alignment) + + alignment_results = { + 'soft_alignment': [AlignmentStepMapper.to_json_dict(it) + for it in soft_alignment], + 'embedding_alignment': [AlignmentStepMapper.to_json_dict(it) + for it in embedding_alignment], + } + wer_results = {'soft_wer': soft_wer, + 'embedding_wer': embedding_wer} + experiment_repository.update_property_for_key( - record_id, - self._metrics_property_name, - self.calculate_metrics( - gold_transcript=gold_transcript, - asr_result=asr_result['transcription'] - ) - ) + record_id, self._alignment_property_name, alignment_results) + experiment_repository.update_property_for_key( + record_id, self._metrics_property_name, wer_results) + self._embedding_transformer.clear() - def calculate_metrics( - self, - gold_transcript: List[Dict[str, any]], - asr_result: List[str] - ) -> Dict[str, any]: - """Calculate all metrics for data sample.""" - metrics = dict() - reference = [it[_WORD] for it in gold_transcript] - metrics[_SOFT_WER] = self._wer_soft_calculator.calculate_wer( - reference, asr_result)[0] - metrics[_EMBEDDING_WER] = self._wer_embedding_calculator.calculate_wer( - reference, asr_result)[0] - return metrics + @staticmethod + def _map_words_to_domain( + input_json_dicts: List[Dict[str, str]] + ) -> List[Word]: + return [ + WordMapper.from_json_dict(word_dict) + for word_dict in input_json_dicts + ] diff --git a/tests/test_classic_wer.py b/tests/test_classic_wer.py index cdead597bc6e4e2a20db4f2089fbdc90985a8f31..ff67ebd11117c13284f38efefe6a059bd5cd1c90 100644 --- a/tests/test_classic_wer.py +++ b/tests/test_classic_wer.py @@ -2,41 +2,54 @@ from typing import List, Tuple import pytest -from sziszapangma.core.wer.classic_wer_calculator import ClassicWerCalculator -from sziszapangma.core.wer.step_type import StepType -from sziszapangma.core.wer.step_words import StepWords +from sziszapangma.core.alignment.alignment_classic_calculator import \ + AlignmentClassicCalculator +from sziszapangma.core.alignment.step_type import StepType +from sziszapangma.core.alignment.step_words import StepWords +from sziszapangma.core.alignment.word import Word +from sziszapangma.core.wer.wer_calculator import WerCalculator -def get_sample_data() -> Tuple[List[str], List[str]]: +def string_list_to_words(strings: List[str]) -> List[Word]: + return [Word.from_string(it) for it in strings] + + +def get_sample_data() -> Tuple[List[Word], List[Word]]: reference = ['This', 'great', 'machine', 'can', 'recognize', 'speech'] hypothesis = ['This', 'machine', 'can', 'wreck', 'a', 'nice', 'beach'] - return reference, hypothesis + return string_list_to_words(reference), string_list_to_words(hypothesis) def test_classic_calculate_wer_value(): """Sample test for core calculate.""" reference, hypothesis = get_sample_data() - wer_result = ClassicWerCalculator().calculate_wer(reference, hypothesis) - assert pytest.approx(wer_result[0]) == 0.8333333 + alignment = AlignmentClassicCalculator()\ + .calculate_alignment(reference, hypothesis) + wer_result = WerCalculator().calculate_wer(alignment) + assert pytest.approx(wer_result) == 0.8333333 def test_classic_calculate_wer_steps(): """Sample test for core calculate.""" reference, hypothesis = get_sample_data() - wer_result = ClassicWerCalculator().calculate_wer(reference, hypothesis) + alignment = AlignmentClassicCalculator().calculate_alignment( + reference, hypothesis) reference_words = [ - StepWords('This', 'This'), StepWords('great', None), - StepWords('machine', 'machine'), StepWords('can', 'can'), - StepWords(None, 'wreck'), StepWords(None, 'a'), - StepWords('recognize', 'nice'), - StepWords('speech', 'beach')] + StepWords(reference[0], hypothesis[0]), + StepWords(reference[1], None), + StepWords(reference[2], hypothesis[1]), + StepWords(reference[3], hypothesis[2]), + StepWords(None, hypothesis[3]), + StepWords(None, hypothesis[4]), + StepWords(reference[4], hypothesis[5]), + StepWords(reference[5], hypothesis[6])] step_types = [ StepType.CORRECT, StepType.DELETION, StepType.CORRECT, StepType.CORRECT, StepType.INSERTION, StepType.INSERTION, StepType.SUBSTITUTION, StepType.SUBSTITUTION] - assert len(wer_result[1]) == 8 - assert [it.step_type for it in wer_result[1]] == step_types - assert [it.step_cost for it in wer_result[1]] == [0, 1, 0, 0, 1, 1, 1, 1] - assert [it.step_words for it in wer_result[1]] == reference_words + assert len(alignment) == 8 + assert [it.step_type for it in alignment] == step_types + assert [it.step_cost for it in alignment] == [0, 1, 0, 0, 1, 1, 1, 1] + assert [it.step_words for it in alignment] == reference_words diff --git a/tests/test_embedding_wer.py b/tests/test_embedding_wer.py index 876af9456e8f104d3761d6f5c3042ba9ed5e7623..4f7cd55c9d0fdb1f9abff24bb80c0c040ec92061 100644 --- a/tests/test_embedding_wer.py +++ b/tests/test_embedding_wer.py @@ -2,26 +2,35 @@ from typing import List, Tuple import pytest -from sziszapangma.core.wer.wer_embedding_calculator import \ - WerEmbeddingCalculator +from sziszapangma.core.alignment.alignment_calculator import \ + AlignmentCalculator +from sziszapangma.core.alignment.alignment_embedding_calculator import \ + AlignmentEmbeddingCalculator +from sziszapangma.core.alignment.word import Word +from sziszapangma.core.wer.wer_calculator import WerCalculator from tests.file_stored_embedding_transformer import \ FileStoredEmbeddingTransformer -def get_sample_data() -> Tuple[List[str], List[str]]: +def string_list_to_words(strings: List[str]) -> List[Word]: + return [Word.from_string(it) for it in strings] + + +def get_sample_data() -> Tuple[List[Word], List[Word]]: reference = ['ala', 'ma', 'dobrego', 'wielkiego', 'psa', 'rasowego'] hypothesis = ['alana', 'rego', 'kruchego', 'psa', 'rasowego'] - return reference, hypothesis + return string_list_to_words(reference), string_list_to_words(hypothesis) -def get_calculator() -> WerEmbeddingCalculator: - return WerEmbeddingCalculator( +def get_alignment_calculator() -> AlignmentCalculator: + return AlignmentEmbeddingCalculator( FileStoredEmbeddingTransformer('tests/embeddings_pl.json')) def test_classic_calculate_wer_value(): """Sample test for core calculate.""" reference, hypothesis = get_sample_data() - wer_result = get_calculator().calculate_wer(reference, hypothesis) - print(wer_result[0]) - assert pytest.approx(wer_result[0]) == 0.55879563 + alignment = get_alignment_calculator().calculate_alignment(reference, + hypothesis) + wer_result = WerCalculator().calculate_wer(alignment) + assert pytest.approx(wer_result) == 0.55879563 diff --git a/tests/test_soft_wer.py b/tests/test_soft_wer.py index c72b97f071b382b1b7f224d4692e46d99ce18b08..85a34338b831fff39525f557c69886911cb4e100 100644 --- a/tests/test_soft_wer.py +++ b/tests/test_soft_wer.py @@ -2,25 +2,36 @@ from typing import List, Tuple import pytest -from sziszapangma.core.wer.wer_soft_calculator import WerSoftCalculator +from sziszapangma.core.alignment.alignment_calculator import \ + AlignmentCalculator +from sziszapangma.core.alignment.alignment_soft_calculator import \ + AlignmentSoftCalculator +from sziszapangma.core.alignment.word import Word +from sziszapangma.core.wer.wer_calculator import WerCalculator from tests.file_stored_embedding_transformer import \ FileStoredEmbeddingTransformer -def get_sample_data() -> Tuple[List[str], List[str]]: +def string_list_to_words(strings: List[str]) -> List[Word]: + return [Word.from_string(it) for it in strings] + + +def get_sample_data() -> Tuple[List[Word], List[Word]]: reference = ['ala', 'ma', 'dobrego', 'wielkiego', 'psa', 'rasowego'] hypothesis = ['alana', 'rego', 'kruchego', 'psa', 'rasowego'] - return reference, hypothesis + return string_list_to_words(reference), string_list_to_words(hypothesis) -def get_calculator() -> WerSoftCalculator: - return WerSoftCalculator( +def get_alignment_calculator() -> AlignmentCalculator: + return AlignmentSoftCalculator( FileStoredEmbeddingTransformer('tests/embeddings_pl.json')) def test_classic_calculate_wer_value(): """Sample test for core calculate.""" reference, hypothesis = get_sample_data() - wer_result = get_calculator().calculate_wer(reference, hypothesis) - print(wer_result[0]) - assert pytest.approx(wer_result[0]) == 0.50186761 + alignment = get_alignment_calculator().calculate_alignment( + reference, hypothesis) + wer_result = WerCalculator().calculate_wer(alignment) + print(wer_result) + assert pytest.approx(wer_result) == 0.50186761 diff --git a/tests/test_sziszapangma.py b/tests/test_sziszapangma.py deleted file mode 100644 index 838dc136662e281f6cb87781d10c48f55bd3a5ef..0000000000000000000000000000000000000000 --- a/tests/test_sziszapangma.py +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env python - -"""Tests for `sziszapangma` package.""" - -import pytest - -from click.testing import CliRunner - -from sziszapangma.core import cli - - -@pytest.fixture -def response(): - """Sample pytest fixture. - - See more at: http://doc.pytest.org/en/latest/fixture.html - """ - # import requests - # return requests.get('https://github.com/audreyr/cookiecutter-pypackage') - - -def test_content(response): - """Sample pytest test function with the pytest fixture as an argument.""" - # from bs4 import BeautifulSoup - # assert 'GitHub' in BeautifulSoup(response.content).title.string - - -def test_command_line_interface(): - """Test the CLI.""" - runner = CliRunner() - result = runner.invoke(cli.main) - assert result.exit_code == 0 - assert 'sziszapangma.cli.main' in result.output - help_result = runner.invoke(cli.main, ['--help']) - assert help_result.exit_code == 0 - assert '--help Show this message and exit.' in help_result.output diff --git a/tox.ini b/tox.ini index 4dc38fb541aa720338639b776759f9d0205a9e7e..7062dd226ee8550c29f47c9d907e6e32b5c2bac7 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = flake8 +envlist = flake8,testenv skipsdist = True [testenv:flake8] @@ -11,13 +11,12 @@ commands = flake8 sziszapangma tests setenv = PYTHONPATH = {toxinidir} deps = + -r{toxinidir}/requirements.txt -r{toxinidir}/requirements_dev.txt ; If you want to make tox run the tests with the same versions, create a ; requirements.txt with the pinned versions and uncomment the following line: ; -r{toxinidir}/requirements.txt commands = - ls -la - pip install -U pip pytest ; pytest --basetemp={envtmpdir}