diff --git a/sziszapangma/core/wer/distance_matrix_calculator.py b/sziszapangma/core/wer/distance_matrix_calculator.py index 50e359af263175cde48c883b7157a02451e2384f..47e736a229165267193bb5ef926ce57875c22acc 100644 --- a/sziszapangma/core/wer/distance_matrix_calculator.py +++ b/sziszapangma/core/wer/distance_matrix_calculator.py @@ -5,30 +5,31 @@ import numpy as np from sziszapangma.core.transformer.embedding_transformer import \ EmbeddingTransformer +from sziszapangma.core.wer.word import Word class DistanceCalculator(ABC): @abstractmethod def calculate_distance_matrix( self, - reference: List[str], - hypothesis: List[str] + reference: List[Word], + hypothesis: List[Word] ) -> np.array: pass @abstractmethod - def calculate_distance_for_words(self, word1: str, word2: str) -> float: + def calculate_distance_for_words(self, word1: Word, word2: Word) -> float: pass class BinaryDistanceCalculator(DistanceCalculator): - def calculate_distance_for_words(self, word1: str, word2: str) -> float: - return 0 if word1 == word2 else 1 + def calculate_distance_for_words(self, word1: Word, word2: Word) -> float: + return 0 if word1.value == word2.value else 1 def calculate_distance_matrix( self, - reference: List[str], - hypothesis: List[str] + reference: List[Word], + hypothesis: List[Word] ) -> np.array: return np.array([ [self.calculate_distance_for_words(reference_word, hypothesis_word) @@ -43,10 +44,10 @@ class CosineDistanceCalculator(DistanceCalculator): def __init__(self, embedding_transformer: EmbeddingTransformer): self._embedding_transformer = embedding_transformer - def calculate_distance_for_words(self, word1: str, word2: str) -> float: + def calculate_distance_for_words(self, word1: Word, word2: Word) -> float: return self.cosine_distance_between_words_embeddings( - self._embedding_transformer.get_embedding(word1), - self._embedding_transformer.get_embedding(word2) + self._embedding_transformer.get_embedding(word1.value), + self._embedding_transformer.get_embedding(word2.value) ) @staticmethod @@ -67,22 +68,22 @@ class CosineDistanceCalculator(DistanceCalculator): b_norm = np.linalg.norm(b, axis=1, keepdims=True) else: raise RuntimeError("array dimensions {} not right".format(a.ndim)) - similiarity = np.dot(a, b.T) / (a_norm * b_norm) - dist = 1. - similiarity + similarity = np.dot(a, b.T) / (a_norm * b_norm) + dist = 1. - similarity return dist def calculate_distance_matrix( self, - reference: List[str], - hypothesis: List[str] + reference: List[Word], + hypothesis: List[Word] ) -> np.array: embeddings_dict = self._embedding_transformer.get_embeddings( - list(set(reference + hypothesis)) + list(set(it.value for it in (reference + hypothesis))) ) return np.array([[ self.cosine_distance_between_words_embeddings( - embeddings_dict[reference_word], - embeddings_dict[hypothesis_word], + embeddings_dict[reference_word.value], + embeddings_dict[hypothesis_word.value], ) for hypothesis_word in hypothesis] for reference_word in reference diff --git a/sziszapangma/core/wer/step_words.py b/sziszapangma/core/wer/step_words.py index 889c2fc79cc09bb1cea028a9526e3b1abc550e2f..ced47b169f30b15f92cadcb79697cbdff2826add 100644 --- a/sziszapangma/core/wer/step_words.py +++ b/sziszapangma/core/wer/step_words.py @@ -1,8 +1,10 @@ from dataclasses import dataclass from typing import Optional +from sziszapangma.core.wer.word import Word + @dataclass(frozen=True) class StepWords: - reference_word: Optional[str] - hypothesis_word: Optional[str] + reference_word: Optional[Word] + hypothesis_word: Optional[Word] diff --git a/sziszapangma/core/wer/wer_calculator.py b/sziszapangma/core/wer/wer_calculator.py index 724fb949e83816e46e1dab3f9254d77aa8429bba..edb06e4799f72d73327d635a87fe7991567db464 100644 --- a/sziszapangma/core/wer/wer_calculator.py +++ b/sziszapangma/core/wer/wer_calculator.py @@ -9,6 +9,7 @@ from sziszapangma.core.wer.step_type import StepType from sziszapangma.core.wer.wer_processing_step import WerProcessingStep from sziszapangma.core.wer.wer_span_question import Span from sziszapangma.core.wer.wer_step import WerStep, StepWords +from sziszapangma.core.wer.word import Word class WerCalculator(ABC): @@ -42,8 +43,8 @@ class WerCalculator(ABC): def get_distance_matrix_between_words( self, - reference: List[str], - hypothesis: List[str] + reference: List[Word], + hypothesis: List[Word] ) -> np.ndarray: return self._distance_matrix_calculator.calculate_distance_matrix( reference, hypothesis) @@ -81,8 +82,8 @@ class WerCalculator(ABC): def get_levenshtein_embedding_based( self, - reference: List[str], - hypothesis: List[str], + reference: List[Word], + hypothesis: List[Word], distance_matrix: np.ndarray ) -> Tuple[np.ndarray, List[List[WerProcessingStep]]]: @@ -116,8 +117,8 @@ class WerCalculator(ABC): @staticmethod def _get_initialized_levenshtein_matrix( - reference: List[str], - hypothesis: List[str] + reference: List[Word], + hypothesis: List[Word] ) -> Tuple[np.ndarray, List[List[Optional[WerProcessingStep]]]]: # TODO: consider about remove distance_arr replaced by steps_arr @@ -174,8 +175,8 @@ class WerCalculator(ABC): def _calculate_steps_path( self, - reference: List[str], - hypothesis: List[str] + reference: List[Word], + hypothesis: List[Word] ) -> List[WerProcessingStep]: distance_between_words = self.get_distance_matrix_between_words( reference, hypothesis) @@ -185,8 +186,8 @@ class WerCalculator(ABC): def calculate_wer( self, - reference: List[str], - hypothesis: List[str] + reference: List[Word], + hypothesis: List[Word] ) -> Tuple[float, List[WerStep]]: steps_path = self._calculate_steps_path(reference, hypothesis) steps = self.convert_processing_steps_to_result(steps_path) @@ -194,8 +195,8 @@ class WerCalculator(ABC): def calculate_wer_for_spans( self, - reference: List[str], - hypothesis: List[str], + reference: List[Word], + hypothesis: List[Word], spans: List[Span] ) -> List[float]: steps_path = self._calculate_steps_path(reference, hypothesis) @@ -211,8 +212,8 @@ class WerCalculator(ABC): def calculate_wer_weighted( self, - reference: List[str], - hypothesis: List[str], + reference: List[Word], + hypothesis: List[Word], weights: List[float] ) -> Tuple[float, List[WerStep]]: steps_path = self._calculate_steps_path(reference, hypothesis) diff --git a/sziszapangma/core/wer/word.py b/sziszapangma/core/wer/word.py new file mode 100644 index 0000000000000000000000000000000000000000..b20d9e9f0d374531772ae13f0c72e4734997e4d7 --- /dev/null +++ b/sziszapangma/core/wer/word.py @@ -0,0 +1,12 @@ +import uuid +from dataclasses import dataclass + + +@dataclass(frozen=True) +class Word: + id: str + value: str + + @classmethod + def from_string(cls, string: str): + cls(str(uuid.uuid4()), string) diff --git a/tests/test_classic_wer.py b/tests/test_classic_wer.py index cdead597bc6e4e2a20db4f2089fbdc90985a8f31..c78bc41f86fc37081a45d6b3be5f49b89c5910df 100644 --- a/tests/test_classic_wer.py +++ b/tests/test_classic_wer.py @@ -5,12 +5,17 @@ import pytest from sziszapangma.core.wer.classic_wer_calculator import ClassicWerCalculator from sziszapangma.core.wer.step_type import StepType from sziszapangma.core.wer.step_words import StepWords +from sziszapangma.core.wer.word import Word -def get_sample_data() -> Tuple[List[str], List[str]]: +def string_list_to_words(strings: List[str]) -> List[Word]: + return [Word.from_string(it) for it in strings] + + +def get_sample_data() -> Tuple[List[Word], List[Word]]: reference = ['This', 'great', 'machine', 'can', 'recognize', 'speech'] hypothesis = ['This', 'machine', 'can', 'wreck', 'a', 'nice', 'beach'] - return reference, hypothesis + return string_list_to_words(reference), string_list_to_words(hypothesis) def test_classic_calculate_wer_value(): @@ -26,11 +31,14 @@ def test_classic_calculate_wer_steps(): wer_result = ClassicWerCalculator().calculate_wer(reference, hypothesis) reference_words = [ - StepWords('This', 'This'), StepWords('great', None), - StepWords('machine', 'machine'), StepWords('can', 'can'), - StepWords(None, 'wreck'), StepWords(None, 'a'), - StepWords('recognize', 'nice'), - StepWords('speech', 'beach')] + StepWords(reference[0], hypothesis[0]), + StepWords(reference[1], None), + StepWords(reference[2], hypothesis[1]), + StepWords(reference[3], hypothesis[2]), + StepWords(None, hypothesis[3]), + StepWords(None, hypothesis[4]), + StepWords(reference[4], hypothesis[5]), + StepWords(reference[5], hypothesis[6])] step_types = [ StepType.CORRECT, StepType.DELETION, StepType.CORRECT, StepType.CORRECT, StepType.INSERTION, StepType.INSERTION, diff --git a/tests/test_embedding_wer.py b/tests/test_embedding_wer.py index 876af9456e8f104d3761d6f5c3042ba9ed5e7623..a1a9dabdb58786ad905d70897d58f0884221cc62 100644 --- a/tests/test_embedding_wer.py +++ b/tests/test_embedding_wer.py @@ -4,14 +4,18 @@ import pytest from sziszapangma.core.wer.wer_embedding_calculator import \ WerEmbeddingCalculator +from sziszapangma.core.wer.word import Word from tests.file_stored_embedding_transformer import \ FileStoredEmbeddingTransformer +def string_list_to_words(strings: List[str]) -> List[Word]: + return [Word.from_string(it) for it in strings] -def get_sample_data() -> Tuple[List[str], List[str]]: + +def get_sample_data() -> Tuple[List[Word], List[Word]]: reference = ['ala', 'ma', 'dobrego', 'wielkiego', 'psa', 'rasowego'] hypothesis = ['alana', 'rego', 'kruchego', 'psa', 'rasowego'] - return reference, hypothesis + return string_list_to_words(reference), string_list_to_words(hypothesis) def get_calculator() -> WerEmbeddingCalculator: diff --git a/tests/test_soft_wer.py b/tests/test_soft_wer.py index c72b97f071b382b1b7f224d4692e46d99ce18b08..e47240bad34bb935760ca513cc7570ed56e319d2 100644 --- a/tests/test_soft_wer.py +++ b/tests/test_soft_wer.py @@ -3,14 +3,19 @@ from typing import List, Tuple import pytest from sziszapangma.core.wer.wer_soft_calculator import WerSoftCalculator +from sziszapangma.core.wer.word import Word from tests.file_stored_embedding_transformer import \ FileStoredEmbeddingTransformer -def get_sample_data() -> Tuple[List[str], List[str]]: +def string_list_to_words(strings: List[str]) -> List[Word]: + return [Word.from_string(it) for it in strings] + + +def get_sample_data() -> Tuple[List[Word], List[Word]]: reference = ['ala', 'ma', 'dobrego', 'wielkiego', 'psa', 'rasowego'] hypothesis = ['alana', 'rego', 'kruchego', 'psa', 'rasowego'] - return reference, hypothesis + return string_list_to_words(reference), string_list_to_words(hypothesis) def get_calculator() -> WerSoftCalculator: