From fddf4ab41cafc62649355e28fb7c352bb11733ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marcin=20W=C4=85troba?= <markowanga@gmail.com>
Date: Sat, 26 Jun 2021 19:33:12 +0200
Subject: [PATCH] Add ids to words

---
 .../core/wer/distance_matrix_calculator.py    | 35 ++++++++++---------
 sziszapangma/core/wer/step_words.py           |  6 ++--
 sziszapangma/core/wer/wer_calculator.py       | 29 +++++++--------
 sziszapangma/core/wer/word.py                 | 12 +++++++
 tests/test_classic_wer.py                     | 22 ++++++++----
 tests/test_embedding_wer.py                   |  8 +++--
 tests/test_soft_wer.py                        |  9 +++--
 7 files changed, 77 insertions(+), 44 deletions(-)
 create mode 100644 sziszapangma/core/wer/word.py

diff --git a/sziszapangma/core/wer/distance_matrix_calculator.py b/sziszapangma/core/wer/distance_matrix_calculator.py
index 50e359a..47e736a 100644
--- a/sziszapangma/core/wer/distance_matrix_calculator.py
+++ b/sziszapangma/core/wer/distance_matrix_calculator.py
@@ -5,30 +5,31 @@ import numpy as np
 
 from sziszapangma.core.transformer.embedding_transformer import \
     EmbeddingTransformer
+from sziszapangma.core.wer.word import Word
 
 
 class DistanceCalculator(ABC):
     @abstractmethod
     def calculate_distance_matrix(
         self,
-        reference: List[str],
-        hypothesis: List[str]
+        reference: List[Word],
+        hypothesis: List[Word]
     ) -> np.array:
         pass
 
     @abstractmethod
-    def calculate_distance_for_words(self, word1: str, word2: str) -> float:
+    def calculate_distance_for_words(self, word1: Word, word2: Word) -> float:
         pass
 
 
 class BinaryDistanceCalculator(DistanceCalculator):
-    def calculate_distance_for_words(self, word1: str, word2: str) -> float:
-        return 0 if word1 == word2 else 1
+    def calculate_distance_for_words(self, word1: Word, word2: Word) -> float:
+        return 0 if word1.value == word2.value else 1
 
     def calculate_distance_matrix(
         self,
-        reference: List[str],
-        hypothesis: List[str]
+        reference: List[Word],
+        hypothesis: List[Word]
     ) -> np.array:
         return np.array([
             [self.calculate_distance_for_words(reference_word, hypothesis_word)
@@ -43,10 +44,10 @@ class CosineDistanceCalculator(DistanceCalculator):
     def __init__(self, embedding_transformer: EmbeddingTransformer):
         self._embedding_transformer = embedding_transformer
 
-    def calculate_distance_for_words(self, word1: str, word2: str) -> float:
+    def calculate_distance_for_words(self, word1: Word, word2: Word) -> float:
         return self.cosine_distance_between_words_embeddings(
-            self._embedding_transformer.get_embedding(word1),
-            self._embedding_transformer.get_embedding(word2)
+            self._embedding_transformer.get_embedding(word1.value),
+            self._embedding_transformer.get_embedding(word2.value)
         )
 
     @staticmethod
@@ -67,22 +68,22 @@ class CosineDistanceCalculator(DistanceCalculator):
             b_norm = np.linalg.norm(b, axis=1, keepdims=True)
         else:
             raise RuntimeError("array dimensions {} not right".format(a.ndim))
-        similiarity = np.dot(a, b.T) / (a_norm * b_norm)
-        dist = 1. - similiarity
+        similarity = np.dot(a, b.T) / (a_norm * b_norm)
+        dist = 1. - similarity
         return dist
 
     def calculate_distance_matrix(
         self,
-        reference: List[str],
-        hypothesis: List[str]
+        reference: List[Word],
+        hypothesis: List[Word]
     ) -> np.array:
         embeddings_dict = self._embedding_transformer.get_embeddings(
-            list(set(reference + hypothesis))
+            list(set(it.value for it in (reference + hypothesis)))
         )
         return np.array([[
             self.cosine_distance_between_words_embeddings(
-                embeddings_dict[reference_word],
-                embeddings_dict[hypothesis_word],
+                embeddings_dict[reference_word.value],
+                embeddings_dict[hypothesis_word.value],
             )
             for hypothesis_word in hypothesis]
             for reference_word in reference
diff --git a/sziszapangma/core/wer/step_words.py b/sziszapangma/core/wer/step_words.py
index 889c2fc..ced47b1 100644
--- a/sziszapangma/core/wer/step_words.py
+++ b/sziszapangma/core/wer/step_words.py
@@ -1,8 +1,10 @@
 from dataclasses import dataclass
 from typing import Optional
 
+from sziszapangma.core.wer.word import Word
+
 
 @dataclass(frozen=True)
 class StepWords:
-    reference_word: Optional[str]
-    hypothesis_word: Optional[str]
+    reference_word: Optional[Word]
+    hypothesis_word: Optional[Word]
diff --git a/sziszapangma/core/wer/wer_calculator.py b/sziszapangma/core/wer/wer_calculator.py
index 724fb94..edb06e4 100644
--- a/sziszapangma/core/wer/wer_calculator.py
+++ b/sziszapangma/core/wer/wer_calculator.py
@@ -9,6 +9,7 @@ from sziszapangma.core.wer.step_type import StepType
 from sziszapangma.core.wer.wer_processing_step import WerProcessingStep
 from sziszapangma.core.wer.wer_span_question import Span
 from sziszapangma.core.wer.wer_step import WerStep, StepWords
+from sziszapangma.core.wer.word import Word
 
 
 class WerCalculator(ABC):
@@ -42,8 +43,8 @@ class WerCalculator(ABC):
 
     def get_distance_matrix_between_words(
         self,
-        reference: List[str],
-        hypothesis: List[str]
+        reference: List[Word],
+        hypothesis: List[Word]
     ) -> np.ndarray:
         return self._distance_matrix_calculator.calculate_distance_matrix(
             reference, hypothesis)
@@ -81,8 +82,8 @@ class WerCalculator(ABC):
 
     def get_levenshtein_embedding_based(
         self,
-        reference: List[str],
-        hypothesis: List[str],
+        reference: List[Word],
+        hypothesis: List[Word],
         distance_matrix: np.ndarray
     ) -> Tuple[np.ndarray, List[List[WerProcessingStep]]]:
 
@@ -116,8 +117,8 @@ class WerCalculator(ABC):
 
     @staticmethod
     def _get_initialized_levenshtein_matrix(
-        reference: List[str],
-        hypothesis: List[str]
+        reference: List[Word],
+        hypothesis: List[Word]
     ) -> Tuple[np.ndarray, List[List[Optional[WerProcessingStep]]]]:
 
         # TODO: consider about remove distance_arr replaced by steps_arr
@@ -174,8 +175,8 @@ class WerCalculator(ABC):
 
     def _calculate_steps_path(
         self,
-        reference: List[str],
-        hypothesis: List[str]
+        reference: List[Word],
+        hypothesis: List[Word]
     ) -> List[WerProcessingStep]:
         distance_between_words = self.get_distance_matrix_between_words(
             reference, hypothesis)
@@ -185,8 +186,8 @@ class WerCalculator(ABC):
 
     def calculate_wer(
         self,
-        reference: List[str],
-        hypothesis: List[str]
+        reference: List[Word],
+        hypothesis: List[Word]
     ) -> Tuple[float, List[WerStep]]:
         steps_path = self._calculate_steps_path(reference, hypothesis)
         steps = self.convert_processing_steps_to_result(steps_path)
@@ -194,8 +195,8 @@ class WerCalculator(ABC):
 
     def calculate_wer_for_spans(
         self,
-        reference: List[str],
-        hypothesis: List[str],
+        reference: List[Word],
+        hypothesis: List[Word],
         spans: List[Span]
     ) -> List[float]:
         steps_path = self._calculate_steps_path(reference, hypothesis)
@@ -211,8 +212,8 @@ class WerCalculator(ABC):
 
     def calculate_wer_weighted(
         self,
-        reference: List[str],
-        hypothesis: List[str],
+        reference: List[Word],
+        hypothesis: List[Word],
         weights: List[float]
     ) -> Tuple[float, List[WerStep]]:
         steps_path = self._calculate_steps_path(reference, hypothesis)
diff --git a/sziszapangma/core/wer/word.py b/sziszapangma/core/wer/word.py
new file mode 100644
index 0000000..b20d9e9
--- /dev/null
+++ b/sziszapangma/core/wer/word.py
@@ -0,0 +1,12 @@
+import uuid
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class Word:
+    id: str
+    value: str
+
+    @classmethod
+    def from_string(cls, string: str):
+        cls(str(uuid.uuid4()), string)
diff --git a/tests/test_classic_wer.py b/tests/test_classic_wer.py
index cdead59..c78bc41 100644
--- a/tests/test_classic_wer.py
+++ b/tests/test_classic_wer.py
@@ -5,12 +5,17 @@ import pytest
 from sziszapangma.core.wer.classic_wer_calculator import ClassicWerCalculator
 from sziszapangma.core.wer.step_type import StepType
 from sziszapangma.core.wer.step_words import StepWords
+from sziszapangma.core.wer.word import Word
 
 
-def get_sample_data() -> Tuple[List[str], List[str]]:
+def string_list_to_words(strings: List[str]) -> List[Word]:
+    return [Word.from_string(it) for it in strings]
+
+
+def get_sample_data() -> Tuple[List[Word], List[Word]]:
     reference = ['This', 'great', 'machine', 'can', 'recognize', 'speech']
     hypothesis = ['This', 'machine', 'can', 'wreck', 'a', 'nice', 'beach']
-    return reference, hypothesis
+    return string_list_to_words(reference), string_list_to_words(hypothesis)
 
 
 def test_classic_calculate_wer_value():
@@ -26,11 +31,14 @@ def test_classic_calculate_wer_steps():
     wer_result = ClassicWerCalculator().calculate_wer(reference, hypothesis)
 
     reference_words = [
-        StepWords('This', 'This'), StepWords('great', None),
-        StepWords('machine', 'machine'), StepWords('can', 'can'),
-        StepWords(None, 'wreck'), StepWords(None, 'a'),
-        StepWords('recognize', 'nice'),
-        StepWords('speech', 'beach')]
+        StepWords(reference[0], hypothesis[0]),
+        StepWords(reference[1], None),
+        StepWords(reference[2], hypothesis[1]),
+        StepWords(reference[3], hypothesis[2]),
+        StepWords(None, hypothesis[3]),
+        StepWords(None, hypothesis[4]),
+        StepWords(reference[4], hypothesis[5]),
+        StepWords(reference[5], hypothesis[6])]
     step_types = [
         StepType.CORRECT, StepType.DELETION, StepType.CORRECT,
         StepType.CORRECT, StepType.INSERTION, StepType.INSERTION,
diff --git a/tests/test_embedding_wer.py b/tests/test_embedding_wer.py
index 876af94..a1a9dab 100644
--- a/tests/test_embedding_wer.py
+++ b/tests/test_embedding_wer.py
@@ -4,14 +4,18 @@ import pytest
 
 from sziszapangma.core.wer.wer_embedding_calculator import \
     WerEmbeddingCalculator
+from sziszapangma.core.wer.word import Word
 from tests.file_stored_embedding_transformer import \
     FileStoredEmbeddingTransformer
 
+def string_list_to_words(strings: List[str]) -> List[Word]:
+    return [Word.from_string(it) for it in strings]
 
-def get_sample_data() -> Tuple[List[str], List[str]]:
+
+def get_sample_data() -> Tuple[List[Word], List[Word]]:
     reference = ['ala', 'ma', 'dobrego', 'wielkiego', 'psa', 'rasowego']
     hypothesis = ['alana', 'rego', 'kruchego', 'psa', 'rasowego']
-    return reference, hypothesis
+    return string_list_to_words(reference), string_list_to_words(hypothesis)
 
 
 def get_calculator() -> WerEmbeddingCalculator:
diff --git a/tests/test_soft_wer.py b/tests/test_soft_wer.py
index c72b97f..e47240b 100644
--- a/tests/test_soft_wer.py
+++ b/tests/test_soft_wer.py
@@ -3,14 +3,19 @@ from typing import List, Tuple
 import pytest
 
 from sziszapangma.core.wer.wer_soft_calculator import WerSoftCalculator
+from sziszapangma.core.wer.word import Word
 from tests.file_stored_embedding_transformer import \
     FileStoredEmbeddingTransformer
 
 
-def get_sample_data() -> Tuple[List[str], List[str]]:
+def string_list_to_words(strings: List[str]) -> List[Word]:
+    return [Word.from_string(it) for it in strings]
+
+
+def get_sample_data() -> Tuple[List[Word], List[Word]]:
     reference = ['ala', 'ma', 'dobrego', 'wielkiego', 'psa', 'rasowego']
     hypothesis = ['alana', 'rego', 'kruchego', 'psa', 'rasowego']
-    return reference, hypothesis
+    return string_list_to_words(reference), string_list_to_words(hypothesis)
 
 
 def get_calculator() -> WerSoftCalculator:
-- 
GitLab