Skip to content
Snippets Groups Projects
Unverified Commit 02e76f7f authored by Marcin Wątroba's avatar Marcin Wątroba
Browse files

Divide WerCalculator for Alignment and Wer calculator

parent 2e9b00d6
Branches
2 merge requests!4Feature/add poetry,!3Add ids to words
Showing
with 347 additions and 18 deletions
......@@ -21,6 +21,6 @@ exclude = docs
# Define setup.py command aliases here
test = pytest
[tool:pytest]
collect_ignore = ['setup.py']
;[tool:pytest]
;collect_ignore = ['setup.py']
from abc import ABC
from typing import List, Tuple, Optional
import numpy as np
from sziszapangma.core.alignment.step_type import StepType
from sziszapangma.core.alignment.alignment_step import AlignmentStep
from sziszapangma.core.alignment.distance_matrix_calculator import \
DistanceCalculator
from sziszapangma.core.alignment.step_words import StepWords
from sziszapangma.core.alignment.alignment_processing_step import \
AlignmentProcessingStep
from sziszapangma.core.alignment.word import Word
class AlignmentCalculator(ABC):
_distance_matrix_calculator: DistanceCalculator
def __init__(self, distance_matrix_calculator: DistanceCalculator):
self._distance_matrix_calculator = distance_matrix_calculator
def convert_processing_steps_to_result(
self,
processing_steps: List[AlignmentProcessingStep],
) -> List[AlignmentStep]:
return [
AlignmentStep(step.step_type, step.step_words, step.step_cost)
for step in processing_steps
]
def _get_reference_indexes_per_steps(
self,
steps: List[AlignmentProcessingStep]
) -> List[int]:
counter = 0
indexes = []
for step in steps:
indexes.append(counter)
if step.step_type.contain_reference_word():
counter = counter + 1
return indexes
def get_distance_matrix_between_words(
self,
reference: List[Word],
hypothesis: List[Word]
) -> np.ndarray:
return self._distance_matrix_calculator.calculate_distance_matrix(
reference, hypothesis)
@staticmethod
def _get_initialized_levenshtein_matrix(
reference: List[Word],
hypothesis: List[Word]
) -> Tuple[np.ndarray, List[List[Optional[AlignmentProcessingStep]]]]:
# TODO: consider about remove distance_arr replaced by steps_arr
reference_len = len(reference)
hypothesis_len = len(hypothesis)
distance_arr = np.zeros((reference_len + 1) * (hypothesis_len + 1)) \
.reshape((reference_len + 1, hypothesis_len + 1))
steps_arr = [
[None for _ in range(hypothesis_len + 1)]
for _ in range(reference_len + 1)
]
# levenshtein initial
for ref_index in range(reference_len + 1):
distance_arr[ref_index][0] = ref_index
step_words = StepWords(
reference[ref_index - 1] if ref_index > 0 else None,
None
)
steps_arr[ref_index][0] = AlignmentProcessingStep\
.levenshtein_deletion(ref_index - 1, step_words)
for hyp_index in range(hypothesis_len + 1):
distance_arr[0][hyp_index] = hyp_index
step_words = StepWords(
None,
hypothesis[hyp_index - 1] if hyp_index > 0 else None
)
steps_arr[0][hyp_index] = AlignmentProcessingStep\
.levenshtein_insertion(hyp_index - 1, step_words)
return distance_arr, steps_arr
@staticmethod
def _get_levenshtein_processing_step_cross(
prev_cross_distance: float,
step_words: StepWords,
current_distance: float
) -> AlignmentProcessingStep:
return AlignmentProcessingStep.levenshtein_correct(
prev_cross_distance, step_words, 0) \
if current_distance == 0 \
else AlignmentProcessingStep.levenshtein_substitution(
prev_cross_distance, step_words, current_distance)
def get_levenshtein_embedding_based(
self,
reference: List[Word],
hypothesis: List[Word],
distance_matrix: np.ndarray
) -> Tuple[np.ndarray, List[List[AlignmentProcessingStep]]]:
reference_len = len(reference)
hypothesis_len = len(hypothesis)
distance_arr, steps_arr = self._get_initialized_levenshtein_matrix(
reference, hypothesis)
for ref_index in range(reference_len):
for hyp_index in range(hypothesis_len):
step_words = StepWords(reference[ref_index],
hypothesis[hyp_index])
current_distance = distance_matrix[ref_index][hyp_index]
prev_cross_distance = distance_arr[ref_index][hyp_index]
cross_go_step = self._get_levenshtein_processing_step_cross(
prev_cross_distance, step_words, current_distance)
insertion_step = AlignmentProcessingStep.levenshtein_insertion(
distance_arr[ref_index + 1][hyp_index], step_words)
deletion_step = AlignmentProcessingStep.levenshtein_deletion(
distance_arr[ref_index][hyp_index + 1], step_words)
best_step = min([cross_go_step, insertion_step, deletion_step],
key=lambda it: it.total_distance())
distance_arr[ref_index + 1][hyp_index + 1] = \
best_step.total_distance()
steps_arr[ref_index + 1][hyp_index + 1] = best_step
return distance_arr, steps_arr
def extract_steps_path(
self,
steps_matrix: List[List[AlignmentProcessingStep]]
) -> List[AlignmentProcessingStep]:
x = len(steps_matrix) - 1
y = len(steps_matrix[0]) - 1
to_return = []
while not (x == 0 and y == 0):
current_step = steps_matrix[x][y]
to_return.append(current_step)
if current_step.step_type == StepType.DELETION:
x = x - 1
elif current_step.step_type == StepType.INSERTION:
y = y - 1
else: # creation and substitution
y = y - 1
x = x - 1
return to_return[::-1]
def _calculate_steps_path(
self,
reference: List[Word],
hypothesis: List[Word]
) -> List[AlignmentProcessingStep]:
distance_between_words = self.get_distance_matrix_between_words(
reference, hypothesis)
_, steps_matrix = self.get_levenshtein_embedding_based(
reference, hypothesis, distance_between_words)
return self.extract_steps_path(steps_matrix)
def calculate_alignment(
self,
reference: List[Word],
hypothesis: List[Word]
) -> List[AlignmentStep]:
steps_path = self._calculate_steps_path(reference, hypothesis)
return self.convert_processing_steps_to_result(steps_path)
def calculate_alignment_weighted(
self,
reference: List[Word],
hypothesis: List[Word],
weights: List[float]
) -> List[AlignmentStep]:
steps_path = self._calculate_steps_path(reference, hypothesis)
return self.convert_processing_steps_to_result(steps_path)
from sziszapangma.core.wer.distance_matrix_calculator import \
from sziszapangma.core.alignment.alignment_calculator import \
AlignmentCalculator
from sziszapangma.core.alignment.distance_matrix_calculator import \
BinaryDistanceCalculator
from sziszapangma.core.wer.wer_calculator import WerCalculator
class ClassicWerCalculator(WerCalculator):
class AlignmentClassicCalculator(AlignmentCalculator):
def __init__(self):
super().__init__(BinaryDistanceCalculator())
from typing import List, Optional
from typing import List
from sziszapangma.core.alignment.alignment_calculator import \
AlignmentCalculator
from sziszapangma.core.alignment.alignment_processing_step import \
AlignmentProcessingStep
from sziszapangma.core.alignment.alignment_step import AlignmentStep
from sziszapangma.core.alignment.distance_matrix_calculator import \
BinaryDistanceCalculator, DistanceCalculator, CosineDistanceCalculator
from sziszapangma.core.alignment.step_words import StepWords
from sziszapangma.core.transformer.embedding_transformer import \
EmbeddingTransformer
from sziszapangma.core.wer.distance_matrix_calculator import \
BinaryDistanceCalculator, DistanceCalculator, CosineDistanceCalculator
from sziszapangma.core.wer.step_words import StepWords
from sziszapangma.core.wer.wer_calculator import WerCalculator
from sziszapangma.core.wer.wer_processing_step import WerProcessingStep
from sziszapangma.core.wer.wer_step import WerStep
class WerEmbeddingCalculator(WerCalculator):
class AlignmentEmbeddingCalculator(AlignmentCalculator):
_distance_calculator: DistanceCalculator
def __init__(self, embedding_transformer: EmbeddingTransformer):
......@@ -30,7 +32,7 @@ class WerEmbeddingCalculator(WerCalculator):
def _calculate_result_cost_for_step(
self,
processing_step: WerProcessingStep
processing_step: AlignmentProcessingStep
) -> float:
step_words = processing_step.step_words
return self._calculate_distance_for_word_step(step_words) \
......@@ -39,25 +41,10 @@ class WerEmbeddingCalculator(WerCalculator):
def convert_processing_steps_to_result(
self,
processing_steps: List[WerProcessingStep],
reference_weights: Optional[List[float]] = None
) -> List[WerStep]:
if reference_weights is None:
return [
WerStep(step.step_type, step.step_words,
self._calculate_result_cost_for_step(step))
for step in processing_steps
]
else:
indexes_per_steps = self._get_reference_indexes_per_steps(
processing_steps)
return [
WerStep(
processing_steps[step_index].step_type,
processing_steps[step_index].step_words,
reference_weights[indexes_per_steps[step_index]] *
self._calculate_result_cost_for_step(
processing_steps[step_index])
)
for step_index in range(len(processing_steps))
]
processing_steps: List[AlignmentProcessingStep]
) -> List[AlignmentStep]:
return [
AlignmentStep(step.step_type, step.step_words,
self._calculate_result_cost_for_step(step))
for step in processing_steps
]
from dataclasses import dataclass
from sziszapangma.core.wer.step_type import StepType
from sziszapangma.core.wer.step_words import StepWords
from sziszapangma.core.alignment.step_type import StepType
from sziszapangma.core.alignment.step_words import StepWords
@dataclass(frozen=True)
class WerProcessingStep:
class AlignmentProcessingStep:
step_type: StepType
step_words: StepWords
previous_distance: float
......@@ -15,27 +15,27 @@ class WerProcessingStep:
def levenshtein_insertion(cls, previous_distance: float,
step_words: StepWords, step_cost: float = 1):
words = StepWords(None, step_words.hypothesis_word)
return WerProcessingStep(StepType.INSERTION, words,
previous_distance, step_cost)
return AlignmentProcessingStep(StepType.INSERTION, words,
previous_distance, step_cost)
@classmethod
def levenshtein_deletion(cls, previous_distance: float,
step_words: StepWords, step_cost: float = 1):
words = StepWords(step_words.reference_word, None)
return WerProcessingStep(StepType.DELETION, words,
previous_distance, step_cost)
return AlignmentProcessingStep(StepType.DELETION, words,
previous_distance, step_cost)
@classmethod
def levenshtein_substitution(cls, previous_distance: float,
step_words: StepWords, step_cost: float):
return WerProcessingStep(StepType.SUBSTITUTION, step_words,
previous_distance, step_cost)
return AlignmentProcessingStep(StepType.SUBSTITUTION, step_words,
previous_distance, step_cost)
@classmethod
def levenshtein_correct(cls, previous_distance: float,
step_words: StepWords, step_cost: float):
return WerProcessingStep(StepType.CORRECT, step_words,
previous_distance, step_cost)
return AlignmentProcessingStep(StepType.CORRECT, step_words,
previous_distance, step_cost)
def total_distance(self) -> float:
return self.step_cost + self.previous_distance
from sziszapangma.core.wer.distance_matrix_calculator import \
from sziszapangma.core.alignment.alignment_calculator import \
AlignmentCalculator
from sziszapangma.core.alignment.distance_matrix_calculator import \
CosineDistanceCalculator
from sziszapangma.core.wer.wer_calculator import WerCalculator
from sziszapangma.core.transformer.embedding_transformer import \
EmbeddingTransformer
class WerSoftCalculator(WerCalculator):
class AlignmentSoftCalculator(AlignmentCalculator):
def __init__(self, embedding_transformer: EmbeddingTransformer):
super().__init__(CosineDistanceCalculator(embedding_transformer))
from dataclasses import dataclass
from sziszapangma.core.alignment.step_type import StepType
from sziszapangma.core.alignment.step_words import StepWords
@dataclass(frozen=True)
class AlignmentStep:
step_type: StepType
step_words: StepWords
step_cost: float
def with_weight_multiplication(self, weight: float):
return AlignmentStep(
step_type=self.step_type,
step_words=self.step_words,
step_cost=self.step_cost * weight
)
from typing import List, Optional
import numpy as np
import pandas as pd
from sziszapangma.core.alignment.alignment_step import AlignmentStep
class AlignmentUtil:
@staticmethod
def _optional_str_to_str(value: Optional[str]) -> str:
return value if value is not None else ''
@staticmethod
def _wer_step_to_pandas_row_lit(step: AlignmentStep) -> List[any]:
return [
step.step_type.get_short_name(),
AlignmentUtil._optional_str_to_str(step.step_words.reference_word),
AlignmentUtil._optional_str_to_str(
step.step_words.hypothesis_word),
round(step.step_cost, 3)
]
@staticmethod
def steps_to_dataframe(steps: List[AlignmentStep]) -> pd.DataFrame:
arr = np.array([
AlignmentUtil._wer_step_to_pandas_row_lit(step)
for step in steps
])
return pd.DataFrame(
arr,
columns=['step_type', 'reference', 'hypothesis', 'cost']
)
@staticmethod
def get_reference_indexes_per_steps(
steps: List[AlignmentStep]
) -> List[int]:
counter = 0
indexes = []
for step in steps:
indexes.append(counter)
if step.step_type.contain_reference_word():
counter = counter + 1
return indexes
@staticmethod
def get_reference_length(steps: List[AlignmentStep]) -> int:
return sum([
1 if step.step_type.contain_reference_word() else 0
for step in steps
])
@staticmethod
def apply_weights_to_alignment(
steps: List[AlignmentStep],
weights: List[float]
) -> List[AlignmentStep]:
if AlignmentUtil.get_reference_length(steps) != len(weights):
raise Exception(
f"Incorrect length of weights, current={len(weights)}, "
f"required={AlignmentUtil.get_reference_length(steps)}"
)
reference_indexes_per_steps = \
AlignmentUtil.get_reference_indexes_per_steps(steps)
return [
steps[index].with_weight_multiplication(
weights[reference_indexes_per_steps[index]])
for index in range(len(steps))
]
......@@ -5,7 +5,7 @@ import numpy as np
from sziszapangma.core.transformer.embedding_transformer import \
EmbeddingTransformer
from sziszapangma.core.wer.word import Word
from sziszapangma.core.alignment.word import Word
class DistanceCalculator(ABC):
......
from dataclasses import dataclass
from typing import Optional
from sziszapangma.core.wer.word import Word
from sziszapangma.core.alignment.word import Word
@dataclass(frozen=True)
......
File moved
File deleted
File deleted
File deleted
File deleted
File deleted
File deleted
File deleted
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment