Skip to content
Snippets Groups Projects
Commit 13f918e6 authored by Marcin Wątroba's avatar Marcin Wątroba
Browse files

Fix dep-tags

parent f4762cc2
Branches change_data_model
1 merge request!13Change data model
Showing
with 312 additions and 279 deletions
This diff is collapsed.
......@@ -379,12 +379,12 @@ stages:
- dataset: pl_common_voice
asr: ajn
- dataset: pl_voicelab_cbiz
asr: google
- dataset: pl_voicelab_cbiz
asr: ajn
- dataset: pl_voicelab_cbiz
asr: techmo
# - dataset: pl_voicelab_cbiz
# asr: google
# - dataset: pl_voicelab_cbiz
# asr: ajn
# - dataset: pl_voicelab_cbiz
# asr: techmo
- dataset: pl_google_fleurs
asr: google
......
......@@ -25,4 +25,4 @@ class SpacyDepTagSentenceWerProcessor(SentenceWerProcessor):
def process_text(self, text: str) -> List[str]:
document = self._nlp(text)
return [token.pos_ for token in document]
return [token.dep_ for token in document]
......@@ -40,3 +40,7 @@
/wav2vec2__spacy_dep_tag_metrics
/ajn__spacy_pos_alignment
/ajn__spacy_pos_metrics
/google__spacy_dep_tag_alignment
/google__spacy_dep_tag_metrics
/techmo__spacy_dep_tag_alignment
/techmo__spacy_dep_tag_metrics
......@@ -57,3 +57,5 @@
/google__word_wer_embeddings_alignment
/google__spacy_dep_tag_alignment
/google__spacy_dep_tag_metrics
/wav2vec2__spacy_dep_tag_alignment
/wav2vec2__spacy_dep_tag_metrics
......@@ -57,3 +57,5 @@
/google__spacy_dep_tag_metrics
/techmo__spacy_dep_tag_alignment
/techmo__spacy_dep_tag_metrics
/wav2vec2__spacy_dep_tag_alignment
/wav2vec2__spacy_dep_tag_metrics
......@@ -49,3 +49,11 @@
/ajn__word_wer_classic_alignment
/ajn__word_wer_embeddings_metrics
/ajn__word_wer_embeddings_alignment
/wav2vec2__spacy_dep_tag_alignment
/wav2vec2__spacy_dep_tag_metrics
/google__spacy_dep_tag_alignment
/google__spacy_dep_tag_metrics
/techmo__spacy_dep_tag_alignment
/techmo__spacy_dep_tag_metrics
/ajn__spacy_dep_tag_alignment
/ajn__spacy_dep_tag_metrics
......@@ -68,7 +68,12 @@ class CosineDistanceCalculator(DistanceCalculator):
raise RuntimeError("array dimensions {} not right".format(a.ndim))
similarity = np.dot(a, b.T) / (a_norm * b_norm)
dist = 1.0 - similarity
return float(dist)
# return float(dist)
float_dist = float(dist)
if abs(float_dist) < 0.000001:
return 0.0
else:
return float_dist
def calculate_distance_matrix(
self, reference: List[Word], hypothesis: List[Word]
......
No preview for this file type
from abc import ABC
from typing import List
import numpy as np
from sziszapangma.core.alignment.alignment_step import AlignmentStep
from sziszapangma.core.alignment.alignment_util import AlignmentUtil
from sziszapangma.core.alignment.step_type import StepType
from sziszapangma.core.wer.span import Span
......@@ -27,7 +30,8 @@ class WerCalculator(ABC):
steps: List[AlignmentStep],
) -> float:
reference_len = AlignmentUtil.get_reference_length(steps)
return sum([step.step_cost for step in steps]) / reference_len
fixed_step_costs = [step.step_cost for step in steps]
return sum(fixed_step_costs) / reference_len
def calculate_wer(self, steps: List[AlignmentStep]) -> float:
return self._calculate_wer(steps)
......
No preview for this file type
from typing import List
from sziszapangma.core.alignment.alignment_embedding_calculator import AlignmentEmbeddingCalculator
from sziszapangma.core.alignment.alignment_soft_calculator import AlignmentSoftCalculator
from sziszapangma.core.transformer.cached_embedding_transformer import CachedEmbeddingTransformer
......@@ -7,6 +9,7 @@ from sziszapangma.integration.mapper.alignment_step_mapper import AlignmentStepM
from sziszapangma.integration.repository.experiment_repository import ExperimentRepository
from sziszapangma.integration.task.processing_task import ProcessingTask
from sziszapangma.integration.task.task_util import TaskUtil
from sziszapangma.model.model import Word
from sziszapangma.model.relation_manager import RelationManager
_SOFT_WER = "soft_wer"
......@@ -51,6 +54,10 @@ class EmbeddingWerMetricsTask(ProcessingTask):
is not None
)
@staticmethod
def filter_empty_words(words: List[Word]) -> List[Word]:
return [it for it in words if len(it['text']) > 0]
def run_single_process(
self,
record_id: str,
......@@ -60,8 +67,8 @@ class EmbeddingWerMetricsTask(ProcessingTask):
gold_transcript = TaskUtil.get_words_from_record(relation_manager)
asr_result = experiment_repository.get_property_for_key(record_id, self._asr_property_name)
if gold_transcript is not None and asr_result is not None and "transcription" in asr_result:
gold_transcript_lower = TaskUtil.words_to_lower(gold_transcript)
asr_transcript_lower = TaskUtil.words_to_lower(asr_result["transcription"])
gold_transcript_lower = self.filter_empty_words(TaskUtil.words_to_lower(gold_transcript))
asr_transcript_lower = self.filter_empty_words(TaskUtil.words_to_lower(asr_result["transcription"]))
soft_alignment = self._alignment_soft_calculator.calculate_alignment(
gold_transcript_lower, asr_transcript_lower
......@@ -80,6 +87,7 @@ class EmbeddingWerMetricsTask(ProcessingTask):
],
}
wer_results = {"soft_wer": soft_wer, "embedding_wer": embedding_wer}
print(wer_results)
experiment_repository.update_property_for_key(
record_id, self._alignment_property_name, alignment_results
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment