Skip to content
Snippets Groups Projects
Commit 428b93f6 authored by Marcin Wątroba's avatar Marcin Wątroba
Browse files

Add voicelab pipeline stages

parent 3be71a1b
Branches
1 merge request!13Change data model
from dataclasses import dataclass
from typing import Any
from experiment.pos_processing.spacy_token_pos_processing_task import SpacyTokenPosProcessingTask
@dataclass
class WordSpan:
text: str
index_start: int
index_end: int
class AsrSpacyTokenPosProcessingTask(SpacyTokenPosProcessingTask):
_spacy_property_name: str
_nlp: Any
_input_property_name: str
def __init__(self, task_name: str, spacy_property_name: str, require_update: bool,
input_property_name: str):
super().__init__(task_name, spacy_property_name, require_update, input_property_name)
def get_transcript_to_process(self, property_value: Any) -> str:
return property_value['full_text']
from dataclasses import dataclass
from typing import Any
from experiment.pos_processing.spacy_token_pos_processing_task import SpacyTokenPosProcessingTask
@dataclass
class WordSpan:
text: str
index_start: int
index_end: int
class GoldTranscriptSpacyTokenPosProcessingTask(SpacyTokenPosProcessingTask):
_spacy_property_name: str
_nlp: Any
_input_property_name: str
def __init__(self, task_name: str, spacy_property_name: str, require_update: bool,
input_property_name: str):
super().__init__(task_name, spacy_property_name, require_update, input_property_name)
def get_transcript_to_process(self, property_value: Any) -> str:
return ' '.join([it['word'] for it in property_value])
from typing import Any, List, Dict
from sziszapangma.core.alignment.alignment_classic_calculator import AlignmentClassicCalculator
from sziszapangma.core.alignment.alignment_step import AlignmentStep
from sziszapangma.core.wer.wer_calculator import WerCalculator
from sziszapangma.integration.mapper.alignment_step_mapper import AlignmentStepMapper
from sziszapangma.integration.repository.experiment_repository import ExperimentRepository
from sziszapangma.integration.task.processing_task import ProcessingTask
from sziszapangma.model.model import Word
from sziszapangma.model.relation_manager import RelationManager
_CLASSIC_WER = "classic_wer"
class SpacyPosWerProcessingTask(ProcessingTask):
_gold_transcript_pos_property_name: str
_asr_pos_property_name: str
_pos_alignment_wer: str
_pos_metrics_wer: str
_alignment_classic_calculator: AlignmentClassicCalculator
_wer_calculator: WerCalculator
def __init__(
self,
task_name: str,
gold_transcript_pos_property_name: str,
require_update: bool,
asr_pos_property_name: str,
pos_alignment_wer: str,
pos_metrics_wer: str
):
super().__init__(task_name, require_update)
self._gold_transcript_pos_property_name = gold_transcript_pos_property_name
self._asr_pos_property_name = asr_pos_property_name
self._pos_alignment_wer = pos_alignment_wer
self._pos_metrics_wer = pos_metrics_wer
self._alignment_classic_calculator = AlignmentClassicCalculator()
self._wer_calculator = WerCalculator()
def run_single_process(
self,
record_id: str,
experiment_repository: ExperimentRepository,
relation_manager: RelationManager,
):
gold_transcript_pos = [
Word(id=it['id'], type='Word', text=it['pos'])
for it in
experiment_repository.get_property_for_key(
record_id, self._gold_transcript_pos_property_name)
]
asr_transcript_pos = [
Word(id=it['id'], type='Word', text=it['pos'])
for it in
experiment_repository.get_property_for_key(record_id, self._asr_pos_property_name)
]
if gold_transcript_pos is not None and asr_transcript_pos is not None:
alignment_steps = self._get_alignment(gold_transcript_pos, asr_transcript_pos)
experiment_repository.update_property_for_key(
record_id,
self._pos_alignment_wer,
[AlignmentStepMapper.to_json_dict(it) for it in alignment_steps],
)
experiment_repository.update_property_for_key(
record_id, self._pos_metrics_wer, self.calculate_metrics(alignment_steps)
)
def _get_alignment(
self, gold_transcript: List[Word], asr_transcript: List[Word]
) -> List[AlignmentStep]:
return self._alignment_classic_calculator.calculate_alignment(
reference=gold_transcript, hypothesis=asr_transcript
)
def calculate_metrics(self, alignment_steps: List[AlignmentStep]) -> Dict[str, Any]:
"""Calculate all metrics for data sample."""
metrics = dict()
metrics[_CLASSIC_WER] = self._wer_calculator.calculate_wer(alignment_steps)
return metrics
def skip_for_record(self, record_id: str, experiment_repository: ExperimentRepository) -> bool:
return (
experiment_repository.get_property_for_key(record_id, self._pos_metrics_wer)
is not None
)
import uuid
from abc import abstractmethod, ABC
from dataclasses import dataclass
from typing import Any, Dict
import spacy
from spacy.tokens import Token
from sziszapangma.integration.repository.experiment_repository import ExperimentRepository
from sziszapangma.integration.task.processing_task import ProcessingTask
from sziszapangma.model.relation_manager import RelationManager
@dataclass
class WordSpan:
text: str
index_start: int
index_end: int
class SpacyTokenPosProcessingTask(ProcessingTask, ABC):
_spacy_model_name: str
_nlp: Any
_input_property_name: str
_spacy_property_name: str
def __init__(
self,
task_name: str,
spacy_property_name: str,
require_update: bool,
input_property_name: str
):
super().__init__(task_name, require_update)
self._spacy_property_name = spacy_property_name
self._nlp = spacy.load("pl_core_news_lg")
self._input_property_name = input_property_name
def run_single_process(self, record_id: str, experiment_repository: ExperimentRepository,
relation_manager: RelationManager):
test_property = experiment_repository.get_property_for_key(
record_id, self._input_property_name)
document = self._nlp(self.get_transcript_to_process(test_property))
spacy_result = [self.token_to_result_dict(token) for token in document]
experiment_repository.update_property_for_key(record_id, self._spacy_property_name,
spacy_result)
@staticmethod
def token_to_result_dict(token: Token) -> Dict[str, str]:
return {'id': str(uuid.uuid4()), 'word': token.text, 'pos': token.pos_}
@abstractmethod
def get_transcript_to_process(self, property_value: Any) -> str:
pass
def skip_for_record(self, record_id: str, experiment_repository: ExperimentRepository) -> bool:
return experiment_repository.property_exists(record_id, self._spacy_property_name)
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment