Skip to content
Snippets Groups Projects
Commit 3d76363f authored by Marcin Wątroba's avatar Marcin Wątroba
Browse files

Add tag experiment

parent 7ca1b6d4
Branches
1 merge request!13Change data model
......@@ -332,3 +332,81 @@ stages:
md5: 1fc2985ad4c3cb00d05b1865ad5b22d4.dir
size: 56182
nfiles: 800
voicelab_tag_spacy_processing:
cmd: "PYTHONPATH=. python -u experiment/voicelab/spacy_tag_processing.py\n"
deps:
- path: experiment/voicelab/spacy_tag_processing.py
md5: b5f996e5be56cdf93eec23b9c0c066b9
size: 2580
- path: experiment_data/dataset/voicelab_cbiz_testset_20220322
md5: 3c2b18e1f1f89e4c5ad7b254e472b25e.dir
size: 4803739404
nfiles: 1600
- path: experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/ajn_spacy
md5: e8a48a0a63c1569ec734e1c8bb03c7db.dir
size: 20536889
nfiles: 800
- path: experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/gold_transcript_spacy
md5: c166937f6e8ae9d28412ca1e3e43469e.dir
size: 26643278
nfiles: 800
- path: experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/techmo_spacy
md5: cd89a91a33629088ba6fc30ef8427dee.dir
size: 24482297
nfiles: 800
outs:
- path: experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/tag_spacy_ajn_alignment_wer_embeddings
md5: c7914fa4d415a4815e371017ef505358.dir
size: 80129550
nfiles: 800
- path: experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/tag_spacy_ajn_metrics_wer_embeddings
md5: ac726b3b371256176aee1364cb1fec88.dir
size: 27258
nfiles: 800
- path: experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/tag_spacy_techmo_alignment_wer_embeddings
md5: 47d1b5a820806dbad941b19547eb1273.dir
size: 83756819
nfiles: 800
- path: experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/tag_spacy_techmo_metrics_wer_embeddings
md5: ba59691f467f0486f832bf86326c5142.dir
size: 27781
nfiles: 800
luna_tag_spacy_processing:
cmd: "PYTHONPATH=. python -u experiment/luna/pipeline/spacy_tag_processing.py\n"
deps:
- path: experiment/luna/pipeline/spacy_tag_processing.py
md5: bdaf1cae6863815ce59e022a493379da
size: 2567
- path: experiment_data/dataset/LUNA.PL
md5: d342155b1871e881797cf7da09d5dc3c.dir
size: 1578358645
nfiles: 4500
- path: experiment_data/pipeline/asr_benchmark_luna/ajn_spacy
md5: f06d2f1369b18e5fa126af5a00a8f0b8.dir
size: 6590702
nfiles: 500
- path: experiment_data/pipeline/asr_benchmark_luna/gold_transcript_spacy
md5: 24a399475b752737db0f2a8671507014.dir
size: 6785648
nfiles: 500
- path: experiment_data/pipeline/asr_benchmark_luna/techmo_spacy
md5: 337b6bf947ee47cda30b3cc75f954e8e.dir
size: 6124559
nfiles: 500
outs:
- path: experiment_data/pipeline/asr_benchmark_luna/tag_spacy_ajn_alignment_wer_embeddings
md5: bd0439d750f23b978adbfd4ef1151a9c.dir
size: 22511019
nfiles: 500
- path: experiment_data/pipeline/asr_benchmark_luna/tag_spacy_ajn_metrics_wer_embeddings
md5: 85a4a0b2d6b91d9745f55944fb3886df.dir
size: 17014
nfiles: 500
- path: experiment_data/pipeline/asr_benchmark_luna/tag_spacy_techmo_alignment_wer_embeddings
md5: a6f1ef0da2dc2f45f522e3b2e7dec0f9.dir
size: 21411162
nfiles: 500
- path: experiment_data/pipeline/asr_benchmark_luna/tag_spacy_techmo_metrics_wer_embeddings
md5: abb1f6102a1de3d63d0e9e2cb797d2b9.dir
size: 17369
nfiles: 500
......@@ -57,6 +57,22 @@ stages:
- experiment_data/pipeline/asr_benchmark_luna/word_ajn_metrics_wer_embeddings
- experiment_data/pipeline/asr_benchmark_luna/word_ajn_alignment_wer_embeddings
luna_tag_spacy_processing:
cmd: |
PYTHONPATH=. python -u experiment/luna/pipeline/spacy_tag_processing.py
deps:
- experiment/luna/pipeline/spacy_tag_processing.py
- experiment_data/dataset/LUNA.PL
- experiment_data/pipeline/asr_benchmark_luna/ajn_spacy
- experiment_data/pipeline/asr_benchmark_luna/gold_transcript_spacy
- experiment_data/pipeline/asr_benchmark_luna/techmo_spacy
outs:
- experiment_data/pipeline/asr_benchmark_luna/tag_spacy_techmo_metrics_wer_embeddings
- experiment_data/pipeline/asr_benchmark_luna/tag_spacy_techmo_alignment_wer_embeddings
- experiment_data/pipeline/asr_benchmark_luna/tag_spacy_ajn_metrics_wer_embeddings
- experiment_data/pipeline/asr_benchmark_luna/tag_spacy_ajn_alignment_wer_embeddings
voicelab_import_to_common_format:
cmd: PYTHONPATH=. python -u experiment/voicelab/import_data.py
deps:
......@@ -114,4 +130,19 @@ stages:
- experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/word_ajn_metrics_wer_embeddings
- experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/word_ajn_alignment_wer_embeddings
voicelab_tag_spacy_processing:
cmd: |
PYTHONPATH=. python -u experiment/voicelab/spacy_tag_processing.py
deps:
- experiment/voicelab/spacy_tag_processing.py
- experiment_data/dataset/voicelab_cbiz_testset_20220322
- experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/ajn_spacy
- experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/gold_transcript_spacy
- experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/techmo_spacy
outs:
- experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/tag_spacy_techmo_metrics_wer_embeddings
- experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/tag_spacy_techmo_alignment_wer_embeddings
- experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/tag_spacy_ajn_metrics_wer_embeddings
- experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/tag_spacy_ajn_alignment_wer_embeddings
# concurrent features, multiprocessing
......@@ -15,6 +15,8 @@ POS_TECHMO_ALIGNMENT_WER = 'pos_techmo_alignment_wer'
POS_TECHMO_METRICS_WER = 'pos_techmo_metrics_wer'
WORD_TECHMO_METRICS_WER_EMBEDDINGS = 'word_techmo_metrics_wer_embeddings'
WORD_TECHMO_ALIGNMENT_WER_EMBEDDINGS = 'word_techmo_alignment_wer_embeddings'
TAG_SPACY_TECHMO_METRICS_WER_EMBEDDINGS = 'tag_spacy_techmo_metrics_wer_embeddings'
TAG_SPACY_TECHMO_ALIGNMENT_WER_EMBEDDINGS = 'tag_spacy_techmo_alignment_wer_embeddings'
AJN_POLISH_ASR = 'ajn_polish_asr'
WORD_AJN_METRICS_WER = 'word_ajn_metrics_wer'
......@@ -24,6 +26,9 @@ POS_AJN_ALIGNMENT_WER = 'pos_ajn_alignment_wer'
POS_AJN_METRICS_WER = 'pos_ajn_metrics_wer'
WORD_AJN_METRICS_WER_EMBEDDINGS = 'word_ajn_metrics_wer_embeddings'
WORD_AJN_ALIGNMENT_WER_EMBEDDINGS = 'word_ajn_alignment_wer_embeddings'
TAG_SPACY_AJN_METRICS_WER_EMBEDDINGS = 'tag_spacy_ajn_metrics_wer_embeddings'
TAG_SPACY_AJN_ALIGNMENT_WER_EMBEDDINGS = 'tag_spacy_ajn_alignment_wer_embeddings'
def get_record_provider() -> LunaRecordProvider:
......
from experiment.luna.pipeline.dependency_provider import get_record_provider, GOLD_TRANSCRIPT, TECHMO_POLISH_ASR, \
get_multiple_files_repository, \
GOLD_TRANSCRIPT_SPACY, POS_TECHMO_ALIGNMENT_WER, POS_TECHMO_METRICS_WER, WORD_TECHMO_METRICS_WER, \
WORD_TECHMO_ALIGNMENT_WER, TECHMO_SPACY, WORD_TECHMO_METRICS_WER_EMBEDDINGS, WORD_TECHMO_ALIGNMENT_WER_EMBEDDINGS, \
AJN_SPACY, TAG_SPACY_TECHMO_ALIGNMENT_WER_EMBEDDINGS, TAG_SPACY_TECHMO_METRICS_WER_EMBEDDINGS, \
TAG_SPACY_AJN_METRICS_WER_EMBEDDINGS, TAG_SPACY_AJN_ALIGNMENT_WER_EMBEDDINGS
from experiment.pos_processing.asr_spacy_token_pos_processing_task import AsrSpacyTokenPosProcessingTask
from experiment.pos_processing.spacy_pos_wer_processing_task import SpacyPosWerProcessingTask
from experiment.pos_processing.tag_spacy_wer_processing_task import TagSpacyWerProcessingTask
from sziszapangma.core.transformer.web_embedding_transformer import WebEmbeddingTransformer
from sziszapangma.integration.asr_processor import AsrPathCacheClient
from sziszapangma.integration.experiment_manager import ExperimentManager
from sziszapangma.integration.task.asr_task import AsrTask
from sziszapangma.integration.task.classic_wer_metric_task import ClassicWerMetricTask
from sziszapangma.integration.task.embedding_wer_metrics_task import EmbeddingWerMetricsTask
def run_luna_experiment():
record_provider = get_record_provider()
experiment_processor = ExperimentManager(
record_id_iterator=record_provider,
processing_tasks=[
TagSpacyWerProcessingTask(
task_name='techmo_pos_wer_processing',
require_update=False,
gold_transcript_pos_property_name=GOLD_TRANSCRIPT_SPACY,
asr_pos_property_name=TECHMO_SPACY,
pos_alignment_wer=TAG_SPACY_TECHMO_ALIGNMENT_WER_EMBEDDINGS,
pos_metrics_wer=TAG_SPACY_TECHMO_METRICS_WER_EMBEDDINGS
),
TagSpacyWerProcessingTask(
task_name='ajn_pos_wer_processing',
require_update=False,
gold_transcript_pos_property_name=GOLD_TRANSCRIPT_SPACY,
asr_pos_property_name=AJN_SPACY,
pos_alignment_wer=TAG_SPACY_AJN_ALIGNMENT_WER_EMBEDDINGS,
pos_metrics_wer=TAG_SPACY_AJN_METRICS_WER_EMBEDDINGS
)
],
experiment_repository=get_multiple_files_repository(),
relation_manager_provider=record_provider
)
experiment_processor.process()
if __name__ == '__main__':
run_luna_experiment()
from typing import Any, List, Dict
from sziszapangma.core.alignment.alignment_classic_calculator import AlignmentClassicCalculator
from sziszapangma.core.alignment.alignment_step import AlignmentStep
from sziszapangma.core.wer.wer_calculator import WerCalculator
from sziszapangma.integration.mapper.alignment_step_mapper import AlignmentStepMapper
from sziszapangma.integration.repository.experiment_repository import ExperimentRepository
from sziszapangma.integration.task.processing_task import ProcessingTask
from sziszapangma.model.model import Word
from sziszapangma.model.relation_manager import RelationManager
_CLASSIC_WER = "classic_wer"
class TagSpacyWerProcessingTask(ProcessingTask):
_gold_transcript_pos_property_name: str
_asr_pos_property_name: str
_pos_alignment_wer: str
_pos_metrics_wer: str
_alignment_classic_calculator: AlignmentClassicCalculator
_wer_calculator: WerCalculator
def __init__(
self,
task_name: str,
gold_transcript_pos_property_name: str,
require_update: bool,
asr_pos_property_name: str,
pos_alignment_wer: str,
pos_metrics_wer: str
):
super().__init__(task_name, require_update)
self._gold_transcript_pos_property_name = gold_transcript_pos_property_name
self._asr_pos_property_name = asr_pos_property_name
self._pos_alignment_wer = pos_alignment_wer
self._pos_metrics_wer = pos_metrics_wer
self._alignment_classic_calculator = AlignmentClassicCalculator()
self._wer_calculator = WerCalculator()
def run_single_process(
self,
record_id: str,
experiment_repository: ExperimentRepository,
relation_manager: RelationManager,
):
gold_transcript_pos = [
Word(id=it['id'], type='Word', text=it['word'])
for it in
experiment_repository.get_property_for_key(
record_id, self._gold_transcript_pos_property_name)
]
asr_transcript_pos = [
Word(id=it['id'], type='Word', text=it['word'])
for it in
experiment_repository.get_property_for_key(record_id, self._asr_pos_property_name)
]
if gold_transcript_pos is not None and asr_transcript_pos is not None:
alignment_steps = self._get_alignment(gold_transcript_pos, asr_transcript_pos)
experiment_repository.update_property_for_key(
record_id,
self._pos_alignment_wer,
[AlignmentStepMapper.to_json_dict(it) for it in alignment_steps],
)
experiment_repository.update_property_for_key(
record_id, self._pos_metrics_wer, self.calculate_metrics(alignment_steps)
)
def _get_alignment(
self, gold_transcript: List[Word], asr_transcript: List[Word]
) -> List[AlignmentStep]:
return self._alignment_classic_calculator.calculate_alignment(
reference=gold_transcript, hypothesis=asr_transcript
)
def calculate_metrics(self, alignment_steps: List[AlignmentStep]) -> Dict[str, Any]:
"""Calculate all metrics for data sample."""
metrics = dict()
metrics[_CLASSIC_WER] = self._wer_calculator.calculate_wer(alignment_steps)
return metrics
def skip_for_record(self, record_id: str, experiment_repository: ExperimentRepository) -> bool:
return (
experiment_repository.get_property_for_key(record_id, self._pos_metrics_wer)
is not None
)
from experiment.pos_processing.asr_spacy_token_pos_processing_task import AsrSpacyTokenPosProcessingTask
from experiment.pos_processing.spacy_pos_wer_processing_task import SpacyPosWerProcessingTask
from experiment.pos_processing.tag_spacy_wer_processing_task import TagSpacyWerProcessingTask
from experiment.voicelab.voicelab_dependency import get_record_provider, get_repository, GOLD_TRANSCRIPT, \
GOLD_TRANSCRIPT_SPACY, TECHMO_POLISH_ASR, WORD_TECHMO_METRICS_WER, WORD_TECHMO_ALIGNMENT_WER, TECHMO_SPACY, \
POS_TECHMO_METRICS_WER, POS_TECHMO_ALIGNMENT_WER, WORD_TECHMO_METRICS_WER_EMBEDDINGS, \
WORD_TECHMO_ALIGNMENT_WER_EMBEDDINGS, TAG_SPACY_TECHMO_ALIGNMENT_WER_EMBEDDINGS, \
TAG_SPACY_TECHMO_METRICS_WER_EMBEDDINGS, AJN_SPACY, TAG_SPACY_AJN_ALIGNMENT_WER_EMBEDDINGS, \
TAG_SPACY_AJN_METRICS_WER_EMBEDDINGS
from sziszapangma.core.transformer.web_embedding_transformer import WebEmbeddingTransformer
from sziszapangma.integration.asr_processor import AsrPathCacheClient
from sziszapangma.integration.experiment_manager import ExperimentManager
from sziszapangma.integration.task.asr_task import AsrTask
from sziszapangma.integration.task.classic_wer_metric_task import ClassicWerMetricTask
from sziszapangma.integration.task.embedding_wer_metrics_task import EmbeddingWerMetricsTask
def run_voicelab_experiment():
record_provider = get_record_provider()
experiment_processor = ExperimentManager(
record_id_iterator=record_provider,
processing_tasks=[
TagSpacyWerProcessingTask(
task_name='techmo_pos_wer_processing',
require_update=False,
gold_transcript_pos_property_name=GOLD_TRANSCRIPT_SPACY,
asr_pos_property_name=TECHMO_SPACY,
pos_alignment_wer=TAG_SPACY_TECHMO_ALIGNMENT_WER_EMBEDDINGS,
pos_metrics_wer=TAG_SPACY_TECHMO_METRICS_WER_EMBEDDINGS
),
TagSpacyWerProcessingTask(
task_name='ajn_pos_wer_processing',
require_update=False,
gold_transcript_pos_property_name=GOLD_TRANSCRIPT_SPACY,
asr_pos_property_name=AJN_SPACY,
pos_alignment_wer=TAG_SPACY_AJN_ALIGNMENT_WER_EMBEDDINGS,
pos_metrics_wer=TAG_SPACY_AJN_METRICS_WER_EMBEDDINGS
)
],
experiment_repository=get_repository(),
relation_manager_provider=record_provider
)
experiment_processor.process()
def example_run():
run_voicelab_experiment()
if __name__ == '__main__':
example_run()
......@@ -15,6 +15,8 @@ POS_TECHMO_ALIGNMENT_WER = 'pos_techmo_alignment_wer'
POS_TECHMO_METRICS_WER = 'pos_techmo_metrics_wer'
WORD_TECHMO_METRICS_WER_EMBEDDINGS = 'word_techmo_metrics_wer_embeddings'
WORD_TECHMO_ALIGNMENT_WER_EMBEDDINGS = 'word_techmo_alignment_wer_embeddings'
TAG_SPACY_TECHMO_METRICS_WER_EMBEDDINGS = 'tag_spacy_techmo_metrics_wer_embeddings'
TAG_SPACY_TECHMO_ALIGNMENT_WER_EMBEDDINGS = 'tag_spacy_techmo_alignment_wer_embeddings'
AJN_POLISH_ASR = 'ajn_polish_asr'
WORD_AJN_METRICS_WER = 'word_ajn_metrics_wer'
......@@ -24,6 +26,8 @@ POS_AJN_ALIGNMENT_WER = 'pos_ajn_alignment_wer'
POS_AJN_METRICS_WER = 'pos_ajn_metrics_wer'
WORD_AJN_METRICS_WER_EMBEDDINGS = 'word_ajn_metrics_wer_embeddings'
WORD_AJN_ALIGNMENT_WER_EMBEDDINGS = 'word_ajn_alignment_wer_embeddings'
TAG_SPACY_AJN_METRICS_WER_EMBEDDINGS = 'tag_spacy_ajn_metrics_wer_embeddings'
TAG_SPACY_AJN_ALIGNMENT_WER_EMBEDDINGS = 'tag_spacy_ajn_alignment_wer_embeddings'
PIPELINE_DATA_DIRECTORY = 'experiment_data/pipeline'
EXPERIMENT_NAME = 'asr_benchmark_voicelab_cbiz_testset_20220322'
......
outs:
- md5: 0f60fb48fc5f9a46e6b2262bd994e8e8.dir
size: 1273907
nfiles: 494
- md5: 10454ef4568c2023e9d51ad418db2854.dir
size: 1276562
nfiles: 495
path: luna_ajn_polish_asr
......@@ -16,3 +16,7 @@
/word_techmo_alignment_wer_embeddings
/word_ajn_metrics_wer_embeddings
/word_ajn_alignment_wer_embeddings
/tag_spacy_techmo_metrics_wer_embeddings
/tag_spacy_techmo_alignment_wer_embeddings
/tag_spacy_ajn_metrics_wer_embeddings
/tag_spacy_ajn_alignment_wer_embeddings
source diff could not be displayed: it is too large. Options to address this: view the blob.
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment