Skip to content
Snippets Groups Projects
Commit 3be71a1b authored by Marcin Wątroba's avatar Marcin Wątroba
Browse files

Add voicelab pipeline stages

parent 8d234117
Branches
1 merge request!13Change data model
Showing
with 662 additions and 91 deletions
version: "3.8"
services:
techmo_asr:
image: docker-registry.theliver.pl/techmo-asr:1.1
container_name: techmo_asr
restart: always
ports:
- 5001:5000
volumes:
- /etc/localtime:/etc/localtime:ro
- /home/marcinwatroba/.ssh/keys/techmo_asr_server:/keys/techmo_rsa_key:ro
environment:
- TECHMO_SSH_SERVER_USERNAME=mwatroba
- TECHMO_SSH_SERVER_URL=jankocon.clarin-pl.eu
- TECHMO_SERVER_SSH_PORT=9222
- TECHMO_REMOTE_SERVICE_PORT=12321
- TECHMO_SERVER_URL=156.17.135.34
- AUTH_TOKEN=__example_token__
# techmo_asr:
# image: docker-registry.theliver.pl/techmo-asr:1.1
# container_name: techmo_asr
# restart: always
# ports:
# - 5001:5000
# volumes:
# - /etc/localtime:/etc/localtime:ro
# - /home/marcinwatroba/.ssh/keys/techmo_asr_server:/keys/techmo_rsa_key:ro
# environment:
# - TECHMO_SSH_SERVER_USERNAME=mwatroba
# - TECHMO_SSH_SERVER_URL=jankocon.clarin-pl.eu
# - TECHMO_SERVER_SSH_PORT=9222
# - TECHMO_REMOTE_SERVICE_PORT=12321
# - TECHMO_SERVER_URL=156.17.135.34
# - AUTH_TOKEN=__example_token__
transformers-wav2vec2for_ctc:
image: docker-registry.theliver.pl/transformers-wav2vec2for_ctc:1.0
container_name: transformers-wav2vec2for_ctc
restart: always
volumes:
- /etc/localtime:/etc/localtime:ro
- ./wav2vec2for_ctc_models:/models
ports:
- 5002:5000
environment:
- AUTH_TOKEN=__example_token__
- MODEL_NAME=jonatasgrosman/wav2vec2-large-xlsr-53-polish
- SAMPLING_RATE=16000
transformers-wav2vec2for_ctc:
image: docker-registry.theliver.pl/transformers-wav2vec2for_ctc:1.0
container_name: transformers-wav2vec2for_ctc
restart: always
volumes:
- /etc/localtime:/etc/localtime:ro
- ./wav2vec2for_ctc_models:/models
ports:
- "5430:5000"
environment:
- AUTH_TOKEN=__example_token__
- MODEL_NAME=jonatasgrosman/wav2vec2-large-xlsr-53-polish
- SAMPLING_RATE=16000
embedding_service:
image: docker-registry.theliver.pl/embedding_docker:1.0
container_name: embeddings_service
restart: always
ports:
- 5003:5000
environment:
- AUTH_TOKEN=__example_token__
volumes:
- /etc/localtime:/etc/localtime:ro
- ./embedding_models:/models
# embedding_service:
# image: docker-registry.theliver.pl/embedding_docker:1.0
# container_name: embeddings_service
# restart: always
# ports:
# - 5003:5000
# environment:
# - AUTH_TOKEN=__example_token__
# volumes:
# - /etc/localtime:/etc/localtime:ro
# - ./embedding_models:/models
ajn_asr:
image: docker-registry.theliver.pl/asr-clarin-pl-service:1.4
container_name: ajn_asr
restart: always
ports:
- 5004:5000
environment:
- AUTH_TOKEN=__example_token__
volumes:
- /etc/localtime:/etc/localtime:ro
ajn_asr:
image: docker-registry.theliver.pl/asr-clarin-pl-service:1.4
container_name: ajn_asr
restart: always
ports:
- "5431:5000"
environment:
- AUTH_TOKEN=__example_token__
volumes:
- /etc/localtime:/etc/localtime:ro
speechbrain_asr:
image: docker-registry.theliver.pl/speechbrain-asr:1.5
container_name: speechbrain_asr
restart: always
ports:
- 5005:5000
volumes:
- /etc/localtime:/etc/localtime:ro
- ./speechbrain_asr_models:/models
environment:
- AUTH_TOKEN=__example_token__
speechbrain_asr:
image: docker-registry.theliver.pl/speechbrain-asr:1.5
container_name: speechbrain_asr
restart: always
ports:
- "5432:5000"
volumes:
- /etc/localtime:/etc/localtime:ro
- ./speechbrain_asr_models:/models
environment:
- AUTH_TOKEN=__example_token__
......@@ -66,3 +66,116 @@ stages:
md5: 6d56f24b0ff78c0d44ade2114158150d.dir
size: 110711470
nfiles: 1600
luna_gold_transcript_processing:
cmd: "PYTHONPATH=. python experiment/luna/pipeline/luna_gold_transcript_processing.py\n"
deps:
- path: experiment/luna/pipeline/luna_gold_transcript_processing.py
md5: 2bae24d511febebb26b3264b204784f5
size: 1466
- path: experiment_data/dataset/LUNA.PL
md5: d342155b1871e881797cf7da09d5dc3c.dir
size: 1578358645
nfiles: 4500
- path: experiment_data/dataset_relation_manager_data/luna
md5: ff680a49296818460a49bd0c70089a4a.dir
size: 229007155
nfiles: 1000
outs:
- path: experiment_data/pipeline/asr_benchmark_luna/gold_transcript
md5: c9c51d94294eb1b30b39aef5d6abbe4b.dir
size: 6706925
nfiles: 500
- path: experiment_data/pipeline/asr_benchmark_luna/gold_transcript_spacy
md5: 24a399475b752737db0f2a8671507014.dir
size: 6785648
nfiles: 500
luna_ajn_processing:
cmd: "PYTHONPATH=. python experiment/luna/pipeline/luna_ajn_asr_processing.py\n"
deps:
- path: experiment/luna/pipeline/luna_ajn_asr_processing.py
md5: ec7d7b5384f845173d9fb77e9cfa9907
size: 2501
- path: experiment_data/dataset/LUNA.PL
md5: d342155b1871e881797cf7da09d5dc3c.dir
size: 1578358645
nfiles: 4500
- path: experiment_data/pipeline/asr_benchmark_luna/gold_transcript
md5: c9c51d94294eb1b30b39aef5d6abbe4b.dir
size: 6706925
nfiles: 500
- path: experiment_data/pipeline/asr_benchmark_luna/gold_transcript_spacy
md5: 24a399475b752737db0f2a8671507014.dir
size: 6785648
nfiles: 500
outs:
- path: experiment_data/pipeline/asr_benchmark_luna/ajn_polish_asr
md5: 620e178854dbcb69f49a608f34573a88.dir
size: 6159899
nfiles: 494
- path: experiment_data/pipeline/asr_benchmark_luna/ajn_spacy
md5: 312be284d4ec9e38986048e785fcbbc1.dir
size: 6535212
nfiles: 494
- path: experiment_data/pipeline/asr_benchmark_luna/pos_ajn_alignment_wer
md5: 8ad558edb6a8bd2508a7e25bcf53bf94.dir
size: 21936929
nfiles: 494
- path: experiment_data/pipeline/asr_benchmark_luna/pos_ajn_metrics_wer
md5: 98c74c5bf87637749eac1ed5ff3393b4.dir
size: 16842
nfiles: 494
- path: experiment_data/pipeline/asr_benchmark_luna/word_ajn_alignment_wer
md5: 1741fff740259398b28bf2a6ba3aec41.dir
size: 20671277
nfiles: 494
- path: experiment_data/pipeline/asr_benchmark_luna/word_ajn_metrics_wer
md5: 18605657ff9c7ef3221e27b671a3b4d1.dir
size: 16835
nfiles: 494
luna_techmo_processing:
cmd: "PYTHONPATH=. python experiment/luna/pipeline/luna_techmo_processing.py\n"
deps:
- path: experiment/luna/pipeline/luna_techmo_processing.py
md5: b4d5ad7a0d7fb0714a2dc02cb457e8c9
size: 2628
- path: experiment_data/cached_asr/luna_techmo
md5: 033ea7b5434dded73bf869bfdd299462.dir
size: 4256479
nfiles: 500
- path: experiment_data/dataset/LUNA.PL
md5: d342155b1871e881797cf7da09d5dc3c.dir
size: 1578358645
nfiles: 4500
- path: experiment_data/pipeline/asr_benchmark_luna/gold_transcript
md5: c9c51d94294eb1b30b39aef5d6abbe4b.dir
size: 6706925
nfiles: 500
- path: experiment_data/pipeline/asr_benchmark_luna/gold_transcript_spacy
md5: 24a399475b752737db0f2a8671507014.dir
size: 6785648
nfiles: 500
outs:
- path: experiment_data/pipeline/asr_benchmark_luna/pos_techmo_alignment_wer
md5: c71539f3889c627a371957958bd0907d.dir
size: 20897599
nfiles: 500
- path: experiment_data/pipeline/asr_benchmark_luna/pos_techmo_metrics_wer
md5: 4efbe309674d9d494bae3dac057025ba.dir
size: 17341
nfiles: 500
- path: experiment_data/pipeline/asr_benchmark_luna/techmo_polish_asr
md5: acfaec46b2415ed6a64e3a3464d164f8.dir
size: 9697519
nfiles: 500
- path: experiment_data/pipeline/asr_benchmark_luna/techmo_spacy
md5: e869581816457d1585a7e42d0a18b8b2.dir
size: 6124559
nfiles: 500
- path: experiment_data/pipeline/asr_benchmark_luna/word_techmo_alignment_wer
md5: 0dabd65b3981d588cd23d943abc6e231.dir
size: 21380796
nfiles: 500
- path: experiment_data/pipeline/asr_benchmark_luna/word_techmo_metrics_wer
md5: 4cfbb2830b280084ece14b1ef815b92a.dir
size: 17298
nfiles: 500
......@@ -8,17 +8,49 @@ stages:
outs:
- experiment_data/dataset_relation_manager_data/luna
luna_main_pipeline:
luna_gold_transcript_processing:
cmd: |
python -m spacy download pl_core_news_lg
PYTHONPATH=. python experiment/luna/pipeline/luna_main.py
PYTHONPATH=. python experiment/luna/pipeline/luna_gold_transcript_processing.py
deps:
- experiment/luna/pipeline/luna_main.py
- experiment/luna/pipeline/luna_gold_transcript_processing.py
- experiment_data/dataset_relation_manager_data/luna
- experiment_data/dataset/LUNA.PL
outs:
- experiment_data/pipeline/asr_benchmark_luna/gold_transcript
- experiment_data/pipeline/asr_benchmark_luna/gold_transcript_spacy
luna_techmo_processing:
cmd: |
PYTHONPATH=. python experiment/luna/pipeline/luna_techmo_processing.py
deps:
- experiment/luna/pipeline/luna_techmo_processing.py
- experiment_data/dataset/LUNA.PL
- experiment_data/cached_asr/luna_techmo
- experiment_data/pipeline/asr_benchmark_luna/gold_transcript
- experiment_data/pipeline/asr_benchmark_luna/gold_transcript_spacy
outs:
- experiment_data/pipeline/asr_benchmark_luna/techmo_polish_asr
- experiment_data/pipeline/asr_benchmark_luna/word_techmo_metrics_wer
- experiment_data/pipeline/asr_benchmark_luna/word_techmo_alignment_wer
- experiment_data/pipeline/asr_benchmark_luna/techmo_spacy
- experiment_data/pipeline/asr_benchmark_luna/pos_techmo_alignment_wer
- experiment_data/pipeline/asr_benchmark_luna/pos_techmo_metrics_wer
luna_ajn_processing:
cmd: |
PYTHONPATH=. python experiment/luna/pipeline/luna_ajn_asr_processing.py
deps:
- experiment/luna/pipeline/luna_ajn_asr_processing.py
- experiment_data/dataset/LUNA.PL
- experiment_data/pipeline/asr_benchmark_luna/gold_transcript
- experiment_data/pipeline/asr_benchmark_luna/gold_transcript_spacy
outs:
- experiment_data/pipeline/asr_benchmark_luna
- experiment_data/pipeline/asr_benchmark_luna/ajn_polish_asr
- experiment_data/pipeline/asr_benchmark_luna/word_ajn_metrics_wer
- experiment_data/pipeline/asr_benchmark_luna/word_ajn_alignment_wer
- experiment_data/pipeline/asr_benchmark_luna/ajn_spacy
- experiment_data/pipeline/asr_benchmark_luna/pos_ajn_alignment_wer
- experiment_data/pipeline/asr_benchmark_luna/pos_ajn_metrics_wer
voicelab_import_to_common_format:
cmd: PYTHONPATH=. python experiment/voicelab/import_data.py
......@@ -28,16 +60,48 @@ stages:
outs:
- experiment_data/dataset_relation_manager_data/voicelab_cbiz_testset_20220322
voicelab_main_pipeline:
voicelab_gold_transcript_processing:
cmd: |
python -m spacy download pl_core_news_lg
PYTHONPATH=. python experiment/voicelab/voicelab_pipeline.py
PYTHONPATH=. python experiment/voicelab/voicelab_gold_transcript_processor.py
deps:
- experiment/voicelab/voicelab_pipeline.py
- experiment/voicelab/voicelab_gold_transcript_processor.py
- experiment_data/dataset_relation_manager_data/voicelab_cbiz_testset_20220322
- experiment_data/dataset/voicelab_cbiz_testset_20220322
outs:
- experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/gold_transcript
- experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/gold_transcript_spacy
voicelab_techmo_processing:
cmd: |
PYTHONPATH=. python experiment/voicelab/voicelab_pipeline_techmo.py
deps:
- experiment/voicelab/voicelab_pipeline_techmo.py
- experiment_data/dataset/voicelab_cbiz_testset_20220322
- experiment_data/cached_asr/voicelab_cbiz_testset_20220322_techmo
- experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/gold_transcript
- experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/gold_transcript_spacy
outs:
- experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/techmo_polish_asr
- experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/word_techmo_metrics_wer
- experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/word_techmo_alignment_wer
- experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/techmo_spacy
- experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/pos_techmo_alignment_wer
- experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/pos_techmo_metrics_wer
voicelab_ajn_processing:
cmd: |
PYTHONPATH=. python experiment/voicelab/voicelab_pipeline_ajn_asr.py
deps:
- experiment/voicelab/voicelab_pipeline_ajn_asr.py
- experiment_data/dataset/voicelab_cbiz_testset_20220322
- experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/gold_transcript
- experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/gold_transcript_spacy
outs:
- experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322
- experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/ajn_polish_asr
- experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/word_ajn_metrics_wer
- experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/word_ajn_alignment_wer
- experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/ajn_spacy
- experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/pos_ajn_alignment_wer
- experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/pos_ajn_metrics_wer
# concurrent features, multiprocessing
from experiment.luna.luna_record_provider import LunaRecordProvider
from sziszapangma.integration.path_filter import ExtensionPathFilter
from sziszapangma.integration.repository.multi_files_experiment_repository import MultiFilesExperimentRepository
LUNA_DIRECTORY = 'experiment_data/dataset/LUNA.PL'
GOLD_TRANSCRIPT = 'gold_transcript'
GOLD_TRANSCRIPT_SPACY = 'gold_transcript_spacy'
TECHMO_POLISH_ASR = 'techmo_polish_asr'
WORD_TECHMO_MERTICS_WER = 'word_techmo_metrics_wer'
WORD_TECHMO_ALIGNMENT_WER = 'word_techmo_alignment_wer'
TECHMO_SPACY = 'techmo_spacy'
POS_TECHMO_ALIGNMENT_WER = 'pos_techmo_alignment_wer'
POS_TECHMO_METRICS_WER = 'pos_techmo_metrics_wer'
AJN_POLISH_ASR = 'ajn_polish_asr'
WORD_AJN_MERTICS_WER = 'word_ajn_metrics_wer'
WORD_AJN_ALIGNMENT_WER = 'word_ajn_alignment_wer'
AJN_SPACY = 'ajn_spacy'
POS_AJN_ALIGNMENT_WER = 'pos_ajn_alignment_wer'
POS_AJN_METRICS_WER = 'pos_ajn_metrics_wer'
def get_record_provider() -> LunaRecordProvider:
return LunaRecordProvider(
ExtensionPathFilter(
root_directory=f'{LUNA_DIRECTORY}/LUNA.PL',
extension='wav'
),
relation_manager_root_path='experiment_data/dataset_relation_manager_data/luna'
)
def get_multiple_files_repository() -> MultiFilesExperimentRepository:
return MultiFilesExperimentRepository('experiment_data/pipeline', 'asr_benchmark_luna')
from experiment.luna.pipeline.dependency_provider import get_record_provider, GOLD_TRANSCRIPT, \
get_multiple_files_repository, \
GOLD_TRANSCRIPT_SPACY, AJN_POLISH_ASR, WORD_AJN_MERTICS_WER, WORD_AJN_ALIGNMENT_WER, AJN_SPACY, \
POS_AJN_ALIGNMENT_WER, POS_AJN_METRICS_WER
from experiment.pos_processing.asr_spacy_token_pos_processing_task import AsrSpacyTokenPosProcessingTask
from experiment.pos_processing.spacy_pos_wer_processing_task import SpacyPosWerProcessingTask
from sziszapangma.integration.experiment_manager import ExperimentManager
from sziszapangma.integration.task.asr_task import AsrTask
from sziszapangma.integration.task.classic_wer_metric_task import ClassicWerMetricTask
from sziszapangma.integration.asr_processor import AsrWebClient
def run_luna_experiment():
record_provider = get_record_provider()
experiment_processor = ExperimentManager(
record_id_iterator=record_provider,
processing_tasks=[
AsrTask(
task_name='ajn_polish_asr_task',
asr_processor=AsrWebClient('http://localhost:5431/process_asr', '__example_token__'),
asr_property_name=AJN_POLISH_ASR,
require_update=False,
record_path_provider=record_provider
),
ClassicWerMetricTask(
task_name='techmo_word_wer_processing',
asr_property_name=AJN_POLISH_ASR,
gold_transcript_property_name=GOLD_TRANSCRIPT,
metrics_property_name=WORD_AJN_MERTICS_WER,
require_update=False,
alignment_property_name=WORD_AJN_ALIGNMENT_WER
),
AsrSpacyTokenPosProcessingTask(
task_name='techmo_spacy_task',
input_property_name=AJN_POLISH_ASR,
spacy_property_name=AJN_SPACY,
require_update=True
),
SpacyPosWerProcessingTask(
task_name='techmo_pos_wer_processing',
require_update=False,
gold_transcript_pos_property_name=GOLD_TRANSCRIPT_SPACY,
asr_pos_property_name=AJN_SPACY,
pos_alignment_wer=POS_AJN_ALIGNMENT_WER,
pos_metrics_wer=POS_AJN_METRICS_WER
)
],
experiment_repository=get_multiple_files_repository(),
relation_manager_provider=record_provider
)
experiment_processor.process()
if __name__ == '__main__':
run_luna_experiment()
from experiment.luna.pipeline.dependency_provider import get_record_provider, GOLD_TRANSCRIPT, \
get_multiple_files_repository, GOLD_TRANSCRIPT_SPACY
from experiment.luna.pipeline.task.luna_gold_transcript_processor import LunaGoldTranscriptProcessor
from experiment.pos_processing.gold_transcript_spacy_token_pos_processing_task import \
GoldTranscriptSpacyTokenPosProcessingTask
from sziszapangma.integration.experiment_manager import ExperimentManager
from sziszapangma.integration.task.gold_transcript_task import GoldTranscriptTask
def run_luna_experiment():
record_provider = get_record_provider()
ExperimentManager(
record_id_iterator=record_provider,
processing_tasks=[
GoldTranscriptTask(
task_name='gold_transcript_task',
gold_transcript_processor=LunaGoldTranscriptProcessor(record_provider),
gold_transcript_property_name=GOLD_TRANSCRIPT,
require_update=False
),
GoldTranscriptSpacyTokenPosProcessingTask(
task_name='gold_transcript_spacy_task',
input_property_name=GOLD_TRANSCRIPT,
spacy_property_name=GOLD_TRANSCRIPT_SPACY,
require_update=True
),
],
experiment_repository=get_multiple_files_repository(),
relation_manager_provider=record_provider
).process()
if __name__ == '__main__':
run_luna_experiment()
from experiment.luna.pipeline.dependency_provider import get_record_provider, GOLD_TRANSCRIPT, TECHMO_POLISH_ASR, \
get_multiple_files_repository, \
GOLD_TRANSCRIPT_SPACY, POS_TECHMO_ALIGNMENT_WER, POS_TECHMO_METRICS_WER, WORD_TECHMO_MERTICS_WER, \
WORD_TECHMO_ALIGNMENT_WER, TECHMO_SPACY
from experiment.pos_processing.asr_spacy_token_pos_processing_task import AsrSpacyTokenPosProcessingTask
from experiment.pos_processing.spacy_pos_wer_processing_task import SpacyPosWerProcessingTask
from sziszapangma.integration.asr_processor import AsrPathCacheClient
from sziszapangma.integration.experiment_manager import ExperimentManager
from sziszapangma.integration.task.asr_task import AsrTask
from sziszapangma.integration.task.classic_wer_metric_task import ClassicWerMetricTask
def run_luna_experiment():
record_provider = get_record_provider()
experiment_processor = ExperimentManager(
record_id_iterator=record_provider,
processing_tasks=[
AsrTask(
task_name='techmo_polish_asr_task',
asr_processor=AsrPathCacheClient('experiment_data/cached_asr/luna_techmo', record_provider,
record_provider),
asr_property_name=TECHMO_POLISH_ASR,
require_update=False,
record_path_provider=record_provider
),
ClassicWerMetricTask(
task_name='techmo_word_wer_processing',
asr_property_name=TECHMO_POLISH_ASR,
gold_transcript_property_name=GOLD_TRANSCRIPT,
metrics_property_name=WORD_TECHMO_MERTICS_WER,
require_update=False,
alignment_property_name=WORD_TECHMO_ALIGNMENT_WER
),
AsrSpacyTokenPosProcessingTask(
task_name='techmo_spacy_task',
input_property_name=TECHMO_POLISH_ASR,
spacy_property_name=TECHMO_SPACY,
require_update=True
),
SpacyPosWerProcessingTask(
task_name='techmo_pos_wer_processing',
require_update=False,
gold_transcript_pos_property_name=GOLD_TRANSCRIPT_SPACY,
asr_pos_property_name=TECHMO_SPACY,
pos_alignment_wer=POS_TECHMO_ALIGNMENT_WER,
pos_metrics_wer=POS_TECHMO_METRICS_WER
)
],
experiment_repository=get_multiple_files_repository(),
relation_manager_provider=record_provider
)
experiment_processor.process()
if __name__ == '__main__':
run_luna_experiment()
......@@ -11,7 +11,7 @@ class LunaGoldTranscriptProcessor(GoldTranscriptProcessor):
def __init__(self, record_provider: LunaRecordProvider):
self._record_provider = record_provider
def parse_word(self, word, relation_manager: RelationManager):
def parse_word(self, word, relation_manager: RelationManager) -> Dict[str, str]:
all_relations = relation_manager.get_all_relations_for_item(word['id'])
pos_id = [it['second_id'] for it in all_relations if it['second_type'] in ['pos']][0]
return {
......
import os.path
from typing import List
from experiment.voicelab.voicelab_dependency import get_record_provider
from experiment.voicelab.voicelab_telco_record_provider import VoicelabTelcoRecordProvider
from sziszapangma.integration.path_filter import ExtensionPathFilter
from sziszapangma.model.model import Word
......@@ -48,13 +49,7 @@ class VoicelabAdapter:
if __name__ == '__main__':
voicelab_record_provider = VoicelabTelcoRecordProvider(
ExtensionPathFilter(
'experiment_data/dataset/voicelab_cbiz_testset_20220322',
'wav'
),
'experiment_data/dataset_relation_manager_data/voicelab_cbiz_testset_20220322'
)
voicelab_record_provider = get_record_provider()
adapter = VoicelabAdapter(voicelab_record_provider)
for it in voicelab_record_provider.get_all_records():
adapter.import_record(it)
from experiment.voicelab.voicelab_telco_record_provider import VoicelabTelcoRecordProvider
from sziszapangma.integration.path_filter import ExtensionPathFilter
from sziszapangma.integration.repository.multi_files_experiment_repository import MultiFilesExperimentRepository
DATASET_DIRECTORY = 'experiment_data/dataset/voicelab_cbiz_testset_20220322'
GOLD_TRANSCRIPT = 'gold_transcript'
GOLD_TRANSCRIPT_SPACY = 'gold_transcript_spacy'
TECHMO_POLISH_ASR = 'techmo_polish_asr'
WORD_TECHMO_MERTICS_WER = 'word_techmo_metrics_wer'
WORD_TECHMO_ALIGNMENT_WER = 'word_techmo_alignment_wer'
TECHMO_SPACY = 'techmo_spacy'
POS_TECHMO_ALIGNMENT_WER = 'pos_techmo_alignment_wer'
POS_TECHMO_METRICS_WER = 'pos_techmo_metrics_wer'
AJN_POLISH_ASR = 'ajn_polish_asr'
WORD_AJN_MERTICS_WER = 'word_ajn_metrics_wer'
WORD_AJN_ALIGNMENT_WER = 'word_ajn_alignment_wer'
AJN_SPACY = 'ajn_spacy'
POS_AJN_ALIGNMENT_WER = 'pos_ajn_alignment_wer'
POS_AJN_METRICS_WER = 'pos_ajn_metrics_wer'
PIPELINE_DATA_DIRECTORY = 'experiment_data/pipeline'
EXPERIMENT_NAME = 'asr_benchmark_voicelab_cbiz_testset_20220322'
RELATION_MANAGER_ROOT_PATH = 'experiment_data/dataset_relation_manager_data/voicelab_cbiz_testset_20220322'
def get_record_provider() -> VoicelabTelcoRecordProvider:
return VoicelabTelcoRecordProvider(
ExtensionPathFilter(
root_directory=DATASET_DIRECTORY,
extension='wav'
),
relation_manager_root_path=RELATION_MANAGER_ROOT_PATH
)
def get_repository() -> MultiFilesExperimentRepository:
return MultiFilesExperimentRepository(PIPELINE_DATA_DIRECTORY, EXPERIMENT_NAME)
......@@ -2,6 +2,7 @@ from experiment.pos_processing.asr_spacy_token_pos_processing_task import AsrSpa
from experiment.pos_processing.gold_transcript_spacy_token_pos_processing_task import \
GoldTranscriptSpacyTokenPosProcessingTask
from experiment.pos_processing.spacy_pos_wer_processing_task import SpacyPosWerProcessingTask
from experiment.voicelab.voicelab_dependency import get_record_provider, get_repository
from experiment.voicelab.voicelab_gold_transcript_processor import VoicelabGoldTranscriptProcessor
from experiment.voicelab.voicelab_telco_record_provider import VoicelabTelcoRecordProvider
from sziszapangma.integration.asr_processor import AsrPathCacheClient
......@@ -29,14 +30,8 @@ EXPERIMENT_NAME = 'asr_benchmark_voicelab_cbiz_testset_20220322'
RELATION_MANAGER_ROOT_PATH = 'experiment_data/dataset_relation_manager_data/voicelab_cbiz_testset_20220322'
def run_voicelab_experiment(experiment_repository: ExperimentRepository):
record_provider = VoicelabTelcoRecordProvider(
ExtensionPathFilter(
root_directory=DATASET_DIRECTORY,
extension='wav'
),
relation_manager_root_path=RELATION_MANAGER_ROOT_PATH
)
def run_voicelab_experiment():
record_provider = get_record_provider()
experiment_processor = ExperimentManager(
record_id_iterator=record_provider,
processing_tasks=[
......@@ -87,16 +82,14 @@ def run_voicelab_experiment(experiment_repository: ExperimentRepository):
pos_metrics_wer=POS_METRICS_WER
)
],
experiment_repository=experiment_repository,
experiment_repository=get_repository(),
relation_manager_provider=record_provider
)
experiment_processor.process()
def example_run():
experiment_repository = MultiFilesExperimentRepository(
PIPELINE_DATA_DIRECTORY, EXPERIMENT_NAME)
run_voicelab_experiment(experiment_repository)
run_voicelab_experiment()
if __name__ == '__main__':
......
from experiment.luna.pipeline.luna_gold_transcript_processor import LunaGoldTranscriptProcessor
from experiment.luna.luna_record_provider import LunaRecordProvider
from experiment.pos_processing.asr_spacy_token_pos_processing_task import AsrSpacyTokenPosProcessingTask
from experiment.pos_processing.gold_transcript_spacy_token_pos_processing_task import \
GoldTranscriptSpacyTokenPosProcessingTask
from experiment.pos_processing.spacy_pos_wer_processing_task import SpacyPosWerProcessingTask
from sziszapangma.integration.asr_processor import AsrPathCacheClient
from experiment.voicelab.voicelab_dependency import get_record_provider, get_repository, GOLD_TRANSCRIPT, \
GOLD_TRANSCRIPT_SPACY, TECHMO_POLISH_ASR, AJN_POLISH_ASR, WORD_AJN_MERTICS_WER, WORD_AJN_ALIGNMENT_WER, AJN_SPACY, \
POS_AJN_ALIGNMENT_WER, POS_AJN_METRICS_WER
from experiment.voicelab.voicelab_gold_transcript_processor import VoicelabGoldTranscriptProcessor
from experiment.voicelab.voicelab_telco_record_provider import VoicelabTelcoRecordProvider
from sziszapangma.integration.asr_processor import AsrPathCacheClient, AsrWebClient
from sziszapangma.integration.experiment_manager import ExperimentManager
from sziszapangma.integration.path_filter import ExtensionPathFilter
from sziszapangma.integration.repository.experiment_repository import ExperimentRepository
......@@ -14,82 +17,50 @@ from sziszapangma.integration.task.asr_task import AsrTask
from sziszapangma.integration.task.classic_wer_metric_task import ClassicWerMetricTask
from sziszapangma.integration.task.gold_transcript_task import GoldTranscriptTask
LUNA_DIRECTORY = 'experiment_data/dataset/LUNA.PL'
GOLD_TRANSCRIPT = 'gold_transcript'
TECHMO_POLISH_ASR = 'techmo_polish_asr'
TECHMO_POLISH_CLASSIC_WER_METRIC = 'techmo_polish_classic_wer_metric'
TECHMO_POLISH_CLASSIC_ALIGNMENT = 'techmo_polish_classic_alignment'
TECHMO_SPACY = 'techmo_spacy'
GOLD_TRANSCRIPT_SPACY = 'gold_transcript_spacy'
POS_ALIGNMENT_WER = 'pos_alignment_wer'
POS_METRICS_WER = 'pos_metrics_wer'
def run_luna_experiment(experiment_repository: ExperimentRepository):
record_provider = LunaRecordProvider(
ExtensionPathFilter(
root_directory=f'{LUNA_DIRECTORY}/LUNA.PL',
extension='wav'
),
relation_manager_root_path='experiment_data/dataset_relation_manager_data/luna'
)
def run_voicelab_experiment():
record_provider = get_record_provider()
experiment_processor = ExperimentManager(
record_id_iterator=record_provider,
processing_tasks=[
GoldTranscriptTask(
task_name='gold_transcript_task',
gold_transcript_processor=LunaGoldTranscriptProcessor(record_provider),
gold_transcript_property_name=GOLD_TRANSCRIPT,
require_update=False
),
AsrTask(
task_name='techmo_polish_task',
# asr_processor=AsrWebClient('http://192.168.0.124:4999/process_asr', 'test1234'),
asr_processor=AsrPathCacheClient('experiment_data/cached_asr/luna_techmo', record_provider,
record_provider),
asr_property_name=TECHMO_POLISH_ASR,
task_name='ajn_polish_asr_task',
asr_processor=AsrWebClient('http://localhost:5431/process_asr', '__example_token__'),
asr_property_name=AJN_POLISH_ASR,
require_update=False,
record_path_provider=record_provider
),
ClassicWerMetricTask(
task_name='classic_wer_metric_task',
asr_property_name=TECHMO_POLISH_ASR,
task_name='techmo_word_wer_processing',
asr_property_name=AJN_POLISH_ASR,
gold_transcript_property_name=GOLD_TRANSCRIPT,
metrics_property_name=TECHMO_POLISH_CLASSIC_WER_METRIC,
metrics_property_name=WORD_AJN_MERTICS_WER,
require_update=False,
alignment_property_name=TECHMO_POLISH_CLASSIC_ALIGNMENT
),
GoldTranscriptSpacyTokenPosProcessingTask(
task_name='gold_transcript_spacy_task',
input_property_name=GOLD_TRANSCRIPT,
spacy_property_name=GOLD_TRANSCRIPT_SPACY,
require_update=True
alignment_property_name=WORD_AJN_ALIGNMENT_WER
),
AsrSpacyTokenPosProcessingTask(
task_name='techmo_spacy_task',
input_property_name=TECHMO_POLISH_ASR,
spacy_property_name=TECHMO_SPACY,
input_property_name=AJN_POLISH_ASR,
spacy_property_name=AJN_SPACY,
require_update=True
),
SpacyPosWerProcessingTask(
task_name='PosWerProcessor',
task_name='techmo_pos_wer_processing',
require_update=False,
gold_transcript_pos_property_name=GOLD_TRANSCRIPT_SPACY,
asr_pos_property_name=TECHMO_SPACY,
pos_alignment_wer=POS_ALIGNMENT_WER,
pos_metrics_wer=POS_METRICS_WER
asr_pos_property_name=AJN_SPACY,
pos_alignment_wer=POS_AJN_ALIGNMENT_WER,
pos_metrics_wer=POS_AJN_METRICS_WER
)
],
experiment_repository=experiment_repository,
experiment_repository=get_repository(),
relation_manager_provider=record_provider
)
experiment_processor.process()
def example_run():
experiment_repository = MultiFilesExperimentRepository(
'experiment_data/pipeline', 'asr_benchmark_luna')
run_luna_experiment(experiment_repository)
run_voicelab_experiment()
if __name__ == '__main__':
......
from experiment.pos_processing.asr_spacy_token_pos_processing_task import AsrSpacyTokenPosProcessingTask
from experiment.pos_processing.gold_transcript_spacy_token_pos_processing_task import \
GoldTranscriptSpacyTokenPosProcessingTask
from experiment.pos_processing.spacy_pos_wer_processing_task import SpacyPosWerProcessingTask
from experiment.voicelab.voicelab_dependency import get_record_provider, get_repository, GOLD_TRANSCRIPT, \
GOLD_TRANSCRIPT_SPACY
from experiment.voicelab.voicelab_gold_transcript_processor import VoicelabGoldTranscriptProcessor
from experiment.voicelab.voicelab_telco_record_provider import VoicelabTelcoRecordProvider
from sziszapangma.integration.asr_processor import AsrPathCacheClient
from sziszapangma.integration.experiment_manager import ExperimentManager
from sziszapangma.integration.path_filter import ExtensionPathFilter
from sziszapangma.integration.repository.experiment_repository import ExperimentRepository
from sziszapangma.integration.repository.multi_files_experiment_repository import \
MultiFilesExperimentRepository
from sziszapangma.integration.task.asr_task import AsrTask
from sziszapangma.integration.task.classic_wer_metric_task import ClassicWerMetricTask
from sziszapangma.integration.task.gold_transcript_task import GoldTranscriptTask
def run_voicelab_experiment():
record_provider = get_record_provider()
experiment_processor = ExperimentManager(
record_id_iterator=record_provider,
processing_tasks=[
GoldTranscriptTask(
task_name='gold_transcript_task',
gold_transcript_processor=VoicelabGoldTranscriptProcessor(record_provider),
gold_transcript_property_name=GOLD_TRANSCRIPT,
require_update=False
),
GoldTranscriptSpacyTokenPosProcessingTask(
task_name='gold_transcript_spacy_task',
input_property_name=GOLD_TRANSCRIPT,
spacy_property_name=GOLD_TRANSCRIPT_SPACY,
require_update=True
)
],
experiment_repository=get_repository(),
relation_manager_provider=record_provider
)
experiment_processor.process()
def example_run():
run_voicelab_experiment()
if __name__ == '__main__':
example_run()
from experiment.pos_processing.asr_spacy_token_pos_processing_task import AsrSpacyTokenPosProcessingTask
from experiment.pos_processing.spacy_pos_wer_processing_task import SpacyPosWerProcessingTask
from experiment.voicelab.voicelab_dependency import get_record_provider, get_repository, GOLD_TRANSCRIPT, \
GOLD_TRANSCRIPT_SPACY, TECHMO_POLISH_ASR, WORD_TECHMO_MERTICS_WER, WORD_TECHMO_ALIGNMENT_WER, TECHMO_SPACY, \
POS_TECHMO_METRICS_WER, POS_TECHMO_ALIGNMENT_WER
from sziszapangma.integration.asr_processor import AsrPathCacheClient
from sziszapangma.integration.experiment_manager import ExperimentManager
from sziszapangma.integration.task.asr_task import AsrTask
from sziszapangma.integration.task.classic_wer_metric_task import ClassicWerMetricTask
def run_voicelab_experiment():
record_provider = get_record_provider()
experiment_processor = ExperimentManager(
record_id_iterator=record_provider,
processing_tasks=[
AsrTask(
task_name='techmo_polish_task',
asr_processor=AsrPathCacheClient(
'experiment_data/cached_asr/voicelab_cbiz_testset_20220322_techmo',
record_provider,
record_provider
),
asr_property_name=TECHMO_POLISH_ASR,
require_update=False,
record_path_provider=record_provider
),
ClassicWerMetricTask(
task_name='techmo_word_wer_processing',
asr_property_name=TECHMO_POLISH_ASR,
gold_transcript_property_name=GOLD_TRANSCRIPT,
metrics_property_name=WORD_TECHMO_MERTICS_WER,
require_update=False,
alignment_property_name=WORD_TECHMO_ALIGNMENT_WER
),
AsrSpacyTokenPosProcessingTask(
task_name='techmo_spacy_task',
input_property_name=TECHMO_POLISH_ASR,
spacy_property_name=TECHMO_SPACY,
require_update=True
),
SpacyPosWerProcessingTask(
task_name='techmo_pos_wer_processing',
require_update=False,
gold_transcript_pos_property_name=GOLD_TRANSCRIPT_SPACY,
asr_pos_property_name=TECHMO_SPACY,
pos_alignment_wer=POS_TECHMO_ALIGNMENT_WER,
pos_metrics_wer=POS_TECHMO_METRICS_WER
)
],
experiment_repository=get_repository(),
relation_manager_provider=record_provider
)
experiment_processor.process()
def example_run():
run_voicelab_experiment()
if __name__ == '__main__':
example_run()
/luna_techmo
/voicelab_cbiz_testset_20220322_techmo
/luna_ajn_polish_asr
outs:
- md5: 620e178854dbcb69f49a608f34573a88.dir
size: 6159899
nfiles: 494
path: luna_ajn_polish_asr
No preview for this file type
No preview for this file type
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment