diff --git a/docker/ajn_asr/main.py b/docker/ajn_asr/main.py index 37459f0c10fcefd3f2d19ab655c9b4ac3ec87559..150ddde32759f53482c3f395601fe2f967d90b4c 100644 --- a/docker/ajn_asr/main.py +++ b/docker/ajn_asr/main.py @@ -8,7 +8,6 @@ from sziszapangma.integration.service_core.asr.asr_result import AsrResult class SpeechbrainAsrProcessor(AsrBaseProcessor): def process_asr(self, audio_file_path: str) -> AsrResult: - # prepare paths file_tag = str(uuid.uuid4()) file_extension = audio_file_path.split('.')[-1] file_name = f'{file_tag}.{file_extension}' diff --git a/docker/ajn_asr/prepare_docker.sh b/docker/ajn_asr/prepare_docker.sh index a47ad1318644338f1a3fc241fad3bb6cb58c0c67..c7e687d087cba4d4c5fd7305b9ec8b6afb8fe431 100755 --- a/docker/ajn_asr/prepare_docker.sh +++ b/docker/ajn_asr/prepare_docker.sh @@ -3,5 +3,5 @@ SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" docker build -t asr-clarin-pl-service "$SCRIPT_DIR" -docker tag asr-clarin-pl-service docker-registry.theliver.pl/asr-clarin-pl-service:1.4 -docker push docker-registry.theliver.pl/asr-clarin-pl-service:1.4 +docker tag asr-clarin-pl-service gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/asr-clarin-pl-service:1.4 +docker push gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/asr-clarin-pl-service:1.4 diff --git a/docker/build-all-dockers.sh b/docker/build-all-dockers.sh old mode 100644 new mode 100755 diff --git a/docker/fasttext_embedding/prepare_docker.sh b/docker/fasttext_embedding/prepare_docker.sh index 2e32082b2bc8883631ff748eb8f67a8c7bae1067..61827c8ea535eaf9a475525d37ca419944d3059a 100755 --- a/docker/fasttext_embedding/prepare_docker.sh +++ b/docker/fasttext_embedding/prepare_docker.sh @@ -3,5 +3,5 @@ SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" docker build -t embedding_docker "$SCRIPT_DIR" -docker tag embedding_docker docker-registry.theliver.pl/embedding_docker:1.0 -docker push docker-registry.theliver.pl/embedding_docker:1.0 +docker tag embedding_docker gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/embedding_docker:1.0 +docker push gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/embedding_docker:1.0 diff --git a/docker/polish_asr_hf/prepare_docker.sh b/docker/polish_asr_hf/prepare_docker.sh index c305c346eddbc44bcd4fa287bb835ac07eb221e6..b78193877eb5e112ab5c7e75da3aecac1b7e87b2 100755 --- a/docker/polish_asr_hf/prepare_docker.sh +++ b/docker/polish_asr_hf/prepare_docker.sh @@ -3,5 +3,5 @@ SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" docker build -t transformers-wav2vec2for_ctc "$SCRIPT_DIR" -docker tag transformers-wav2vec2for_ctc docker-registry.theliver.pl/transformers-wav2vec2for_ctc:1.0 -docker push docker-registry.theliver.pl/transformers-wav2vec2for_ctc:1.0 +docker tag transformers-wav2vec2for_ctc gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/transformers-wav2vec2for_ctc:1.0 +docker push gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/transformers-wav2vec2for_ctc:1.0 diff --git a/docker/speechbrain_asr/prepare_docker.sh b/docker/speechbrain_asr/prepare_docker.sh index 1731522f1854abc6abcf0ec6dbc3dd2191b86d5b..5a9c3395e6283a8399a47d2971a1ff22ff016cd1 100755 --- a/docker/speechbrain_asr/prepare_docker.sh +++ b/docker/speechbrain_asr/prepare_docker.sh @@ -3,5 +3,5 @@ SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" docker build --no-cache -t speechbrain-asr "$SCRIPT_DIR" -docker tag speechbrain-asr docker-registry.theliver.pl/speechbrain-asr:1.5 -docker push docker-registry.theliver.pl/speechbrain-asr:1.5 +docker tag speechbrain-asr gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/speechbrain-asr:1.5 +docker push gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/speechbrain-asr:1.5 diff --git a/docker/techmo_asr/prepare_docker.sh b/docker/techmo_asr/prepare_docker.sh index 0af4b816df543f0292a0e7f2317ea72a23e23129..309f7c378f2954d93298bf81922c4b766ea8b233 100755 --- a/docker/techmo_asr/prepare_docker.sh +++ b/docker/techmo_asr/prepare_docker.sh @@ -3,5 +3,5 @@ SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" docker build -t techmo-asr "$SCRIPT_DIR" -docker tag techmo-asr docker-registry.theliver.pl/techmo-asr:1.1 -docker push docker-registry.theliver.pl/techmo-asr:1.1 +docker tag techmo-asr gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/techmo-asr:1.1 +docker push gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/techmo-asr:1.1 diff --git a/dvc.lock b/dvc.lock index f611d3c3e1eab110957b83995a21c7b76b8d6271..686566c2ab5cdf6434e3801ae7775b819b7c2a60 100644 --- a/dvc.lock +++ b/dvc.lock @@ -36,3 +36,33 @@ stages: md5: 2e334734387ab4579b7b5269d5029e81.dir size: 71627685 nfiles: 4000 + luna_import_to_common_format: + cmd: PYTHONPATH=. python experiment/luna/import_dataset/import_luna.py + deps: + - path: experiment/luna/import_dataset/import_luna.py + md5: 44a1b914afda2ae74462e7afd83f658e + size: 8278 + - path: experiment_data/dataset/LUNA.PL + md5: d342155b1871e881797cf7da09d5dc3c.dir + size: 1578358645 + nfiles: 4500 + outs: + - path: experiment_data/dataset_relation_manager_data/luna + md5: ff680a49296818460a49bd0c70089a4a.dir + size: 229007155 + nfiles: 1000 + voicelab_import_to_common_format: + cmd: PYTHONPATH=. python experiment/voicelab/import_data.py + deps: + - path: experiment/voicelab/import_data.py + md5: 0cf7cf604b202489ce3b0cb51bb47fa2 + size: 2264 + - path: experiment_data/dataset/voicelab_cbiz_testset_20220322 + md5: 3c2b18e1f1f89e4c5ad7b254e472b25e.dir + size: 4803739404 + nfiles: 1600 + outs: + - path: experiment_data/dataset_relation_manager_data/voicelab_cbiz_testset_20220322 + md5: 6d56f24b0ff78c0d44ade2114158150d.dir + size: 110711470 + nfiles: 1600 diff --git a/dvc.yaml b/dvc.yaml index 7fae9011ade7fb44f3b405a7a53b1670492fc7ad..f2151c0fcc756eaeef31716dcd104dec297414fe 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -1,6 +1,6 @@ stages: - import_luna_to_common_format: + luna_import_to_common_format: cmd: PYTHONPATH=. python experiment/luna/import_dataset/import_luna.py deps: - experiment/luna/import_dataset/import_luna.py @@ -10,11 +10,34 @@ stages: luna_main_pipeline: cmd: | - PYTHONPATH=. python -m spacy download pl_core_news_lg + python -m spacy download pl_core_news_lg PYTHONPATH=. python experiment/luna/pipeline/luna_main.py deps: + - experiment/luna/pipeline/luna_main.py - experiment_data/dataset_relation_manager_data/luna - experiment_data/dataset/LUNA.PL - experiment_data/cached_asr/luna_techmo outs: - experiment_data/pipeline/asr_benchmark_luna + + voicelab_import_to_common_format: + cmd: PYTHONPATH=. python experiment/voicelab/import_data.py + deps: + - experiment/voicelab/import_data.py + - experiment_data/dataset/voicelab_cbiz_testset_20220322 + outs: + - experiment_data/dataset_relation_manager_data/voicelab_cbiz_testset_20220322 + + voicelab_main_pipeline: + cmd: | + python -m spacy download pl_core_news_lg + PYTHONPATH=. python experiment/voicelab/voicelab_pipeline.py + deps: + - experiment/voicelab/voicelab_pipeline.py + - experiment_data/dataset_relation_manager_data/voicelab_cbiz_testset_20220322 + - experiment_data/dataset/voicelab_cbiz_testset_20220322 + - experiment_data/cached_asr/voicelab_cbiz_testset_20220322_techmo + outs: + - experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322 + +# concurrent features, multiprocessing diff --git a/experiment/luna/import_dataset/import_luna.py b/experiment/luna/import_dataset/import_luna.py index 673e1869f47c3a2b02705a499d930dd411fd3697..45826431d7933ab5a666e8faf3e4670bf9a998ef 100644 --- a/experiment/luna/import_dataset/import_luna.py +++ b/experiment/luna/import_dataset/import_luna.py @@ -171,7 +171,7 @@ def main(): luna_directory = 'experiment_data/dataset/LUNA.PL' luna_record_provider = LunaRecordProvider( ExtensionPathFilter( - root_directory=luna_directory, + root_directory=f'{luna_directory}/LUNA.PL', extension='wav' ), relation_manager_root_path='experiment_data/dataset_relation_manager_data/luna' diff --git a/experiment/luna/move_asr.py b/experiment/luna/move_asr.py deleted file mode 100644 index 87910a00d85560a743d6f47909323b563b7114f7..0000000000000000000000000000000000000000 --- a/experiment/luna/move_asr.py +++ /dev/null @@ -1,21 +0,0 @@ -import json -from pprint import pprint - -from experiment.luna.luna_record_provider import LunaRecordProvider -from sziszapangma.integration.path_filter import ExtensionPathFilter - -if __name__ == '__main__': - luna_directory = 'experiment_data/dataset/LUNA.PL' - luna_record_provider = LunaRecordProvider( - ExtensionPathFilter( - root_directory=luna_directory, - extension='wav' - ), - relation_manager_root_path='experiment_data/dataset_relation_manager_data/luna' - ) - pprint(luna_record_provider.get_all_records()) - for record_id in luna_record_provider.get_all_records(): - path = f'experiment_data/cached_asr/LUNA_techmo_asr_cache/{record_id.replace("__", "/")}.wav.techmo.json' - raw = json.load(open(path, 'r')) - output_path = f'experiment_data/cached_asr/luna_techmo/{record_id}.json' - json.dump(raw, open(output_path, 'w')) diff --git a/experiment/luna/pipeline/luna_gold_transcript_processor.py b/experiment/luna/pipeline/luna_gold_transcript_processor.py index 4bfce0afaef4e8bbf618b9ccf5c1174cb0e43a0a..e7e36b2f8babdbb8558903969f78f3c76fa1f60b 100644 --- a/experiment/luna/pipeline/luna_gold_transcript_processor.py +++ b/experiment/luna/pipeline/luna_gold_transcript_processor.py @@ -1,6 +1,5 @@ from typing import List, Dict -# from experiment.luna.luna_record_provider import LunaRecordProvider from experiment.luna.luna_record_provider import LunaRecordProvider from sziszapangma.integration.gold_transcript_processor import GoldTranscriptProcessor from sziszapangma.model.relation_manager import RelationManager @@ -13,7 +12,6 @@ class LunaGoldTranscriptProcessor(GoldTranscriptProcessor): self._record_provider = record_provider def parse_word(self, word, relation_manager: RelationManager): - # print(word) all_relations = relation_manager.get_all_relations_for_item(word['id']) pos_id = [it['second_id'] for it in all_relations if it['second_type'] in ['pos']][0] return { diff --git a/experiment/luna/pipeline/luna_main.py b/experiment/luna/pipeline/luna_main.py index cc556e704cf17cbec70320d24f201d0d7c61cce1..d707202aeef5b9cb3433e11148551ea338e54641 100644 --- a/experiment/luna/pipeline/luna_main.py +++ b/experiment/luna/pipeline/luna_main.py @@ -1,11 +1,9 @@ from experiment.luna.pipeline.luna_gold_transcript_processor import LunaGoldTranscriptProcessor -from experiment.luna.pipeline.pos_processing.asr_spacy_token_pos_processing_task import \ - AsrSpacyTokenPosProcessingTask -from experiment.luna.pipeline.pos_processing.gold_transcript_spacy_token_pos_processing_task import \ - GoldTranscriptSpacyTokenPosProcessingTask -from experiment.luna.pipeline.pos_processing.spacy_pos_wer_processing_task import \ - SpacyPosWerProcessingTask from experiment.luna.luna_record_provider import LunaRecordProvider +from experiment.pos_processing.asr_spacy_token_pos_processing_task import AsrSpacyTokenPosProcessingTask +from experiment.pos_processing.gold_transcript_spacy_token_pos_processing_task import \ + GoldTranscriptSpacyTokenPosProcessingTask +from experiment.pos_processing.spacy_pos_wer_processing_task import SpacyPosWerProcessingTask from sziszapangma.integration.asr_processor import AsrPathCacheClient from sziszapangma.integration.experiment_manager import ExperimentManager from sziszapangma.integration.path_filter import ExtensionPathFilter @@ -30,7 +28,7 @@ POS_METRICS_WER = 'pos_metrics_wer' def run_luna_experiment(experiment_repository: ExperimentRepository): record_provider = LunaRecordProvider( ExtensionPathFilter( - root_directory=LUNA_DIRECTORY, + root_directory=f'{LUNA_DIRECTORY}/LUNA.PL', extension='wav' ), relation_manager_root_path='experiment_data/dataset_relation_manager_data/luna' diff --git a/experiment/luna/pipeline/pos_processing/asr_spacy_token_pos_processing_task.py b/experiment/luna/pipeline/pos_processing/asr_spacy_token_pos_processing_task.py deleted file mode 100644 index 42a0e25175b9710f5e150c7382c8c550891b0540..0000000000000000000000000000000000000000 --- a/experiment/luna/pipeline/pos_processing/asr_spacy_token_pos_processing_task.py +++ /dev/null @@ -1,24 +0,0 @@ -from dataclasses import dataclass -from typing import Any - -from experiment.luna.pipeline.pos_processing.spacy_token_pos_processing_task import SpacyTokenPosProcessingTask - - -@dataclass -class WordSpan: - text: str - index_start: int - index_end: int - - -class AsrSpacyTokenPosProcessingTask(SpacyTokenPosProcessingTask): - _spacy_property_name: str - _nlp: Any - _input_property_name: str - - def __init__(self, task_name: str, spacy_property_name: str, require_update: bool, - input_property_name: str): - super().__init__(task_name, spacy_property_name, require_update, input_property_name) - - def get_transcript_to_process(self, property_value: Any) -> str: - return property_value['full_text'] diff --git a/experiment/luna/pipeline/pos_processing/gold_transcript_spacy_token_pos_processing_task.py b/experiment/luna/pipeline/pos_processing/gold_transcript_spacy_token_pos_processing_task.py deleted file mode 100644 index e142212aa4b5727e4e1d1e78d0206500fdc646fe..0000000000000000000000000000000000000000 --- a/experiment/luna/pipeline/pos_processing/gold_transcript_spacy_token_pos_processing_task.py +++ /dev/null @@ -1,24 +0,0 @@ -from dataclasses import dataclass -from typing import Any - -from experiment.luna.pipeline.pos_processing.spacy_token_pos_processing_task import SpacyTokenPosProcessingTask - - -@dataclass -class WordSpan: - text: str - index_start: int - index_end: int - - -class GoldTranscriptSpacyTokenPosProcessingTask(SpacyTokenPosProcessingTask): - _spacy_property_name: str - _nlp: Any - _input_property_name: str - - def __init__(self, task_name: str, spacy_property_name: str, require_update: bool, - input_property_name: str): - super().__init__(task_name, spacy_property_name, require_update, input_property_name) - - def get_transcript_to_process(self, property_value: Any) -> str: - return ' '.join([it['word'] for it in property_value]) diff --git a/experiment/luna/pipeline/pos_processing/spacy_pos_wer_processing_task.py b/experiment/luna/pipeline/pos_processing/spacy_pos_wer_processing_task.py deleted file mode 100644 index 23d4f57895cfbb2124705c2e40ce97b37ffb4c2a..0000000000000000000000000000000000000000 --- a/experiment/luna/pipeline/pos_processing/spacy_pos_wer_processing_task.py +++ /dev/null @@ -1,85 +0,0 @@ -from typing import Any, List, Dict - -from sziszapangma.core.alignment.alignment_classic_calculator import AlignmentClassicCalculator -from sziszapangma.core.alignment.alignment_step import AlignmentStep -from sziszapangma.core.wer.wer_calculator import WerCalculator -from sziszapangma.integration.mapper.alignment_step_mapper import AlignmentStepMapper -from sziszapangma.integration.repository.experiment_repository import ExperimentRepository -from sziszapangma.integration.task.processing_task import ProcessingTask -from sziszapangma.model.model import Word -from sziszapangma.model.relation_manager import RelationManager - -_CLASSIC_WER = "classic_wer" - - -class SpacyPosWerProcessingTask(ProcessingTask): - _gold_transcript_pos_property_name: str - _asr_pos_property_name: str - _pos_alignment_wer: str - _pos_metrics_wer: str - _alignment_classic_calculator: AlignmentClassicCalculator - _wer_calculator: WerCalculator - - def __init__( - self, - task_name: str, - gold_transcript_pos_property_name: str, - require_update: bool, - asr_pos_property_name: str, - pos_alignment_wer: str, - pos_metrics_wer: str - ): - super().__init__(task_name, require_update) - self._gold_transcript_pos_property_name = gold_transcript_pos_property_name - self._asr_pos_property_name = asr_pos_property_name - self._pos_alignment_wer = pos_alignment_wer - self._pos_metrics_wer = pos_metrics_wer - self._alignment_classic_calculator = AlignmentClassicCalculator() - self._wer_calculator = WerCalculator() - - def run_single_process( - self, - record_id: str, - experiment_repository: ExperimentRepository, - relation_manager: RelationManager, - ): - gold_transcript_pos = [ - Word(id=it['id'], type='Word', text=it['pos']) - for it in - experiment_repository.get_property_for_key( - record_id, self._gold_transcript_pos_property_name) - ] - asr_transcript_pos = [ - Word(id=it['id'], type='Word', text=it['pos']) - for it in - experiment_repository.get_property_for_key(record_id, self._asr_pos_property_name) - ] - if gold_transcript_pos is not None and asr_transcript_pos is not None: - alignment_steps = self._get_alignment(gold_transcript_pos, asr_transcript_pos) - experiment_repository.update_property_for_key( - record_id, - self._pos_alignment_wer, - [AlignmentStepMapper.to_json_dict(it) for it in alignment_steps], - ) - experiment_repository.update_property_for_key( - record_id, self._pos_metrics_wer, self.calculate_metrics(alignment_steps) - ) - - def _get_alignment( - self, gold_transcript: List[Word], asr_transcript: List[Word] - ) -> List[AlignmentStep]: - return self._alignment_classic_calculator.calculate_alignment( - reference=gold_transcript, hypothesis=asr_transcript - ) - - def calculate_metrics(self, alignment_steps: List[AlignmentStep]) -> Dict[str, Any]: - """Calculate all metrics for data sample.""" - metrics = dict() - metrics[_CLASSIC_WER] = self._wer_calculator.calculate_wer(alignment_steps) - return metrics - - def skip_for_record(self, record_id: str, experiment_repository: ExperimentRepository) -> bool: - return ( - experiment_repository.get_property_for_key(record_id, self._pos_metrics_wer) - is not None - ) diff --git a/experiment/luna/pipeline/pos_processing/spacy_token_pos_processing_task.py b/experiment/luna/pipeline/pos_processing/spacy_token_pos_processing_task.py deleted file mode 100644 index 2c8bb0b52e9427a3725c45a4055868483eea0fd9..0000000000000000000000000000000000000000 --- a/experiment/luna/pipeline/pos_processing/spacy_token_pos_processing_task.py +++ /dev/null @@ -1,58 +0,0 @@ -import uuid -from abc import abstractmethod, ABC -from dataclasses import dataclass -from typing import Any, Dict - -import spacy -from spacy.tokens import Token - -from sziszapangma.integration.repository.experiment_repository import ExperimentRepository -from sziszapangma.integration.task.processing_task import ProcessingTask -from sziszapangma.model.relation_manager import RelationManager - - -@dataclass -class WordSpan: - text: str - index_start: int - index_end: int - - -class SpacyTokenPosProcessingTask(ProcessingTask, ABC): - _spacy_model_name: str - _nlp: Any - _input_property_name: str - _spacy_property_name: str - - def __init__( - self, - task_name: str, - spacy_property_name: str, - require_update: bool, - input_property_name: str - ): - super().__init__(task_name, require_update) - self._spacy_property_name = spacy_property_name - self._nlp = spacy.load("pl_core_news_lg") - self._input_property_name = input_property_name - - def run_single_process(self, record_id: str, experiment_repository: ExperimentRepository, - relation_manager: RelationManager): - test_property = experiment_repository.get_property_for_key( - record_id, self._input_property_name) - document = self._nlp(self.get_transcript_to_process(test_property)) - - spacy_result = [self.token_to_result_dict(token) for token in document] - experiment_repository.update_property_for_key(record_id, self._spacy_property_name, - spacy_result) - - @staticmethod - def token_to_result_dict(token: Token) -> Dict[str, str]: - return {'id': str(uuid.uuid4()), 'word': token.text, 'pos': token.pos_} - - @abstractmethod - def get_transcript_to_process(self, property_value: Any) -> str: - pass - - def skip_for_record(self, record_id: str, experiment_repository: ExperimentRepository) -> bool: - return experiment_repository.property_exists(record_id, self._spacy_property_name) diff --git a/experiment/luna/pipeline/pos_processing/__init__.py b/experiment/voicelab/__init__.py similarity index 100% rename from experiment/luna/pipeline/pos_processing/__init__.py rename to experiment/voicelab/__init__.py diff --git a/experiment/voicelab/import_data.py b/experiment/voicelab/import_data.py new file mode 100644 index 0000000000000000000000000000000000000000..a08b49ef45211b543153559b0d934b490af2bb4f --- /dev/null +++ b/experiment/voicelab/import_data.py @@ -0,0 +1,60 @@ +import os.path +from typing import List + +from experiment.voicelab.voicelab_telco_record_provider import VoicelabTelcoRecordProvider +from sziszapangma.integration.path_filter import ExtensionPathFilter +from sziszapangma.model.model import Word +from sziszapangma.model.model_creators import create_new_word, create_new_document +from sziszapangma.model.relation_manager import RelationManager + + +class VoicelabAdapter: + _record_provider: VoicelabTelcoRecordProvider + + def __init__(self, record_provider: VoicelabTelcoRecordProvider): + self._record_provider = record_provider + + @staticmethod + def save_words( + words_path: str, + relation_manager: RelationManager + ) -> List[Word]: + with open(words_path, 'r') as f: + text = f.read() + words = [] + for single_word in text.split(): + word = create_new_word(text=single_word) + relation_manager.save_item(word) + words.append(word) + document = create_new_document([word['id'] for word in words]) + relation_manager.save_item(document) + for word in words: + relation_manager.save_relation(word, document) + return words + + def import_record(self, record_id: str): + print(f'record {record_id}') + relation_manager = self._record_provider.get_relation_manager(record_id) + relation_manager.clear_all() + root_path = '/home/marcinwatroba/PWR_ASR/asr-benchmarks/experiment_data/dataset/voicelab_cbiz_testset_20220322' + + words_path = [path for path in [ + f'{root_path}/{record_id.replace("__", "/")}.txt', + f'{root_path}/{record_id.replace("__", "/")}.asr' + ] if os.path.exists(path)][0] + + self.save_words(words_path, relation_manager) + relation_manager.commit() + + +if __name__ == '__main__': + voicelab_record_provider = VoicelabTelcoRecordProvider( + ExtensionPathFilter( + 'experiment_data/dataset/voicelab_cbiz_testset_20220322', + 'wav' + ), + 'experiment_data/dataset_relation_manager_data/voicelab_cbiz_testset_20220322' + ) + adapter = VoicelabAdapter(voicelab_record_provider) + for it in voicelab_record_provider.get_all_records(): + adapter.import_record(it) diff --git a/experiment/voicelab/voicelab_gold_transcript_processor.py b/experiment/voicelab/voicelab_gold_transcript_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..b4cf94a7f308bb870ebbb9f022429116b5353614 --- /dev/null +++ b/experiment/voicelab/voicelab_gold_transcript_processor.py @@ -0,0 +1,23 @@ +from typing import List, Dict + +from experiment.voicelab.voicelab_telco_record_provider import VoicelabTelcoRecordProvider +from sziszapangma.integration.gold_transcript_processor import GoldTranscriptProcessor + + +class VoicelabGoldTranscriptProcessor(GoldTranscriptProcessor): + _record_provider: VoicelabTelcoRecordProvider + + def __init__(self, record_provider: VoicelabTelcoRecordProvider): + self._record_provider = record_provider + + def parse_word(self, word): + return { + 'id': word['id'], + 'word': word['text'] + } + + def get_gold_transcript(self, record_id: str) -> List[Dict[str, any]]: + relation_manager = self._record_provider.get_relation_manager(record_id) + document = [itt for itt in relation_manager.get_all_items() if itt['type'] == 'Document'][0] + document_words = [relation_manager.get_item_by_id(item_id) for item_id in document['word_ids']] + return [self.parse_word(word) for word in document_words] diff --git a/experiment/voicelab/voicelab_pipeline.py b/experiment/voicelab/voicelab_pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..37ef974a91f995da289a963abcd88762c25e7b10 --- /dev/null +++ b/experiment/voicelab/voicelab_pipeline.py @@ -0,0 +1,103 @@ +from experiment.pos_processing.asr_spacy_token_pos_processing_task import AsrSpacyTokenPosProcessingTask +from experiment.pos_processing.gold_transcript_spacy_token_pos_processing_task import \ + GoldTranscriptSpacyTokenPosProcessingTask +from experiment.pos_processing.spacy_pos_wer_processing_task import SpacyPosWerProcessingTask +from experiment.voicelab.voicelab_gold_transcript_processor import VoicelabGoldTranscriptProcessor +from experiment.voicelab.voicelab_telco_record_provider import VoicelabTelcoRecordProvider +from sziszapangma.integration.asr_processor import AsrPathCacheClient +from sziszapangma.integration.experiment_manager import ExperimentManager +from sziszapangma.integration.path_filter import ExtensionPathFilter +from sziszapangma.integration.repository.experiment_repository import ExperimentRepository +from sziszapangma.integration.repository.multi_files_experiment_repository import \ + MultiFilesExperimentRepository +from sziszapangma.integration.task.asr_task import AsrTask +from sziszapangma.integration.task.classic_wer_metric_task import ClassicWerMetricTask +from sziszapangma.integration.task.gold_transcript_task import GoldTranscriptTask + +GOLD_TRANSCRIPT = 'gold_transcript' +TECHMO_POLISH_ASR = 'techmo_polish_asr' +TECHMO_POLISH_CLASSIC_WER_METRIC = 'techmo_polish_classic_wer_metric' +TECHMO_POLISH_CLASSIC_ALIGNMENT = 'techmo_polish_classic_alignment' +TECHMO_SPACY = 'techmo_spacy' +GOLD_TRANSCRIPT_SPACY = 'gold_transcript_spacy' +POS_ALIGNMENT_WER = 'pos_alignment_wer' +POS_METRICS_WER = 'pos_metrics_wer' + +DATASET_DIRECTORY = 'experiment_data/dataset/voicelab_cbiz_testset_20220322' +PIPELINE_DATA_DIRECTORY = 'experiment_data/pipeline' +EXPERIMENT_NAME = 'asr_benchmark_voicelab_cbiz_testset_20220322' +RELATION_MANAGER_ROOT_PATH = 'experiment_data/dataset_relation_manager_data/voicelab_cbiz_testset_20220322' + + +def run_voicelab_experiment(experiment_repository: ExperimentRepository): + record_provider = VoicelabTelcoRecordProvider( + ExtensionPathFilter( + root_directory=DATASET_DIRECTORY, + extension='wav' + ), + relation_manager_root_path=RELATION_MANAGER_ROOT_PATH + ) + experiment_processor = ExperimentManager( + record_id_iterator=record_provider, + processing_tasks=[ + GoldTranscriptTask( + task_name='gold_transcript_task', + gold_transcript_processor=VoicelabGoldTranscriptProcessor(record_provider), + gold_transcript_property_name=GOLD_TRANSCRIPT, + require_update=False + ), + AsrTask( + task_name='techmo_polish_task', + # asr_processor=AsrWebClient('http://192.168.0.124:4999/process_asr', 'test1234'), + asr_processor=AsrPathCacheClient( + 'experiment_data/cached_asr/voicelab_cbiz_testset_20220322_techmo', + record_provider, + record_provider + ), + asr_property_name=TECHMO_POLISH_ASR, + require_update=False, + record_path_provider=record_provider + ), + ClassicWerMetricTask( + task_name='classic_wer_metric_task', + asr_property_name=TECHMO_POLISH_ASR, + gold_transcript_property_name=GOLD_TRANSCRIPT, + metrics_property_name=TECHMO_POLISH_CLASSIC_WER_METRIC, + require_update=False, + alignment_property_name=TECHMO_POLISH_CLASSIC_ALIGNMENT + ), + GoldTranscriptSpacyTokenPosProcessingTask( + task_name='gold_transcript_spacy_task', + input_property_name=GOLD_TRANSCRIPT, + spacy_property_name=GOLD_TRANSCRIPT_SPACY, + require_update=True + ), + AsrSpacyTokenPosProcessingTask( + task_name='techmo_spacy_task', + input_property_name=TECHMO_POLISH_ASR, + spacy_property_name=TECHMO_SPACY, + require_update=True + ), + SpacyPosWerProcessingTask( + task_name='PosWerProcessor', + require_update=False, + gold_transcript_pos_property_name=GOLD_TRANSCRIPT_SPACY, + asr_pos_property_name=TECHMO_SPACY, + pos_alignment_wer=POS_ALIGNMENT_WER, + pos_metrics_wer=POS_METRICS_WER + ) + ], + experiment_repository=experiment_repository, + relation_manager_provider=record_provider + ) + experiment_processor.process() + + +def example_run(): + experiment_repository = MultiFilesExperimentRepository( + PIPELINE_DATA_DIRECTORY, EXPERIMENT_NAME) + run_voicelab_experiment(experiment_repository) + + +if __name__ == '__main__': + example_run() diff --git a/experiment/voicelab/voicelab_telco_record_provider.py b/experiment/voicelab/voicelab_telco_record_provider.py new file mode 100644 index 0000000000000000000000000000000000000000..8f00611ac55eb672f797f5115ef7465c49b31192 --- /dev/null +++ b/experiment/voicelab/voicelab_telco_record_provider.py @@ -0,0 +1,39 @@ +from pathlib import Path +from typing import Dict, Set + +from sziszapangma.integration.path_filter import PathFilter +from sziszapangma.integration.record_id_iterator import RecordIdIterator +from sziszapangma.integration.record_path_provider import RecordPathProvider +from sziszapangma.integration.relation_manager_provider import RelationManagerProvider +from sziszapangma.model.relation_manager import RelationManager, FileRelationManager + + +class VoicelabTelcoRecordProvider(RecordIdIterator, RecordPathProvider, RelationManagerProvider): + _path_by_id: Dict[str, str] + _relation_manager_root_path: str + + def __init__(self, path_filter: PathFilter, relation_manager_root_path: str): + self._path_by_id = dict({ + self._get_id(it): it + for it in path_filter.get_list_of_files() + }) + self._relation_manager_root_path = relation_manager_root_path + + def get_all_records(self) -> Set[str]: + return set(self._path_by_id.keys()) + + def get_path(self, record_id: str) -> str: + return self._path_by_id[record_id] + + def get_relation_manager(self, record_id: str) -> RelationManager: + record_path = Path(self._relation_manager_root_path).joinpath(record_id) + record_path.mkdir(parents=True, exist_ok=True) + return FileRelationManager( + str(record_path.joinpath('ab_relations.csv')), + str(record_path.joinpath('ab_items.json')) + ) + + @staticmethod + def _get_id(record_file_path: str) -> str: + path = record_file_path.replace('.wav', '') + return '__'.join(path.split('/')[-2:]) diff --git a/experiment_data/cached_asr/.gitignore b/experiment_data/cached_asr/.gitignore index eeada193c62a9d070b0bff4704c446c0a645a03a..c6c167008140b18ef457dd953f5adb88d8e939c6 100644 --- a/experiment_data/cached_asr/.gitignore +++ b/experiment_data/cached_asr/.gitignore @@ -1 +1,2 @@ /luna_techmo +/voicelab_cbiz_testset_20220322_techmo diff --git a/experiment_data/cached_asr/voicelab_cbiz_testset_20220322_techmo.dvc b/experiment_data/cached_asr/voicelab_cbiz_testset_20220322_techmo.dvc new file mode 100644 index 0000000000000000000000000000000000000000..629f0b94d1f75b50e40b0395277aa986e7b637ec --- /dev/null +++ b/experiment_data/cached_asr/voicelab_cbiz_testset_20220322_techmo.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 94b1709c05bd09b77c5a6850e2f2f373.dir + size: 34654307 + nfiles: 800 + path: voicelab_cbiz_testset_20220322_techmo diff --git a/experiment_data/dataset/.gitignore b/experiment_data/dataset/.gitignore index beaf70115d311f2cddb1fe658108705439bbf737..a2b9bfa575929ded20bea51da57a076bdb280dcf 100644 --- a/experiment_data/dataset/.gitignore +++ b/experiment_data/dataset/.gitignore @@ -1 +1,2 @@ /LUNA.PL +/voicelab_cbiz_testset_20220322 diff --git a/experiment_data/dataset/voicelab_cbiz_testset_20220322.dvc b/experiment_data/dataset/voicelab_cbiz_testset_20220322.dvc new file mode 100644 index 0000000000000000000000000000000000000000..f62672385124a534bb24849b3ba5ba6d005b4746 --- /dev/null +++ b/experiment_data/dataset/voicelab_cbiz_testset_20220322.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 3c2b18e1f1f89e4c5ad7b254e472b25e.dir + size: 4803739404 + nfiles: 1600 + path: voicelab_cbiz_testset_20220322 diff --git a/mypy.ini b/mypy.ini index 754ff4f419b37e74f1b15b5155b3454c7744101a..62368d74a698eaaa281c6f96390fb0f07893aea6 100644 --- a/mypy.ini +++ b/mypy.ini @@ -4,6 +4,9 @@ ignore_missing_imports = True [mypy-tensorflow.*] ignore_missing_imports = True +[mypy-minio.*] +ignore_missing_imports = True + [mypy-srsly.*] ignore_missing_imports = True diff --git a/sziszapangma/integration/__pycache__/path_filter.cpython-38.pyc b/sziszapangma/integration/__pycache__/path_filter.cpython-38.pyc index fe1989427f54e3a4261b6c2ebc65b4bd8629ce56..00df4512cd39cb98d7bd4067804d61f5e062d4f4 100644 Binary files a/sziszapangma/integration/__pycache__/path_filter.cpython-38.pyc and b/sziszapangma/integration/__pycache__/path_filter.cpython-38.pyc differ diff --git a/sziszapangma/integration/asr_processor.py b/sziszapangma/integration/asr_processor.py index 416baf243b5e7c98018fd5c7ab90312f0f30c303..56a8786b56e78adb5cb2942a2615cd8677be4ea6 100644 --- a/sziszapangma/integration/asr_processor.py +++ b/sziszapangma/integration/asr_processor.py @@ -37,7 +37,7 @@ class AsrWebClient(AsrProcessor): ) res = requests.post(self._url, files=files, headers=headers, timeout=600) json_response = res.json() - print(f'asr processing result {json_response}') + print(f"asr processing result {json_response}") return json_response @@ -45,11 +45,18 @@ class AsrPathCacheClient(AsrProcessor): cache_path: str path_to_id: Dict[str, str] - def __init__(self, cache_path: str, record_iterator: RecordIdIterator, record_path_provider: RecordPathProvider): + def __init__( + self, + cache_path: str, + record_iterator: RecordIdIterator, + record_path_provider: RecordPathProvider, + ): super(AsrPathCacheClient, self).__init__() self._cache_path = cache_path - self.path_to_id = {record_path_provider.get_path(it): it for it in record_iterator.get_all_records()} + self.path_to_id = { + record_path_provider.get_path(it): it for it in record_iterator.get_all_records() + } def call_recognise(self, file_path: str) -> Dict[str, Any]: - path = Path(self._cache_path).joinpath(f'{self.path_to_id[file_path]}.json') - return json.load(open(path, 'r')) + path = Path(self._cache_path).joinpath(f"{self.path_to_id[file_path]}.json") + return json.load(open(path, "r")) diff --git a/sziszapangma/integration/experiment_manager.py b/sziszapangma/integration/experiment_manager.py index 7732d7ca0a385a50208a24599eb4c8fd43f7f1fd..5ecd58f4f11a9965958e91a4135532f393f9a9ed 100644 --- a/sziszapangma/integration/experiment_manager.py +++ b/sziszapangma/integration/experiment_manager.py @@ -18,7 +18,7 @@ class ExperimentManager: experiment_repository: ExperimentRepository, record_id_iterator: RecordIdIterator, processing_tasks: List[ProcessingTask], - relation_manager_provider: RelationManagerProvider + relation_manager_provider: RelationManagerProvider, ): self._experiment_repository = experiment_repository self._record_id_iterator = record_id_iterator diff --git a/sziszapangma/integration/path_filter.py b/sziszapangma/integration/path_filter.py index 3053335d5d8aec049b0b008af7414ff926b9bf16..a5e55f90c90db25a564e94a982edd39af8ec2161 100644 --- a/sziszapangma/integration/path_filter.py +++ b/sziszapangma/integration/path_filter.py @@ -34,6 +34,6 @@ class ExtensionPathFilter(PathFilter): """ Implementation of searching files with extension. """ - path_generator = Path(self._root_directory).glob(f"LUNA.PL/**/*.{self._extension}") + path_generator = Path(self._root_directory).glob(f"**/*.{self._extension}") all_files = [str(it) for it in path_generator] return all_files if self._files_limit is None else all_files[: self._files_limit] diff --git a/sziszapangma/integration/repository/file_experiment_repository.py b/sziszapangma/integration/repository/file_experiment_repository.py index a528aad9e7c0f96214de0229e259522c4937a24a..6f8c658d92dc3548aaf40652b7d6ff2d2a09351b 100644 --- a/sziszapangma/integration/repository/file_experiment_repository.py +++ b/sziszapangma/integration/repository/file_experiment_repository.py @@ -2,8 +2,6 @@ import json import os from typing import Any, Dict, Optional, Set -import pandas as pd - from sziszapangma.integration.repository.experiment_repository import ExperimentRepository diff --git a/sziszapangma/integration/repository/minio_experiment_repository.py b/sziszapangma/integration/repository/minio_experiment_repository.py index 6d53f3a19c77989ff1fe9a71165ef9cf0e81affa..fef1689875371825f3a3d6b402dbb10b96791def 100644 --- a/sziszapangma/integration/repository/minio_experiment_repository.py +++ b/sziszapangma/integration/repository/minio_experiment_repository.py @@ -1,6 +1,6 @@ import io import json -from typing import Any, Optional, Set +from typing import Any, List, Optional, Set from minio import Minio @@ -28,10 +28,11 @@ class MinioExperimentRepository(ExperimentRepository): def update_property_for_key(self, record_id: str, property_name: str, property_value: Any): path = self._get_file_path(property_name, record_id) - content_bytes = json.dumps(property_value).encode('utf-8') + content_bytes = json.dumps(property_value).encode("utf-8") print(self._bucket_name, path) - self._client.put_object(self._bucket_name, path, io.BytesIO(content_bytes), - len(content_bytes)) + self._client.put_object( + self._bucket_name, path, io.BytesIO(content_bytes), len(content_bytes) + ) def delete_property_for_key(self, record_id: str, property_name: str): path = self._get_file_path(property_name, record_id) @@ -40,24 +41,32 @@ class MinioExperimentRepository(ExperimentRepository): def get_property_for_key(self, record_id: str, property_name: str) -> Optional[Any]: if self.property_exists(record_id, property_name): path = self._get_file_path(property_name, record_id) - json_content = self._client.get_object(self._bucket_name, path).read().decode('utf-8') + json_content = self._client.get_object(self._bucket_name, path).read().decode("utf-8") return json.loads(json_content) else: return None def get_all_record_ids(self) -> Set[str]: - ids = [] + ids: List[str] = [] for property_name in self.get_all_properties(): - path = f'{self._root_path}{self._experiment_name}/{property_name}/' - property_ids = set([obj.object_name.split('/')[-1].replace('.json', '') for obj in - self._client.list_objects(self._bucket_name, path)]) + path = f"{self._root_path}{self._experiment_name}/{property_name}/" + property_ids = set( + [ + obj.object_name.split("/")[-1].replace(".json", "") + for obj in self._client.list_objects(self._bucket_name, path) + ] + ) ids.extend(property_ids) return set(ids) def get_all_properties(self) -> Set[str]: - experiment_path = f'{self._root_path}{self._experiment_name}/' - return set([obj.object_name.split('/')[-1] for obj in - self._client.list_objects(self._bucket_name, experiment_path)]) + experiment_path = f"{self._root_path}{self._experiment_name}/" + return set( + [ + obj.object_name.split("/")[-1] + for obj in self._client.list_objects(self._bucket_name, experiment_path) + ] + ) def _get_file_path(self, property_name: str, record_id: str) -> str: - return f'{self._root_path}{self._experiment_name}/{property_name}/{record_id}.json' + return f"{self._root_path}{self._experiment_name}/{property_name}/{record_id}.json" diff --git a/sziszapangma/integration/repository/multi_files_experiment_repository.py b/sziszapangma/integration/repository/multi_files_experiment_repository.py index c6b5a23e8194cafaaa28e406eaf0122e9fdc4ff8..3d18f930ba1ec129a94f28837163b53d207dbfbf 100644 --- a/sziszapangma/integration/repository/multi_files_experiment_repository.py +++ b/sziszapangma/integration/repository/multi_files_experiment_repository.py @@ -1,6 +1,6 @@ import json from pathlib import Path -from typing import Any, Optional, Set +from typing import Any, List, Optional, Set from sziszapangma.integration.repository.experiment_repository import ExperimentRepository @@ -24,7 +24,7 @@ class MultiFilesExperimentRepository(ExperimentRepository): def update_property_for_key(self, record_id: str, property_name: str, property_value: Any): path = self._get_file_path(property_name, record_id) path.parent.mkdir(parents=True, exist_ok=True) - json.dump(property_value, open(path, 'w')) + json.dump(property_value, open(path, "w")) def delete_property_for_key(self, record_id: str, property_name: str): self._get_file_path(property_name, record_id).unlink() @@ -32,16 +32,17 @@ class MultiFilesExperimentRepository(ExperimentRepository): def get_property_for_key(self, record_id: str, property_name: str) -> Optional[Any]: if self.property_exists(record_id, property_name): path = self._get_file_path(property_name, record_id) - return json.load(open(path, 'r')) + return json.load(open(path, "r")) else: return None def get_all_record_ids(self) -> Set[str]: - ids = [] + ids: List[str] = [] for property_name in self.get_all_properties(): - path = Path(self._root_directory).joinpath(self._experiment_name)\ - .joinpath(property_name) - property_ids = set([children.name.replace('.json', '') for children in path.iterdir()]) + path = ( + Path(self._root_directory).joinpath(self._experiment_name).joinpath(property_name) + ) + property_ids = set([children.name.replace(".json", "") for children in path.iterdir()]) ids.extend(property_ids) return set(ids) @@ -49,13 +50,10 @@ class MultiFilesExperimentRepository(ExperimentRepository): experiment_path = Path(self._root_directory).joinpath(self._experiment_name) return set([it.name for it in experiment_path.iterdir()]) - def _get_file_path(self, property_name: str, record_id: str) -> Path: - return Path(self._root_directory) \ - .joinpath(self._experiment_name) \ - .joinpath(property_name) \ - .joinpath(f'{record_id}.json') - - -if __name__ == '__main__': - print(list(Path('./').iterdir())) + return ( + Path(self._root_directory) + .joinpath(self._experiment_name) + .joinpath(property_name) + .joinpath(f"{record_id}.json") + ) diff --git a/sziszapangma/integration/repository/multiple_experiment_repository.py b/sziszapangma/integration/repository/multiple_experiment_repository.py index b8e1d352f2b7cbb57f877ae5e4575460fd15175c..da43fe3938d64edce1f9a5d361153dac4f0addbd 100644 --- a/sziszapangma/integration/repository/multiple_experiment_repository.py +++ b/sziszapangma/integration/repository/multiple_experiment_repository.py @@ -1,4 +1,4 @@ -from typing import Any, Optional, Set, List +from typing import Any, List, Optional, Set from sziszapangma.integration.repository.experiment_repository import ExperimentRepository diff --git a/sziszapangma/integration/task/gold_transcript_task.py b/sziszapangma/integration/task/gold_transcript_task.py index d4cc6ba27c88db08393490528180cacf230d8e25..c8f547bdc106c0c5bfa62a0bcb8d8a8c3931aef2 100644 --- a/sziszapangma/integration/task/gold_transcript_task.py +++ b/sziszapangma/integration/task/gold_transcript_task.py @@ -27,7 +27,12 @@ class GoldTranscriptTask(ProcessingTask): is not None ) - def run_single_process(self, record_id: str, experiment_repository: ExperimentRepository, relation_manager: RelationManager,): + def run_single_process( + self, + record_id: str, + experiment_repository: ExperimentRepository, + relation_manager: RelationManager, + ): experiment_repository.update_property_for_key( record_id, self._gold_transcript_property_name, diff --git a/sziszapangma/model/relation_manager.py b/sziszapangma/model/relation_manager.py index 64a742deb4f878525792fbb7a0ebb81cc138cacd..1360aa683d2ee7144f7a44d0485447cede5f88aa 100644 --- a/sziszapangma/model/relation_manager.py +++ b/sziszapangma/model/relation_manager.py @@ -114,4 +114,3 @@ class FileRelationManager(RelationManager): def clear_all(self) -> None: self.items_dict.clear() self.relations_dataframe = self.relations_dataframe[0:0] -