Skip to content
Snippets Groups Projects
Commit 8d234117 authored by Marcin Wątroba's avatar Marcin Wątroba
Browse files

Fix lint

parent c67a89c3
1 merge request!13Change data model
Showing
with 154 additions and 235 deletions
......@@ -8,7 +8,6 @@ from sziszapangma.integration.service_core.asr.asr_result import AsrResult
class SpeechbrainAsrProcessor(AsrBaseProcessor):
def process_asr(self, audio_file_path: str) -> AsrResult:
# prepare paths
file_tag = str(uuid.uuid4())
file_extension = audio_file_path.split('.')[-1]
file_name = f'{file_tag}.{file_extension}'
......
......@@ -3,5 +3,5 @@
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
docker build -t asr-clarin-pl-service "$SCRIPT_DIR"
docker tag asr-clarin-pl-service docker-registry.theliver.pl/asr-clarin-pl-service:1.4
docker push docker-registry.theliver.pl/asr-clarin-pl-service:1.4
docker tag asr-clarin-pl-service gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/asr-clarin-pl-service:1.4
docker push gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/asr-clarin-pl-service:1.4
File mode changed from 100644 to 100755
......@@ -3,5 +3,5 @@
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
docker build -t embedding_docker "$SCRIPT_DIR"
docker tag embedding_docker docker-registry.theliver.pl/embedding_docker:1.0
docker push docker-registry.theliver.pl/embedding_docker:1.0
docker tag embedding_docker gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/embedding_docker:1.0
docker push gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/embedding_docker:1.0
......@@ -3,5 +3,5 @@
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
docker build -t transformers-wav2vec2for_ctc "$SCRIPT_DIR"
docker tag transformers-wav2vec2for_ctc docker-registry.theliver.pl/transformers-wav2vec2for_ctc:1.0
docker push docker-registry.theliver.pl/transformers-wav2vec2for_ctc:1.0
docker tag transformers-wav2vec2for_ctc gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/transformers-wav2vec2for_ctc:1.0
docker push gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/transformers-wav2vec2for_ctc:1.0
......@@ -3,5 +3,5 @@
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
docker build --no-cache -t speechbrain-asr "$SCRIPT_DIR"
docker tag speechbrain-asr docker-registry.theliver.pl/speechbrain-asr:1.5
docker push docker-registry.theliver.pl/speechbrain-asr:1.5
docker tag speechbrain-asr gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/speechbrain-asr:1.5
docker push gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/speechbrain-asr:1.5
......@@ -3,5 +3,5 @@
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
docker build -t techmo-asr "$SCRIPT_DIR"
docker tag techmo-asr docker-registry.theliver.pl/techmo-asr:1.1
docker push docker-registry.theliver.pl/techmo-asr:1.1
docker tag techmo-asr gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/techmo-asr:1.1
docker push gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/techmo-asr:1.1
......@@ -36,3 +36,33 @@ stages:
md5: 2e334734387ab4579b7b5269d5029e81.dir
size: 71627685
nfiles: 4000
luna_import_to_common_format:
cmd: PYTHONPATH=. python experiment/luna/import_dataset/import_luna.py
deps:
- path: experiment/luna/import_dataset/import_luna.py
md5: 44a1b914afda2ae74462e7afd83f658e
size: 8278
- path: experiment_data/dataset/LUNA.PL
md5: d342155b1871e881797cf7da09d5dc3c.dir
size: 1578358645
nfiles: 4500
outs:
- path: experiment_data/dataset_relation_manager_data/luna
md5: ff680a49296818460a49bd0c70089a4a.dir
size: 229007155
nfiles: 1000
voicelab_import_to_common_format:
cmd: PYTHONPATH=. python experiment/voicelab/import_data.py
deps:
- path: experiment/voicelab/import_data.py
md5: 0cf7cf604b202489ce3b0cb51bb47fa2
size: 2264
- path: experiment_data/dataset/voicelab_cbiz_testset_20220322
md5: 3c2b18e1f1f89e4c5ad7b254e472b25e.dir
size: 4803739404
nfiles: 1600
outs:
- path: experiment_data/dataset_relation_manager_data/voicelab_cbiz_testset_20220322
md5: 6d56f24b0ff78c0d44ade2114158150d.dir
size: 110711470
nfiles: 1600
stages:
import_luna_to_common_format:
luna_import_to_common_format:
cmd: PYTHONPATH=. python experiment/luna/import_dataset/import_luna.py
deps:
- experiment/luna/import_dataset/import_luna.py
......@@ -10,11 +10,34 @@ stages:
luna_main_pipeline:
cmd: |
PYTHONPATH=. python -m spacy download pl_core_news_lg
python -m spacy download pl_core_news_lg
PYTHONPATH=. python experiment/luna/pipeline/luna_main.py
deps:
- experiment/luna/pipeline/luna_main.py
- experiment_data/dataset_relation_manager_data/luna
- experiment_data/dataset/LUNA.PL
- experiment_data/cached_asr/luna_techmo
outs:
- experiment_data/pipeline/asr_benchmark_luna
voicelab_import_to_common_format:
cmd: PYTHONPATH=. python experiment/voicelab/import_data.py
deps:
- experiment/voicelab/import_data.py
- experiment_data/dataset/voicelab_cbiz_testset_20220322
outs:
- experiment_data/dataset_relation_manager_data/voicelab_cbiz_testset_20220322
voicelab_main_pipeline:
cmd: |
python -m spacy download pl_core_news_lg
PYTHONPATH=. python experiment/voicelab/voicelab_pipeline.py
deps:
- experiment/voicelab/voicelab_pipeline.py
- experiment_data/dataset_relation_manager_data/voicelab_cbiz_testset_20220322
- experiment_data/dataset/voicelab_cbiz_testset_20220322
- experiment_data/cached_asr/voicelab_cbiz_testset_20220322_techmo
outs:
- experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322
# concurrent features, multiprocessing
......@@ -171,7 +171,7 @@ def main():
luna_directory = 'experiment_data/dataset/LUNA.PL'
luna_record_provider = LunaRecordProvider(
ExtensionPathFilter(
root_directory=luna_directory,
root_directory=f'{luna_directory}/LUNA.PL',
extension='wav'
),
relation_manager_root_path='experiment_data/dataset_relation_manager_data/luna'
......
import json
from pprint import pprint
from experiment.luna.luna_record_provider import LunaRecordProvider
from sziszapangma.integration.path_filter import ExtensionPathFilter
if __name__ == '__main__':
luna_directory = 'experiment_data/dataset/LUNA.PL'
luna_record_provider = LunaRecordProvider(
ExtensionPathFilter(
root_directory=luna_directory,
extension='wav'
),
relation_manager_root_path='experiment_data/dataset_relation_manager_data/luna'
)
pprint(luna_record_provider.get_all_records())
for record_id in luna_record_provider.get_all_records():
path = f'experiment_data/cached_asr/LUNA_techmo_asr_cache/{record_id.replace("__", "/")}.wav.techmo.json'
raw = json.load(open(path, 'r'))
output_path = f'experiment_data/cached_asr/luna_techmo/{record_id}.json'
json.dump(raw, open(output_path, 'w'))
from typing import List, Dict
# from experiment.luna.luna_record_provider import LunaRecordProvider
from experiment.luna.luna_record_provider import LunaRecordProvider
from sziszapangma.integration.gold_transcript_processor import GoldTranscriptProcessor
from sziszapangma.model.relation_manager import RelationManager
......@@ -13,7 +12,6 @@ class LunaGoldTranscriptProcessor(GoldTranscriptProcessor):
self._record_provider = record_provider
def parse_word(self, word, relation_manager: RelationManager):
# print(word)
all_relations = relation_manager.get_all_relations_for_item(word['id'])
pos_id = [it['second_id'] for it in all_relations if it['second_type'] in ['pos']][0]
return {
......
from experiment.luna.pipeline.luna_gold_transcript_processor import LunaGoldTranscriptProcessor
from experiment.luna.pipeline.pos_processing.asr_spacy_token_pos_processing_task import \
AsrSpacyTokenPosProcessingTask
from experiment.luna.pipeline.pos_processing.gold_transcript_spacy_token_pos_processing_task import \
GoldTranscriptSpacyTokenPosProcessingTask
from experiment.luna.pipeline.pos_processing.spacy_pos_wer_processing_task import \
SpacyPosWerProcessingTask
from experiment.luna.luna_record_provider import LunaRecordProvider
from experiment.pos_processing.asr_spacy_token_pos_processing_task import AsrSpacyTokenPosProcessingTask
from experiment.pos_processing.gold_transcript_spacy_token_pos_processing_task import \
GoldTranscriptSpacyTokenPosProcessingTask
from experiment.pos_processing.spacy_pos_wer_processing_task import SpacyPosWerProcessingTask
from sziszapangma.integration.asr_processor import AsrPathCacheClient
from sziszapangma.integration.experiment_manager import ExperimentManager
from sziszapangma.integration.path_filter import ExtensionPathFilter
......@@ -30,7 +28,7 @@ POS_METRICS_WER = 'pos_metrics_wer'
def run_luna_experiment(experiment_repository: ExperimentRepository):
record_provider = LunaRecordProvider(
ExtensionPathFilter(
root_directory=LUNA_DIRECTORY,
root_directory=f'{LUNA_DIRECTORY}/LUNA.PL',
extension='wav'
),
relation_manager_root_path='experiment_data/dataset_relation_manager_data/luna'
......
from dataclasses import dataclass
from typing import Any
from experiment.luna.pipeline.pos_processing.spacy_token_pos_processing_task import SpacyTokenPosProcessingTask
@dataclass
class WordSpan:
text: str
index_start: int
index_end: int
class AsrSpacyTokenPosProcessingTask(SpacyTokenPosProcessingTask):
_spacy_property_name: str
_nlp: Any
_input_property_name: str
def __init__(self, task_name: str, spacy_property_name: str, require_update: bool,
input_property_name: str):
super().__init__(task_name, spacy_property_name, require_update, input_property_name)
def get_transcript_to_process(self, property_value: Any) -> str:
return property_value['full_text']
from dataclasses import dataclass
from typing import Any
from experiment.luna.pipeline.pos_processing.spacy_token_pos_processing_task import SpacyTokenPosProcessingTask
@dataclass
class WordSpan:
text: str
index_start: int
index_end: int
class GoldTranscriptSpacyTokenPosProcessingTask(SpacyTokenPosProcessingTask):
_spacy_property_name: str
_nlp: Any
_input_property_name: str
def __init__(self, task_name: str, spacy_property_name: str, require_update: bool,
input_property_name: str):
super().__init__(task_name, spacy_property_name, require_update, input_property_name)
def get_transcript_to_process(self, property_value: Any) -> str:
return ' '.join([it['word'] for it in property_value])
from typing import Any, List, Dict
from sziszapangma.core.alignment.alignment_classic_calculator import AlignmentClassicCalculator
from sziszapangma.core.alignment.alignment_step import AlignmentStep
from sziszapangma.core.wer.wer_calculator import WerCalculator
from sziszapangma.integration.mapper.alignment_step_mapper import AlignmentStepMapper
from sziszapangma.integration.repository.experiment_repository import ExperimentRepository
from sziszapangma.integration.task.processing_task import ProcessingTask
from sziszapangma.model.model import Word
from sziszapangma.model.relation_manager import RelationManager
_CLASSIC_WER = "classic_wer"
class SpacyPosWerProcessingTask(ProcessingTask):
_gold_transcript_pos_property_name: str
_asr_pos_property_name: str
_pos_alignment_wer: str
_pos_metrics_wer: str
_alignment_classic_calculator: AlignmentClassicCalculator
_wer_calculator: WerCalculator
def __init__(
self,
task_name: str,
gold_transcript_pos_property_name: str,
require_update: bool,
asr_pos_property_name: str,
pos_alignment_wer: str,
pos_metrics_wer: str
):
super().__init__(task_name, require_update)
self._gold_transcript_pos_property_name = gold_transcript_pos_property_name
self._asr_pos_property_name = asr_pos_property_name
self._pos_alignment_wer = pos_alignment_wer
self._pos_metrics_wer = pos_metrics_wer
self._alignment_classic_calculator = AlignmentClassicCalculator()
self._wer_calculator = WerCalculator()
def run_single_process(
self,
record_id: str,
experiment_repository: ExperimentRepository,
relation_manager: RelationManager,
):
gold_transcript_pos = [
Word(id=it['id'], type='Word', text=it['pos'])
for it in
experiment_repository.get_property_for_key(
record_id, self._gold_transcript_pos_property_name)
]
asr_transcript_pos = [
Word(id=it['id'], type='Word', text=it['pos'])
for it in
experiment_repository.get_property_for_key(record_id, self._asr_pos_property_name)
]
if gold_transcript_pos is not None and asr_transcript_pos is not None:
alignment_steps = self._get_alignment(gold_transcript_pos, asr_transcript_pos)
experiment_repository.update_property_for_key(
record_id,
self._pos_alignment_wer,
[AlignmentStepMapper.to_json_dict(it) for it in alignment_steps],
)
experiment_repository.update_property_for_key(
record_id, self._pos_metrics_wer, self.calculate_metrics(alignment_steps)
)
def _get_alignment(
self, gold_transcript: List[Word], asr_transcript: List[Word]
) -> List[AlignmentStep]:
return self._alignment_classic_calculator.calculate_alignment(
reference=gold_transcript, hypothesis=asr_transcript
)
def calculate_metrics(self, alignment_steps: List[AlignmentStep]) -> Dict[str, Any]:
"""Calculate all metrics for data sample."""
metrics = dict()
metrics[_CLASSIC_WER] = self._wer_calculator.calculate_wer(alignment_steps)
return metrics
def skip_for_record(self, record_id: str, experiment_repository: ExperimentRepository) -> bool:
return (
experiment_repository.get_property_for_key(record_id, self._pos_metrics_wer)
is not None
)
import uuid
from abc import abstractmethod, ABC
from dataclasses import dataclass
from typing import Any, Dict
import spacy
from spacy.tokens import Token
from sziszapangma.integration.repository.experiment_repository import ExperimentRepository
from sziszapangma.integration.task.processing_task import ProcessingTask
from sziszapangma.model.relation_manager import RelationManager
@dataclass
class WordSpan:
text: str
index_start: int
index_end: int
class SpacyTokenPosProcessingTask(ProcessingTask, ABC):
_spacy_model_name: str
_nlp: Any
_input_property_name: str
_spacy_property_name: str
def __init__(
self,
task_name: str,
spacy_property_name: str,
require_update: bool,
input_property_name: str
):
super().__init__(task_name, require_update)
self._spacy_property_name = spacy_property_name
self._nlp = spacy.load("pl_core_news_lg")
self._input_property_name = input_property_name
def run_single_process(self, record_id: str, experiment_repository: ExperimentRepository,
relation_manager: RelationManager):
test_property = experiment_repository.get_property_for_key(
record_id, self._input_property_name)
document = self._nlp(self.get_transcript_to_process(test_property))
spacy_result = [self.token_to_result_dict(token) for token in document]
experiment_repository.update_property_for_key(record_id, self._spacy_property_name,
spacy_result)
@staticmethod
def token_to_result_dict(token: Token) -> Dict[str, str]:
return {'id': str(uuid.uuid4()), 'word': token.text, 'pos': token.pos_}
@abstractmethod
def get_transcript_to_process(self, property_value: Any) -> str:
pass
def skip_for_record(self, record_id: str, experiment_repository: ExperimentRepository) -> bool:
return experiment_repository.property_exists(record_id, self._spacy_property_name)
import os.path
from typing import List
from experiment.voicelab.voicelab_telco_record_provider import VoicelabTelcoRecordProvider
from sziszapangma.integration.path_filter import ExtensionPathFilter
from sziszapangma.model.model import Word
from sziszapangma.model.model_creators import create_new_word, create_new_document
from sziszapangma.model.relation_manager import RelationManager
class VoicelabAdapter:
_record_provider: VoicelabTelcoRecordProvider
def __init__(self, record_provider: VoicelabTelcoRecordProvider):
self._record_provider = record_provider
@staticmethod
def save_words(
words_path: str,
relation_manager: RelationManager
) -> List[Word]:
with open(words_path, 'r') as f:
text = f.read()
words = []
for single_word in text.split():
word = create_new_word(text=single_word)
relation_manager.save_item(word)
words.append(word)
document = create_new_document([word['id'] for word in words])
relation_manager.save_item(document)
for word in words:
relation_manager.save_relation(word, document)
return words
def import_record(self, record_id: str):
print(f'record {record_id}')
relation_manager = self._record_provider.get_relation_manager(record_id)
relation_manager.clear_all()
root_path = '/home/marcinwatroba/PWR_ASR/asr-benchmarks/experiment_data/dataset/voicelab_cbiz_testset_20220322'
words_path = [path for path in [
f'{root_path}/{record_id.replace("__", "/")}.txt',
f'{root_path}/{record_id.replace("__", "/")}.asr'
] if os.path.exists(path)][0]
self.save_words(words_path, relation_manager)
relation_manager.commit()
if __name__ == '__main__':
voicelab_record_provider = VoicelabTelcoRecordProvider(
ExtensionPathFilter(
'experiment_data/dataset/voicelab_cbiz_testset_20220322',
'wav'
),
'experiment_data/dataset_relation_manager_data/voicelab_cbiz_testset_20220322'
)
adapter = VoicelabAdapter(voicelab_record_provider)
for it in voicelab_record_provider.get_all_records():
adapter.import_record(it)
from typing import List, Dict
from experiment.voicelab.voicelab_telco_record_provider import VoicelabTelcoRecordProvider
from sziszapangma.integration.gold_transcript_processor import GoldTranscriptProcessor
class VoicelabGoldTranscriptProcessor(GoldTranscriptProcessor):
_record_provider: VoicelabTelcoRecordProvider
def __init__(self, record_provider: VoicelabTelcoRecordProvider):
self._record_provider = record_provider
def parse_word(self, word):
return {
'id': word['id'],
'word': word['text']
}
def get_gold_transcript(self, record_id: str) -> List[Dict[str, any]]:
relation_manager = self._record_provider.get_relation_manager(record_id)
document = [itt for itt in relation_manager.get_all_items() if itt['type'] == 'Document'][0]
document_words = [relation_manager.get_item_by_id(item_id) for item_id in document['word_ids']]
return [self.parse_word(word) for word in document_words]
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment