Skip to content
Snippets Groups Projects
Commit 8d234117 authored by Marcin Wątroba's avatar Marcin Wątroba
Browse files

Fix lint

parent c67a89c3
Branches
1 merge request!13Change data model
Showing
with 214 additions and 41 deletions
from experiment.pos_processing.asr_spacy_token_pos_processing_task import AsrSpacyTokenPosProcessingTask
from experiment.pos_processing.gold_transcript_spacy_token_pos_processing_task import \
GoldTranscriptSpacyTokenPosProcessingTask
from experiment.pos_processing.spacy_pos_wer_processing_task import SpacyPosWerProcessingTask
from experiment.voicelab.voicelab_gold_transcript_processor import VoicelabGoldTranscriptProcessor
from experiment.voicelab.voicelab_telco_record_provider import VoicelabTelcoRecordProvider
from sziszapangma.integration.asr_processor import AsrPathCacheClient
from sziszapangma.integration.experiment_manager import ExperimentManager
from sziszapangma.integration.path_filter import ExtensionPathFilter
from sziszapangma.integration.repository.experiment_repository import ExperimentRepository
from sziszapangma.integration.repository.multi_files_experiment_repository import \
MultiFilesExperimentRepository
from sziszapangma.integration.task.asr_task import AsrTask
from sziszapangma.integration.task.classic_wer_metric_task import ClassicWerMetricTask
from sziszapangma.integration.task.gold_transcript_task import GoldTranscriptTask
GOLD_TRANSCRIPT = 'gold_transcript'
TECHMO_POLISH_ASR = 'techmo_polish_asr'
TECHMO_POLISH_CLASSIC_WER_METRIC = 'techmo_polish_classic_wer_metric'
TECHMO_POLISH_CLASSIC_ALIGNMENT = 'techmo_polish_classic_alignment'
TECHMO_SPACY = 'techmo_spacy'
GOLD_TRANSCRIPT_SPACY = 'gold_transcript_spacy'
POS_ALIGNMENT_WER = 'pos_alignment_wer'
POS_METRICS_WER = 'pos_metrics_wer'
DATASET_DIRECTORY = 'experiment_data/dataset/voicelab_cbiz_testset_20220322'
PIPELINE_DATA_DIRECTORY = 'experiment_data/pipeline'
EXPERIMENT_NAME = 'asr_benchmark_voicelab_cbiz_testset_20220322'
RELATION_MANAGER_ROOT_PATH = 'experiment_data/dataset_relation_manager_data/voicelab_cbiz_testset_20220322'
def run_voicelab_experiment(experiment_repository: ExperimentRepository):
record_provider = VoicelabTelcoRecordProvider(
ExtensionPathFilter(
root_directory=DATASET_DIRECTORY,
extension='wav'
),
relation_manager_root_path=RELATION_MANAGER_ROOT_PATH
)
experiment_processor = ExperimentManager(
record_id_iterator=record_provider,
processing_tasks=[
GoldTranscriptTask(
task_name='gold_transcript_task',
gold_transcript_processor=VoicelabGoldTranscriptProcessor(record_provider),
gold_transcript_property_name=GOLD_TRANSCRIPT,
require_update=False
),
AsrTask(
task_name='techmo_polish_task',
# asr_processor=AsrWebClient('http://192.168.0.124:4999/process_asr', 'test1234'),
asr_processor=AsrPathCacheClient(
'experiment_data/cached_asr/voicelab_cbiz_testset_20220322_techmo',
record_provider,
record_provider
),
asr_property_name=TECHMO_POLISH_ASR,
require_update=False,
record_path_provider=record_provider
),
ClassicWerMetricTask(
task_name='classic_wer_metric_task',
asr_property_name=TECHMO_POLISH_ASR,
gold_transcript_property_name=GOLD_TRANSCRIPT,
metrics_property_name=TECHMO_POLISH_CLASSIC_WER_METRIC,
require_update=False,
alignment_property_name=TECHMO_POLISH_CLASSIC_ALIGNMENT
),
GoldTranscriptSpacyTokenPosProcessingTask(
task_name='gold_transcript_spacy_task',
input_property_name=GOLD_TRANSCRIPT,
spacy_property_name=GOLD_TRANSCRIPT_SPACY,
require_update=True
),
AsrSpacyTokenPosProcessingTask(
task_name='techmo_spacy_task',
input_property_name=TECHMO_POLISH_ASR,
spacy_property_name=TECHMO_SPACY,
require_update=True
),
SpacyPosWerProcessingTask(
task_name='PosWerProcessor',
require_update=False,
gold_transcript_pos_property_name=GOLD_TRANSCRIPT_SPACY,
asr_pos_property_name=TECHMO_SPACY,
pos_alignment_wer=POS_ALIGNMENT_WER,
pos_metrics_wer=POS_METRICS_WER
)
],
experiment_repository=experiment_repository,
relation_manager_provider=record_provider
)
experiment_processor.process()
def example_run():
experiment_repository = MultiFilesExperimentRepository(
PIPELINE_DATA_DIRECTORY, EXPERIMENT_NAME)
run_voicelab_experiment(experiment_repository)
if __name__ == '__main__':
example_run()
from pathlib import Path
from typing import Dict, Set
from sziszapangma.integration.path_filter import PathFilter
from sziszapangma.integration.record_id_iterator import RecordIdIterator
from sziszapangma.integration.record_path_provider import RecordPathProvider
from sziszapangma.integration.relation_manager_provider import RelationManagerProvider
from sziszapangma.model.relation_manager import RelationManager, FileRelationManager
class VoicelabTelcoRecordProvider(RecordIdIterator, RecordPathProvider, RelationManagerProvider):
_path_by_id: Dict[str, str]
_relation_manager_root_path: str
def __init__(self, path_filter: PathFilter, relation_manager_root_path: str):
self._path_by_id = dict({
self._get_id(it): it
for it in path_filter.get_list_of_files()
})
self._relation_manager_root_path = relation_manager_root_path
def get_all_records(self) -> Set[str]:
return set(self._path_by_id.keys())
def get_path(self, record_id: str) -> str:
return self._path_by_id[record_id]
def get_relation_manager(self, record_id: str) -> RelationManager:
record_path = Path(self._relation_manager_root_path).joinpath(record_id)
record_path.mkdir(parents=True, exist_ok=True)
return FileRelationManager(
str(record_path.joinpath('ab_relations.csv')),
str(record_path.joinpath('ab_items.json'))
)
@staticmethod
def _get_id(record_file_path: str) -> str:
path = record_file_path.replace('.wav', '')
return '__'.join(path.split('/')[-2:])
/luna_techmo /luna_techmo
/voicelab_cbiz_testset_20220322_techmo
outs:
- md5: 94b1709c05bd09b77c5a6850e2f2f373.dir
size: 34654307
nfiles: 800
path: voicelab_cbiz_testset_20220322_techmo
/LUNA.PL /LUNA.PL
/voicelab_cbiz_testset_20220322
outs:
- md5: 3c2b18e1f1f89e4c5ad7b254e472b25e.dir
size: 4803739404
nfiles: 1600
path: voicelab_cbiz_testset_20220322
...@@ -4,6 +4,9 @@ ignore_missing_imports = True ...@@ -4,6 +4,9 @@ ignore_missing_imports = True
[mypy-tensorflow.*] [mypy-tensorflow.*]
ignore_missing_imports = True ignore_missing_imports = True
[mypy-minio.*]
ignore_missing_imports = True
[mypy-srsly.*] [mypy-srsly.*]
ignore_missing_imports = True ignore_missing_imports = True
......
No preview for this file type
...@@ -37,7 +37,7 @@ class AsrWebClient(AsrProcessor): ...@@ -37,7 +37,7 @@ class AsrWebClient(AsrProcessor):
) )
res = requests.post(self._url, files=files, headers=headers, timeout=600) res = requests.post(self._url, files=files, headers=headers, timeout=600)
json_response = res.json() json_response = res.json()
print(f'asr processing result {json_response}') print(f"asr processing result {json_response}")
return json_response return json_response
...@@ -45,11 +45,18 @@ class AsrPathCacheClient(AsrProcessor): ...@@ -45,11 +45,18 @@ class AsrPathCacheClient(AsrProcessor):
cache_path: str cache_path: str
path_to_id: Dict[str, str] path_to_id: Dict[str, str]
def __init__(self, cache_path: str, record_iterator: RecordIdIterator, record_path_provider: RecordPathProvider): def __init__(
self,
cache_path: str,
record_iterator: RecordIdIterator,
record_path_provider: RecordPathProvider,
):
super(AsrPathCacheClient, self).__init__() super(AsrPathCacheClient, self).__init__()
self._cache_path = cache_path self._cache_path = cache_path
self.path_to_id = {record_path_provider.get_path(it): it for it in record_iterator.get_all_records()} self.path_to_id = {
record_path_provider.get_path(it): it for it in record_iterator.get_all_records()
}
def call_recognise(self, file_path: str) -> Dict[str, Any]: def call_recognise(self, file_path: str) -> Dict[str, Any]:
path = Path(self._cache_path).joinpath(f'{self.path_to_id[file_path]}.json') path = Path(self._cache_path).joinpath(f"{self.path_to_id[file_path]}.json")
return json.load(open(path, 'r')) return json.load(open(path, "r"))
...@@ -18,7 +18,7 @@ class ExperimentManager: ...@@ -18,7 +18,7 @@ class ExperimentManager:
experiment_repository: ExperimentRepository, experiment_repository: ExperimentRepository,
record_id_iterator: RecordIdIterator, record_id_iterator: RecordIdIterator,
processing_tasks: List[ProcessingTask], processing_tasks: List[ProcessingTask],
relation_manager_provider: RelationManagerProvider relation_manager_provider: RelationManagerProvider,
): ):
self._experiment_repository = experiment_repository self._experiment_repository = experiment_repository
self._record_id_iterator = record_id_iterator self._record_id_iterator = record_id_iterator
......
...@@ -34,6 +34,6 @@ class ExtensionPathFilter(PathFilter): ...@@ -34,6 +34,6 @@ class ExtensionPathFilter(PathFilter):
""" """
Implementation of searching files with extension. Implementation of searching files with extension.
""" """
path_generator = Path(self._root_directory).glob(f"LUNA.PL/**/*.{self._extension}") path_generator = Path(self._root_directory).glob(f"**/*.{self._extension}")
all_files = [str(it) for it in path_generator] all_files = [str(it) for it in path_generator]
return all_files if self._files_limit is None else all_files[: self._files_limit] return all_files if self._files_limit is None else all_files[: self._files_limit]
...@@ -2,8 +2,6 @@ import json ...@@ -2,8 +2,6 @@ import json
import os import os
from typing import Any, Dict, Optional, Set from typing import Any, Dict, Optional, Set
import pandas as pd
from sziszapangma.integration.repository.experiment_repository import ExperimentRepository from sziszapangma.integration.repository.experiment_repository import ExperimentRepository
......
import io import io
import json import json
from typing import Any, Optional, Set from typing import Any, List, Optional, Set
from minio import Minio from minio import Minio
...@@ -28,10 +28,11 @@ class MinioExperimentRepository(ExperimentRepository): ...@@ -28,10 +28,11 @@ class MinioExperimentRepository(ExperimentRepository):
def update_property_for_key(self, record_id: str, property_name: str, property_value: Any): def update_property_for_key(self, record_id: str, property_name: str, property_value: Any):
path = self._get_file_path(property_name, record_id) path = self._get_file_path(property_name, record_id)
content_bytes = json.dumps(property_value).encode('utf-8') content_bytes = json.dumps(property_value).encode("utf-8")
print(self._bucket_name, path) print(self._bucket_name, path)
self._client.put_object(self._bucket_name, path, io.BytesIO(content_bytes), self._client.put_object(
len(content_bytes)) self._bucket_name, path, io.BytesIO(content_bytes), len(content_bytes)
)
def delete_property_for_key(self, record_id: str, property_name: str): def delete_property_for_key(self, record_id: str, property_name: str):
path = self._get_file_path(property_name, record_id) path = self._get_file_path(property_name, record_id)
...@@ -40,24 +41,32 @@ class MinioExperimentRepository(ExperimentRepository): ...@@ -40,24 +41,32 @@ class MinioExperimentRepository(ExperimentRepository):
def get_property_for_key(self, record_id: str, property_name: str) -> Optional[Any]: def get_property_for_key(self, record_id: str, property_name: str) -> Optional[Any]:
if self.property_exists(record_id, property_name): if self.property_exists(record_id, property_name):
path = self._get_file_path(property_name, record_id) path = self._get_file_path(property_name, record_id)
json_content = self._client.get_object(self._bucket_name, path).read().decode('utf-8') json_content = self._client.get_object(self._bucket_name, path).read().decode("utf-8")
return json.loads(json_content) return json.loads(json_content)
else: else:
return None return None
def get_all_record_ids(self) -> Set[str]: def get_all_record_ids(self) -> Set[str]:
ids = [] ids: List[str] = []
for property_name in self.get_all_properties(): for property_name in self.get_all_properties():
path = f'{self._root_path}{self._experiment_name}/{property_name}/' path = f"{self._root_path}{self._experiment_name}/{property_name}/"
property_ids = set([obj.object_name.split('/')[-1].replace('.json', '') for obj in property_ids = set(
self._client.list_objects(self._bucket_name, path)]) [
obj.object_name.split("/")[-1].replace(".json", "")
for obj in self._client.list_objects(self._bucket_name, path)
]
)
ids.extend(property_ids) ids.extend(property_ids)
return set(ids) return set(ids)
def get_all_properties(self) -> Set[str]: def get_all_properties(self) -> Set[str]:
experiment_path = f'{self._root_path}{self._experiment_name}/' experiment_path = f"{self._root_path}{self._experiment_name}/"
return set([obj.object_name.split('/')[-1] for obj in return set(
self._client.list_objects(self._bucket_name, experiment_path)]) [
obj.object_name.split("/")[-1]
for obj in self._client.list_objects(self._bucket_name, experiment_path)
]
)
def _get_file_path(self, property_name: str, record_id: str) -> str: def _get_file_path(self, property_name: str, record_id: str) -> str:
return f'{self._root_path}{self._experiment_name}/{property_name}/{record_id}.json' return f"{self._root_path}{self._experiment_name}/{property_name}/{record_id}.json"
import json import json
from pathlib import Path from pathlib import Path
from typing import Any, Optional, Set from typing import Any, List, Optional, Set
from sziszapangma.integration.repository.experiment_repository import ExperimentRepository from sziszapangma.integration.repository.experiment_repository import ExperimentRepository
...@@ -24,7 +24,7 @@ class MultiFilesExperimentRepository(ExperimentRepository): ...@@ -24,7 +24,7 @@ class MultiFilesExperimentRepository(ExperimentRepository):
def update_property_for_key(self, record_id: str, property_name: str, property_value: Any): def update_property_for_key(self, record_id: str, property_name: str, property_value: Any):
path = self._get_file_path(property_name, record_id) path = self._get_file_path(property_name, record_id)
path.parent.mkdir(parents=True, exist_ok=True) path.parent.mkdir(parents=True, exist_ok=True)
json.dump(property_value, open(path, 'w')) json.dump(property_value, open(path, "w"))
def delete_property_for_key(self, record_id: str, property_name: str): def delete_property_for_key(self, record_id: str, property_name: str):
self._get_file_path(property_name, record_id).unlink() self._get_file_path(property_name, record_id).unlink()
...@@ -32,16 +32,17 @@ class MultiFilesExperimentRepository(ExperimentRepository): ...@@ -32,16 +32,17 @@ class MultiFilesExperimentRepository(ExperimentRepository):
def get_property_for_key(self, record_id: str, property_name: str) -> Optional[Any]: def get_property_for_key(self, record_id: str, property_name: str) -> Optional[Any]:
if self.property_exists(record_id, property_name): if self.property_exists(record_id, property_name):
path = self._get_file_path(property_name, record_id) path = self._get_file_path(property_name, record_id)
return json.load(open(path, 'r')) return json.load(open(path, "r"))
else: else:
return None return None
def get_all_record_ids(self) -> Set[str]: def get_all_record_ids(self) -> Set[str]:
ids = [] ids: List[str] = []
for property_name in self.get_all_properties(): for property_name in self.get_all_properties():
path = Path(self._root_directory).joinpath(self._experiment_name)\ path = (
.joinpath(property_name) Path(self._root_directory).joinpath(self._experiment_name).joinpath(property_name)
property_ids = set([children.name.replace('.json', '') for children in path.iterdir()]) )
property_ids = set([children.name.replace(".json", "") for children in path.iterdir()])
ids.extend(property_ids) ids.extend(property_ids)
return set(ids) return set(ids)
...@@ -49,13 +50,10 @@ class MultiFilesExperimentRepository(ExperimentRepository): ...@@ -49,13 +50,10 @@ class MultiFilesExperimentRepository(ExperimentRepository):
experiment_path = Path(self._root_directory).joinpath(self._experiment_name) experiment_path = Path(self._root_directory).joinpath(self._experiment_name)
return set([it.name for it in experiment_path.iterdir()]) return set([it.name for it in experiment_path.iterdir()])
def _get_file_path(self, property_name: str, record_id: str) -> Path: def _get_file_path(self, property_name: str, record_id: str) -> Path:
return Path(self._root_directory) \ return (
.joinpath(self._experiment_name) \ Path(self._root_directory)
.joinpath(property_name) \ .joinpath(self._experiment_name)
.joinpath(f'{record_id}.json') .joinpath(property_name)
.joinpath(f"{record_id}.json")
)
if __name__ == '__main__':
print(list(Path('./').iterdir()))
from typing import Any, Optional, Set, List from typing import Any, List, Optional, Set
from sziszapangma.integration.repository.experiment_repository import ExperimentRepository from sziszapangma.integration.repository.experiment_repository import ExperimentRepository
......
...@@ -27,7 +27,12 @@ class GoldTranscriptTask(ProcessingTask): ...@@ -27,7 +27,12 @@ class GoldTranscriptTask(ProcessingTask):
is not None is not None
) )
def run_single_process(self, record_id: str, experiment_repository: ExperimentRepository, relation_manager: RelationManager,): def run_single_process(
self,
record_id: str,
experiment_repository: ExperimentRepository,
relation_manager: RelationManager,
):
experiment_repository.update_property_for_key( experiment_repository.update_property_for_key(
record_id, record_id,
self._gold_transcript_property_name, self._gold_transcript_property_name,
......
...@@ -114,4 +114,3 @@ class FileRelationManager(RelationManager): ...@@ -114,4 +114,3 @@ class FileRelationManager(RelationManager):
def clear_all(self) -> None: def clear_all(self) -> None:
self.items_dict.clear() self.items_dict.clear()
self.relations_dataframe = self.relations_dataframe[0:0] self.relations_dataframe = self.relations_dataframe[0:0]
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment