Skip to content
Snippets Groups Projects
Unverified Commit 942a7ef3 authored by Marcin Wątroba's avatar Marcin Wątroba
Browse files

Add processing

parent eca94342
No related branches found
No related tags found
1 merge request!13Change data model
Showing
with 111 additions and 94 deletions
[core] [core]
remote = clarin-biz-asr-benchmarks remote = clarin-biz-asr-benchmarks
autostage = true
['remote "clarin-biz-asr-benchmarks"'] ['remote "clarin-biz-asr-benchmarks"']
url = s3://projects/clarin-biz-asr-benchmark/dvc url = s3://projects/clarin-biz-asr-benchmark/dvc
endpointurl = https://s3.clarin-pl.eu endpointurl = https://s3.clarin-pl.eu
...@@ -111,4 +111,3 @@ debug_run/ ...@@ -111,4 +111,3 @@ debug_run/
publish_to_theliver.sh publish_to_theliver.sh
.idea .idea
/experiment_data
dvc.lock 0 → 100644
schema: '2.0'
stages:
import_luna_to_common_format:
cmd: PYTHONPATH=. python experiment/luna/import_dataset/import_luna.py
deps:
- path: experiment/luna/import_dataset/import_luna.py
md5: d938162187616f7e7390983ecb9e120b
size: 8269
- path: experiment_data/dataset/LUNA.PL
md5: d342155b1871e881797cf7da09d5dc3c.dir
size: 1578358645
nfiles: 4500
outs:
- path: experiment_data/dataset_relation_manager_data/luna
md5: c68722cc69375259a4d1a4b6a2bd4dc3.dir
size: 3016826
nfiles: 10
dvc.yaml 0 → 100644
stages:
import_luna_to_common_format:
cmd: PYTHONPATH=. python experiment/luna/import_dataset/import_luna.py
deps:
- experiment/luna/import_dataset/import_luna.py
- experiment_data/dataset/LUNA.PL
outs:
- experiment_data/dataset_relation_manager_data/luna
# luna_main_pipeline:
# cmd: PYTHONPATH=. python experiment/luna/pipeline/luna_main.py
# deps:
# - experiment_data/dataset_relation_manager_data/luna
# - experiment_data/dataset/LUNA.PL
# outs:
# - experiment_data/pipeline/luna
from typing import Dict, Set
from sziszapangma.integration.path_filter import PathFilter
from sziszapangma.integration.record_id_iterator import RecordIdIterator
from sziszapangma.integration.record_path_provider import RecordPathProvider
from sziszapangma.integration.relation_manager_provider import RelationManagerProvider
from sziszapangma.model.relation_manager import RelationManager, FileRelationManager
class LunaRecordProvider(RecordIdIterator, RecordPathProvider, RelationManagerProvider):
_path_by_id: Dict[str, str]
def __init__(self, path_filter: PathFilter):
self._path_by_id = dict({
self._get_id(it): it
for it in path_filter.get_list_of_files()
})
def get_all_records(self) -> Set[str]:
return set(self._path_by_id.keys())
def get_path(self, record_id: str) -> str:
return self._path_by_id[record_id]
def get_item_file_path(self, record_id: str, file) -> str:
return self._path_by_id[record_id]
def get_relation_manager(self, record_id: str) -> RelationManager:
basic_path = self.get_path(record_id)[:-4]
return FileRelationManager(f'{basic_path}_ab_relations.csv', f'{basic_path}_ab_items.json')
@staticmethod
def _get_id(record_file_path: str) -> str:
path = record_file_path.replace('.wav', '')
return '/'.join(path.split('/')[-6:])
from examples.import_dataset.luna.luna_record_provider import LunaRecordProvider
from sziszapangma.model.relation_manager import RelationManager, FileRelationManager
class RelationManagerProvider:
_luna_record_provider: LunaRecordProvider
def __init__(self, luna_record_provider: LunaRecordProvider):
self._luna_record_provider = luna_record_provider
def get_relation_manager(self, record_id: str) -> RelationManager:
basic_path = self._luna_record_provider.get_path(record_id)[:-4]
return FileRelationManager(f'{basic_path}_relations.csv', f'{basic_path}_items.json')
from lhotse import AudioSource, SupervisionSegment
def load_single_datasource(path: str) -> AudioSource:
return AudioSource('file', [1], path)
File moved
File moved
from pprint import pprint
from typing import Tuple, List from typing import Tuple, List
from xml.etree import ElementTree from xml.etree import ElementTree
from examples.import_dataset.luna.luna_record_provider import LunaRecordProvider from experiment.luna.luna_record_provider import LunaRecordProvider
from sziszapangma.integration.path_filter import ExtensionPathFilter from sziszapangma.integration.path_filter import ExtensionPathFilter
from sziszapangma.model.model import Word, SingleAnnotation from sziszapangma.model.model import Word, SingleAnnotation
from sziszapangma.model.model_creators import create_new_word, create_new_single_annotation, \ from sziszapangma.model.model_creators import create_new_word, create_new_single_annotation, \
...@@ -149,6 +148,7 @@ class LunaAdapter: ...@@ -149,6 +148,7 @@ class LunaAdapter:
def import_record(self, record_id: str): def import_record(self, record_id: str):
print(f'record {record_id}') print(f'record {record_id}')
relation_manager = self._record_provider.get_relation_manager(record_id) relation_manager = self._record_provider.get_relation_manager(record_id)
relation_manager.clear_all()
basic_path = self._record_provider.get_path(record_id)[:-4] basic_path = self._record_provider.get_path(record_id)[:-4]
words_path = f"{basic_path}_words.xml" words_path = f"{basic_path}_words.xml"
...@@ -157,27 +157,33 @@ class LunaAdapter: ...@@ -157,27 +157,33 @@ class LunaAdapter:
turn_path = f"{basic_path}_turns.xml" turn_path = f"{basic_path}_turns.xml"
words, single_annotations = self.save_words(words_path, relation_manager) words, single_annotations = self.save_words(words_path, relation_manager)
print('save_words') # print('save_words')
self.read_concepts(words, concept_path, relation_manager) self.read_concepts(words, concept_path, relation_manager)
print('read_concepts') # print('read_concepts')
self.read_chunks(words, chunks_path, relation_manager) self.read_chunks(words, chunks_path, relation_manager)
print('read_chunks') # print('read_chunks')
self.read_turns(words, turn_path, relation_manager) self.read_turns(words, turn_path, relation_manager)
print('read_turns') # print('read_turns')
relation_manager.commit() relation_manager.commit()
if __name__ == "__main__": def main():
luna_directory = '/Users/marcinwatroba/Desktop/LUNA/LUNA.PL' luna_directory = 'experiment_data/dataset/LUNA.PL'
luna_record_provider = LunaRecordProvider(ExtensionPathFilter( luna_record_provider = LunaRecordProvider(
ExtensionPathFilter(
root_directory=luna_directory, root_directory=luna_directory,
extension='wav' extension='wav'
)) ),
for it in list(luna_record_provider.get_all_records())[:1]: relation_manager_root_path='experiment_data/dataset_relation_manager_data/luna'
relation_manager = luna_record_provider.get_relation_manager(it) )
document = [itt for itt in relation_manager.get_all_items() if itt['type'] == 'Document'][0] luna_adapter = LunaAdapter(luna_record_provider)
document_words = [relation_manager.get_item_by_id(item_id) record_ids = list(luna_record_provider.get_all_records())
for item_id in document['word_ids']] index = 0
all_relations = relation_manager.get_all_relations_for_item(document_words[0]['id']) for record_id in record_ids:
ids = [it['second_id'] for it in all_relations if it['second_type'] in ['lemma', 'pos']] index += 1
pprint([relation_manager.get_item_by_id(it) for it in ids]) print(f'{index}/{len(record_ids)}')
luna_adapter.import_record(record_id)
if __name__ == "__main__":
main()
from pathlib import Path
from typing import Dict, Set from typing import Dict, Set
from sziszapangma.integration.path_filter import PathFilter from sziszapangma.integration.path_filter import PathFilter
...@@ -9,8 +10,10 @@ from sziszapangma.model.relation_manager import RelationManager, FileRelationMan ...@@ -9,8 +10,10 @@ from sziszapangma.model.relation_manager import RelationManager, FileRelationMan
class LunaRecordProvider(RecordIdIterator, RecordPathProvider, RelationManagerProvider): class LunaRecordProvider(RecordIdIterator, RecordPathProvider, RelationManagerProvider):
_path_by_id: Dict[str, str] _path_by_id: Dict[str, str]
_relation_manager_root_path: str
def __init__(self, path_filter: PathFilter): def __init__(self, path_filter: PathFilter, relation_manager_root_path: str):
self._relation_manager_root_path = relation_manager_root_path
self._path_by_id = dict({ self._path_by_id = dict({
self._get_id(it): it self._get_id(it): it
for it in path_filter.get_list_of_files() for it in path_filter.get_list_of_files()
...@@ -22,12 +25,13 @@ class LunaRecordProvider(RecordIdIterator, RecordPathProvider, RelationManagerPr ...@@ -22,12 +25,13 @@ class LunaRecordProvider(RecordIdIterator, RecordPathProvider, RelationManagerPr
def get_path(self, record_id: str) -> str: def get_path(self, record_id: str) -> str:
return self._path_by_id[record_id] return self._path_by_id[record_id]
def get_item_file_path(self, record_id: str, file) -> str:
return self._path_by_id[record_id]
def get_relation_manager(self, record_id: str) -> RelationManager: def get_relation_manager(self, record_id: str) -> RelationManager:
basic_path = self.get_path(record_id)[:-4] record_path = Path(self._relation_manager_root_path).joinpath(record_id)
return FileRelationManager(f'{basic_path}_ab_relations.csv', f'{basic_path}_ab_items.json') record_path.mkdir(parents=True, exist_ok=True)
return FileRelationManager(
str(record_path.joinpath('ab_relations.csv')),
str(record_path.joinpath('ab_items.json'))
)
@staticmethod @staticmethod
def _get_id(record_file_path: str) -> str: def _get_id(record_file_path: str) -> str:
......
import json
from pprint import pprint
from experiment.luna.luna_record_provider import LunaRecordProvider
from sziszapangma.integration.path_filter import ExtensionPathFilter
if __name__ == '__main__':
luna_directory = 'experiment_data/dataset/LUNA.PL'
luna_record_provider = LunaRecordProvider(
ExtensionPathFilter(
root_directory=luna_directory,
extension='wav'
),
relation_manager_root_path='experiment_data/dataset_relation_manager_data/luna'
)
pprint(luna_record_provider.get_all_records())
for record_id in luna_record_provider.get_all_records():
path = f'experiment_data/cached_asr/LUNA_techmo_asr_cache/{record_id.replace("__", "/")}.wav.techmo.json'
raw = json.load(open(path, 'r'))
output_path = f'experiment_data/cached_asr/luna_techmo/{record_id}.json'
json.dump(raw, open(output_path, 'w'))
from typing import List, Dict from typing import List, Dict
# from examples.luna.luna_record_provider import LunaRecordProvider # from experiment.luna.luna_record_provider import LunaRecordProvider
from examples.luna.luna_record_provider import LunaRecordProvider from experiment.luna.luna_record_provider import LunaRecordProvider
from sziszapangma.integration.gold_transcript_processor import GoldTranscriptProcessor from sziszapangma.integration.gold_transcript_processor import GoldTranscriptProcessor
from sziszapangma.model.relation_manager import RelationManager from sziszapangma.model.relation_manager import RelationManager
......
from examples.luna.luna_gold_transcript_processor import LunaGoldTranscriptProcessor from experiment.luna.pipeline.luna_gold_transcript_processor import LunaGoldTranscriptProcessor
from examples.luna.luna_record_provider import LunaRecordProvider from experiment.luna.pipeline.pos_processing.asr_spacy_token_pos_processing_task import \
from examples.luna.pos_processing.asr_spacy_token_pos_processing_task import \
AsrSpacyTokenPosProcessingTask AsrSpacyTokenPosProcessingTask
from examples.luna.pos_processing.gold_transcript_spacy_token_pos_processing_task import \ from experiment.luna.pipeline.pos_processing.gold_transcript_spacy_token_pos_processing_task import \
GoldTranscriptSpacyTokenPosProcessingTask GoldTranscriptSpacyTokenPosProcessingTask
from examples.luna.pos_processing.spacy_pos_wer_processing_task import SpacyPosWerProcessingTask from experiment.luna.pipeline.pos_processing.spacy_pos_wer_processing_task import \
SpacyPosWerProcessingTask
from experiment.luna.luna_record_provider import LunaRecordProvider
from sziszapangma.integration.asr_processor import AsrPathCacheClient from sziszapangma.integration.asr_processor import AsrPathCacheClient
from sziszapangma.integration.experiment_manager import ExperimentManager from sziszapangma.integration.experiment_manager import ExperimentManager
from sziszapangma.integration.path_filter import ExtensionPathFilter from sziszapangma.integration.path_filter import ExtensionPathFilter
...@@ -15,7 +16,7 @@ from sziszapangma.integration.task.asr_task import AsrTask ...@@ -15,7 +16,7 @@ from sziszapangma.integration.task.asr_task import AsrTask
from sziszapangma.integration.task.classic_wer_metric_task import ClassicWerMetricTask from sziszapangma.integration.task.classic_wer_metric_task import ClassicWerMetricTask
from sziszapangma.integration.task.gold_transcript_task import GoldTranscriptTask from sziszapangma.integration.task.gold_transcript_task import GoldTranscriptTask
LUNA_DIRECTORY = '/Users/marcinwatroba/Desktop/LUNA/LUNA.PL' LUNA_DIRECTORY = 'experiment_data/dataset/LUNA.PL'
GOLD_TRANSCRIPT = 'gold_transcript' GOLD_TRANSCRIPT = 'gold_transcript'
TECHMO_POLISH_ASR = 'techmo_polish_asr' TECHMO_POLISH_ASR = 'techmo_polish_asr'
TECHMO_POLISH_CLASSIC_WER_METRIC = 'techmo_polish_classic_wer_metric' TECHMO_POLISH_CLASSIC_WER_METRIC = 'techmo_polish_classic_wer_metric'
...@@ -27,10 +28,13 @@ POS_METRICS_WER = 'pos_metrics_wer' ...@@ -27,10 +28,13 @@ POS_METRICS_WER = 'pos_metrics_wer'
def run_luna_experiment(experiment_repository: ExperimentRepository): def run_luna_experiment(experiment_repository: ExperimentRepository):
record_provider = LunaRecordProvider(ExtensionPathFilter( record_provider = LunaRecordProvider(
ExtensionPathFilter(
root_directory=LUNA_DIRECTORY, root_directory=LUNA_DIRECTORY,
extension='wav' extension='wav'
)) ),
relation_manager_root_path='experiment_data/dataset_relation_manager_data/luna'
)
experiment_processor = ExperimentManager( experiment_processor = ExperimentManager(
record_id_iterator=record_provider, record_id_iterator=record_provider,
processing_tasks=[ processing_tasks=[
...@@ -84,7 +88,8 @@ def run_luna_experiment(experiment_repository: ExperimentRepository): ...@@ -84,7 +88,8 @@ def run_luna_experiment(experiment_repository: ExperimentRepository):
def example_run(): def example_run():
experiment_repository = MultiFilesExperimentRepository('experiment_data', 'asr_benchmark_luna') experiment_repository = MultiFilesExperimentRepository(
'experiment_data/pipeline', 'asr_benchmark_luna')
run_luna_experiment(experiment_repository) run_luna_experiment(experiment_repository)
......
from dataclasses import dataclass from dataclasses import dataclass
from typing import Any from typing import Any
from examples.luna.pos_processing.spacy_token_pos_processing_task import SpacyTokenPosProcessingTask from experiment.luna.pipeline.pos_processing.spacy_token_pos_processing_task import SpacyTokenPosProcessingTask
@dataclass @dataclass
......
from dataclasses import dataclass from dataclasses import dataclass
from typing import Any from typing import Any
from examples.luna.pos_processing.spacy_token_pos_processing_task import SpacyTokenPosProcessingTask from experiment.luna.pipeline.pos_processing.spacy_token_pos_processing_task import SpacyTokenPosProcessingTask
@dataclass @dataclass
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment