Skip to content
Snippets Groups Projects
Commit c67a89c3 authored by Marcin Wątroba's avatar Marcin Wątroba
Browse files

Finish processing luna

parent 942a7ef3
1 merge request!13Change data model
Showing
with 496 additions and 37 deletions
......@@ -4,14 +4,35 @@ stages:
cmd: PYTHONPATH=. python experiment/luna/import_dataset/import_luna.py
deps:
- path: experiment/luna/import_dataset/import_luna.py
md5: d938162187616f7e7390983ecb9e120b
size: 8269
md5: f40adccbf0b51094a71b876c9ccad751
size: 8265
- path: experiment_data/dataset/LUNA.PL
md5: d342155b1871e881797cf7da09d5dc3c.dir
size: 1578358645
nfiles: 4500
outs:
- path: experiment_data/dataset_relation_manager_data/luna
md5: c68722cc69375259a4d1a4b6a2bd4dc3.dir
size: 3016826
nfiles: 10
md5: 773f92667e16efd915ec6384d06aa4fb.dir
size: 229007155
nfiles: 1000
luna_main_pipeline:
cmd: "PYTHONPATH=. python -m spacy download pl_core_news_lg\nPYTHONPATH=. python\
\ experiment/luna/pipeline/luna_main.py\n"
deps:
- path: experiment_data/cached_asr/luna_techmo
md5: 033ea7b5434dded73bf869bfdd299462.dir
size: 4256479
nfiles: 500
- path: experiment_data/dataset/LUNA.PL
md5: d342155b1871e881797cf7da09d5dc3c.dir
size: 1578358645
nfiles: 4500
- path: experiment_data/dataset_relation_manager_data/luna
md5: 773f92667e16efd915ec6384d06aa4fb.dir
size: 229007155
nfiles: 1000
outs:
- path: experiment_data/pipeline/asr_benchmark_luna
md5: 2e334734387ab4579b7b5269d5029e81.dir
size: 71627685
nfiles: 4000
stages:
import_luna_to_common_format:
cmd: PYTHONPATH=. python experiment/luna/import_dataset/import_luna.py
deps:
- experiment/luna/import_dataset/import_luna.py
- experiment_data/dataset/LUNA.PL
outs:
- experiment_data/dataset_relation_manager_data/luna
import_luna_to_common_format:
cmd: PYTHONPATH=. python experiment/luna/import_dataset/import_luna.py
deps:
- experiment/luna/import_dataset/import_luna.py
- experiment_data/dataset/LUNA.PL
outs:
- experiment_data/dataset_relation_manager_data/luna
# luna_main_pipeline:
# cmd: PYTHONPATH=. python experiment/luna/pipeline/luna_main.py
# deps:
# - experiment_data/dataset_relation_manager_data/luna
# - experiment_data/dataset/LUNA.PL
# outs:
# - experiment_data/pipeline/luna
luna_main_pipeline:
cmd: |
PYTHONPATH=. python -m spacy download pl_core_news_lg
PYTHONPATH=. python experiment/luna/pipeline/luna_main.py
deps:
- experiment_data/dataset_relation_manager_data/luna
- experiment_data/dataset/LUNA.PL
- experiment_data/cached_asr/luna_techmo
outs:
- experiment_data/pipeline/asr_benchmark_luna
......@@ -47,7 +47,8 @@ def run_luna_experiment(experiment_repository: ExperimentRepository):
AsrTask(
task_name='techmo_polish_task',
# asr_processor=AsrWebClient('http://192.168.0.124:4999/process_asr', 'test1234'),
asr_processor=AsrPathCacheClient(),
asr_processor=AsrPathCacheClient('experiment_data/cached_asr/luna_techmo', record_provider,
record_provider),
asr_property_name=TECHMO_POLISH_ASR,
require_update=False,
record_path_provider=record_provider
......
/asr_benchmark_luna
This diff is collapsed.
......@@ -22,6 +22,7 @@ Flask = "^2.0.1"
Flask-HTTPAuth = "^4.4.0"
minio = "^7.1.6"
dvc = {extras = ["s3"], version = "^2.10.1"}
spacy = "^3.2.4"
[tool.poetry.dev-dependencies]
pytest = "^5.2"
......
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
import json
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Any, Dict, Optional
import requests
from sziszapangma.integration.record_id_iterator import RecordIdIterator
from sziszapangma.integration.record_path_provider import RecordPathProvider
class AsrProcessor(ABC):
@abstractmethod
......@@ -38,19 +42,14 @@ class AsrWebClient(AsrProcessor):
class AsrPathCacheClient(AsrProcessor):
cache_path: str
path_to_id: Dict[str, str]
def __init__(self):
def __init__(self, cache_path: str, record_iterator: RecordIdIterator, record_path_provider: RecordPathProvider):
super(AsrPathCacheClient, self).__init__()
self._cache_path = cache_path
self.path_to_id = {record_path_provider.get_path(it): it for it in record_iterator.get_all_records()}
def call_recognise(self, file_path: str) -> Dict[str, Any]:
# files = {"file": open(file_path, "rb")}
# headers = (
# dict({"Authorization": f"Bearer {self._auth_token}"})
# if self._auth_token is not None
# else dict()
# )
# res = requests.post(self._url, files=files, headers=headers, timeout=600)
# json_response = res.json()
# print(f'asr processing result {json_response}')
# return json_response
return json.load(open(f'{file_path}.techmo.json'))
path = Path(self._cache_path).joinpath(f'{self.path_to_id[file_path]}.json')
return json.load(open(path, 'r'))
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment