From 9a94c87e64a1083d8964b927853fd029da09c823 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marcin=20W=C4=85troba?= <markowanga@gmail.com>
Date: Thu, 5 May 2022 14:08:32 +0200
Subject: [PATCH] Add other experiments

---
 docker/ajn_asr/main.py                        |   8 +
 docker/ajn_asr/prepare_docker.sh              |   4 +-
 docker/docker-compose.yml                     |  68 ++++----
 dvc.lock                                      | 161 +++++++++++++-----
 dvc.yaml                                      |  42 +++--
 .../luna/pipeline/dependency_provider.py      |   8 +-
 .../luna/pipeline/luna_ajn_asr_processing.py  |  25 ++-
 .../luna/pipeline/luna_techmo_processing.py   |  17 +-
 experiment/voicelab/voicelab_dependency.py    |   9 +-
 experiment/voicelab/voicelab_pipeline.py      |  96 -----------
 .../voicelab/voicelab_pipeline_ajn_asr.py     |  29 +++-
 .../voicelab/voicelab_pipeline_techmo.py      |  18 +-
 experiment_data/cached_asr/.gitignore         |   1 +
 .../voicelab_cbiz_testset_20220322_ajn.dvc    |   5 +
 .../.gitignore                                |  10 ++
 run_repro_in_background.sh                    |   2 +
 ...ached_embedding_transformer.cpython-38.pyc | Bin 1986 -> 1985 bytes
 .../__pycache__/asr_processor.cpython-38.pyc  | Bin 2865 -> 3719 bytes
 sziszapangma/integration/asr_processor.py     |  26 ++-
 .../multi_files_experiment_repository.py      |   4 +
 .../task/__pycache__/asr_task.cpython-38.pyc  | Bin 2148 -> 2273 bytes
 .../embedding_wer_metrics_task.cpython-38.pyc | Bin 3627 -> 3626 bytes
 sziszapangma/integration/task/asr_task.py     |   3 +
 23 files changed, 324 insertions(+), 212 deletions(-)
 delete mode 100644 experiment/voicelab/voicelab_pipeline.py
 create mode 100644 experiment_data/cached_asr/voicelab_cbiz_testset_20220322_ajn.dvc
 create mode 100755 run_repro_in_background.sh

diff --git a/docker/ajn_asr/main.py b/docker/ajn_asr/main.py
index 150ddde..bf69371 100644
--- a/docker/ajn_asr/main.py
+++ b/docker/ajn_asr/main.py
@@ -7,12 +7,20 @@ from sziszapangma.integration.service_core.asr.asr_result import AsrResult
 
 class SpeechbrainAsrProcessor(AsrBaseProcessor):
 
+    @staticmethod
+    def _process_file_to_correct_format(file_path: str, extension: str):
+        temp_file = f'{str(uuid.uuid4())}.{extension}'
+        os.system(f'ffmpeg -i {file_path} -ar 16000 {temp_file}')
+        # os.remove(file_path)
+        os.rename(temp_file, file_path)
+
     def process_asr(self, audio_file_path: str) -> AsrResult:
         file_tag = str(uuid.uuid4())
         file_extension = audio_file_path.split('.')[-1]
         file_name = f'{file_tag}.{file_extension}'
         result_file_path = f'processing_flask/{file_tag}.txt'
         file_path = f'processing_flask/{file_name}'
+        self._process_file_to_correct_format(audio_file_path, file_extension)
         # create file in /data/uuid.ext
         os.system(f"cp {audio_file_path} /data/{file_path}")
 
diff --git a/docker/ajn_asr/prepare_docker.sh b/docker/ajn_asr/prepare_docker.sh
index c7e687d..711edd2 100755
--- a/docker/ajn_asr/prepare_docker.sh
+++ b/docker/ajn_asr/prepare_docker.sh
@@ -3,5 +3,5 @@
 SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
 
 docker build -t asr-clarin-pl-service "$SCRIPT_DIR"
-docker tag asr-clarin-pl-service gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/asr-clarin-pl-service:1.4
-docker push gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/asr-clarin-pl-service:1.4
+docker tag asr-clarin-pl-service gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/asr-clarin-pl-service:1.5
+docker push gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/asr-clarin-pl-service:1.5
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
index d4486f1..b2fbf23 100644
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@@ -18,34 +18,34 @@ services:
     #      - TECHMO_SERVER_URL=156.17.135.34
     #      - AUTH_TOKEN=__example_token__
 
-    transformers-wav2vec2for_ctc:
-        image: docker-registry.theliver.pl/transformers-wav2vec2for_ctc:1.0
-        container_name: transformers-wav2vec2for_ctc
+    #    transformers-wav2vec2for_ctc:
+    #        image: gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/transformers-wav2vec2for_ctc:1.0
+    #        container_name: transformers-wav2vec2for_ctc
+    #        restart: always
+    #        volumes:
+    #            - /etc/localtime:/etc/localtime:ro
+    #            - ./wav2vec2for_ctc_models:/models
+    #        ports:
+    #            - "5430:5000"
+    #        environment:
+    #            - AUTH_TOKEN=__example_token__
+    #            - MODEL_NAME=jonatasgrosman/wav2vec2-large-xlsr-53-polish
+    #            - SAMPLING_RATE=16000
+
+    embedding_service:
+        image: docker-registry.theliver.pl/embedding_docker:1.0
+        container_name: embeddings_service
         restart: always
-        volumes:
-            - /etc/localtime:/etc/localtime:ro
-            - ./wav2vec2for_ctc_models:/models
         ports:
-            - "5430:5000"
+            - "5003:5000"
         environment:
-            - AUTH_TOKEN=__example_token__
-            - MODEL_NAME=jonatasgrosman/wav2vec2-large-xlsr-53-polish
-            - SAMPLING_RATE=16000
-
-    #  embedding_service:
-    #    image: docker-registry.theliver.pl/embedding_docker:1.0
-    #    container_name: embeddings_service
-    #    restart: always
-    #    ports:
-    #      - 5003:5000
-    #    environment:
-    #      - AUTH_TOKEN=__example_token__
-    #    volumes:
-    #      - /etc/localtime:/etc/localtime:ro
-    #      - ./embedding_models:/models
+            - AUTH_TOKEN=fjsd-mkwe-oius-m9h2
+        volumes:
+            - /etc/localtime:/etc/localtime:ro
+            - ./embedding_models:/models
 
     ajn_asr:
-        image: docker-registry.theliver.pl/asr-clarin-pl-service:1.4
+        image: gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/asr-clarin-pl-service:1.5
         container_name: ajn_asr
         restart: always
         ports:
@@ -55,14 +55,14 @@ services:
         volumes:
             - /etc/localtime:/etc/localtime:ro
 
-    speechbrain_asr:
-        image: docker-registry.theliver.pl/speechbrain-asr:1.5
-        container_name: speechbrain_asr
-        restart: always
-        ports:
-            - "5432:5000"
-        volumes:
-            - /etc/localtime:/etc/localtime:ro
-            - ./speechbrain_asr_models:/models
-        environment:
-            - AUTH_TOKEN=__example_token__
+#    speechbrain_asr:
+#        image: gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/speechbrain-asr:1.5
+#        container_name: speechbrain_asr
+#        restart: always
+#        ports:
+#            - "5432:5000"
+#        volumes:
+#            - /etc/localtime:/etc/localtime:ro
+#            - ./speechbrain_asr_models:/models
+#        environment:
+#            - AUTH_TOKEN=__example_token__
diff --git a/dvc.lock b/dvc.lock
index 7a9261d..85522fe 100644
--- a/dvc.lock
+++ b/dvc.lock
@@ -52,7 +52,7 @@ stages:
       size: 229007155
       nfiles: 1000
   voicelab_import_to_common_format:
-    cmd: PYTHONPATH=. python experiment/voicelab/import_data.py
+    cmd: PYTHONPATH=. python -u experiment/voicelab/import_data.py
     deps:
     - path: experiment/voicelab/import_data.py
       md5: 41acb98a1517e66c052182fe0a1403ba
@@ -63,7 +63,7 @@ stages:
       nfiles: 1600
     outs:
     - path: experiment_data/dataset_relation_manager_data/voicelab_cbiz_testset_20220322
-      md5: 926ef9bab4ce41b9de95f2f3d5ab67a0.dir
+      md5: 4046ea5d80966f0c017b2c4bec0e7c9b.dir
       size: 110711470
       nfiles: 1600
   luna_gold_transcript_processing:
@@ -93,8 +93,12 @@ stages:
     cmd: "PYTHONPATH=. python experiment/luna/pipeline/luna_ajn_asr_processing.py\n"
     deps:
     - path: experiment/luna/pipeline/luna_ajn_asr_processing.py
-      md5: ec7d7b5384f845173d9fb77e9cfa9907
-      size: 2501
+      md5: 2d66cb8890c420b55e8b7eb33ac32ba2
+      size: 3558
+    - path: experiment_data/cached_asr/luna_ajn_polish_asr
+      md5: 620e178854dbcb69f49a608f34573a88.dir
+      size: 6159899
+      nfiles: 494
     - path: experiment_data/dataset/LUNA.PL
       md5: d342155b1871e881797cf7da09d5dc3c.dir
       size: 1578358645
@@ -109,35 +113,43 @@ stages:
       nfiles: 500
     outs:
     - path: experiment_data/pipeline/asr_benchmark_luna/ajn_polish_asr
-      md5: 620e178854dbcb69f49a608f34573a88.dir
-      size: 6159899
-      nfiles: 494
+      md5: fa9d926ae8fd0268c71f19c1d5d39fcf.dir
+      size: 11080541
+      nfiles: 499
     - path: experiment_data/pipeline/asr_benchmark_luna/ajn_spacy
-      md5: 312be284d4ec9e38986048e785fcbbc1.dir
-      size: 6535212
-      nfiles: 494
+      md5: 417d8f07266eb5da9c4bfbf84f3b4eac.dir
+      size: 6579351
+      nfiles: 499
     - path: experiment_data/pipeline/asr_benchmark_luna/pos_ajn_alignment_wer
-      md5: 8ad558edb6a8bd2508a7e25bcf53bf94.dir
-      size: 21936929
-      nfiles: 494
+      md5: 2bf746c412e6bff4071f689d853b106f.dir
+      size: 22061350
+      nfiles: 499
     - path: experiment_data/pipeline/asr_benchmark_luna/pos_ajn_metrics_wer
-      md5: 98c74c5bf87637749eac1ed5ff3393b4.dir
-      size: 16842
-      nfiles: 494
+      md5: 3147413bdfd36ad91c64303e8705951b.dir
+      size: 17002
+      nfiles: 499
     - path: experiment_data/pipeline/asr_benchmark_luna/word_ajn_alignment_wer
-      md5: 1741fff740259398b28bf2a6ba3aec41.dir
-      size: 20671277
-      nfiles: 494
+      md5: 2bb11f8a97cdeb18c557fadb49a6f015.dir
+      size: 25669158
+      nfiles: 499
+    - path: experiment_data/pipeline/asr_benchmark_luna/word_ajn_alignment_wer_embeddings
+      md5: c2824c0c5cf433dbf864ebbdc2fb3cfc.dir
+      size: 44326962
+      nfiles: 500
     - path: experiment_data/pipeline/asr_benchmark_luna/word_ajn_metrics_wer
-      md5: 18605657ff9c7ef3221e27b671a3b4d1.dir
-      size: 16835
-      nfiles: 494
+      md5: c48c74eccf1cfd0768900514d2fcfd1b.dir
+      size: 10527
+      nfiles: 499
+    - path: experiment_data/pipeline/asr_benchmark_luna/word_ajn_metrics_wer_embeddings
+      md5: 98a7edeee3b630e8e301acfc578a8393.dir
+      size: 34869
+      nfiles: 500
   luna_techmo_processing:
     cmd: "PYTHONPATH=. python experiment/luna/pipeline/luna_techmo_processing.py\n"
     deps:
     - path: experiment/luna/pipeline/luna_techmo_processing.py
-      md5: b4d5ad7a0d7fb0714a2dc02cb457e8c9
-      size: 2628
+      md5: 75069cd6e3a61dfaaf49c2bdb1e81976
+      size: 3416
     - path: experiment_data/cached_asr/luna_techmo
       md5: 033ea7b5434dded73bf869bfdd299462.dir
       size: 4256479
@@ -156,7 +168,7 @@ stages:
       nfiles: 500
     outs:
     - path: experiment_data/pipeline/asr_benchmark_luna/pos_techmo_alignment_wer
-      md5: c71539f3889c627a371957958bd0907d.dir
+      md5: 94762d19a853810064afd38319d05a2c.dir
       size: 20897599
       nfiles: 500
     - path: experiment_data/pipeline/asr_benchmark_luna/pos_techmo_metrics_wer
@@ -164,21 +176,29 @@ stages:
       size: 17341
       nfiles: 500
     - path: experiment_data/pipeline/asr_benchmark_luna/techmo_polish_asr
-      md5: acfaec46b2415ed6a64e3a3464d164f8.dir
+      md5: 3787c6a4c7941787253165e2ba760e73.dir
       size: 9697519
       nfiles: 500
     - path: experiment_data/pipeline/asr_benchmark_luna/techmo_spacy
-      md5: e869581816457d1585a7e42d0a18b8b2.dir
+      md5: 337b6bf947ee47cda30b3cc75f954e8e.dir
       size: 6124559
       nfiles: 500
     - path: experiment_data/pipeline/asr_benchmark_luna/word_techmo_alignment_wer
-      md5: 0dabd65b3981d588cd23d943abc6e231.dir
+      md5: afc25d6ad22bed4ded5cb07028bff1cf.dir
       size: 21380796
       nfiles: 500
+    - path: experiment_data/pipeline/asr_benchmark_luna/word_techmo_alignment_wer_embeddings
+      md5: c2824c0c5cf433dbf864ebbdc2fb3cfc.dir
+      size: 44326962
+      nfiles: 500
     - path: experiment_data/pipeline/asr_benchmark_luna/word_techmo_metrics_wer
       md5: 4cfbb2830b280084ece14b1ef815b92a.dir
       size: 17298
       nfiles: 500
+    - path: experiment_data/pipeline/asr_benchmark_luna/word_techmo_metrics_wer_embeddings
+      md5: 98a7edeee3b630e8e301acfc578a8393.dir
+      size: 34869
+      nfiles: 500
   voicelab_gold_transcript_processing:
     cmd: "PYTHONPATH=. python -u experiment/voicelab/voicelab_pipeline_gold_transcript.py\n"
     deps:
@@ -190,24 +210,24 @@ stages:
       size: 4803739404
       nfiles: 1600
     - path: experiment_data/dataset_relation_manager_data/voicelab_cbiz_testset_20220322
-      md5: 926ef9bab4ce41b9de95f2f3d5ab67a0.dir
+      md5: 4046ea5d80966f0c017b2c4bec0e7c9b.dir
       size: 110711470
       nfiles: 1600
     outs:
     - path: experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/gold_transcript
-      md5: fb6812b2f3044c0285ee6ee2b21d0523.dir
+      md5: 9edf1e743faa9fc3515790acb6fd8cab.dir
       size: 21846798
       nfiles: 800
     - path: experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/gold_transcript_spacy
-      md5: f2e68dcc8842a15e417ae6f5221a802a.dir
+      md5: c166937f6e8ae9d28412ca1e3e43469e.dir
       size: 26643278
       nfiles: 800
   voicelab_techmo_processing:
     cmd: "PYTHONPATH=. python -u experiment/voicelab/voicelab_pipeline_techmo.py\n"
     deps:
     - path: experiment/voicelab/voicelab_pipeline_techmo.py
-      md5: 23c0869d7cc9f0088870362d669ab82e
-      size: 2685
+      md5: 3d6347486055a11e399beac71ce2f877
+      size: 3479
     - path: experiment_data/cached_asr/voicelab_cbiz_testset_20220322_techmo
       md5: 6c3b356723d562c978f84e733b91f5d0.dir
       size: 17539259
@@ -217,16 +237,16 @@ stages:
       size: 4803739404
       nfiles: 1600
     - path: experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/gold_transcript
-      md5: fb6812b2f3044c0285ee6ee2b21d0523.dir
+      md5: 9edf1e743faa9fc3515790acb6fd8cab.dir
       size: 21846798
       nfiles: 800
     - path: experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/gold_transcript_spacy
-      md5: f2e68dcc8842a15e417ae6f5221a802a.dir
+      md5: c166937f6e8ae9d28412ca1e3e43469e.dir
       size: 26643278
       nfiles: 800
     outs:
     - path: experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/pos_techmo_alignment_wer
-      md5: 8c5f0380ba2891b3e726d647c2863c60.dir
+      md5: a15a7a19f46e329c8b77eeecdda9d7b4.dir
       size: 81650836
       nfiles: 800
     - path: experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/pos_techmo_metrics_wer
@@ -234,18 +254,81 @@ stages:
       size: 27934
       nfiles: 800
     - path: experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/techmo_polish_asr
-      md5: c45e29b08af7bb13cdf54da9655bd96c.dir
+      md5: da32e6fa9d986deddb594cb66e649864.dir
       size: 39158267
       nfiles: 800
     - path: experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/techmo_spacy
-      md5: a39c82666419c2b7791952a1fa116d61.dir
+      md5: cd89a91a33629088ba6fc30ef8427dee.dir
       size: 24482297
       nfiles: 800
     - path: experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/word_techmo_alignment_wer
-      md5: 72ff86c7cb2e89ac7e04677f532255b2.dir
+      md5: 0b714391682432408d74beee1cd5a14a.dir
       size: 83756423
       nfiles: 800
+    - path: experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/word_techmo_alignment_wer_embeddings
+      md5: 93d34d82f8536014ddbe0cf0645dd837.dir
+      size: 174322727
+      nfiles: 800
     - path: experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/word_techmo_metrics_wer
       md5: 2fe3288abe85e4a385e2aefa0e8cad7e.dir
       size: 27780
       nfiles: 800
+    - path: experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/word_techmo_metrics_wer_embeddings
+      md5: 1fc2985ad4c3cb00d05b1865ad5b22d4.dir
+      size: 56182
+      nfiles: 800
+  voicelab_ajn_processing:
+    cmd: "PYTHONPATH=. python -u experiment/voicelab/voicelab_pipeline_ajn_asr.py\n"
+    deps:
+    - path: experiment/voicelab/voicelab_pipeline_ajn_asr.py
+      md5: 85e8d3d79379e6d5db751e03c5ebae75
+      size: 4161
+    - path: experiment_data/cached_asr/voicelab_cbiz_testset_20220322_ajn
+      md5: 49a38b90f1265a61b90b54f820415011.dir
+      size: 32601414
+      nfiles: 800
+    - path: experiment_data/dataset/voicelab_cbiz_testset_20220322
+      md5: 3c2b18e1f1f89e4c5ad7b254e472b25e.dir
+      size: 4803739404
+      nfiles: 1600
+    - path: experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/gold_transcript
+      md5: 9edf1e743faa9fc3515790acb6fd8cab.dir
+      size: 21846798
+      nfiles: 800
+    - path: experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/gold_transcript_spacy
+      md5: c166937f6e8ae9d28412ca1e3e43469e.dir
+      size: 26643278
+      nfiles: 800
+    outs:
+    - path: experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/ajn_polish_asr
+      md5: 94181d7a0731e8defbdcb4b477ad72a2.dir
+      size: 48470646
+      nfiles: 800
+    - path: experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/ajn_spacy
+      md5: ef8be18b8acca299f9b9542ac8643a87.dir
+      size: 20536889
+      nfiles: 800
+    - path: experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/pos_ajn_alignment_wer
+      md5: b2d3a9872e6016cfde8e6d025bef373b.dir
+      size: 78539613
+      nfiles: 800
+    - path: experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/pos_ajn_metrics_wer
+      md5: d0e1ef5f57de27a2356d2f2050a93349.dir
+      size: 27353
+      nfiles: 800
+    - path: experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/word_ajn_alignment_wer
+      md5: acb5337346e70bed974dfe7ca7947d79.dir
+      size: 104789466
+      nfiles: 800
+    - path: experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/word_ajn_alignment_wer_embeddings
+      md5: 93d34d82f8536014ddbe0cf0645dd837.dir
+      size: 174322727
+      nfiles: 800
+    - path: experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/word_ajn_metrics_wer
+      md5: 903096554a3ea6896c4abaa5e2c71d4c.dir
+      size: 16505
+      nfiles: 800
+    - path: experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/word_ajn_metrics_wer_embeddings
+      md5: 1fc2985ad4c3cb00d05b1865ad5b22d4.dir
+      size: 56182
+      nfiles: 800
diff --git a/dvc.yaml b/dvc.yaml
index da6552d..e2f3e5d 100644
--- a/dvc.yaml
+++ b/dvc.yaml
@@ -35,6 +35,8 @@ stages:
             - experiment_data/pipeline/asr_benchmark_luna/techmo_spacy
             - experiment_data/pipeline/asr_benchmark_luna/pos_techmo_alignment_wer
             - experiment_data/pipeline/asr_benchmark_luna/pos_techmo_metrics_wer
+            - experiment_data/pipeline/asr_benchmark_luna/word_techmo_metrics_wer_embeddings
+            - experiment_data/pipeline/asr_benchmark_luna/word_techmo_alignment_wer_embeddings
 
     luna_ajn_processing:
         cmd: |
@@ -42,6 +44,7 @@ stages:
         deps:
             - experiment/luna/pipeline/luna_ajn_asr_processing.py
             - experiment_data/dataset/LUNA.PL
+            - experiment_data/cached_asr/luna_ajn_polish_asr
             - experiment_data/pipeline/asr_benchmark_luna/gold_transcript
             - experiment_data/pipeline/asr_benchmark_luna/gold_transcript_spacy
         outs:
@@ -51,9 +54,11 @@ stages:
             - experiment_data/pipeline/asr_benchmark_luna/ajn_spacy
             - experiment_data/pipeline/asr_benchmark_luna/pos_ajn_alignment_wer
             - experiment_data/pipeline/asr_benchmark_luna/pos_ajn_metrics_wer
+            - experiment_data/pipeline/asr_benchmark_luna/word_ajn_metrics_wer_embeddings
+            - experiment_data/pipeline/asr_benchmark_luna/word_ajn_alignment_wer_embeddings
 
     voicelab_import_to_common_format:
-        cmd: PYTHONPATH=. python experiment/voicelab/import_data.py
+        cmd: PYTHONPATH=. python -u experiment/voicelab/import_data.py
         deps:
             - experiment/voicelab/import_data.py
             - experiment_data/dataset/voicelab_cbiz_testset_20220322
@@ -87,21 +92,26 @@ stages:
             - experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/techmo_spacy
             - experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/pos_techmo_alignment_wer
             - experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/pos_techmo_metrics_wer
+            - experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/word_techmo_metrics_wer_embeddings
+            - experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/word_techmo_alignment_wer_embeddings
 
-#    voicelab_ajn_processing:
-#        cmd: |
-#            PYTHONPATH=. python -u experiment/voicelab/voicelab_pipeline_ajn_asr.py
-#        deps:
-#            - experiment/voicelab/voicelab_pipeline_ajn_asr.py
-#            - experiment_data/dataset/voicelab_cbiz_testset_20220322
-#            - experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/gold_transcript
-#            - experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/gold_transcript_spacy
-#        outs:
-#            - experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/ajn_polish_asr
-#            - experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/word_ajn_metrics_wer
-#            - experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/word_ajn_alignment_wer
-#            - experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/ajn_spacy
-#            - experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/pos_ajn_alignment_wer
-#            - experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/pos_ajn_metrics_wer
+    voicelab_ajn_processing:
+        cmd: |
+            PYTHONPATH=. python -u experiment/voicelab/voicelab_pipeline_ajn_asr.py
+        deps:
+            - experiment/voicelab/voicelab_pipeline_ajn_asr.py
+            - experiment_data/cached_asr/voicelab_cbiz_testset_20220322_ajn
+            - experiment_data/dataset/voicelab_cbiz_testset_20220322
+            - experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/gold_transcript
+            - experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/gold_transcript_spacy
+        outs:
+            - experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/ajn_polish_asr
+            - experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/word_ajn_metrics_wer
+            - experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/word_ajn_alignment_wer
+            - experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/ajn_spacy
+            - experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/pos_ajn_alignment_wer
+            - experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/pos_ajn_metrics_wer
+            - experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/word_ajn_metrics_wer_embeddings
+            - experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/word_ajn_alignment_wer_embeddings
 
 # concurrent features, multiprocessing
diff --git a/experiment/luna/pipeline/dependency_provider.py b/experiment/luna/pipeline/dependency_provider.py
index 7557f51..78a4e7a 100644
--- a/experiment/luna/pipeline/dependency_provider.py
+++ b/experiment/luna/pipeline/dependency_provider.py
@@ -8,18 +8,22 @@ GOLD_TRANSCRIPT = 'gold_transcript'
 GOLD_TRANSCRIPT_SPACY = 'gold_transcript_spacy'
 
 TECHMO_POLISH_ASR = 'techmo_polish_asr'
-WORD_TECHMO_MERTICS_WER = 'word_techmo_metrics_wer'
+WORD_TECHMO_METRICS_WER = 'word_techmo_metrics_wer'
 WORD_TECHMO_ALIGNMENT_WER = 'word_techmo_alignment_wer'
 TECHMO_SPACY = 'techmo_spacy'
 POS_TECHMO_ALIGNMENT_WER = 'pos_techmo_alignment_wer'
 POS_TECHMO_METRICS_WER = 'pos_techmo_metrics_wer'
+WORD_TECHMO_METRICS_WER_EMBEDDINGS = 'word_techmo_metrics_wer_embeddings'
+WORD_TECHMO_ALIGNMENT_WER_EMBEDDINGS = 'word_techmo_alignment_wer_embeddings'
 
 AJN_POLISH_ASR = 'ajn_polish_asr'
-WORD_AJN_MERTICS_WER = 'word_ajn_metrics_wer'
+WORD_AJN_METRICS_WER = 'word_ajn_metrics_wer'
 WORD_AJN_ALIGNMENT_WER = 'word_ajn_alignment_wer'
 AJN_SPACY = 'ajn_spacy'
 POS_AJN_ALIGNMENT_WER = 'pos_ajn_alignment_wer'
 POS_AJN_METRICS_WER = 'pos_ajn_metrics_wer'
+WORD_AJN_METRICS_WER_EMBEDDINGS = 'word_ajn_metrics_wer_embeddings'
+WORD_AJN_ALIGNMENT_WER_EMBEDDINGS = 'word_ajn_alignment_wer_embeddings'
 
 
 def get_record_provider() -> LunaRecordProvider:
diff --git a/experiment/luna/pipeline/luna_ajn_asr_processing.py b/experiment/luna/pipeline/luna_ajn_asr_processing.py
index 7ad6122..662858d 100644
--- a/experiment/luna/pipeline/luna_ajn_asr_processing.py
+++ b/experiment/luna/pipeline/luna_ajn_asr_processing.py
@@ -1,13 +1,15 @@
 from experiment.luna.pipeline.dependency_provider import get_record_provider, GOLD_TRANSCRIPT, \
     get_multiple_files_repository, \
-    GOLD_TRANSCRIPT_SPACY, AJN_POLISH_ASR, WORD_AJN_MERTICS_WER, WORD_AJN_ALIGNMENT_WER, AJN_SPACY, \
-    POS_AJN_ALIGNMENT_WER, POS_AJN_METRICS_WER
+    GOLD_TRANSCRIPT_SPACY, AJN_POLISH_ASR, WORD_AJN_METRICS_WER, WORD_AJN_ALIGNMENT_WER, AJN_SPACY, \
+    POS_AJN_ALIGNMENT_WER, POS_AJN_METRICS_WER, WORD_AJN_METRICS_WER_EMBEDDINGS, WORD_AJN_ALIGNMENT_WER_EMBEDDINGS
 from experiment.pos_processing.asr_spacy_token_pos_processing_task import AsrSpacyTokenPosProcessingTask
 from experiment.pos_processing.spacy_pos_wer_processing_task import SpacyPosWerProcessingTask
+from sziszapangma.core.transformer.web_embedding_transformer import WebEmbeddingTransformer
 from sziszapangma.integration.experiment_manager import ExperimentManager
 from sziszapangma.integration.task.asr_task import AsrTask
 from sziszapangma.integration.task.classic_wer_metric_task import ClassicWerMetricTask
-from sziszapangma.integration.asr_processor import AsrWebClient
+from sziszapangma.integration.asr_processor import AsrWebClient, MultipleSourcesAsrProcessor, AsrPathCacheClient
+from sziszapangma.integration.task.embedding_wer_metrics_task import EmbeddingWerMetricsTask
 
 
 def run_luna_experiment():
@@ -17,7 +19,11 @@ def run_luna_experiment():
         processing_tasks=[
             AsrTask(
                 task_name='ajn_polish_asr_task',
-                asr_processor=AsrWebClient('http://localhost:5431/process_asr', '__example_token__'),
+                asr_processor=MultipleSourcesAsrProcessor([
+                    AsrPathCacheClient('experiment_data/cached_asr/luna_ajn_polish_asr', record_provider,
+                                       record_provider),
+                    AsrWebClient('http://localhost:5431/process_asr', '__example_token__')
+                ]),
                 asr_property_name=AJN_POLISH_ASR,
                 require_update=False,
                 record_path_provider=record_provider
@@ -26,7 +32,7 @@ def run_luna_experiment():
                 task_name='techmo_word_wer_processing',
                 asr_property_name=AJN_POLISH_ASR,
                 gold_transcript_property_name=GOLD_TRANSCRIPT,
-                metrics_property_name=WORD_AJN_MERTICS_WER,
+                metrics_property_name=WORD_AJN_METRICS_WER,
                 require_update=False,
                 alignment_property_name=WORD_AJN_ALIGNMENT_WER
             ),
@@ -43,6 +49,15 @@ def run_luna_experiment():
                 asr_pos_property_name=AJN_SPACY,
                 pos_alignment_wer=POS_AJN_ALIGNMENT_WER,
                 pos_metrics_wer=POS_AJN_METRICS_WER
+            ),
+            EmbeddingWerMetricsTask(
+                task_name='EmbeddingWerMetricsTask',
+                asr_property_name='techmo_polish_asr',
+                gold_transcript_property_name=GOLD_TRANSCRIPT,
+                metrics_property_name=WORD_AJN_METRICS_WER_EMBEDDINGS,
+                require_update=False,
+                embedding_transformer=WebEmbeddingTransformer('pl', 'http://localhost:5003', 'fjsd-mkwe-oius-m9h2'),
+                alignment_property_name=WORD_AJN_ALIGNMENT_WER_EMBEDDINGS
             )
         ],
         experiment_repository=get_multiple_files_repository(),
diff --git a/experiment/luna/pipeline/luna_techmo_processing.py b/experiment/luna/pipeline/luna_techmo_processing.py
index ee12296..c219440 100644
--- a/experiment/luna/pipeline/luna_techmo_processing.py
+++ b/experiment/luna/pipeline/luna_techmo_processing.py
@@ -1,13 +1,15 @@
 from experiment.luna.pipeline.dependency_provider import get_record_provider, GOLD_TRANSCRIPT, TECHMO_POLISH_ASR, \
     get_multiple_files_repository, \
-    GOLD_TRANSCRIPT_SPACY, POS_TECHMO_ALIGNMENT_WER, POS_TECHMO_METRICS_WER, WORD_TECHMO_MERTICS_WER, \
-    WORD_TECHMO_ALIGNMENT_WER, TECHMO_SPACY
+    GOLD_TRANSCRIPT_SPACY, POS_TECHMO_ALIGNMENT_WER, POS_TECHMO_METRICS_WER, WORD_TECHMO_METRICS_WER, \
+    WORD_TECHMO_ALIGNMENT_WER, TECHMO_SPACY, WORD_TECHMO_METRICS_WER_EMBEDDINGS, WORD_TECHMO_ALIGNMENT_WER_EMBEDDINGS
 from experiment.pos_processing.asr_spacy_token_pos_processing_task import AsrSpacyTokenPosProcessingTask
 from experiment.pos_processing.spacy_pos_wer_processing_task import SpacyPosWerProcessingTask
+from sziszapangma.core.transformer.web_embedding_transformer import WebEmbeddingTransformer
 from sziszapangma.integration.asr_processor import AsrPathCacheClient
 from sziszapangma.integration.experiment_manager import ExperimentManager
 from sziszapangma.integration.task.asr_task import AsrTask
 from sziszapangma.integration.task.classic_wer_metric_task import ClassicWerMetricTask
+from sziszapangma.integration.task.embedding_wer_metrics_task import EmbeddingWerMetricsTask
 
 
 def run_luna_experiment():
@@ -27,7 +29,7 @@ def run_luna_experiment():
                 task_name='techmo_word_wer_processing',
                 asr_property_name=TECHMO_POLISH_ASR,
                 gold_transcript_property_name=GOLD_TRANSCRIPT,
-                metrics_property_name=WORD_TECHMO_MERTICS_WER,
+                metrics_property_name=WORD_TECHMO_METRICS_WER,
                 require_update=False,
                 alignment_property_name=WORD_TECHMO_ALIGNMENT_WER
             ),
@@ -44,6 +46,15 @@ def run_luna_experiment():
                 asr_pos_property_name=TECHMO_SPACY,
                 pos_alignment_wer=POS_TECHMO_ALIGNMENT_WER,
                 pos_metrics_wer=POS_TECHMO_METRICS_WER
+            ),
+            EmbeddingWerMetricsTask(
+                task_name='EmbeddingWerMetricsTask',
+                asr_property_name='techmo_polish_asr',
+                gold_transcript_property_name=GOLD_TRANSCRIPT,
+                metrics_property_name=WORD_TECHMO_METRICS_WER_EMBEDDINGS,
+                require_update=False,
+                embedding_transformer=WebEmbeddingTransformer('pl', 'http://localhost:5003', 'fjsd-mkwe-oius-m9h2'),
+                alignment_property_name=WORD_TECHMO_ALIGNMENT_WER_EMBEDDINGS
             )
         ],
         experiment_repository=get_multiple_files_repository(),
diff --git a/experiment/voicelab/voicelab_dependency.py b/experiment/voicelab/voicelab_dependency.py
index 58642e4..4b528c9 100644
--- a/experiment/voicelab/voicelab_dependency.py
+++ b/experiment/voicelab/voicelab_dependency.py
@@ -8,19 +8,22 @@ GOLD_TRANSCRIPT = 'gold_transcript'
 GOLD_TRANSCRIPT_SPACY = 'gold_transcript_spacy'
 
 TECHMO_POLISH_ASR = 'techmo_polish_asr'
-WORD_TECHMO_MERTICS_WER = 'word_techmo_metrics_wer'
+WORD_TECHMO_METRICS_WER = 'word_techmo_metrics_wer'
 WORD_TECHMO_ALIGNMENT_WER = 'word_techmo_alignment_wer'
 TECHMO_SPACY = 'techmo_spacy'
 POS_TECHMO_ALIGNMENT_WER = 'pos_techmo_alignment_wer'
 POS_TECHMO_METRICS_WER = 'pos_techmo_metrics_wer'
+WORD_TECHMO_METRICS_WER_EMBEDDINGS = 'word_techmo_metrics_wer_embeddings'
+WORD_TECHMO_ALIGNMENT_WER_EMBEDDINGS = 'word_techmo_alignment_wer_embeddings'
 
 AJN_POLISH_ASR = 'ajn_polish_asr'
-WORD_AJN_MERTICS_WER = 'word_ajn_metrics_wer'
+WORD_AJN_METRICS_WER = 'word_ajn_metrics_wer'
 WORD_AJN_ALIGNMENT_WER = 'word_ajn_alignment_wer'
 AJN_SPACY = 'ajn_spacy'
 POS_AJN_ALIGNMENT_WER = 'pos_ajn_alignment_wer'
 POS_AJN_METRICS_WER = 'pos_ajn_metrics_wer'
-
+WORD_AJN_METRICS_WER_EMBEDDINGS = 'word_ajn_metrics_wer_embeddings'
+WORD_AJN_ALIGNMENT_WER_EMBEDDINGS = 'word_ajn_alignment_wer_embeddings'
 
 PIPELINE_DATA_DIRECTORY = 'experiment_data/pipeline'
 EXPERIMENT_NAME = 'asr_benchmark_voicelab_cbiz_testset_20220322'
diff --git a/experiment/voicelab/voicelab_pipeline.py b/experiment/voicelab/voicelab_pipeline.py
deleted file mode 100644
index d58f92e..0000000
--- a/experiment/voicelab/voicelab_pipeline.py
+++ /dev/null
@@ -1,96 +0,0 @@
-from experiment.pos_processing.asr_spacy_token_pos_processing_task import AsrSpacyTokenPosProcessingTask
-from experiment.pos_processing.gold_transcript_spacy_token_pos_processing_task import \
-    GoldTranscriptSpacyTokenPosProcessingTask
-from experiment.pos_processing.spacy_pos_wer_processing_task import SpacyPosWerProcessingTask
-from experiment.voicelab.voicelab_dependency import get_record_provider, get_repository
-from experiment.voicelab.voicelab_gold_transcript_processor import VoicelabGoldTranscriptProcessor
-from experiment.voicelab.voicelab_telco_record_provider import VoicelabTelcoRecordProvider
-from sziszapangma.integration.asr_processor import AsrPathCacheClient
-from sziszapangma.integration.experiment_manager import ExperimentManager
-from sziszapangma.integration.path_filter import ExtensionPathFilter
-from sziszapangma.integration.repository.experiment_repository import ExperimentRepository
-from sziszapangma.integration.repository.multi_files_experiment_repository import \
-    MultiFilesExperimentRepository
-from sziszapangma.integration.task.asr_task import AsrTask
-from sziszapangma.integration.task.classic_wer_metric_task import ClassicWerMetricTask
-from sziszapangma.integration.task.gold_transcript_task import GoldTranscriptTask
-
-GOLD_TRANSCRIPT = 'gold_transcript'
-TECHMO_POLISH_ASR = 'techmo_polish_asr'
-TECHMO_POLISH_CLASSIC_WER_METRIC = 'techmo_polish_classic_wer_metric'
-TECHMO_POLISH_CLASSIC_ALIGNMENT = 'techmo_polish_classic_alignment'
-TECHMO_SPACY = 'techmo_spacy'
-GOLD_TRANSCRIPT_SPACY = 'gold_transcript_spacy'
-POS_ALIGNMENT_WER = 'pos_alignment_wer'
-POS_METRICS_WER = 'pos_metrics_wer'
-
-DATASET_DIRECTORY = 'experiment_data/dataset/voicelab_cbiz_testset_20220322'
-PIPELINE_DATA_DIRECTORY = 'experiment_data/pipeline'
-EXPERIMENT_NAME = 'asr_benchmark_voicelab_cbiz_testset_20220322'
-RELATION_MANAGER_ROOT_PATH = 'experiment_data/dataset_relation_manager_data/voicelab_cbiz_testset_20220322'
-
-
-def run_voicelab_experiment():
-    record_provider = get_record_provider()
-    experiment_processor = ExperimentManager(
-        record_id_iterator=record_provider,
-        processing_tasks=[
-            GoldTranscriptTask(
-                task_name='gold_transcript_task',
-                gold_transcript_processor=VoicelabGoldTranscriptProcessor(record_provider),
-                gold_transcript_property_name=GOLD_TRANSCRIPT,
-                require_update=False
-            ),
-            AsrTask(
-                task_name='techmo_polish_task',
-                # asr_processor=AsrWebClient('http://192.168.0.124:4999/process_asr', 'test1234'),
-                asr_processor=AsrPathCacheClient(
-                    'experiment_data/cached_asr/voicelab_cbiz_testset_20220322_techmo',
-                    record_provider,
-                    record_provider
-                ),
-                asr_property_name=TECHMO_POLISH_ASR,
-                require_update=False,
-                record_path_provider=record_provider
-            ),
-            ClassicWerMetricTask(
-                task_name='classic_wer_metric_task',
-                asr_property_name=TECHMO_POLISH_ASR,
-                gold_transcript_property_name=GOLD_TRANSCRIPT,
-                metrics_property_name=TECHMO_POLISH_CLASSIC_WER_METRIC,
-                require_update=False,
-                alignment_property_name=TECHMO_POLISH_CLASSIC_ALIGNMENT
-            ),
-            GoldTranscriptSpacyTokenPosProcessingTask(
-                task_name='gold_transcript_spacy_task',
-                input_property_name=GOLD_TRANSCRIPT,
-                spacy_property_name=GOLD_TRANSCRIPT_SPACY,
-                require_update=True
-            ),
-            AsrSpacyTokenPosProcessingTask(
-                task_name='techmo_spacy_task',
-                input_property_name=TECHMO_POLISH_ASR,
-                spacy_property_name=TECHMO_SPACY,
-                require_update=True
-            ),
-            SpacyPosWerProcessingTask(
-                task_name='PosWerProcessor',
-                require_update=False,
-                gold_transcript_pos_property_name=GOLD_TRANSCRIPT_SPACY,
-                asr_pos_property_name=TECHMO_SPACY,
-                pos_alignment_wer=POS_ALIGNMENT_WER,
-                pos_metrics_wer=POS_METRICS_WER
-            )
-        ],
-        experiment_repository=get_repository(),
-        relation_manager_provider=record_provider
-    )
-    experiment_processor.process()
-
-
-def example_run():
-    run_voicelab_experiment()
-
-
-if __name__ == '__main__':
-    example_run()
diff --git a/experiment/voicelab/voicelab_pipeline_ajn_asr.py b/experiment/voicelab/voicelab_pipeline_ajn_asr.py
index d650082..b6c4c2c 100644
--- a/experiment/voicelab/voicelab_pipeline_ajn_asr.py
+++ b/experiment/voicelab/voicelab_pipeline_ajn_asr.py
@@ -3,11 +3,12 @@ from experiment.pos_processing.gold_transcript_spacy_token_pos_processing_task i
     GoldTranscriptSpacyTokenPosProcessingTask
 from experiment.pos_processing.spacy_pos_wer_processing_task import SpacyPosWerProcessingTask
 from experiment.voicelab.voicelab_dependency import get_record_provider, get_repository, GOLD_TRANSCRIPT, \
-    GOLD_TRANSCRIPT_SPACY, TECHMO_POLISH_ASR, AJN_POLISH_ASR, WORD_AJN_MERTICS_WER, WORD_AJN_ALIGNMENT_WER, AJN_SPACY, \
-    POS_AJN_ALIGNMENT_WER, POS_AJN_METRICS_WER
+    GOLD_TRANSCRIPT_SPACY, TECHMO_POLISH_ASR, AJN_POLISH_ASR, WORD_AJN_METRICS_WER, WORD_AJN_ALIGNMENT_WER, AJN_SPACY, \
+    POS_AJN_ALIGNMENT_WER, POS_AJN_METRICS_WER, WORD_AJN_METRICS_WER_EMBEDDINGS, WORD_AJN_ALIGNMENT_WER_EMBEDDINGS
 from experiment.voicelab.voicelab_gold_transcript_processor import VoicelabGoldTranscriptProcessor
 from experiment.voicelab.voicelab_telco_record_provider import VoicelabTelcoRecordProvider
-from sziszapangma.integration.asr_processor import AsrPathCacheClient, AsrWebClient
+from sziszapangma.core.transformer.web_embedding_transformer import WebEmbeddingTransformer
+from sziszapangma.integration.asr_processor import AsrPathCacheClient, AsrWebClient, MultipleSourcesAsrProcessor
 from sziszapangma.integration.experiment_manager import ExperimentManager
 from sziszapangma.integration.path_filter import ExtensionPathFilter
 from sziszapangma.integration.repository.experiment_repository import ExperimentRepository
@@ -15,6 +16,7 @@ from sziszapangma.integration.repository.multi_files_experiment_repository impor
     MultiFilesExperimentRepository
 from sziszapangma.integration.task.asr_task import AsrTask
 from sziszapangma.integration.task.classic_wer_metric_task import ClassicWerMetricTask
+from sziszapangma.integration.task.embedding_wer_metrics_task import EmbeddingWerMetricsTask
 from sziszapangma.integration.task.gold_transcript_task import GoldTranscriptTask
 
 
@@ -25,7 +27,11 @@ def run_voicelab_experiment():
         processing_tasks=[
             AsrTask(
                 task_name='ajn_polish_asr_task',
-                asr_processor=AsrWebClient('http://localhost:5431/process_asr', '__example_token__'),
+                asr_processor=AsrPathCacheClient(
+                    'experiment_data/cached_asr/voicelab_cbiz_testset_20220322_ajn',
+                    record_provider,
+                    record_provider
+                ),
                 asr_property_name=AJN_POLISH_ASR,
                 require_update=False,
                 record_path_provider=record_provider
@@ -34,7 +40,7 @@ def run_voicelab_experiment():
                 task_name='techmo_word_wer_processing',
                 asr_property_name=AJN_POLISH_ASR,
                 gold_transcript_property_name=GOLD_TRANSCRIPT,
-                metrics_property_name=WORD_AJN_MERTICS_WER,
+                metrics_property_name=WORD_AJN_METRICS_WER,
                 require_update=False,
                 alignment_property_name=WORD_AJN_ALIGNMENT_WER
             ),
@@ -51,6 +57,15 @@ def run_voicelab_experiment():
                 asr_pos_property_name=AJN_SPACY,
                 pos_alignment_wer=POS_AJN_ALIGNMENT_WER,
                 pos_metrics_wer=POS_AJN_METRICS_WER
+            ),
+            EmbeddingWerMetricsTask(
+                task_name='EmbeddingWerMetricsTask',
+                asr_property_name='techmo_polish_asr',
+                gold_transcript_property_name=GOLD_TRANSCRIPT,
+                metrics_property_name=WORD_AJN_METRICS_WER_EMBEDDINGS,
+                require_update=False,
+                embedding_transformer=WebEmbeddingTransformer('pl', 'http://localhost:5003', 'fjsd-mkwe-oius-m9h2'),
+                alignment_property_name=WORD_AJN_ALIGNMENT_WER_EMBEDDINGS
             )
         ],
         experiment_repository=get_repository(),
@@ -64,6 +79,4 @@ def example_run():
 
 
 if __name__ == '__main__':
-    # example_run()
-    path = '/home/marcinwatroba/PWR_ASR/asr-benchmarks/experiment_data/dataset/voicelab_cbiz_testset_20220322/bankowe/cbiz_tc_2.agnt.wav'
-    print(AsrWebClient('http://localhost:5431/process_asr', '__example_token__').call_recognise(path))
+    example_run()
diff --git a/experiment/voicelab/voicelab_pipeline_techmo.py b/experiment/voicelab/voicelab_pipeline_techmo.py
index 03bdd64..c99e09b 100644
--- a/experiment/voicelab/voicelab_pipeline_techmo.py
+++ b/experiment/voicelab/voicelab_pipeline_techmo.py
@@ -1,12 +1,15 @@
 from experiment.pos_processing.asr_spacy_token_pos_processing_task import AsrSpacyTokenPosProcessingTask
 from experiment.pos_processing.spacy_pos_wer_processing_task import SpacyPosWerProcessingTask
 from experiment.voicelab.voicelab_dependency import get_record_provider, get_repository, GOLD_TRANSCRIPT, \
-    GOLD_TRANSCRIPT_SPACY, TECHMO_POLISH_ASR, WORD_TECHMO_MERTICS_WER, WORD_TECHMO_ALIGNMENT_WER, TECHMO_SPACY, \
-    POS_TECHMO_METRICS_WER, POS_TECHMO_ALIGNMENT_WER
+    GOLD_TRANSCRIPT_SPACY, TECHMO_POLISH_ASR, WORD_TECHMO_METRICS_WER, WORD_TECHMO_ALIGNMENT_WER, TECHMO_SPACY, \
+    POS_TECHMO_METRICS_WER, POS_TECHMO_ALIGNMENT_WER, WORD_TECHMO_METRICS_WER_EMBEDDINGS, \
+    WORD_TECHMO_ALIGNMENT_WER_EMBEDDINGS
+from sziszapangma.core.transformer.web_embedding_transformer import WebEmbeddingTransformer
 from sziszapangma.integration.asr_processor import AsrPathCacheClient
 from sziszapangma.integration.experiment_manager import ExperimentManager
 from sziszapangma.integration.task.asr_task import AsrTask
 from sziszapangma.integration.task.classic_wer_metric_task import ClassicWerMetricTask
+from sziszapangma.integration.task.embedding_wer_metrics_task import EmbeddingWerMetricsTask
 
 
 def run_voicelab_experiment():
@@ -29,7 +32,7 @@ def run_voicelab_experiment():
                 task_name='techmo_word_wer_processing',
                 asr_property_name=TECHMO_POLISH_ASR,
                 gold_transcript_property_name=GOLD_TRANSCRIPT,
-                metrics_property_name=WORD_TECHMO_MERTICS_WER,
+                metrics_property_name=WORD_TECHMO_METRICS_WER,
                 require_update=False,
                 alignment_property_name=WORD_TECHMO_ALIGNMENT_WER
             ),
@@ -46,6 +49,15 @@ def run_voicelab_experiment():
                 asr_pos_property_name=TECHMO_SPACY,
                 pos_alignment_wer=POS_TECHMO_ALIGNMENT_WER,
                 pos_metrics_wer=POS_TECHMO_METRICS_WER
+            ),
+            EmbeddingWerMetricsTask(
+                task_name='EmbeddingWerMetricsTask',
+                asr_property_name='techmo_polish_asr',
+                gold_transcript_property_name=GOLD_TRANSCRIPT,
+                metrics_property_name=WORD_TECHMO_METRICS_WER_EMBEDDINGS,
+                require_update=False,
+                embedding_transformer=WebEmbeddingTransformer('pl', 'http://localhost:5003', 'fjsd-mkwe-oius-m9h2'),
+                alignment_property_name=WORD_TECHMO_ALIGNMENT_WER_EMBEDDINGS
             )
         ],
         experiment_repository=get_repository(),
diff --git a/experiment_data/cached_asr/.gitignore b/experiment_data/cached_asr/.gitignore
index 127221d..b8177bc 100644
--- a/experiment_data/cached_asr/.gitignore
+++ b/experiment_data/cached_asr/.gitignore
@@ -1,3 +1,4 @@
 /luna_techmo
 /voicelab_cbiz_testset_20220322_techmo
 /luna_ajn_polish_asr
+/voicelab_cbiz_testset_20220322_ajn
diff --git a/experiment_data/cached_asr/voicelab_cbiz_testset_20220322_ajn.dvc b/experiment_data/cached_asr/voicelab_cbiz_testset_20220322_ajn.dvc
new file mode 100644
index 0000000..107d8c0
--- /dev/null
+++ b/experiment_data/cached_asr/voicelab_cbiz_testset_20220322_ajn.dvc
@@ -0,0 +1,5 @@
+outs:
+- md5: 49a38b90f1265a61b90b54f820415011.dir
+  size: 32601414
+  nfiles: 800
+  path: voicelab_cbiz_testset_20220322_ajn
diff --git a/experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/.gitignore b/experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/.gitignore
index 7aff3d6..e07b324 100644
--- a/experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/.gitignore
+++ b/experiment_data/pipeline/asr_benchmark_voicelab_cbiz_testset_20220322/.gitignore
@@ -6,3 +6,13 @@
 /techmo_spacy
 /pos_techmo_alignment_wer
 /pos_techmo_metrics_wer
+/ajn_polish_asr
+/word_ajn_metrics_wer
+/word_ajn_alignment_wer
+/ajn_spacy
+/pos_ajn_alignment_wer
+/pos_ajn_metrics_wer
+/word_techmo_metrics_wer_embeddings
+/word_techmo_alignment_wer_embeddings
+/word_ajn_metrics_wer_embeddings
+/word_ajn_alignment_wer_embeddings
diff --git a/run_repro_in_background.sh b/run_repro_in_background.sh
new file mode 100755
index 0000000..35d3286
--- /dev/null
+++ b/run_repro_in_background.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+dvc repro > repro.log 2>&1 &
diff --git a/sziszapangma/core/transformer/__pycache__/cached_embedding_transformer.cpython-38.pyc b/sziszapangma/core/transformer/__pycache__/cached_embedding_transformer.cpython-38.pyc
index 0ce5df23cfc1255b9563c274caccbf4fb6fc0826..e69f01a8dbd67b79d3cb0386b27827ac6d3021b2 100644
GIT binary patch
delta 55
zcmX@ae~_Oil$V!_0SIiPGLts)yk?Zi(9g)vP1Vm$EK1JID^Dyb%1=tv4+sy6cMJ~N
JEW)&o6#%>f5fA_X

delta 56
zcmX@ee~6zal$V!_0SLS{u_tcidCe%BsUKRLT2!o`n^=^bnOB}zQk0*RsPB?moL!P%
KuvwUC9V-CToe~%T

diff --git a/sziszapangma/integration/__pycache__/asr_processor.cpython-38.pyc b/sziszapangma/integration/__pycache__/asr_processor.cpython-38.pyc
index 79aba6f4c6f20a2b241734f0e21c01cd2656659d..adb286664a8b35871a52844a1c97afdc946c595d 100644
GIT binary patch
delta 1865
zcmaJ>OK%)S5bo}o*`1wz#tz0NwzEVD(ZrF7Ne~ZNAx;dI5NskWD`BN*HJ%>GI^&s5
z_iPkaYZk?kIB~G%f<%yPaX^-GNqzx80K~od0f-Y<4jicJS$l(t7^%Ohs;+)i)z|fO
z;_I2pTDj~H_|{$%>-U|V$~4{GnR{oj=`<ZmIN>xbHjD7L!qR5BS=P2(SlO&LtCXw|
zp6B)>!tH=<8_gQ%1@3_E1U1l)fnMY#&`Vk$1HH^EpjWg$&gm*~tAF764VNin>FPCA
z@z!Os^XvGu33~K}GMC$aqU`IPq#Z|IsLYSrGI{#fJS~=CnW$<=7uYZ7$42_0bPS~;
zKnaF40c^^sD5g8PnRLbY`T9O>xR$bHw-X5QGR!Nx)oMrWq}39Wpo%E~*AO+7UqLtl
zuuoN?)$&76O7PQn&?n8N>*hIHO209G{~Q-8lk3}y;9fV7iBx7MmWke2nHw^Wl+_XK
zC~<Sy0Xn49c*+@EZHPcN%jnkvsS`&s7zT_k`oyvHH|z3X9QES}FCk0-sA{X_MNyo1
zm^i#<9;apzatIaz=1i4`q4+@<9)zP~I*Bg&7|aarDs2FV+O)(bEuAkP4C%MIALv55
zn14pA>7V)O;w3N+(~?fupA=_Mzko2CO8eB@8z8!77V`p{FCyqw`pmac9cJ_=`_f6Q
zp3eXdJ_7^{;2I2hBONciI^e9&z9ZkWZAwX>?OJ<8UP*G?IIupXNq*N(3Vp`SZMKKh
zvl=~X9@E<^`ksF$sCT^NPQ6#tKS*6YcmQORbw!2i2xO0V7eLt?aXUgMfq{tgXlv#&
zXH|IrfgfnLiaDGx!`S2EEQkj*!YTt3aD7H!(1+CoX%U9hZ<0>a3Z%M{J}I2td_jrD
zJTBD5ETDvovH~C0R=ke3(J7;fVm0vMvI2k=t(gp-2K&2K5*OfkOb)K#>@q+=nv|0!
zTO)xHm{TOMI9(jdoIR%8;N~L}5|9sU7=_Q+6f}&-jOTbB><+gf*9CRr<8GL=J7KUI
zcZDCwB`I!*7z7y$<#Y~2xlaX_b?~o(WASn5f@7e4+9xgOP^&~tgDUW%F3Tu|PR|9M
z5eaN7Fr=mt?VbLg+stDN80a4GALe0dv7zXTBp`6I#rkA}ER)+GYysOixoZojSLqwO
zv`_cw7Q0FAIVMTiT}L!1*)l2Vl@g<GB2#aYH*p8^OY;`_6#Q<%$o3cro-j_g=|dW!
z`;geHI%bzOdS{lxus%v#-S?t8l6GjSi@=Wshi9hKp}XUIa9>SV*BUh(W!?9}ZXn)*
zNktDk4DfJ7$`4^qVgn%5q{tQq)T{wPZR#+a_Rby2X8I$W85G44(qi+_-8f^e?nSui
zk%shK=`?e7{jP!^)>tCn9q2z}hCrKpJbPw_x9$t9tuO#ozT>60ol`k|q?>-_EYL#w
y%(=L#9|;yvcdx7|;XW@qRN0H0G4F=KWeOe=8!WsNTB4K8W+hrOYEEsUHu(<`y>rh1

delta 936
zcmZva-D(q25XX1+BRSdZZju^fY*VoHi?wZ<sz`-WC<wh!TB|4$ge7JJrffFh?DoQ1
z=tTu@6p7~n)GP5)eFU$Idg*xqpTK)(PTB^<!2Wmsvy+)QzulM9pDW%2*R?hLt^eA7
z@;p2AYVc|}ck>Zwp%y}9H*LIIQNHIi9hJ*OZqI9apsj0RHe@ffkahF}qgg<o3vKjv
zr+~hQJ|8;h9py_Q+}C_}_~TN=kj7g4Oq#d4?NqYOLE7!dt?26n)FJQSD%?{dZtI7Q
z(%7FwMT%%?Do#uk20Exrdsr%q$Hvc1A2xhTTFK6!!+8ZuBnyIW+)aakpFzv3D89i9
z6rUxTMEL|V7X<C7l_XfF)=<N0V$Zw+dGXQw4OQ{Rn!G?gq`95+V`&X|H%|S`sFymx
zEHykY4y|f&bQbceQ%U@>R`(~!pCT!foJNs_AZW#LKWz~X8lR;{HIfX8MM5~Fd$eM^
z6Gg{ssxi}4qQ=mP=}H)i4GbK;YAFtFObp8<@h$Tf7RAHt7w|-m)$&&{e{^EeVE6J<
z<m)6g@tV!fH8Awe(HR#hy+oq^rN-VMdjw0e>j<lmn}&I@kek_mZ>5<%J=72M5A+cj
z8@pCrp#$z`IlVx>j3U`~zZ(x)=~GVAxKBAV>bF9E5yRuRMTDiP%Kn9mPgd|!--HG=
z*qvER_+3%A&u#q&IiJVc5nRH=b0iA*32VMUx$&4W;ktNdPcZ`IPjFuh-DS8ezPn4n
v1bEl(s9yAqD(EAS<2EWHSdy*u`r%I0Sp{?nEy<t=4wQAKJKz{ayIA=Pb%(9i

diff --git a/sziszapangma/integration/asr_processor.py b/sziszapangma/integration/asr_processor.py
index 56a8786..4d01c10 100644
--- a/sziszapangma/integration/asr_processor.py
+++ b/sziszapangma/integration/asr_processor.py
@@ -1,7 +1,7 @@
 import json
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, List
 
 import requests
 
@@ -59,4 +59,28 @@ class AsrPathCacheClient(AsrProcessor):
 
     def call_recognise(self, file_path: str) -> Dict[str, Any]:
         path = Path(self._cache_path).joinpath(f"{self.path_to_id[file_path]}.json")
+        print(f'cache path {path}')
+        if not path.exists():
+            raise Exception(f"path not exists {path}")
         return json.load(open(path, "r"))
+
+
+class MultipleSourcesAsrProcessor(AsrProcessor):
+    processors: List[AsrProcessor]
+
+    def __init__(self, processors: List[AsrProcessor]):
+        self.processors = processors
+
+    def call_recognise(self, file_path: str) -> Dict[str, Any]:
+        value = None
+        for it in self.processors:
+            if value is None:
+                try:
+                    value = it.call_recognise(file_path)
+                except Exception as e:
+                    print(e)
+        if value is None:
+            raise Exception("All processors can not process record")
+        else:
+            print(value)
+            return value
diff --git a/sziszapangma/integration/repository/multi_files_experiment_repository.py b/sziszapangma/integration/repository/multi_files_experiment_repository.py
index 2385858..0939495 100644
--- a/sziszapangma/integration/repository/multi_files_experiment_repository.py
+++ b/sziszapangma/integration/repository/multi_files_experiment_repository.py
@@ -23,9 +23,13 @@ class MultiFilesExperimentRepository(ExperimentRepository):
         return self._get_file_path(property_name, record_id).exists()
 
     def update_property_for_key(self, record_id: str, property_name: str, property_value: Any):
+        print(f'update {record_id} {property_name}')
         path = self._get_file_path(property_name, record_id)
+        print(path)
         path.parent.mkdir(parents=True, exist_ok=True)
+        print('created path')
         json.dump(property_value, open(path, "w"))
+        print('value saved')
 
     def delete_property_for_key(self, record_id: str, property_name: str):
         self._get_file_path(property_name, record_id).unlink()
diff --git a/sziszapangma/integration/task/__pycache__/asr_task.cpython-38.pyc b/sziszapangma/integration/task/__pycache__/asr_task.cpython-38.pyc
index a34960b88f1c6e4bbe7fb65ec4b80fc16a9658e7..2ec6d5594835708ca2244cb10668efea26f93dc6 100644
GIT binary patch
delta 277
zcmaDN@KBI9l$V!_0SJyADM)&{k+*@RJ`KpLVaQ@wz*xh$kg=AfgeiqFg(-!(mx&Q1
z&b)x7h6O0j3KVC7iKnopu(dGMu%@s}GPp1_GlEz)3_ukeFcmdS3s`Fyf%@5MSQj!g
zGNdpBGiY-9Rf#60rsWr<DkLZ7<ir=HCg-cB=VcbBhG;V0Vol2_EzYPCNlYtAEkY;&
z%TK<>lE|nv*@D$kw@MVMpeQx5BsD%SwLHE&zbHkMy$Iy7TO8@BCGiD`B^gC>K<?x&
e)_y@TAcKL8hY<qB7<neEu(dI2Y+lW_lMw*Q&`M_j

delta 151
zcmaDT_(Xs=l$V!_0SF#MXD0b><ZWQ_bOQ2f7_t}^FxD_GWUOTYvY8ez*RTNDtSO8s
zOf3vGtSQWr3@!}Kj3AZ`Lk-gcmKsK&I@TK2h0Kf$DGb33nk;^k4OtTzH6~AHbrjTO
sD+1Yaiz7X?B)%ZABx5ooTR*n|$YM4gMjl47$qU)q7}Ymxu<v990GZVyr~m)}

diff --git a/sziszapangma/integration/task/__pycache__/embedding_wer_metrics_task.cpython-38.pyc b/sziszapangma/integration/task/__pycache__/embedding_wer_metrics_task.cpython-38.pyc
index 4918ee5191cb5b9f9f58c552c124e883bb704be7..388f4fe48dd157ff91257c305c0a27b700e0f078 100644
GIT binary patch
delta 66
zcmZ22vr2|1l$V!_0SF#MXC`grdCVe{rk|0Yo2s9iSd^TZSDsi>l%JHS9}pfC?-(4k
UnTJ)IgXxy;<Xm34$>F?L0HahD6951J

delta 67
zcmZ1_vs#8Hl$V!_0SL0bol4xu^O!|8T|cxqwWwG>H?b%=Gp{_cq$ocrQQsxCIJ+dj
VU^6$XHV0Fb?&KU^xyfO?R{-3f790Qo

diff --git a/sziszapangma/integration/task/asr_task.py b/sziszapangma/integration/task/asr_task.py
index bbb60b2..3c8f444 100644
--- a/sziszapangma/integration/task/asr_task.py
+++ b/sziszapangma/integration/task/asr_task.py
@@ -36,8 +36,11 @@ class AsrTask(ProcessingTask):
         relation_manager: RelationManager,
     ) -> None:
         file_record_path = self._record_path_provider.get_path(record_id)
+        print('before call_recognise', flush=True)
         asr_result = self._asr_processor.call_recognise(file_record_path)
+        print('after call_recognise', flush=True)
         asr_result["transcription"] = [create_new_word(it) for it in asr_result["transcription"]]
+        print('after create_new_word', flush=True)
         experiment_repository.update_property_for_key(
             record_id, self._asr_property_name, asr_result
         )
-- 
GitLab