From 13f918e60840254b2e75bfc02a9cb8548e046734 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20W=C4=85troba?= <markowanga@gmail.com> Date: Tue, 26 Jul 2022 13:49:56 +0200 Subject: [PATCH] Fix dep-tags --- dvc.lock | 536 +++++++++--------- dvc.yaml | 12 +- .../spacy_pos_sentence_dep_tag_processor.py | 2 +- .../pipeline/pl_common_voice/.gitignore | 4 + .../pipeline/pl_google_fleurs/.gitignore | 2 + experiment_data/pipeline/pl_luna/.gitignore | 2 + .../pipeline/pl_minds14/.gitignore | 8 + .../alignment/distance_matrix_calculator.py | 7 +- .../__pycache__/wer_calculator.cpython-38.pyc | Bin 2273 -> 2388 bytes sziszapangma/core/wer/wer_calculator.py | 6 +- .../embedding_wer_metrics_task.cpython-38.pyc | Bin 3626 -> 4158 bytes .../task/embedding_wer_metrics_task.py | 12 +- 12 files changed, 312 insertions(+), 279 deletions(-) diff --git a/dvc.lock b/dvc.lock index 0155b5e..b58cc07 100644 --- a/dvc.lock +++ b/dvc.lock @@ -483,7 +483,7 @@ stages: nfiles: 0 outs: - path: experiment_data/dataset_relation_manager_data/pl_minds14 - md5: 6608d45aee735eaf0a387c52d01c9fa8.dir + md5: 40bb7d02cc76d5b1093955b5046cc3c4.dir size: 3545062 nfiles: 1124 pipeline_prepare_relation_manager@0: @@ -580,12 +580,12 @@ stages: size: 0 nfiles: 0 - path: experiment_data/dataset_relation_manager_data/pl_minds14 - md5: 6608d45aee735eaf0a387c52d01c9fa8.dir + md5: 40bb7d02cc76d5b1093955b5046cc3c4.dir size: 3545062 nfiles: 1124 outs: - path: experiment_data/pipeline/pl_minds14/gold_transcript - md5: d2d48495000b3ea7ea6f4212ddb113a5.dir + md5: d4da8b3a8c5f044af1403d70d8f60fcf.dir size: 689374 nfiles: 562 pipeline_gold_transcript@4: @@ -883,7 +883,7 @@ stages: nfiles: 0 outs: - path: experiment_data/pipeline/pl_minds14/ajn__result - md5: 6fd1b042a7cb6d6f200c2eb9a926b7f1.dir + md5: 4094dd4b22895a0a4ce82793cd6a8b0f.dir size: 974727 nfiles: 559 pipeline_asr_result@10: @@ -1015,12 +1015,12 @@ stages: size: 22935 nfiles: 758 - path: experiment_data/pipeline/pl_google_fleurs/ajn__word_wer_embeddings_alignment - md5: 06e64fca6fc73e1cc9e7e86b21eb11f3.dir - size: 8318047 + md5: 54a91c0e615bccc40da99b525cb566bb.dir + size: 7785758 nfiles: 758 - path: experiment_data/pipeline/pl_google_fleurs/ajn__word_wer_embeddings_metrics - md5: 68da1ded61389ae5d866bc762c61363c.dir - size: 33029 + md5: a6c4e1185a8adc98f36b95f421f06c5b.dir + size: 52372 nfiles: 758 pipeline_word_wer@6: cmd: PYTHONPATH=. python experiment/pipeline_process_word_wer.py --dataset=pl_voicelab_cbiz @@ -1159,12 +1159,12 @@ stages: size: 16835 nfiles: 494 - path: experiment_data/pipeline/pl_luna/ajn__word_wer_embeddings_alignment - md5: 6be0a1c035f4a84a9035bfba1458cdac.dir - size: 43099546 + md5: d3c1d515e47c5cb9c71f56e1ae65de29.dir + size: 42557665 nfiles: 494 - path: experiment_data/pipeline/pl_luna/ajn__word_wer_embeddings_metrics - md5: 4f368d2ba1c5a54d5e3ab69a7581549e.dir - size: 19326 + md5: 023357c311e695217fa66463be6f5eb8.dir + size: 33868 nfiles: 494 pipeline_word_wer@10: cmd: PYTHONPATH=. python experiment/pipeline_process_word_wer.py --dataset=pl_google_fleurs @@ -1267,12 +1267,12 @@ stages: size: 209927 nfiles: 8155 - path: experiment_data/pipeline/pl_common_voice/ajn__word_wer_embeddings_alignment - md5: efb06f9897e62077366362b2aab25d8c.dir - size: 36932578 + md5: 51798a8e9b7239c8833274dbf6644d1e.dir + size: 32293142 nfiles: 8155 - path: experiment_data/pipeline/pl_common_voice/ajn__word_wer_embeddings_metrics - md5: a9132386ed7ccffcba68dfa0a1dca7ee.dir - size: 324358 + md5: 63233cd4c3a18f791f228e0e385cdde1.dir + size: 561188 nfiles: 8155 pipeline_word_wer@16: cmd: PYTHONPATH=. python experiment/pipeline_process_word_wer.py --dataset=pl_minds14 @@ -1286,16 +1286,16 @@ stages: size: 0 nfiles: 0 - path: experiment_data/pipeline/pl_minds14/ajn__result - md5: 6fd1b042a7cb6d6f200c2eb9a926b7f1.dir + md5: 4094dd4b22895a0a4ce82793cd6a8b0f.dir size: 974727 nfiles: 559 - path: experiment_data/pipeline/pl_minds14/gold_transcript - md5: d2d48495000b3ea7ea6f4212ddb113a5.dir + md5: d4da8b3a8c5f044af1403d70d8f60fcf.dir size: 689374 nfiles: 562 outs: - path: experiment_data/pipeline/pl_minds14/ajn__word_wer_classic_alignment - md5: c03a478840305afb1eadf4752b3a5678.dir + md5: f5fd8a87dfcbf4e998b9a1d215186921.dir size: 2851613 nfiles: 559 - path: experiment_data/pipeline/pl_minds14/ajn__word_wer_classic_metrics @@ -1303,12 +1303,12 @@ stages: size: 15213 nfiles: 559 - path: experiment_data/pipeline/pl_minds14/ajn__word_wer_embeddings_alignment - md5: ae577cd5886aced83d7de4ba47bb4457.dir - size: 5960904 + md5: fb57500ec3f203fc88bbe20aa877c735.dir + size: 5671751 nfiles: 559 - path: experiment_data/pipeline/pl_minds14/ajn__word_wer_embeddings_metrics - md5: c599599e5935075cd26ac89e0d3b5f1b.dir - size: 22438 + md5: 55349a39a515ae9b11e49dfc98791a8d.dir + size: 38411 nfiles: 559 pipeline_word_wer@11: cmd: PYTHONPATH=. python experiment/pipeline_process_word_wer.py --dataset=pl_luna @@ -3691,33 +3691,33 @@ stages: size: 7147 nfiles: 559 pipeline_spacy_tag_wer@4: - cmd: PYTHONPATH=. python experiment/pipeline_process_spacy_dep_tag_wer.py --dataset=pl_voicelab_cbiz + cmd: PYTHONPATH=. python experiment/pipeline_process_spacy_dep_tag_wer.py --dataset=pl_google_fleurs --asr=google deps: - path: experiment/pipeline_process_spacy_dep_tag_wer.py md5: 83fc16ed68e85cfd89d8d84dc61d6d0f size: 1489 - - path: experiment_data/dataset/pl_voicelab_cbiz - md5: 3c2b18e1f1f89e4c5ad7b254e472b25e.dir - size: 4803739404 - nfiles: 1600 - - path: experiment_data/pipeline/pl_voicelab_cbiz/gold_transcript - md5: ebffd3814a48564f4e33b9a4e0956af3.dir - size: 21846798 - nfiles: 800 - - path: experiment_data/pipeline/pl_voicelab_cbiz/google__result - md5: cb6322c8c0c6d3cf557b93bf52efd0dc.dir - size: 27432599 - nfiles: 799 + - path: experiment_data/dataset/pl_google_fleurs + md5: dfcb8cf40b4a1e1a62f9ada00468cca9.dir + size: 236272072 + nfiles: 758 + - path: experiment_data/pipeline/pl_google_fleurs/gold_transcript + md5: 607f551eca5dabcca0caf31c87bd2ac6.dir + size: 975209 + nfiles: 758 + - path: experiment_data/pipeline/pl_google_fleurs/google__result + md5: 6e0d7eb490eadd8dcc3c5452ba85932b.dir + size: 1377134 + nfiles: 758 outs: - - path: experiment_data/pipeline/pl_voicelab_cbiz/google__spacy_dep_tag_alignment - md5: e2028c14acd625109a465c36ef166e7a.dir - size: 83052124 - nfiles: 799 - - path: experiment_data/pipeline/pl_voicelab_cbiz/google__spacy_dep_tag_metrics - md5: 43d79d47ba1e91e86daf1f66aa18c941.dir - size: 14239 - nfiles: 799 + - path: experiment_data/pipeline/pl_google_fleurs/google__spacy_dep_tag_alignment + md5: 95fcbc37e49ff7f2d5c0e610446f4936.dir + size: 3747833 + nfiles: 758 + - path: experiment_data/pipeline/pl_google_fleurs/google__spacy_dep_tag_metrics + md5: 6ee2469b6f6008337564fd05ad07725c.dir + size: 9422 + nfiles: 758 pipeline_spacy_tag_wer@17: cmd: PYTHONPATH=. python experiment/pipeline_process_spacy_dep_tag_wer.py --dataset=pl_minds14 --asr=techmo @@ -3747,33 +3747,33 @@ stages: size: 6095 nfiles: 562 pipeline_spacy_tag_wer@10: - cmd: PYTHONPATH=. python experiment/pipeline_process_spacy_dep_tag_wer.py --dataset=pl_google_fleurs - --asr=wav2vec2 + cmd: PYTHONPATH=. python experiment/pipeline_process_spacy_dep_tag_wer.py --dataset=pl_luna + --asr=techmo deps: - path: experiment/pipeline_process_spacy_dep_tag_wer.py md5: 83fc16ed68e85cfd89d8d84dc61d6d0f size: 1489 - - path: experiment_data/dataset/pl_google_fleurs - md5: dfcb8cf40b4a1e1a62f9ada00468cca9.dir - size: 236272072 - nfiles: 758 - - path: experiment_data/pipeline/pl_google_fleurs/gold_transcript - md5: 607f551eca5dabcca0caf31c87bd2ac6.dir - size: 975209 - nfiles: 758 - - path: experiment_data/pipeline/pl_google_fleurs/wav2vec2__result - md5: bf9c77e34376bcda73dbdb6afee55c8c.dir - size: 5137721 - nfiles: 758 + - path: experiment_data/dataset/pl_luna + md5: d342155b1871e881797cf7da09d5dc3c.dir + size: 1578358645 + nfiles: 4500 + - path: experiment_data/pipeline/pl_luna/gold_transcript + md5: 4c3e09acb7ffac0ef5b117a38515e3a9.dir + size: 6706925 + nfiles: 500 + - path: experiment_data/pipeline/pl_luna/techmo__result + md5: 0e596570e1502b38588427bc72dcc006.dir + size: 9697519 + nfiles: 500 outs: - - path: experiment_data/pipeline/pl_google_fleurs/wav2vec2__spacy_dep_tag_alignment - md5: 4ca975e9b42db749a368760f5190805b.dir - size: 3737151 - nfiles: 758 - - path: experiment_data/pipeline/pl_google_fleurs/wav2vec2__spacy_dep_tag_metrics - md5: 782cc84e9116281dfc28734b2ae4a5ea.dir - size: 9004 - nfiles: 758 + - path: experiment_data/pipeline/pl_luna/techmo__spacy_dep_tag_alignment + md5: fc7318a6f7511ad1436d71b994cb3aaf.dir + size: 21165688 + nfiles: 500 + - path: experiment_data/pipeline/pl_luna/techmo__spacy_dep_tag_metrics + md5: 238ef4a951d198de3573a67f0fbb2e75.dir + size: 8680 + nfiles: 500 pipeline_spacy_tag_wer@0: cmd: PYTHONPATH=. python experiment/pipeline_process_spacy_dep_tag_wer.py --dataset=pl_common_voice --asr=google @@ -3795,12 +3795,12 @@ stages: nfiles: 8143 outs: - path: experiment_data/pipeline/pl_common_voice/google__spacy_dep_tag_alignment - md5: 38e2f031c443eea54bf86af578d2b79d.dir - size: 18316770 + md5: f043c22d203a7efd123232f1a2a6b4ad.dir + size: 18474982 nfiles: 8143 - path: experiment_data/pipeline/pl_common_voice/google__spacy_dep_tag_metrics - md5: 117611317774e81fb482ba9c71ec806b.dir - size: 97235 + md5: f12ee96b46679884f65c59fef8ce74ea.dir + size: 96519 nfiles: 8143 pipeline_spacy_tag_wer@18: cmd: PYTHONPATH=. python experiment/pipeline_process_spacy_dep_tag_wer.py --dataset=pl_minds14 @@ -3832,7 +3832,7 @@ stages: nfiles: 562 pipeline_spacy_tag_wer@15: cmd: PYTHONPATH=. python experiment/pipeline_process_spacy_dep_tag_wer.py --dataset=pl_minds14 - --asr=google + --asr=wav2vec2 deps: - path: experiment/pipeline_process_spacy_dep_tag_wer.py md5: 83fc16ed68e85cfd89d8d84dc61d6d0f @@ -3842,78 +3842,78 @@ stages: size: 0 nfiles: 0 - path: experiment_data/pipeline/pl_minds14/gold_transcript - md5: d2d48495000b3ea7ea6f4212ddb113a5.dir + md5: d4da8b3a8c5f044af1403d70d8f60fcf.dir size: 689374 nfiles: 562 - - path: experiment_data/pipeline/pl_minds14/google__result - md5: 3f7a79298a5156fd2b023e673326e72f.dir - size: 985004 + - path: experiment_data/pipeline/pl_minds14/wav2vec2__result + md5: 5658da01ecdce39ed99156bbc7f2dc62.dir + size: 3523907 nfiles: 562 outs: - - path: experiment_data/pipeline/pl_minds14/google__spacy_dep_tag_alignment - md5: 715afeb1c31961d4680f8b98ba61d4ad.dir - size: 2659852 + - path: experiment_data/pipeline/pl_minds14/wav2vec2__spacy_dep_tag_alignment + md5: fd108bf3d67c339ebc12a0965a6e4c18.dir + size: 2752423 nfiles: 562 - - path: experiment_data/pipeline/pl_minds14/google__spacy_dep_tag_metrics - md5: aeda105b01366dee65935d3c07fe3444.dir - size: 4970 + - path: experiment_data/pipeline/pl_minds14/wav2vec2__spacy_dep_tag_metrics + md5: 88f7aef65d580d59cdc78610dd98e616.dir + size: 7100 nfiles: 562 pipeline_spacy_tag_wer@6: - cmd: PYTHONPATH=. python experiment/pipeline_process_spacy_dep_tag_wer.py --dataset=pl_voicelab_cbiz + cmd: PYTHONPATH=. python experiment/pipeline_process_spacy_dep_tag_wer.py --dataset=pl_google_fleurs --asr=techmo deps: - path: experiment/pipeline_process_spacy_dep_tag_wer.py md5: 83fc16ed68e85cfd89d8d84dc61d6d0f size: 1489 - - path: experiment_data/dataset/pl_voicelab_cbiz - md5: 3c2b18e1f1f89e4c5ad7b254e472b25e.dir - size: 4803739404 - nfiles: 1600 - - path: experiment_data/pipeline/pl_voicelab_cbiz/gold_transcript - md5: ebffd3814a48564f4e33b9a4e0956af3.dir - size: 21846798 - nfiles: 800 - - path: experiment_data/pipeline/pl_voicelab_cbiz/techmo__result - md5: e544489fc21b6a3e6d4fd68ab8c2c069.dir - size: 39158267 - nfiles: 800 + - path: experiment_data/dataset/pl_google_fleurs + md5: dfcb8cf40b4a1e1a62f9ada00468cca9.dir + size: 236272072 + nfiles: 758 + - path: experiment_data/pipeline/pl_google_fleurs/gold_transcript + md5: 607f551eca5dabcca0caf31c87bd2ac6.dir + size: 975209 + nfiles: 758 + - path: experiment_data/pipeline/pl_google_fleurs/techmo__result + md5: 33c60c2b8bd57c3aedd7161256ad8cfa.dir + size: 1880403 + nfiles: 758 outs: - - path: experiment_data/pipeline/pl_voicelab_cbiz/techmo__spacy_dep_tag_alignment - md5: 19c1d8b3e8704af06e943ba6962cf9ad.dir - size: 81650836 - nfiles: 800 - - path: experiment_data/pipeline/pl_voicelab_cbiz/techmo__spacy_dep_tag_metrics - md5: 11320499f29d2d7bfce68d35fb352b83.dir - size: 14334 - nfiles: 800 + - path: experiment_data/pipeline/pl_google_fleurs/techmo__spacy_dep_tag_alignment + md5: 997d8e36cd023245065af9c1c3db1d72.dir + size: 3743812 + nfiles: 758 + - path: experiment_data/pipeline/pl_google_fleurs/techmo__spacy_dep_tag_metrics + md5: 386a8988937349d2ab69a4a335d0d270.dir + size: 9672 + nfiles: 758 pipeline_spacy_tag_wer@14: - cmd: PYTHONPATH=. python experiment/pipeline_process_spacy_dep_tag_wer.py --dataset=pl_luna - --asr=wav2vec2 + cmd: PYTHONPATH=. python experiment/pipeline_process_spacy_dep_tag_wer.py --dataset=pl_minds14 + --asr=techmo deps: - path: experiment/pipeline_process_spacy_dep_tag_wer.py md5: 83fc16ed68e85cfd89d8d84dc61d6d0f size: 1489 - - path: experiment_data/dataset/pl_luna - md5: d342155b1871e881797cf7da09d5dc3c.dir - size: 1578358645 - nfiles: 4500 - - path: experiment_data/pipeline/pl_luna/gold_transcript - md5: 4c3e09acb7ffac0ef5b117a38515e3a9.dir - size: 6706925 - nfiles: 500 - - path: experiment_data/pipeline/pl_luna/wav2vec2__result - md5: 9c63b061ac7763144bca121e163ee7aa.dir - size: 20658485 - nfiles: 456 + - path: experiment_data/dataset/pl_minds14 + md5: d751713988987e9331980363e24189ce.dir + size: 0 + nfiles: 0 + - path: experiment_data/pipeline/pl_minds14/gold_transcript + md5: d4da8b3a8c5f044af1403d70d8f60fcf.dir + size: 689374 + nfiles: 562 + - path: experiment_data/pipeline/pl_minds14/techmo__result + md5: 4c43636b4773f2bf9a2153ef3393a558.dir + size: 1336305 + nfiles: 562 outs: - - path: experiment_data/pipeline/pl_luna/wav2vec2__spacy_dep_tag_alignment - md5: 6716464936f4f35ba81a43eb2c2f37b0.dir - size: 17967467 - nfiles: 456 - - path: experiment_data/pipeline/pl_luna/wav2vec2__spacy_dep_tag_metrics - md5: 7848ddff997fd231f3857ff30dfd7154.dir - size: 7940 - nfiles: 456 + - path: experiment_data/pipeline/pl_minds14/techmo__spacy_dep_tag_alignment + md5: 66efb4e5647eda2e2ab3116445bdf9b5.dir + size: 2666883 + nfiles: 562 + - path: experiment_data/pipeline/pl_minds14/techmo__spacy_dep_tag_metrics + md5: 15c9ef006daec951119079da2794dcad.dir + size: 6736 + nfiles: 562 pipeline_spacy_tag_wer@16: cmd: PYTHONPATH=. python experiment/pipeline_process_spacy_dep_tag_wer.py --dataset=pl_minds14 --asr=ajn @@ -3943,33 +3943,33 @@ stages: size: 7059 nfiles: 559 pipeline_spacy_tag_wer@5: - cmd: PYTHONPATH=. python experiment/pipeline_process_spacy_dep_tag_wer.py --dataset=pl_voicelab_cbiz + cmd: PYTHONPATH=. python experiment/pipeline_process_spacy_dep_tag_wer.py --dataset=pl_google_fleurs --asr=ajn deps: - path: experiment/pipeline_process_spacy_dep_tag_wer.py md5: 83fc16ed68e85cfd89d8d84dc61d6d0f size: 1489 - - path: experiment_data/dataset/pl_voicelab_cbiz - md5: 3c2b18e1f1f89e4c5ad7b254e472b25e.dir - size: 4803739404 - nfiles: 1600 - - path: experiment_data/pipeline/pl_voicelab_cbiz/ajn__result - md5: 7de1137f44fad26766da0fc309720160.dir - size: 22765926 - nfiles: 800 - - path: experiment_data/pipeline/pl_voicelab_cbiz/gold_transcript - md5: ebffd3814a48564f4e33b9a4e0956af3.dir - size: 21846798 - nfiles: 800 + - path: experiment_data/dataset/pl_google_fleurs + md5: dfcb8cf40b4a1e1a62f9ada00468cca9.dir + size: 236272072 + nfiles: 758 + - path: experiment_data/pipeline/pl_google_fleurs/ajn__result + md5: 545e63a6daf9c46387c1d7d40b85499f.dir + size: 1413262 + nfiles: 758 + - path: experiment_data/pipeline/pl_google_fleurs/gold_transcript + md5: 607f551eca5dabcca0caf31c87bd2ac6.dir + size: 975209 + nfiles: 758 outs: - - path: experiment_data/pipeline/pl_voicelab_cbiz/ajn__spacy_dep_tag_alignment - md5: 3d4a9a912756443a1de46cf91f6e5805.dir - size: 78539613 - nfiles: 800 - - path: experiment_data/pipeline/pl_voicelab_cbiz/ajn__spacy_dep_tag_metrics - md5: 7ec3bb7c838e7f06b8a1dbe7a68faac2.dir - size: 13753 - nfiles: 800 + - path: experiment_data/pipeline/pl_google_fleurs/ajn__spacy_dep_tag_alignment + md5: 21d9be660fc3037a8dd6dc1c93c0499a.dir + size: 3834884 + nfiles: 758 + - path: experiment_data/pipeline/pl_google_fleurs/ajn__spacy_dep_tag_metrics + md5: 27dbce8684441f84dd51327f1ed07e7d.dir + size: 10594 + nfiles: 758 pipeline_spacy_tag_wer@2: cmd: PYTHONPATH=. python experiment/pipeline_process_spacy_dep_tag_wer.py --dataset=pl_common_voice --asr=techmo @@ -3991,12 +3991,12 @@ stages: nfiles: 8136 outs: - path: experiment_data/pipeline/pl_common_voice/techmo__spacy_dep_tag_alignment - md5: 3e1f2b39cd9d82679013649a1ad8d983.dir - size: 18192387 + md5: 7f52c28043e0a7b311bd39e877998834.dir + size: 18352367 nfiles: 8136 - path: experiment_data/pipeline/pl_common_voice/techmo__spacy_dep_tag_metrics - md5: 61c69fcd287051f4ab7d1ffcc68a9aca.dir - size: 96845 + md5: 3d8555026f88adec199bcb380e30a0f0.dir + size: 96139 nfiles: 8136 pipeline_wikineiural_ner@18: cmd: PYTHONPATH=. python experiment/pipeline_process_wikineural_ner.py --dataset=pl_minds14 @@ -4027,33 +4027,33 @@ stages: size: 8602 nfiles: 562 pipeline_spacy_tag_wer@8: - cmd: PYTHONPATH=. python experiment/pipeline_process_spacy_dep_tag_wer.py --dataset=pl_google_fleurs - --asr=ajn + cmd: PYTHONPATH=. python experiment/pipeline_process_spacy_dep_tag_wer.py --dataset=pl_luna + --asr=google deps: - path: experiment/pipeline_process_spacy_dep_tag_wer.py md5: 83fc16ed68e85cfd89d8d84dc61d6d0f size: 1489 - - path: experiment_data/dataset/pl_google_fleurs - md5: dfcb8cf40b4a1e1a62f9ada00468cca9.dir - size: 236272072 - nfiles: 758 - - path: experiment_data/pipeline/pl_google_fleurs/ajn__result - md5: 545e63a6daf9c46387c1d7d40b85499f.dir - size: 1413262 - nfiles: 758 - - path: experiment_data/pipeline/pl_google_fleurs/gold_transcript - md5: 607f551eca5dabcca0caf31c87bd2ac6.dir - size: 975209 - nfiles: 758 + - path: experiment_data/dataset/pl_luna + md5: d342155b1871e881797cf7da09d5dc3c.dir + size: 1578358645 + nfiles: 4500 + - path: experiment_data/pipeline/pl_luna/gold_transcript + md5: 4c3e09acb7ffac0ef5b117a38515e3a9.dir + size: 6706925 + nfiles: 500 + - path: experiment_data/pipeline/pl_luna/google__result + md5: 8e4bf67df4dccd218d4d7c3de69688a4.dir + size: 5346497 + nfiles: 500 outs: - - path: experiment_data/pipeline/pl_google_fleurs/ajn__spacy_dep_tag_alignment - md5: 8ec2e9fc88d4b8ce5032bf809c1c025f.dir - size: 3799802 - nfiles: 758 - - path: experiment_data/pipeline/pl_google_fleurs/ajn__spacy_dep_tag_metrics - md5: 0d95a4abb4a33aef3e242a304fd58698.dir - size: 10569 - nfiles: 758 + - path: experiment_data/pipeline/pl_luna/google__spacy_dep_tag_alignment + md5: 5fc24c54101bce2e858b08f4c47e0667.dir + size: 19568605 + nfiles: 500 + - path: experiment_data/pipeline/pl_luna/google__spacy_dep_tag_metrics + md5: f64735e07b7c460895d1ccf8e4d0884c.dir + size: 8466 + nfiles: 500 pipeline_word_wer@18: cmd: PYTHONPATH=. python experiment/pipeline_process_word_wer.py --dataset=pl_minds14 --asr=wav2vec2 @@ -4091,33 +4091,33 @@ stages: size: 37892 nfiles: 562 pipeline_spacy_tag_wer@12: - cmd: PYTHONPATH=. python experiment/pipeline_process_spacy_dep_tag_wer.py --dataset=pl_luna - --asr=ajn + cmd: PYTHONPATH=. python experiment/pipeline_process_spacy_dep_tag_wer.py --dataset=pl_minds14 + --asr=google deps: - path: experiment/pipeline_process_spacy_dep_tag_wer.py md5: 83fc16ed68e85cfd89d8d84dc61d6d0f size: 1489 - - path: experiment_data/dataset/pl_luna - md5: d342155b1871e881797cf7da09d5dc3c.dir - size: 1578358645 - nfiles: 4500 - - path: experiment_data/pipeline/pl_luna/ajn__result - md5: 653d65e186a7d05958ce3cbef219038c.dir - size: 6159899 - nfiles: 494 - - path: experiment_data/pipeline/pl_luna/gold_transcript - md5: 4c3e09acb7ffac0ef5b117a38515e3a9.dir - size: 6706925 - nfiles: 500 + - path: experiment_data/dataset/pl_minds14 + md5: d751713988987e9331980363e24189ce.dir + size: 0 + nfiles: 0 + - path: experiment_data/pipeline/pl_minds14/gold_transcript + md5: d4da8b3a8c5f044af1403d70d8f60fcf.dir + size: 689374 + nfiles: 562 + - path: experiment_data/pipeline/pl_minds14/google__result + md5: 3f7a79298a5156fd2b023e673326e72f.dir + size: 985004 + nfiles: 562 outs: - - path: experiment_data/pipeline/pl_luna/ajn__spacy_dep_tag_alignment - md5: 034d072825c711a824f1280f4a390f74.dir - size: 21936929 - nfiles: 494 - - path: experiment_data/pipeline/pl_luna/ajn__spacy_dep_tag_metrics - md5: 6c6bb673ea5f64c9d851878c9d8a7c09.dir - size: 8444 - nfiles: 494 + - path: experiment_data/pipeline/pl_minds14/google__spacy_dep_tag_alignment + md5: 76be498f539e45e5650695e29f22b436.dir + size: 2685018 + nfiles: 562 + - path: experiment_data/pipeline/pl_minds14/google__spacy_dep_tag_metrics + md5: 96aea021765fd076fc534e9f09b29037.dir + size: 5117 + nfiles: 562 pipeline_spacy_tag_wer@3: cmd: PYTHONPATH=. python experiment/pipeline_process_spacy_dep_tag_wer.py --dataset=pl_common_voice --asr=ajn @@ -4139,16 +4139,16 @@ stages: nfiles: 8155 outs: - path: experiment_data/pipeline/pl_common_voice/ajn__spacy_dep_tag_alignment - md5: 10af363d90689138f55e3295f562efc4.dir - size: 19159060 + md5: a735ec3c634bbe034cb67f7a54fb0d2f.dir + size: 19294281 nfiles: 8155 - path: experiment_data/pipeline/pl_common_voice/ajn__spacy_dep_tag_metrics - md5: 6094fb960e2eab979ecb33d40a253531.dir - size: 95146 + md5: 516d26ee39867a1166c51edb014ad897.dir + size: 94253 nfiles: 8155 pipeline_spacy_tag_wer@11: cmd: PYTHONPATH=. python experiment/pipeline_process_spacy_dep_tag_wer.py --dataset=pl_luna - --asr=google + --asr=wav2vec2 deps: - path: experiment/pipeline_process_spacy_dep_tag_wer.py md5: 83fc16ed68e85cfd89d8d84dc61d6d0f @@ -4161,47 +4161,47 @@ stages: md5: 4c3e09acb7ffac0ef5b117a38515e3a9.dir size: 6706925 nfiles: 500 - - path: experiment_data/pipeline/pl_luna/google__result - md5: 8e4bf67df4dccd218d4d7c3de69688a4.dir - size: 5346497 - nfiles: 500 + - path: experiment_data/pipeline/pl_luna/wav2vec2__result + md5: 9c63b061ac7763144bca121e163ee7aa.dir + size: 20658485 + nfiles: 456 outs: - - path: experiment_data/pipeline/pl_luna/google__spacy_dep_tag_alignment - md5: 4663cdc1bb88d7d6de3691c734fe0ab6.dir - size: 19342263 - nfiles: 500 - - path: experiment_data/pipeline/pl_luna/google__spacy_dep_tag_metrics - md5: a65dd7d74319da06f4ebaca08cde30ce.dir - size: 8659 - nfiles: 500 + - path: experiment_data/pipeline/pl_luna/wav2vec2__spacy_dep_tag_alignment + md5: 4edb321a6629205a105e76d48ab834ff.dir + size: 18188630 + nfiles: 456 + - path: experiment_data/pipeline/pl_luna/wav2vec2__spacy_dep_tag_metrics + md5: d3afeff3bf782eef2d0d34e8fdebee8f.dir + size: 7784 + nfiles: 456 pipeline_spacy_tag_wer@9: - cmd: PYTHONPATH=. python experiment/pipeline_process_spacy_dep_tag_wer.py --dataset=pl_google_fleurs - --asr=techmo + cmd: PYTHONPATH=. python experiment/pipeline_process_spacy_dep_tag_wer.py --dataset=pl_luna + --asr=ajn deps: - path: experiment/pipeline_process_spacy_dep_tag_wer.py md5: 83fc16ed68e85cfd89d8d84dc61d6d0f size: 1489 - - path: experiment_data/dataset/pl_google_fleurs - md5: dfcb8cf40b4a1e1a62f9ada00468cca9.dir - size: 236272072 - nfiles: 758 - - path: experiment_data/pipeline/pl_google_fleurs/gold_transcript - md5: 607f551eca5dabcca0caf31c87bd2ac6.dir - size: 975209 - nfiles: 758 - - path: experiment_data/pipeline/pl_google_fleurs/techmo__result - md5: 33c60c2b8bd57c3aedd7161256ad8cfa.dir - size: 1880403 - nfiles: 758 + - path: experiment_data/dataset/pl_luna + md5: d342155b1871e881797cf7da09d5dc3c.dir + size: 1578358645 + nfiles: 4500 + - path: experiment_data/pipeline/pl_luna/ajn__result + md5: 653d65e186a7d05958ce3cbef219038c.dir + size: 6159899 + nfiles: 494 + - path: experiment_data/pipeline/pl_luna/gold_transcript + md5: 4c3e09acb7ffac0ef5b117a38515e3a9.dir + size: 6706925 + nfiles: 500 outs: - - path: experiment_data/pipeline/pl_google_fleurs/techmo__spacy_dep_tag_alignment - md5: 18da1eb1ac1485f74337ea502e395b57.dir - size: 3707699 - nfiles: 758 - - path: experiment_data/pipeline/pl_google_fleurs/techmo__spacy_dep_tag_metrics - md5: 9cae08bbb8a6331d06a33dbbb4a16301.dir - size: 9662 - nfiles: 758 + - path: experiment_data/pipeline/pl_luna/ajn__spacy_dep_tag_alignment + md5: 39fb1ccfff7b9b87a8d5606d172cbcc9.dir + size: 22110364 + nfiles: 494 + - path: experiment_data/pipeline/pl_luna/ajn__spacy_dep_tag_metrics + md5: 607486e6c531d49e5a093cc5d0dda949.dir + size: 8366 + nfiles: 494 pipeline_spacy_tag_wer@1: cmd: PYTHONPATH=. python experiment/pipeline_process_spacy_dep_tag_wer.py --dataset=pl_common_voice --asr=wav2vec2 @@ -4223,12 +4223,12 @@ stages: nfiles: 8154 outs: - path: experiment_data/pipeline/pl_common_voice/wav2vec2__spacy_dep_tag_alignment - md5: c46f35654ac42c3ddfd14e0197f36ae5.dir - size: 18349138 + md5: 09fbe03eafa4948e0d3009ef392e9c40.dir + size: 18505763 nfiles: 8154 - path: experiment_data/pipeline/pl_common_voice/wav2vec2__spacy_dep_tag_metrics - md5: 71381fa7fd6c0cdba00f25c17fd6be5e.dir - size: 97426 + md5: d1bc1925fe39ccb98e8bb085a1b1b24f.dir + size: 96041 nfiles: 8154 pipeline_flair_upos@18: cmd: PYTHONPATH=. python experiment/pipeline_process_flair_upos.py --dataset=pl_minds14 @@ -4287,36 +4287,36 @@ stages: size: 6840 nfiles: 562 pipeline_spacy_tag_wer@13: - cmd: PYTHONPATH=. python experiment/pipeline_process_spacy_dep_tag_wer.py --dataset=pl_luna - --asr=techmo + cmd: PYTHONPATH=. python experiment/pipeline_process_spacy_dep_tag_wer.py --dataset=pl_minds14 + --asr=ajn deps: - path: experiment/pipeline_process_spacy_dep_tag_wer.py md5: 83fc16ed68e85cfd89d8d84dc61d6d0f size: 1489 - - path: experiment_data/dataset/pl_luna - md5: d342155b1871e881797cf7da09d5dc3c.dir - size: 1578358645 - nfiles: 4500 - - path: experiment_data/pipeline/pl_luna/gold_transcript - md5: 4c3e09acb7ffac0ef5b117a38515e3a9.dir - size: 6706925 - nfiles: 500 - - path: experiment_data/pipeline/pl_luna/techmo__result - md5: 0e596570e1502b38588427bc72dcc006.dir - size: 9697519 - nfiles: 500 + - path: experiment_data/dataset/pl_minds14 + md5: d751713988987e9331980363e24189ce.dir + size: 0 + nfiles: 0 + - path: experiment_data/pipeline/pl_minds14/ajn__result + md5: 4094dd4b22895a0a4ce82793cd6a8b0f.dir + size: 974727 + nfiles: 559 + - path: experiment_data/pipeline/pl_minds14/gold_transcript + md5: d4da8b3a8c5f044af1403d70d8f60fcf.dir + size: 689374 + nfiles: 562 outs: - - path: experiment_data/pipeline/pl_luna/techmo__spacy_dep_tag_alignment - md5: baefcd5dfadd9c62d6fc71ba0ac31fa9.dir - size: 20897599 - nfiles: 500 - - path: experiment_data/pipeline/pl_luna/techmo__spacy_dep_tag_metrics - md5: 1478707020a96496b50eb732207c290e.dir - size: 8841 - nfiles: 500 + - path: experiment_data/pipeline/pl_minds14/ajn__spacy_dep_tag_alignment + md5: 7635cf48e907c8d2939fa9ef6870cd6a.dir + size: 3200084 + nfiles: 559 + - path: experiment_data/pipeline/pl_minds14/ajn__spacy_dep_tag_metrics + md5: 54a8912bab4cd197f01719d5340ef7e9.dir + size: 6867 + nfiles: 559 pipeline_spacy_tag_wer@7: cmd: PYTHONPATH=. python experiment/pipeline_process_spacy_dep_tag_wer.py --dataset=pl_google_fleurs - --asr=google + --asr=wav2vec2 deps: - path: experiment/pipeline_process_spacy_dep_tag_wer.py md5: 83fc16ed68e85cfd89d8d84dc61d6d0f @@ -4329,18 +4329,18 @@ stages: md5: 607f551eca5dabcca0caf31c87bd2ac6.dir size: 975209 nfiles: 758 - - path: experiment_data/pipeline/pl_google_fleurs/google__result - md5: 6e0d7eb490eadd8dcc3c5452ba85932b.dir - size: 1377134 + - path: experiment_data/pipeline/pl_google_fleurs/wav2vec2__result + md5: bf9c77e34376bcda73dbdb6afee55c8c.dir + size: 5137721 nfiles: 758 outs: - - path: experiment_data/pipeline/pl_google_fleurs/google__spacy_dep_tag_alignment - md5: c15aa30b165152fac6813cd092763242.dir - size: 3712618 + - path: experiment_data/pipeline/pl_google_fleurs/wav2vec2__spacy_dep_tag_alignment + md5: 8503661e0ee89ff96690e245f3144807.dir + size: 3772188 nfiles: 758 - - path: experiment_data/pipeline/pl_google_fleurs/google__spacy_dep_tag_metrics - md5: 56dddb48cea2022b91fd4323efd43a8b.dir - size: 9213 + - path: experiment_data/pipeline/pl_google_fleurs/wav2vec2__spacy_dep_tag_metrics + md5: be82f4400be9dd5f01a7a4f73c357b1c.dir + size: 9083 nfiles: 758 pipeline_spacy_ner_wer@18: cmd: PYTHONPATH=. python experiment/pipeline_process_spacy_ner_wer.py --dataset=pl_minds14 diff --git a/dvc.yaml b/dvc.yaml index 09be1b7..4f3541c 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -379,12 +379,12 @@ stages: - dataset: pl_common_voice asr: ajn - - dataset: pl_voicelab_cbiz - asr: google - - dataset: pl_voicelab_cbiz - asr: ajn - - dataset: pl_voicelab_cbiz - asr: techmo +# - dataset: pl_voicelab_cbiz +# asr: google +# - dataset: pl_voicelab_cbiz +# asr: ajn +# - dataset: pl_voicelab_cbiz +# asr: techmo - dataset: pl_google_fleurs asr: google diff --git a/experiment/sentence_wer_processor/spacy_pos_sentence_dep_tag_processor.py b/experiment/sentence_wer_processor/spacy_pos_sentence_dep_tag_processor.py index ca467e0..8a27e66 100644 --- a/experiment/sentence_wer_processor/spacy_pos_sentence_dep_tag_processor.py +++ b/experiment/sentence_wer_processor/spacy_pos_sentence_dep_tag_processor.py @@ -25,4 +25,4 @@ class SpacyDepTagSentenceWerProcessor(SentenceWerProcessor): def process_text(self, text: str) -> List[str]: document = self._nlp(text) - return [token.pos_ for token in document] + return [token.dep_ for token in document] diff --git a/experiment_data/pipeline/pl_common_voice/.gitignore b/experiment_data/pipeline/pl_common_voice/.gitignore index 2fc1cf4..156a70b 100644 --- a/experiment_data/pipeline/pl_common_voice/.gitignore +++ b/experiment_data/pipeline/pl_common_voice/.gitignore @@ -40,3 +40,7 @@ /wav2vec2__spacy_dep_tag_metrics /ajn__spacy_pos_alignment /ajn__spacy_pos_metrics +/google__spacy_dep_tag_alignment +/google__spacy_dep_tag_metrics +/techmo__spacy_dep_tag_alignment +/techmo__spacy_dep_tag_metrics diff --git a/experiment_data/pipeline/pl_google_fleurs/.gitignore b/experiment_data/pipeline/pl_google_fleurs/.gitignore index 5edb9f9..d9d64b9 100644 --- a/experiment_data/pipeline/pl_google_fleurs/.gitignore +++ b/experiment_data/pipeline/pl_google_fleurs/.gitignore @@ -57,3 +57,5 @@ /google__word_wer_embeddings_alignment /google__spacy_dep_tag_alignment /google__spacy_dep_tag_metrics +/wav2vec2__spacy_dep_tag_alignment +/wav2vec2__spacy_dep_tag_metrics diff --git a/experiment_data/pipeline/pl_luna/.gitignore b/experiment_data/pipeline/pl_luna/.gitignore index e67f426..6c913f2 100644 --- a/experiment_data/pipeline/pl_luna/.gitignore +++ b/experiment_data/pipeline/pl_luna/.gitignore @@ -57,3 +57,5 @@ /google__spacy_dep_tag_metrics /techmo__spacy_dep_tag_alignment /techmo__spacy_dep_tag_metrics +/wav2vec2__spacy_dep_tag_alignment +/wav2vec2__spacy_dep_tag_metrics diff --git a/experiment_data/pipeline/pl_minds14/.gitignore b/experiment_data/pipeline/pl_minds14/.gitignore index 05ce9ab..777d1ab 100644 --- a/experiment_data/pipeline/pl_minds14/.gitignore +++ b/experiment_data/pipeline/pl_minds14/.gitignore @@ -49,3 +49,11 @@ /ajn__word_wer_classic_alignment /ajn__word_wer_embeddings_metrics /ajn__word_wer_embeddings_alignment +/wav2vec2__spacy_dep_tag_alignment +/wav2vec2__spacy_dep_tag_metrics +/google__spacy_dep_tag_alignment +/google__spacy_dep_tag_metrics +/techmo__spacy_dep_tag_alignment +/techmo__spacy_dep_tag_metrics +/ajn__spacy_dep_tag_alignment +/ajn__spacy_dep_tag_metrics diff --git a/sziszapangma/core/alignment/distance_matrix_calculator.py b/sziszapangma/core/alignment/distance_matrix_calculator.py index 7fabcb3..44c1031 100644 --- a/sziszapangma/core/alignment/distance_matrix_calculator.py +++ b/sziszapangma/core/alignment/distance_matrix_calculator.py @@ -68,7 +68,12 @@ class CosineDistanceCalculator(DistanceCalculator): raise RuntimeError("array dimensions {} not right".format(a.ndim)) similarity = np.dot(a, b.T) / (a_norm * b_norm) dist = 1.0 - similarity - return float(dist) + # return float(dist) + float_dist = float(dist) + if abs(float_dist) < 0.000001: + return 0.0 + else: + return float_dist def calculate_distance_matrix( self, reference: List[Word], hypothesis: List[Word] diff --git a/sziszapangma/core/wer/__pycache__/wer_calculator.cpython-38.pyc b/sziszapangma/core/wer/__pycache__/wer_calculator.cpython-38.pyc index 800b2cf0b964c1458fee0d3b9aa1ab514e39441d..7eadfc43e8caede32ebe853909383161106dd9d6 100644 GIT binary patch delta 734 zcmZ8fO=}ZD7@nE^n*D6ERxKDcftEIkpeIpkE(!`-5z=1P!;<byK-k@c*$o;<L1@8? zmoVbRgL?ERiod{{H}SL=@$AK);Jh0QHM7h!@4g?;Gw;j%YWxU;ecyMGTD`YF53ZZD zpo!V+a{FFnFvRrK%&f@5NZCe8Gdr?t+f1FzjojL{QZMr(AEUd7QDz?_W^;U`MFG=# zsN>9@ui=NYSDQBNm3*`}8S#%_zG2r^iUM?nCts2Thw=}3efoqKLJfdWzR|X^FTZNL zT~|30tl;0$lmYv;+Qo)~61~Gy^d28#h8a0VQ*;L%;X_iADUtW}ohzP7>R_<~2NA34 zsPsbJOh8}`Ay}srwB={Lzo&_H$npgraKZD0$0^UF)xnczoW)QYC&QvFmI$gKPWovw zPWvT~pK{Su9EGU>=?)I$zOj21Tpx&%=jou<Sf=p*W7v@2jZ1!g7o!;U^Fp}NHYeB4 zC>qrYu!*yC8`_#r_dHm2Ym2V7<tOt&yPkt4E<hYOMQf2+*MzcVHSw0bWsPrG5CZDT z7^K5~IoG-f-u9wuX?|1%Jo(!?=U8i+fWOX)Y|wSr$j8}eQjvUA$%t;rNAxDCHsxd5 rbn9W3>WK(bzM!pctqY|m2D9j;qC44;jZ@x*2T=r|hdn}xLin`5P`{8o delta 676 zcmZuvL2J}N6rMMeOwweNHpSJ7+q%UTx+{WSOBJnxm$nGv!M!YLGAj$|Hq2y|mWqe= zEGz?__1J@mSLqM%2S}m0dGH|k2fP&LWuet#An$!M-}}BdZ<24*-=pwZ5O~BqcRT&f zBYPA!;K}IbJm0cAHjvTAO#3lOdFmXH)Df`HI$r8NC9R2{sA)mT9;`kz$y)2HVsyCp zx%>*gYbZH^!Pg&Y0+Tvo@9KZ;A~l5}(no9wJpF^Mb38=?`jb7=Xc}0M_pnDkz-uUI z3MoAxd*l(>2c26hw|rC30vTeWa?+&IQiaGR65|jdnl@ohpIGsIDz9KwAVp6|F-Sx= z6N3xoW!O#PEE#5TA-b=GylNE6GJ#m0ervDazGmPdHLXO}tNbn+1pnxp+UM<{>T9b! z9w_PQRlaklZfwl4(M?_+V;o_8Wer5t7-thK=wEzozM7fIS>(}EUO%g>G?9MpG~kAQ z=?vd+q|mywd)anejC;-DZWF`#HJx{I41pg_q0R+LIibK$!_Xk0ry;+@B-U)S{F23+ ziKJ#x@?t0N4_-)yi?-VBtKB%)$8KGJaXqcwW&PWoU9Qf9;{SCn-IcBFbeM^IU@8>; M2Q{cs^FodN0cc@_4FCWD diff --git a/sziszapangma/core/wer/wer_calculator.py b/sziszapangma/core/wer/wer_calculator.py index b5cdab3..ac3ce66 100644 --- a/sziszapangma/core/wer/wer_calculator.py +++ b/sziszapangma/core/wer/wer_calculator.py @@ -1,8 +1,11 @@ from abc import ABC from typing import List +import numpy as np + from sziszapangma.core.alignment.alignment_step import AlignmentStep from sziszapangma.core.alignment.alignment_util import AlignmentUtil +from sziszapangma.core.alignment.step_type import StepType from sziszapangma.core.wer.span import Span @@ -27,7 +30,8 @@ class WerCalculator(ABC): steps: List[AlignmentStep], ) -> float: reference_len = AlignmentUtil.get_reference_length(steps) - return sum([step.step_cost for step in steps]) / reference_len + fixed_step_costs = [step.step_cost for step in steps] + return sum(fixed_step_costs) / reference_len def calculate_wer(self, steps: List[AlignmentStep]) -> float: return self._calculate_wer(steps) diff --git a/sziszapangma/integration/task/__pycache__/embedding_wer_metrics_task.cpython-38.pyc b/sziszapangma/integration/task/__pycache__/embedding_wer_metrics_task.cpython-38.pyc index 388f4fe48dd157ff91257c305c0a27b700e0f078..a8a8f78d692af7ebea7c238aeeee2face160921a 100644 GIT binary patch delta 1575 zcmZ{k-D@0G6u|GDotfR8o!PJKc6akp(==8$ZK6^eO%sDut019@wF-+mOy}OjO=ds5 zGqEO22qgg@6d_)G5K`Q{8KF?{!53{G^wk$Z4D;ff5b!S$A3SFkV@z?D`RzUDo_p?@ zbI-Y758ofn-mol#!1KkIzuo+yu#+98_jb;QFA<jUO09CEObH{*^iH)ZjS7sdMwMk7 zHI{1(zD!u26}Aa0@WNK2F~o}3NqyiDoz-ciE;RjM=ck`F^{tLza+fa9ujC>9U?$6R zJYOSyvh1$HaxA~CZqY`8C%DOrtgxG4MK-XlHU>T-DruC!Zz-zCclF9QQu&o`mSBsh zXgj{JyQ0(OB3QHAPK%2oOb888MkElY5GtaANFu6;0YE(!sXq4>MGZBBh#|x<Vgzv- z@d#j_Mrqr2J;#@aYFirv)rfpsyF*9ipW5&=Ze34^aWt<ZCJ>Ju;c*y6Ie(?uwHG_W z7TkrjT$Bs?<xM5hZh_#7^I+prz&{U;6?k8O*UthNWy&@Q$%|x*LRJ#j{$q}&`1RM( zd=#(*rtefYXh1^BNDUS-x}pj-q&v#{DmYLgHMq(@3<T~3r<U+|dtZr^sV5_)8JrB? znDgLJ-A=1JKZ{%F038C%ll6HxooZpR=>=Tayw!yh!EchUrOI=$n>rO!V1CkN@u4OD z47$coA<Fs$x*dAh1~T1Gviwv(XJ8z0ihh^(^)dG#)EHSwivvbOZABLcA?c|B>1m<5 zNB8K4ek5y*)<~>NhGbdqB||+-?kO87kWxod8l-g32ve}u=#S+y$*HLz6Ixr8l2GZH zA%zJmH1`t6<ZNigGFi^`^77;4s~52z_7#zWzsL+a_Ojn;+pOsZ;wey_NRD_KbZ^L7 z>h{^Vj-Ex6|FgRxzfVnDSR*0_s24=$v-D8r0xHfTF3B_L(kAvmG~ha(yXrXsw;iv! z)Nb*1(C-s$jL7<T`W6?F))mcm5LH&Y3|Aa=9G3A4UyJ*w-+9<{k>+~b5%uiJR+z>~ zc{lw^{xXaN7Le$NE0f0f^odR>iq*F5H`_~Iyrau~|2bIUUjY!^pjld_d0LW=F{;p% zylq^eW%;8qyBT+X-1u?NC&4^YZ+1GKz$y`Ah%{mnXG}kEf~MQz!FwGhu;_$|xP-6} zR{@dLZ+ze7o@d*~j_FY}!PXW90NhIn;$M&~Gd`j#bWl?${QipPuGNPlJy`3)CkaIs z%!>RYQ<4udWtxz`WM-)fy634TUpHr{E<Z6RXi|P_&gyZg%*gM|nk-mnbPT;QDLw17 zeACKPLoQj9qZv>}Re!zduRC3*z0`81S{=r{K7}bix9ZfA53F-fV3^0^kFy?MO{qCp Uvg+}9ya{}~^-RU2GM^p%8+Kr1SpWb4 delta 1139 zcmZ9L&ui2`6vs39)oeCDc5BymwU%mibz3b3wV*3itfGR}gZP6L4Q-N^-A$WivTH4* zh*;>wgY{iJC~QGN1yS(ssi+_*c+APG2cds}sPm?!)Mm&h@6EiIym|A@hu+uS)>YHg z1$O<oS($jAS+csxlcj;UkS=$4Mmgu1<?ZJL&+_aA!Lxn)fmH7FGFOCR_9rGS{rV1{ z$OZBcvf`bcl5hByXFZiX+siD-WqVPOg))zR|3O?Oq#fSMU85P6NM*z}gox-sNQfLl zMQlf88H!pWM}9C#J8`Cq;jxm3KIN{`%j_M{Ql60>7*V^2@zq6%?qLtQA8`P&ZvzEp zC3aM+)SbzYI@EVV>J7mo^|&UjCqPLn=n9J)Wq0%)YTBObE3x7UwG>_PNUYRUy3!C@ zax5>Cd6ip=NAd#m+G3%qwA5IQ+m^+7jd^VwUW$2ID-~-jPG7TORU0hCVoQ$+n@GpT zvP6Up--u1_3stja!Nt_&lemvpB5lJ>7>&@WM&XR(RouoaF%buc60y?Qx>x-~P>C9D zI9(sP1YcA4`*<$~*&FB(;{Vpr!)$I0s6ISu;Ht?`v?=N%$xZnUXO^`qawcgwz1}<8 zk8!&(z*<{mbp_$9PiYQT^?aEhES_Xyy5oV{48|M2!<LiIX5B-(PQ$uS8*}_XYW^HQ zamqD<@<<dn@W*M<+Ipx5p_(4c9Kp{YMXVJJpVND<{lBTCoi=Bjs4_DZ@UOXk6b-YC z=oEvX>ck>lgi%z9D2RV$Y5gf1#E=Zayub1vRn_5)ahhad(J0;H;^n&Kl0~&9@<bR0 zRADCVMQDftL>e)|keH6+2IEoW`a$40o0{K;At+92Goa=pRIX06%|jLmA8^~$vl=&l z^MtWtSNS-6F$?g{%n=bjnk5x0W0U8t9@uB?A~K9x$A|*+RzFeUrBzb7jE=z@D-WW* zTg7^lLvYLPfpd0-q~L~qi0JUbE|N5SvkPajvP~uPCiDMEJ24%4&A=ZaEF#*$z&f{e Jh%#YA`Um{}^fdqg diff --git a/sziszapangma/integration/task/embedding_wer_metrics_task.py b/sziszapangma/integration/task/embedding_wer_metrics_task.py index c0f54ad..eefc2fd 100644 --- a/sziszapangma/integration/task/embedding_wer_metrics_task.py +++ b/sziszapangma/integration/task/embedding_wer_metrics_task.py @@ -1,3 +1,5 @@ +from typing import List + from sziszapangma.core.alignment.alignment_embedding_calculator import AlignmentEmbeddingCalculator from sziszapangma.core.alignment.alignment_soft_calculator import AlignmentSoftCalculator from sziszapangma.core.transformer.cached_embedding_transformer import CachedEmbeddingTransformer @@ -7,6 +9,7 @@ from sziszapangma.integration.mapper.alignment_step_mapper import AlignmentStepM from sziszapangma.integration.repository.experiment_repository import ExperimentRepository from sziszapangma.integration.task.processing_task import ProcessingTask from sziszapangma.integration.task.task_util import TaskUtil +from sziszapangma.model.model import Word from sziszapangma.model.relation_manager import RelationManager _SOFT_WER = "soft_wer" @@ -51,6 +54,10 @@ class EmbeddingWerMetricsTask(ProcessingTask): is not None ) + @staticmethod + def filter_empty_words(words: List[Word]) -> List[Word]: + return [it for it in words if len(it['text']) > 0] + def run_single_process( self, record_id: str, @@ -60,8 +67,8 @@ class EmbeddingWerMetricsTask(ProcessingTask): gold_transcript = TaskUtil.get_words_from_record(relation_manager) asr_result = experiment_repository.get_property_for_key(record_id, self._asr_property_name) if gold_transcript is not None and asr_result is not None and "transcription" in asr_result: - gold_transcript_lower = TaskUtil.words_to_lower(gold_transcript) - asr_transcript_lower = TaskUtil.words_to_lower(asr_result["transcription"]) + gold_transcript_lower = self.filter_empty_words(TaskUtil.words_to_lower(gold_transcript)) + asr_transcript_lower = self.filter_empty_words(TaskUtil.words_to_lower(asr_result["transcription"])) soft_alignment = self._alignment_soft_calculator.calculate_alignment( gold_transcript_lower, asr_transcript_lower @@ -80,6 +87,7 @@ class EmbeddingWerMetricsTask(ProcessingTask): ], } wer_results = {"soft_wer": soft_wer, "embedding_wer": embedding_wer} + print(wer_results) experiment_repository.update_property_for_key( record_id, self._alignment_property_name, alignment_results -- GitLab