Skip to content
Snippets Groups Projects
Commit f4762cc2 authored by Marcin Wątroba's avatar Marcin Wątroba
Browse files

Finish experiments for polish

parent 4d5a8bb0
1 merge request!13Change data model
...@@ -44,27 +44,27 @@ services: ...@@ -44,27 +44,27 @@ services:
- /etc/localtime:/etc/localtime:ro - /etc/localtime:/etc/localtime:ro
- ./embedding_models:/models - ./embedding_models:/models
ajn_asr: # ajn_asr:
image: gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/asr-clarin-pl-service:1.5 # image: gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/asr-clarin-pl-service:1.5
container_name: ajn_asr # container_name: ajn_asr
restart: always # restart: always
ports: # ports:
- "5431:5000" # - "5431:5000"
environment: # environment:
- AUTH_TOKEN=__example_token__ # - AUTH_TOKEN=__example_token__
volumes: # volumes:
- /etc/localtime:/etc/localtime:ro # - /etc/localtime:/etc/localtime:ro
wav2vec2-xls-r-1b-polish: # wav2vec2-xls-r-1b-polish:
image: gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/wav2vec2-xls-r-1b-polish:1.5 # image: gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/wav2vec2-xls-r-1b-polish:1.5
container_name: wav2vec2-xls-r-1b-polish # container_name: wav2vec2-xls-r-1b-polish
restart: always # restart: always
ports: # ports:
- "5437:5000" # - "5437:5000"
environment: # environment:
- AUTH_TOKEN=__example_token__ # - AUTH_TOKEN=__example_token__
volumes: # volumes:
- /etc/localtime:/etc/localtime:ro # - /etc/localtime:/etc/localtime:ro
# speechbrain_asr: # speechbrain_asr:
# image: gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/speechbrain-asr:1.5 # image: gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/speechbrain-asr:1.5
......
This diff is collapsed.
...@@ -99,6 +99,8 @@ stages: ...@@ -99,6 +99,8 @@ stages:
asr: wav2vec2 asr: wav2vec2
- dataset: pl_common_voice - dataset: pl_common_voice
asr: techmo asr: techmo
- dataset: pl_common_voice
asr: ajn
- dataset: pl_voicelab_cbiz - dataset: pl_voicelab_cbiz
asr: google asr: google
...@@ -154,13 +156,15 @@ stages: ...@@ -154,13 +156,15 @@ stages:
asr: wav2vec2 asr: wav2vec2
- dataset: pl_common_voice - dataset: pl_common_voice
asr: techmo asr: techmo
- dataset: pl_common_voice
asr: ajn
# - dataset: pl_voicelab_cbiz - dataset: pl_voicelab_cbiz
# asr: google asr: google
# - dataset: pl_voicelab_cbiz - dataset: pl_voicelab_cbiz
# asr: ajn asr: ajn
# - dataset: pl_voicelab_cbiz - dataset: pl_voicelab_cbiz
# asr: techmo asr: techmo
- dataset: pl_google_fleurs - dataset: pl_google_fleurs
asr: google asr: google
...@@ -207,13 +211,15 @@ stages: ...@@ -207,13 +211,15 @@ stages:
asr: wav2vec2 asr: wav2vec2
- dataset: pl_common_voice - dataset: pl_common_voice
asr: techmo asr: techmo
- dataset: pl_common_voice
asr: ajn
# - dataset: pl_voicelab_cbiz - dataset: pl_voicelab_cbiz
# asr: google asr: google
# - dataset: pl_voicelab_cbiz - dataset: pl_voicelab_cbiz
# asr: ajn asr: ajn
# - dataset: pl_voicelab_cbiz - dataset: pl_voicelab_cbiz
# asr: techmo asr: techmo
- dataset: pl_google_fleurs - dataset: pl_google_fleurs
asr: google asr: google
...@@ -260,13 +266,15 @@ stages: ...@@ -260,13 +266,15 @@ stages:
asr: wav2vec2 asr: wav2vec2
- dataset: pl_common_voice - dataset: pl_common_voice
asr: techmo asr: techmo
- dataset: pl_common_voice
asr: ajn
# - dataset: pl_voicelab_cbiz - dataset: pl_voicelab_cbiz
# asr: google asr: google
# - dataset: pl_voicelab_cbiz - dataset: pl_voicelab_cbiz
# asr: ajn asr: ajn
# - dataset: pl_voicelab_cbiz - dataset: pl_voicelab_cbiz
# asr: techmo asr: techmo
- dataset: pl_google_fleurs - dataset: pl_google_fleurs
asr: google asr: google
...@@ -313,13 +321,15 @@ stages: ...@@ -313,13 +321,15 @@ stages:
asr: wav2vec2 asr: wav2vec2
- dataset: pl_common_voice - dataset: pl_common_voice
asr: techmo asr: techmo
- dataset: pl_common_voice
asr: ajn
# - dataset: pl_voicelab_cbiz - dataset: pl_voicelab_cbiz
# asr: google asr: google
# - dataset: pl_voicelab_cbiz - dataset: pl_voicelab_cbiz
# asr: ajn asr: ajn
# - dataset: pl_voicelab_cbiz - dataset: pl_voicelab_cbiz
# asr: techmo asr: techmo
- dataset: pl_google_fleurs - dataset: pl_google_fleurs
asr: google asr: google
...@@ -357,3 +367,58 @@ stages: ...@@ -357,3 +367,58 @@ stages:
outs: outs:
- experiment_data/pipeline/${item.dataset}/${item.asr}__flair_upos_alignment - experiment_data/pipeline/${item.dataset}/${item.asr}__flair_upos_alignment
- experiment_data/pipeline/${item.dataset}/${item.asr}__flair_upos_metrics - experiment_data/pipeline/${item.dataset}/${item.asr}__flair_upos_metrics
pipeline_spacy_tag_wer:
foreach:
- dataset: pl_common_voice
asr: google
- dataset: pl_common_voice
asr: wav2vec2
- dataset: pl_common_voice
asr: techmo
- dataset: pl_common_voice
asr: ajn
- dataset: pl_voicelab_cbiz
asr: google
- dataset: pl_voicelab_cbiz
asr: ajn
- dataset: pl_voicelab_cbiz
asr: techmo
- dataset: pl_google_fleurs
asr: google
- dataset: pl_google_fleurs
asr: ajn
- dataset: pl_google_fleurs
asr: techmo
- dataset: pl_google_fleurs
asr: wav2vec2
- dataset: pl_luna
asr: google
- dataset: pl_luna
asr: ajn
- dataset: pl_luna
asr: techmo
- dataset: pl_luna
asr: wav2vec2
- dataset: pl_minds14
asr: google
- dataset: pl_minds14
asr: ajn
- dataset: pl_minds14
asr: techmo
- dataset: pl_minds14
asr: wav2vec2
do:
cmd: PYTHONPATH=. python experiment/pipeline_process_spacy_dep_tag_wer.py --dataset=${item.dataset} --asr=${item.asr}
deps:
- experiment/pipeline_process_spacy_dep_tag_wer.py
- experiment_data/dataset/${item.dataset}
- experiment_data/pipeline/${item.dataset}/gold_transcript
- experiment_data/pipeline/${item.dataset}/${item.asr}__result
outs:
- experiment_data/pipeline/${item.dataset}/${item.asr}__spacy_dep_tag_alignment
- experiment_data/pipeline/${item.dataset}/${item.asr}__spacy_dep_tag_metrics
import argparse
from experiment.const_pipeline_names import GOLD_TRANSCRIPT
from experiment.experiment_dependency_provider import get_record_provider, get_repository
from experiment.sentence_wer_processor.spacy_pos_sentence_dep_tag_processor import SpacyDepTagSentenceWerProcessor
from sziszapangma.integration.experiment_manager import ExperimentManager
def run_spacy_pos_wer_pipeline(dataset_name: str, asr_name: str):
record_provider = get_record_provider(dataset_name)
experiment_processor = ExperimentManager(
record_id_iterator=record_provider,
processing_tasks=[
SpacyDepTagSentenceWerProcessor(
model_name='pl_core_news_lg',
gold_transcript_property_name=GOLD_TRANSCRIPT,
asr_property_name=f'{asr_name}__result',
alignment_property_name=f'{asr_name}__spacy_dep_tag_alignment',
wer_property_name=f'{asr_name}__spacy_dep_tag_metrics',
task_name=f'SpacyDepTagSentenceWerProcessor___{dataset_name}___{asr_name}',
require_update=False
)
],
experiment_repository=get_repository(dataset_name),
relation_manager_provider=record_provider
)
experiment_processor.process()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--dataset")
parser.add_argument("--asr")
args = parser.parse_args()
run_spacy_pos_wer_pipeline(args.dataset, args.asr)
from abc import ABC
from typing import List, Any
import spacy
from experiment.sentence_wer_processor.sentence_wer_processor import SentenceWerProcessor
class SpacyDepTagSentenceWerProcessor(SentenceWerProcessor):
_nlp: Any
def __init__(
self,
model_name: str,
gold_transcript_property_name: str,
asr_property_name: str,
alignment_property_name: str,
wer_property_name: str,
task_name: str,
require_update: bool
):
super().__init__(gold_transcript_property_name, asr_property_name, alignment_property_name, wer_property_name,
task_name, require_update)
self._nlp = spacy.load(model_name)
def process_text(self, text: str) -> List[str]:
document = self._nlp(text)
return [token.pos_ for token in document]
...@@ -16,3 +16,27 @@ ...@@ -16,3 +16,27 @@
/wav2vec2__word_wer_embeddings_alignment /wav2vec2__word_wer_embeddings_alignment
/wav2vec2__spacy_ner_alignment /wav2vec2__spacy_ner_alignment
/wav2vec2__spacy_ner_metrics /wav2vec2__spacy_ner_metrics
/techmo__wikineural_ner_alignment
/techmo__wikineural_ner_metrics
/google__wikineural_ner_alignment
/google__wikineural_ner_metrics
/techmo__spacy_pos_alignment
/techmo__spacy_pos_metrics
/wav2vec2__flair_upos_alignment
/wav2vec2__flair_upos_metrics
/techmo__flair_upos_alignment
/techmo__flair_upos_metrics
/ajn__wikineural_ner_alignment
/ajn__wikineural_ner_metrics
/ajn__spacy_dep_tag_alignment
/ajn__spacy_dep_tag_metrics
/ajn__word_wer_classic_metrics
/ajn__word_wer_classic_alignment
/ajn__word_wer_embeddings_metrics
/ajn__word_wer_embeddings_alignment
/ajn__flair_upos_alignment
/ajn__flair_upos_metrics
/wav2vec2__spacy_dep_tag_alignment
/wav2vec2__spacy_dep_tag_metrics
/ajn__spacy_pos_alignment
/ajn__spacy_pos_metrics
...@@ -17,3 +17,43 @@ ...@@ -17,3 +17,43 @@
/wav2vec2__word_wer_embeddings_alignment /wav2vec2__word_wer_embeddings_alignment
/wav2vec2__spacy_ner_alignment /wav2vec2__spacy_ner_alignment
/wav2vec2__spacy_ner_metrics /wav2vec2__spacy_ner_metrics
/ajn__wikineural_ner_alignment
/ajn__wikineural_ner_metrics
/techmo__wikineural_ner_alignment
/techmo__wikineural_ner_metrics
/google__flair_upos_alignment
/google__flair_upos_metrics
/wav2vec2__spacy_pos_alignment
/wav2vec2__spacy_pos_metrics
/ajn__flair_upos_alignment
/ajn__flair_upos_metrics
/ajn__spacy_pos_alignment
/ajn__spacy_pos_metrics
/techmo__flair_upos_alignment
/techmo__flair_upos_metrics
/ajn__spacy_ner_alignment
/ajn__spacy_ner_metrics
/wav2vec2__wikineural_ner_alignment
/wav2vec2__wikineural_ner_metrics
/wav2vec2__flair_upos_alignment
/wav2vec2__flair_upos_metrics
/google__spacy_ner_alignment
/google__spacy_ner_metrics
/techmo__spacy_ner_alignment
/techmo__spacy_ner_metrics
/google__spacy_pos_alignment
/google__spacy_pos_metrics
/techmo__spacy_pos_alignment
/techmo__spacy_pos_metrics
/google__wikineural_ner_alignment
/google__wikineural_ner_metrics
/ajn__spacy_dep_tag_alignment
/ajn__spacy_dep_tag_metrics
/techmo__spacy_dep_tag_alignment
/techmo__spacy_dep_tag_metrics
/google__word_wer_classic_metrics
/google__word_wer_classic_alignment
/google__word_wer_embeddings_metrics
/google__word_wer_embeddings_alignment
/google__spacy_dep_tag_alignment
/google__spacy_dep_tag_metrics
...@@ -19,3 +19,41 @@ ...@@ -19,3 +19,41 @@
/ajn__word_wer_classic_alignment /ajn__word_wer_classic_alignment
/ajn__word_wer_embeddings_metrics /ajn__word_wer_embeddings_metrics
/ajn__word_wer_embeddings_alignment /ajn__word_wer_embeddings_alignment
/wav2vec2__wikineural_ner_alignment
/wav2vec2__wikineural_ner_metrics
/techmo__flair_upos_alignment
/techmo__flair_upos_metrics
/techmo__wikineural_ner_alignment
/techmo__wikineural_ner_metrics
/techmo__spacy_ner_alignment
/techmo__spacy_ner_metrics
/google__wikineural_ner_alignment
/google__wikineural_ner_metrics
/google__flair_upos_alignment
/google__flair_upos_metrics
/ajn__flair_upos_alignment
/ajn__flair_upos_metrics
/google__spacy_ner_alignment
/google__spacy_ner_metrics
/wav2vec2__flair_upos_alignment
/wav2vec2__flair_upos_metrics
/google__spacy_pos_alignment
/google__spacy_pos_metrics
/techmo__spacy_pos_alignment
/techmo__spacy_pos_metrics
/ajn__spacy_ner_alignment
/ajn__spacy_ner_metrics
/ajn__wikineural_ner_alignment
/ajn__wikineural_ner_metrics
/wav2vec2__spacy_pos_alignment
/wav2vec2__spacy_pos_metrics
/ajn__spacy_pos_alignment
/ajn__spacy_pos_metrics
/wav2vec2__spacy_ner_alignment
/wav2vec2__spacy_ner_metrics
/ajn__spacy_dep_tag_alignment
/ajn__spacy_dep_tag_metrics
/google__spacy_dep_tag_alignment
/google__spacy_dep_tag_metrics
/techmo__spacy_dep_tag_alignment
/techmo__spacy_dep_tag_metrics
...@@ -15,3 +15,37 @@ ...@@ -15,3 +15,37 @@
/techmo__spacy_ner_metrics /techmo__spacy_ner_metrics
/ajn__spacy_ner_alignment /ajn__spacy_ner_alignment
/ajn__spacy_ner_metrics /ajn__spacy_ner_metrics
/techmo__flair_upos_alignment
/techmo__flair_upos_metrics
/google__flair_upos_alignment
/google__flair_upos_metrics
/google__spacy_pos_alignment
/google__spacy_pos_metrics
/wav2vec2__flair_upos_alignment
/wav2vec2__flair_upos_metrics
/ajn__wikineural_ner_alignment
/ajn__wikineural_ner_metrics
/wav2vec2__spacy_ner_alignment
/wav2vec2__spacy_ner_metrics
/google__spacy_ner_alignment
/google__spacy_ner_metrics
/techmo__wikineural_ner_alignment
/techmo__wikineural_ner_metrics
/google__wikineural_ner_alignment
/google__wikineural_ner_metrics
/wav2vec2__wikineural_ner_alignment
/wav2vec2__wikineural_ner_metrics
/ajn__flair_upos_alignment
/ajn__flair_upos_metrics
/ajn__spacy_pos_alignment
/ajn__spacy_pos_metrics
/wav2vec2__word_wer_classic_metrics
/wav2vec2__word_wer_classic_alignment
/wav2vec2__word_wer_embeddings_metrics
/wav2vec2__word_wer_embeddings_alignment
/wav2vec2__spacy_pos_alignment
/wav2vec2__spacy_pos_metrics
/ajn__word_wer_classic_metrics
/ajn__word_wer_classic_alignment
/ajn__word_wer_embeddings_metrics
/ajn__word_wer_embeddings_alignment
...@@ -16,3 +16,25 @@ ...@@ -16,3 +16,25 @@
/ajn__word_wer_embeddings_alignment /ajn__word_wer_embeddings_alignment
/ajn__spacy_ner_alignment /ajn__spacy_ner_alignment
/ajn__spacy_ner_metrics /ajn__spacy_ner_metrics
/google__spacy_pos_alignment
/google__spacy_pos_metrics
/techmo__spacy_ner_alignment
/techmo__spacy_ner_metrics
/ajn__flair_upos_alignment
/ajn__flair_upos_metrics
/google__flair_upos_alignment
/google__flair_upos_metrics
/google__spacy_ner_alignment
/google__spacy_ner_metrics
/techmo__spacy_pos_alignment
/techmo__spacy_pos_metrics
/google__wikineural_ner_alignment
/google__wikineural_ner_metrics
/ajn__wikineural_ner_alignment
/ajn__wikineural_ner_metrics
/techmo__wikineural_ner_alignment
/techmo__wikineural_ner_metrics
/ajn__spacy_pos_alignment
/ajn__spacy_pos_metrics
/techmo__flair_upos_alignment
/techmo__flair_upos_metrics
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment