Skip to content
Snippets Groups Projects
Commit f4762cc2 authored by Marcin Wątroba's avatar Marcin Wątroba
Browse files

Finish experiments for polish

parent 4d5a8bb0
No related branches found
No related tags found
1 merge request!13Change data model
......@@ -44,27 +44,27 @@ services:
- /etc/localtime:/etc/localtime:ro
- ./embedding_models:/models
ajn_asr:
image: gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/asr-clarin-pl-service:1.5
container_name: ajn_asr
restart: always
ports:
- "5431:5000"
environment:
- AUTH_TOKEN=__example_token__
volumes:
- /etc/localtime:/etc/localtime:ro
# ajn_asr:
# image: gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/asr-clarin-pl-service:1.5
# container_name: ajn_asr
# restart: always
# ports:
# - "5431:5000"
# environment:
# - AUTH_TOKEN=__example_token__
# volumes:
# - /etc/localtime:/etc/localtime:ro
wav2vec2-xls-r-1b-polish:
image: gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/wav2vec2-xls-r-1b-polish:1.5
container_name: wav2vec2-xls-r-1b-polish
restart: always
ports:
- "5437:5000"
environment:
- AUTH_TOKEN=__example_token__
volumes:
- /etc/localtime:/etc/localtime:ro
# wav2vec2-xls-r-1b-polish:
# image: gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/wav2vec2-xls-r-1b-polish:1.5
# container_name: wav2vec2-xls-r-1b-polish
# restart: always
# ports:
# - "5437:5000"
# environment:
# - AUTH_TOKEN=__example_token__
# volumes:
# - /etc/localtime:/etc/localtime:ro
# speechbrain_asr:
# image: gitlab.clarin-pl.eu:5050/clarin-dialog/clarin-dialog/speechbrain-asr:1.5
......
This diff is collapsed.
......@@ -99,6 +99,8 @@ stages:
asr: wav2vec2
- dataset: pl_common_voice
asr: techmo
- dataset: pl_common_voice
asr: ajn
- dataset: pl_voicelab_cbiz
asr: google
......@@ -154,13 +156,15 @@ stages:
asr: wav2vec2
- dataset: pl_common_voice
asr: techmo
- dataset: pl_common_voice
asr: ajn
# - dataset: pl_voicelab_cbiz
# asr: google
# - dataset: pl_voicelab_cbiz
# asr: ajn
# - dataset: pl_voicelab_cbiz
# asr: techmo
- dataset: pl_voicelab_cbiz
asr: google
- dataset: pl_voicelab_cbiz
asr: ajn
- dataset: pl_voicelab_cbiz
asr: techmo
- dataset: pl_google_fleurs
asr: google
......@@ -207,13 +211,15 @@ stages:
asr: wav2vec2
- dataset: pl_common_voice
asr: techmo
- dataset: pl_common_voice
asr: ajn
# - dataset: pl_voicelab_cbiz
# asr: google
# - dataset: pl_voicelab_cbiz
# asr: ajn
# - dataset: pl_voicelab_cbiz
# asr: techmo
- dataset: pl_voicelab_cbiz
asr: google
- dataset: pl_voicelab_cbiz
asr: ajn
- dataset: pl_voicelab_cbiz
asr: techmo
- dataset: pl_google_fleurs
asr: google
......@@ -260,13 +266,15 @@ stages:
asr: wav2vec2
- dataset: pl_common_voice
asr: techmo
- dataset: pl_common_voice
asr: ajn
# - dataset: pl_voicelab_cbiz
# asr: google
# - dataset: pl_voicelab_cbiz
# asr: ajn
# - dataset: pl_voicelab_cbiz
# asr: techmo
- dataset: pl_voicelab_cbiz
asr: google
- dataset: pl_voicelab_cbiz
asr: ajn
- dataset: pl_voicelab_cbiz
asr: techmo
- dataset: pl_google_fleurs
asr: google
......@@ -313,13 +321,15 @@ stages:
asr: wav2vec2
- dataset: pl_common_voice
asr: techmo
- dataset: pl_common_voice
asr: ajn
# - dataset: pl_voicelab_cbiz
# asr: google
# - dataset: pl_voicelab_cbiz
# asr: ajn
# - dataset: pl_voicelab_cbiz
# asr: techmo
- dataset: pl_voicelab_cbiz
asr: google
- dataset: pl_voicelab_cbiz
asr: ajn
- dataset: pl_voicelab_cbiz
asr: techmo
- dataset: pl_google_fleurs
asr: google
......@@ -357,3 +367,58 @@ stages:
outs:
- experiment_data/pipeline/${item.dataset}/${item.asr}__flair_upos_alignment
- experiment_data/pipeline/${item.dataset}/${item.asr}__flair_upos_metrics
pipeline_spacy_tag_wer:
foreach:
- dataset: pl_common_voice
asr: google
- dataset: pl_common_voice
asr: wav2vec2
- dataset: pl_common_voice
asr: techmo
- dataset: pl_common_voice
asr: ajn
- dataset: pl_voicelab_cbiz
asr: google
- dataset: pl_voicelab_cbiz
asr: ajn
- dataset: pl_voicelab_cbiz
asr: techmo
- dataset: pl_google_fleurs
asr: google
- dataset: pl_google_fleurs
asr: ajn
- dataset: pl_google_fleurs
asr: techmo
- dataset: pl_google_fleurs
asr: wav2vec2
- dataset: pl_luna
asr: google
- dataset: pl_luna
asr: ajn
- dataset: pl_luna
asr: techmo
- dataset: pl_luna
asr: wav2vec2
- dataset: pl_minds14
asr: google
- dataset: pl_minds14
asr: ajn
- dataset: pl_minds14
asr: techmo
- dataset: pl_minds14
asr: wav2vec2
do:
cmd: PYTHONPATH=. python experiment/pipeline_process_spacy_dep_tag_wer.py --dataset=${item.dataset} --asr=${item.asr}
deps:
- experiment/pipeline_process_spacy_dep_tag_wer.py
- experiment_data/dataset/${item.dataset}
- experiment_data/pipeline/${item.dataset}/gold_transcript
- experiment_data/pipeline/${item.dataset}/${item.asr}__result
outs:
- experiment_data/pipeline/${item.dataset}/${item.asr}__spacy_dep_tag_alignment
- experiment_data/pipeline/${item.dataset}/${item.asr}__spacy_dep_tag_metrics
import argparse
from experiment.const_pipeline_names import GOLD_TRANSCRIPT
from experiment.experiment_dependency_provider import get_record_provider, get_repository
from experiment.sentence_wer_processor.spacy_pos_sentence_dep_tag_processor import SpacyDepTagSentenceWerProcessor
from sziszapangma.integration.experiment_manager import ExperimentManager
def run_spacy_pos_wer_pipeline(dataset_name: str, asr_name: str):
record_provider = get_record_provider(dataset_name)
experiment_processor = ExperimentManager(
record_id_iterator=record_provider,
processing_tasks=[
SpacyDepTagSentenceWerProcessor(
model_name='pl_core_news_lg',
gold_transcript_property_name=GOLD_TRANSCRIPT,
asr_property_name=f'{asr_name}__result',
alignment_property_name=f'{asr_name}__spacy_dep_tag_alignment',
wer_property_name=f'{asr_name}__spacy_dep_tag_metrics',
task_name=f'SpacyDepTagSentenceWerProcessor___{dataset_name}___{asr_name}',
require_update=False
)
],
experiment_repository=get_repository(dataset_name),
relation_manager_provider=record_provider
)
experiment_processor.process()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--dataset")
parser.add_argument("--asr")
args = parser.parse_args()
run_spacy_pos_wer_pipeline(args.dataset, args.asr)
from abc import ABC
from typing import List, Any
import spacy
from experiment.sentence_wer_processor.sentence_wer_processor import SentenceWerProcessor
class SpacyDepTagSentenceWerProcessor(SentenceWerProcessor):
_nlp: Any
def __init__(
self,
model_name: str,
gold_transcript_property_name: str,
asr_property_name: str,
alignment_property_name: str,
wer_property_name: str,
task_name: str,
require_update: bool
):
super().__init__(gold_transcript_property_name, asr_property_name, alignment_property_name, wer_property_name,
task_name, require_update)
self._nlp = spacy.load(model_name)
def process_text(self, text: str) -> List[str]:
document = self._nlp(text)
return [token.pos_ for token in document]
......@@ -16,3 +16,27 @@
/wav2vec2__word_wer_embeddings_alignment
/wav2vec2__spacy_ner_alignment
/wav2vec2__spacy_ner_metrics
/techmo__wikineural_ner_alignment
/techmo__wikineural_ner_metrics
/google__wikineural_ner_alignment
/google__wikineural_ner_metrics
/techmo__spacy_pos_alignment
/techmo__spacy_pos_metrics
/wav2vec2__flair_upos_alignment
/wav2vec2__flair_upos_metrics
/techmo__flair_upos_alignment
/techmo__flair_upos_metrics
/ajn__wikineural_ner_alignment
/ajn__wikineural_ner_metrics
/ajn__spacy_dep_tag_alignment
/ajn__spacy_dep_tag_metrics
/ajn__word_wer_classic_metrics
/ajn__word_wer_classic_alignment
/ajn__word_wer_embeddings_metrics
/ajn__word_wer_embeddings_alignment
/ajn__flair_upos_alignment
/ajn__flair_upos_metrics
/wav2vec2__spacy_dep_tag_alignment
/wav2vec2__spacy_dep_tag_metrics
/ajn__spacy_pos_alignment
/ajn__spacy_pos_metrics
......@@ -17,3 +17,43 @@
/wav2vec2__word_wer_embeddings_alignment
/wav2vec2__spacy_ner_alignment
/wav2vec2__spacy_ner_metrics
/ajn__wikineural_ner_alignment
/ajn__wikineural_ner_metrics
/techmo__wikineural_ner_alignment
/techmo__wikineural_ner_metrics
/google__flair_upos_alignment
/google__flair_upos_metrics
/wav2vec2__spacy_pos_alignment
/wav2vec2__spacy_pos_metrics
/ajn__flair_upos_alignment
/ajn__flair_upos_metrics
/ajn__spacy_pos_alignment
/ajn__spacy_pos_metrics
/techmo__flair_upos_alignment
/techmo__flair_upos_metrics
/ajn__spacy_ner_alignment
/ajn__spacy_ner_metrics
/wav2vec2__wikineural_ner_alignment
/wav2vec2__wikineural_ner_metrics
/wav2vec2__flair_upos_alignment
/wav2vec2__flair_upos_metrics
/google__spacy_ner_alignment
/google__spacy_ner_metrics
/techmo__spacy_ner_alignment
/techmo__spacy_ner_metrics
/google__spacy_pos_alignment
/google__spacy_pos_metrics
/techmo__spacy_pos_alignment
/techmo__spacy_pos_metrics
/google__wikineural_ner_alignment
/google__wikineural_ner_metrics
/ajn__spacy_dep_tag_alignment
/ajn__spacy_dep_tag_metrics
/techmo__spacy_dep_tag_alignment
/techmo__spacy_dep_tag_metrics
/google__word_wer_classic_metrics
/google__word_wer_classic_alignment
/google__word_wer_embeddings_metrics
/google__word_wer_embeddings_alignment
/google__spacy_dep_tag_alignment
/google__spacy_dep_tag_metrics
......@@ -19,3 +19,41 @@
/ajn__word_wer_classic_alignment
/ajn__word_wer_embeddings_metrics
/ajn__word_wer_embeddings_alignment
/wav2vec2__wikineural_ner_alignment
/wav2vec2__wikineural_ner_metrics
/techmo__flair_upos_alignment
/techmo__flair_upos_metrics
/techmo__wikineural_ner_alignment
/techmo__wikineural_ner_metrics
/techmo__spacy_ner_alignment
/techmo__spacy_ner_metrics
/google__wikineural_ner_alignment
/google__wikineural_ner_metrics
/google__flair_upos_alignment
/google__flair_upos_metrics
/ajn__flair_upos_alignment
/ajn__flair_upos_metrics
/google__spacy_ner_alignment
/google__spacy_ner_metrics
/wav2vec2__flair_upos_alignment
/wav2vec2__flair_upos_metrics
/google__spacy_pos_alignment
/google__spacy_pos_metrics
/techmo__spacy_pos_alignment
/techmo__spacy_pos_metrics
/ajn__spacy_ner_alignment
/ajn__spacy_ner_metrics
/ajn__wikineural_ner_alignment
/ajn__wikineural_ner_metrics
/wav2vec2__spacy_pos_alignment
/wav2vec2__spacy_pos_metrics
/ajn__spacy_pos_alignment
/ajn__spacy_pos_metrics
/wav2vec2__spacy_ner_alignment
/wav2vec2__spacy_ner_metrics
/ajn__spacy_dep_tag_alignment
/ajn__spacy_dep_tag_metrics
/google__spacy_dep_tag_alignment
/google__spacy_dep_tag_metrics
/techmo__spacy_dep_tag_alignment
/techmo__spacy_dep_tag_metrics
......@@ -15,3 +15,37 @@
/techmo__spacy_ner_metrics
/ajn__spacy_ner_alignment
/ajn__spacy_ner_metrics
/techmo__flair_upos_alignment
/techmo__flair_upos_metrics
/google__flair_upos_alignment
/google__flair_upos_metrics
/google__spacy_pos_alignment
/google__spacy_pos_metrics
/wav2vec2__flair_upos_alignment
/wav2vec2__flair_upos_metrics
/ajn__wikineural_ner_alignment
/ajn__wikineural_ner_metrics
/wav2vec2__spacy_ner_alignment
/wav2vec2__spacy_ner_metrics
/google__spacy_ner_alignment
/google__spacy_ner_metrics
/techmo__wikineural_ner_alignment
/techmo__wikineural_ner_metrics
/google__wikineural_ner_alignment
/google__wikineural_ner_metrics
/wav2vec2__wikineural_ner_alignment
/wav2vec2__wikineural_ner_metrics
/ajn__flair_upos_alignment
/ajn__flair_upos_metrics
/ajn__spacy_pos_alignment
/ajn__spacy_pos_metrics
/wav2vec2__word_wer_classic_metrics
/wav2vec2__word_wer_classic_alignment
/wav2vec2__word_wer_embeddings_metrics
/wav2vec2__word_wer_embeddings_alignment
/wav2vec2__spacy_pos_alignment
/wav2vec2__spacy_pos_metrics
/ajn__word_wer_classic_metrics
/ajn__word_wer_classic_alignment
/ajn__word_wer_embeddings_metrics
/ajn__word_wer_embeddings_alignment
......@@ -16,3 +16,25 @@
/ajn__word_wer_embeddings_alignment
/ajn__spacy_ner_alignment
/ajn__spacy_ner_metrics
/google__spacy_pos_alignment
/google__spacy_pos_metrics
/techmo__spacy_ner_alignment
/techmo__spacy_ner_metrics
/ajn__flair_upos_alignment
/ajn__flair_upos_metrics
/google__flair_upos_alignment
/google__flair_upos_metrics
/google__spacy_ner_alignment
/google__spacy_ner_metrics
/techmo__spacy_pos_alignment
/techmo__spacy_pos_metrics
/google__wikineural_ner_alignment
/google__wikineural_ner_metrics
/ajn__wikineural_ner_alignment
/ajn__wikineural_ner_metrics
/techmo__wikineural_ner_alignment
/techmo__wikineural_ner_metrics
/ajn__spacy_pos_alignment
/ajn__spacy_pos_metrics
/techmo__flair_upos_alignment
/techmo__flair_upos_metrics
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment