Skip to content
Snippets Groups Projects
Commit 0ef9cb66 authored by Marcin Wątroba's avatar Marcin Wątroba
Browse files

Add NeMo processing

parent fb8f9b8f
No related branches found
No related tags found
No related merge requests found
name: CI
on: pull_request
jobs:
tests:
strategy:
fail-fast: false
matrix:
python-version: [ 3.9.6 ]
poetry-version: [ 1.1.5 ]
# os: [ ubuntu-20.04, macos-latest, windows-latest ]
os: [ ubuntu-20.04 ]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install poetry
uses: abatilo/actions-poetry@v2.0.0
with:
poetry-version: ${{ matrix.poetry-version }}
- name: Install deps
run: poetry install -vv
- name: Run tests
run: poetry run poe test
lint:
strategy:
fail-fast: false
matrix:
python-version: [ 3.9.6 ]
poetry-version: [ 1.1.5 ]
os: [ ubuntu-20.04 ]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install poetry
uses: abatilo/actions-poetry@v2.0.0
with:
poetry-version: ${{ matrix.poetry-version }}
- name: Install deps
run: poetry install -vv
- name: Check
run: poetry run poe check
publish:
needs:
- tests
- lint
environment: Test deployment
strategy:
fail-fast: false
matrix:
python-version: [ 3.9.6 ]
poetry-version: [ 1.1.5 ]
os: [ ubuntu-20.04 ]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v2
with:
fetch-depth: 0
- uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install poetry
uses: abatilo/actions-poetry@v2.0.0
with:
poetry-version: ${{ matrix.poetry-version }}
- name: Install deps
run: poetry install -vv
- name: Build and publish
run: |
poetry version "$(poetry version --short)-alpha.${GITHUB_RUN_NUMBER}"
poetry build
poetry config repositories.clarinpypi https://pypi.clarin-pl.eu/
poetry publish -r clarinpypi --username ${{ secrets.PYPI_USER }} --password ${{ secrets.PYPI_PASS }}
#name: CI
#on: pull_request
#
#jobs:
# tests:
# strategy:
# fail-fast: false
# matrix:
# python-version: [ 3.9.6 ]
# poetry-version: [ 1.1.5 ]
# # os: [ ubuntu-20.04, macos-latest, windows-latest ]
# os: [ ubuntu-20.04 ]
# runs-on: ${{ matrix.os }}
# steps:
# - uses: actions/checkout@v2
# - uses: actions/setup-python@v2
# with:
# python-version: ${{ matrix.python-version }}
# - name: Install poetry
# uses: abatilo/actions-poetry@v2.0.0
# with:
# poetry-version: ${{ matrix.poetry-version }}
# - name: Install deps
# run: poetry install -vv
# - name: Run tests
# run: poetry run poe test
# lint:
# strategy:
# fail-fast: false
# matrix:
# python-version: [ 3.9.6 ]
# poetry-version: [ 1.1.5 ]
# os: [ ubuntu-20.04 ]
# runs-on: ${{ matrix.os }}
# steps:
# - uses: actions/checkout@v2
# - uses: actions/setup-python@v2
# with:
# python-version: ${{ matrix.python-version }}
# - name: Install poetry
# uses: abatilo/actions-poetry@v2.0.0
# with:
# poetry-version: ${{ matrix.poetry-version }}
# - name: Install deps
# run: poetry install -vv
# - name: Check
# run: poetry run poe check
# publish:
# needs:
# - tests
# - lint
# environment: Test deployment
# strategy:
# fail-fast: false
# matrix:
# python-version: [ 3.9.6 ]
# poetry-version: [ 1.1.5 ]
# os: [ ubuntu-20.04 ]
# runs-on: ${{ matrix.os }}
# steps:
# - uses: actions/checkout@v2
# with:
# fetch-depth: 0
# - uses: actions/setup-python@v2
# with:
# python-version: ${{ matrix.python-version }}
# - name: Install poetry
# uses: abatilo/actions-poetry@v2.0.0
# with:
# poetry-version: ${{ matrix.poetry-version }}
# - name: Install deps
# run: poetry install -vv
# - name: Build and publish
# run: |
# poetry version "$(poetry version --short)-alpha.${GITHUB_RUN_NUMBER}"
# poetry build
# poetry config repositories.clarinpypi https://pypi.clarin-pl.eu/
# poetry publish -r clarinpypi --username ${{ secrets.PYPI_USER }} --password ${{ secrets.PYPI_PASS }}
......@@ -4,7 +4,6 @@ from typing import List
from nltk.tokenize import RegexpTokenizer
from sziszapangma.model.model import Word
from sziszapangma.model.model_creators import create_new_word
......@@ -14,7 +13,17 @@ def remove_interpunction(text: str) -> List[str]:
def import_mls():
for it in ['de', 'en', 'es', 'fr', 'nl', 'pl']:
with open(f'mls/cache_items_{it}_voxpopuli.jsonl', 'r') as f:
for line in f.read().splitlines(keepends=False):
it_dict = json.loads(line)
text = it_dict['normalized_text']
tokens = remove_interpunction(text)
words = [create_new_word(it) for it in tokens]
pprint(it_dict)
print(words)
def import_mls():
for it in ['de', 'en', 'es', 'fr', 'nl', 'pl']:
with open(f'mls/cache_items_{it}_voxpopuli.jsonl', 'r') as f:
for line in f.read().splitlines(keepends=False):
......
......@@ -8,9 +8,17 @@ from pika.adapters.blocking_connection import BlockingChannel
COMMANDS = ['run_word_wer_classic_pipeline', 'run_word_wer_embedding_pipeline', 'run_spacy_dep_tag_wer_pipeline',
'run_spacy_ner_wer_pipeline', 'run_spacy_pos_wer_pipeline']
LANGUAGES = ['nl', 'fr', 'de', 'it', 'pl', 'es', 'en']
LANGUAGES = [
# 'nl', 'fr', 'de',
'it',
# 'pl', 'es', 'en'
]
WHISPER_ASR_MODEL = ['tiny', 'base', 'small', 'medium', 'large-v2']
DATASETS = ['google_fleurs', 'minds14', 'voxpopuli']
DATASETS = [
# 'google_fleurs',
# 'minds14',
'voxpopuli'
]
def get_all_datasets() -> List[str]:
......@@ -89,7 +97,7 @@ def main():
channel = connection.channel()
# add_whisper(channel)
# add_facebook_hf_wav2vec2_asr(channel)
add_facebook_hf_wav2vec2_pipeline(channel)
# add_facebook_hf_wav2vec2_pipeline(channel)
connection.close()
......
......@@ -48,10 +48,10 @@ if __name__ == '__main__':
# import_voxpopuli_dataset('es', 'es_voxpopuli')
# import_voxpopuli_dataset('en', 'en_voxpopuli')
import_from_file('nl')
import_from_file('fr')
import_from_file('de')
# import_from_file('it')
import_from_file('pl')
import_from_file('es')
import_from_file('en')
# import_from_file('nl')
# import_from_file('fr')
# import_from_file('de')
import_from_file('it')
# import_from_file('pl')
# import_from_file('es')
# import_from_file('en')
......@@ -23,7 +23,6 @@ class LoadedRemoteDatasetHelper(DatasetHelper):
return self._experiment_repository.get_all_record_ids_for_property(PropertyHelper.get_gold_transcript_words())
def get_path(self, record_id: str) -> str:
print('get_path')
record_path = Path.home() / f'.cache/asr_benchmark/{self._dataset_name}/{record_id}.wav'
if record_path.exists():
return record_path.as_posix()
......
......@@ -21,7 +21,6 @@ class MinioAudioRecordRepository:
len(open(local_path, 'rb').read()))
def load_file(self, local_path: Path, dataset_name: str, record_id: str):
print('load_file')
record_response: HTTPResponse = self._minio.get_object(self._bucket,
self._get_record_path(dataset_name, record_id))
local_path.parent.mkdir(parents=True, exist_ok=True)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment