From 0ef9cb6660040af88135cc46d2dea94e24a4d96d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20W=C4=85troba?= <markowanga@gmail.com> Date: Sat, 14 Jan 2023 12:53:22 +0100 Subject: [PATCH] Add NeMo processing --- .github/workflows/python-package.yml | 156 +++++++++--------- new_datasets/import_gold_trnascrpt_mls.py | 11 +- new_experiment/add_to_queue_pipeline.py | 14 +- .../dataset_importer/import_voxpopuli.py | 14 +- .../utils/loaded_remote_dataset_helper.py | 1 - .../utils/minio_audio_record_repository.py | 1 - 6 files changed, 106 insertions(+), 91 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 24c7ad7..0c83610 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -1,78 +1,78 @@ -name: CI -on: pull_request - -jobs: - tests: - strategy: - fail-fast: false - matrix: - python-version: [ 3.9.6 ] - poetry-version: [ 1.1.5 ] - # os: [ ubuntu-20.04, macos-latest, windows-latest ] - os: [ ubuntu-20.04 ] - runs-on: ${{ matrix.os }} - steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install poetry - uses: abatilo/actions-poetry@v2.0.0 - with: - poetry-version: ${{ matrix.poetry-version }} - - name: Install deps - run: poetry install -vv - - name: Run tests - run: poetry run poe test - lint: - strategy: - fail-fast: false - matrix: - python-version: [ 3.9.6 ] - poetry-version: [ 1.1.5 ] - os: [ ubuntu-20.04 ] - runs-on: ${{ matrix.os }} - steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install poetry - uses: abatilo/actions-poetry@v2.0.0 - with: - poetry-version: ${{ matrix.poetry-version }} - - name: Install deps - run: poetry install -vv - - name: Check - run: poetry run poe check - publish: - needs: - - tests - - lint - environment: Test deployment - strategy: - fail-fast: false - matrix: - python-version: [ 3.9.6 ] - poetry-version: [ 1.1.5 ] - os: [ ubuntu-20.04 ] - runs-on: ${{ matrix.os }} - steps: - - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install poetry - uses: abatilo/actions-poetry@v2.0.0 - with: - poetry-version: ${{ matrix.poetry-version }} - - name: Install deps - run: poetry install -vv - - name: Build and publish - run: | - poetry version "$(poetry version --short)-alpha.${GITHUB_RUN_NUMBER}" - poetry build - poetry config repositories.clarinpypi https://pypi.clarin-pl.eu/ - poetry publish -r clarinpypi --username ${{ secrets.PYPI_USER }} --password ${{ secrets.PYPI_PASS }} +#name: CI +#on: pull_request +# +#jobs: +# tests: +# strategy: +# fail-fast: false +# matrix: +# python-version: [ 3.9.6 ] +# poetry-version: [ 1.1.5 ] +# # os: [ ubuntu-20.04, macos-latest, windows-latest ] +# os: [ ubuntu-20.04 ] +# runs-on: ${{ matrix.os }} +# steps: +# - uses: actions/checkout@v2 +# - uses: actions/setup-python@v2 +# with: +# python-version: ${{ matrix.python-version }} +# - name: Install poetry +# uses: abatilo/actions-poetry@v2.0.0 +# with: +# poetry-version: ${{ matrix.poetry-version }} +# - name: Install deps +# run: poetry install -vv +# - name: Run tests +# run: poetry run poe test +# lint: +# strategy: +# fail-fast: false +# matrix: +# python-version: [ 3.9.6 ] +# poetry-version: [ 1.1.5 ] +# os: [ ubuntu-20.04 ] +# runs-on: ${{ matrix.os }} +# steps: +# - uses: actions/checkout@v2 +# - uses: actions/setup-python@v2 +# with: +# python-version: ${{ matrix.python-version }} +# - name: Install poetry +# uses: abatilo/actions-poetry@v2.0.0 +# with: +# poetry-version: ${{ matrix.poetry-version }} +# - name: Install deps +# run: poetry install -vv +# - name: Check +# run: poetry run poe check +# publish: +# needs: +# - tests +# - lint +# environment: Test deployment +# strategy: +# fail-fast: false +# matrix: +# python-version: [ 3.9.6 ] +# poetry-version: [ 1.1.5 ] +# os: [ ubuntu-20.04 ] +# runs-on: ${{ matrix.os }} +# steps: +# - uses: actions/checkout@v2 +# with: +# fetch-depth: 0 +# - uses: actions/setup-python@v2 +# with: +# python-version: ${{ matrix.python-version }} +# - name: Install poetry +# uses: abatilo/actions-poetry@v2.0.0 +# with: +# poetry-version: ${{ matrix.poetry-version }} +# - name: Install deps +# run: poetry install -vv +# - name: Build and publish +# run: | +# poetry version "$(poetry version --short)-alpha.${GITHUB_RUN_NUMBER}" +# poetry build +# poetry config repositories.clarinpypi https://pypi.clarin-pl.eu/ +# poetry publish -r clarinpypi --username ${{ secrets.PYPI_USER }} --password ${{ secrets.PYPI_PASS }} diff --git a/new_datasets/import_gold_trnascrpt_mls.py b/new_datasets/import_gold_trnascrpt_mls.py index 344022a..e5af701 100644 --- a/new_datasets/import_gold_trnascrpt_mls.py +++ b/new_datasets/import_gold_trnascrpt_mls.py @@ -4,7 +4,6 @@ from typing import List from nltk.tokenize import RegexpTokenizer -from sziszapangma.model.model import Word from sziszapangma.model.model_creators import create_new_word @@ -14,7 +13,17 @@ def remove_interpunction(text: str) -> List[str]: def import_mls(): + for it in ['de', 'en', 'es', 'fr', 'nl', 'pl']: + with open(f'mls/cache_items_{it}_voxpopuli.jsonl', 'r') as f: + for line in f.read().splitlines(keepends=False): + it_dict = json.loads(line) + text = it_dict['normalized_text'] + tokens = remove_interpunction(text) + words = [create_new_word(it) for it in tokens] + pprint(it_dict) + print(words) +def import_mls(): for it in ['de', 'en', 'es', 'fr', 'nl', 'pl']: with open(f'mls/cache_items_{it}_voxpopuli.jsonl', 'r') as f: for line in f.read().splitlines(keepends=False): diff --git a/new_experiment/add_to_queue_pipeline.py b/new_experiment/add_to_queue_pipeline.py index 55a4f9b..76a5069 100644 --- a/new_experiment/add_to_queue_pipeline.py +++ b/new_experiment/add_to_queue_pipeline.py @@ -8,9 +8,17 @@ from pika.adapters.blocking_connection import BlockingChannel COMMANDS = ['run_word_wer_classic_pipeline', 'run_word_wer_embedding_pipeline', 'run_spacy_dep_tag_wer_pipeline', 'run_spacy_ner_wer_pipeline', 'run_spacy_pos_wer_pipeline'] -LANGUAGES = ['nl', 'fr', 'de', 'it', 'pl', 'es', 'en'] +LANGUAGES = [ + # 'nl', 'fr', 'de', + 'it', + # 'pl', 'es', 'en' +] WHISPER_ASR_MODEL = ['tiny', 'base', 'small', 'medium', 'large-v2'] -DATASETS = ['google_fleurs', 'minds14', 'voxpopuli'] +DATASETS = [ + # 'google_fleurs', + # 'minds14', + 'voxpopuli' +] def get_all_datasets() -> List[str]: @@ -89,7 +97,7 @@ def main(): channel = connection.channel() # add_whisper(channel) # add_facebook_hf_wav2vec2_asr(channel) - add_facebook_hf_wav2vec2_pipeline(channel) + # add_facebook_hf_wav2vec2_pipeline(channel) connection.close() diff --git a/new_experiment/pipeline/dataset_importer/import_voxpopuli.py b/new_experiment/pipeline/dataset_importer/import_voxpopuli.py index 7e4ba81..bcb236c 100644 --- a/new_experiment/pipeline/dataset_importer/import_voxpopuli.py +++ b/new_experiment/pipeline/dataset_importer/import_voxpopuli.py @@ -48,10 +48,10 @@ if __name__ == '__main__': # import_voxpopuli_dataset('es', 'es_voxpopuli') # import_voxpopuli_dataset('en', 'en_voxpopuli') - import_from_file('nl') - import_from_file('fr') - import_from_file('de') - # import_from_file('it') - import_from_file('pl') - import_from_file('es') - import_from_file('en') + # import_from_file('nl') + # import_from_file('fr') + # import_from_file('de') + import_from_file('it') + # import_from_file('pl') + # import_from_file('es') + # import_from_file('en') diff --git a/new_experiment/utils/loaded_remote_dataset_helper.py b/new_experiment/utils/loaded_remote_dataset_helper.py index 74ffdfb..2afae16 100644 --- a/new_experiment/utils/loaded_remote_dataset_helper.py +++ b/new_experiment/utils/loaded_remote_dataset_helper.py @@ -23,7 +23,6 @@ class LoadedRemoteDatasetHelper(DatasetHelper): return self._experiment_repository.get_all_record_ids_for_property(PropertyHelper.get_gold_transcript_words()) def get_path(self, record_id: str) -> str: - print('get_path') record_path = Path.home() / f'.cache/asr_benchmark/{self._dataset_name}/{record_id}.wav' if record_path.exists(): return record_path.as_posix() diff --git a/new_experiment/utils/minio_audio_record_repository.py b/new_experiment/utils/minio_audio_record_repository.py index 366d83d..0c699c8 100644 --- a/new_experiment/utils/minio_audio_record_repository.py +++ b/new_experiment/utils/minio_audio_record_repository.py @@ -21,7 +21,6 @@ class MinioAudioRecordRepository: len(open(local_path, 'rb').read())) def load_file(self, local_path: Path, dataset_name: str, record_id: str): - print('load_file') record_response: HTTPResponse = self._minio.get_object(self._bucket, self._get_record_path(dataset_name, record_id)) local_path.parent.mkdir(parents=True, exist_ok=True) -- GitLab