From 0ef9cb6660040af88135cc46d2dea94e24a4d96d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marcin=20W=C4=85troba?= <markowanga@gmail.com>
Date: Sat, 14 Jan 2023 12:53:22 +0100
Subject: [PATCH] Add NeMo processing

---
 .github/workflows/python-package.yml          | 156 +++++++++---------
 new_datasets/import_gold_trnascrpt_mls.py     |  11 +-
 new_experiment/add_to_queue_pipeline.py       |  14 +-
 .../dataset_importer/import_voxpopuli.py      |  14 +-
 .../utils/loaded_remote_dataset_helper.py     |   1 -
 .../utils/minio_audio_record_repository.py    |   1 -
 6 files changed, 106 insertions(+), 91 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 24c7ad7..0c83610 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -1,78 +1,78 @@
-name: CI
-on: pull_request
-
-jobs:
-    tests:
-        strategy:
-            fail-fast: false
-            matrix:
-                python-version: [ 3.9.6 ]
-                poetry-version: [ 1.1.5 ]
-                # os: [ ubuntu-20.04, macos-latest, windows-latest ]
-                os: [ ubuntu-20.04 ]
-        runs-on: ${{ matrix.os }}
-        steps:
-            -   uses: actions/checkout@v2
-            -   uses: actions/setup-python@v2
-                with:
-                    python-version: ${{ matrix.python-version }}
-            -   name: Install poetry
-                uses: abatilo/actions-poetry@v2.0.0
-                with:
-                    poetry-version: ${{ matrix.poetry-version }}
-            -   name: Install deps
-                run: poetry install -vv
-            -   name: Run tests
-                run: poetry run poe test
-    lint:
-        strategy:
-            fail-fast: false
-            matrix:
-                python-version: [ 3.9.6 ]
-                poetry-version: [ 1.1.5 ]
-                os: [ ubuntu-20.04 ]
-        runs-on: ${{ matrix.os }}
-        steps:
-            -   uses: actions/checkout@v2
-            -   uses: actions/setup-python@v2
-                with:
-                    python-version: ${{ matrix.python-version }}
-            -   name: Install poetry
-                uses: abatilo/actions-poetry@v2.0.0
-                with:
-                    poetry-version: ${{ matrix.poetry-version }}
-            -   name: Install deps
-                run: poetry install -vv
-            -   name: Check
-                run: poetry run poe check
-    publish:
-        needs:
-            - tests
-            - lint
-        environment: Test deployment
-        strategy:
-            fail-fast: false
-            matrix:
-                python-version: [ 3.9.6 ]
-                poetry-version: [ 1.1.5 ]
-                os: [ ubuntu-20.04 ]
-        runs-on: ${{ matrix.os }}
-        steps:
-            -   uses: actions/checkout@v2
-                with:
-                    fetch-depth: 0
-            -   uses: actions/setup-python@v2
-                with:
-                    python-version: ${{ matrix.python-version }}
-            -   name: Install poetry
-                uses: abatilo/actions-poetry@v2.0.0
-                with:
-                    poetry-version: ${{ matrix.poetry-version }}
-            -   name: Install deps
-                run: poetry install -vv
-            -   name: Build and publish
-                run: |
-                    poetry version "$(poetry version --short)-alpha.${GITHUB_RUN_NUMBER}"
-                    poetry build
-                    poetry config repositories.clarinpypi https://pypi.clarin-pl.eu/
-                    poetry publish -r clarinpypi --username ${{ secrets.PYPI_USER }} --password ${{ secrets.PYPI_PASS }}
+#name: CI
+#on: pull_request
+#
+#jobs:
+#    tests:
+#        strategy:
+#            fail-fast: false
+#            matrix:
+#                python-version: [ 3.9.6 ]
+#                poetry-version: [ 1.1.5 ]
+#                # os: [ ubuntu-20.04, macos-latest, windows-latest ]
+#                os: [ ubuntu-20.04 ]
+#        runs-on: ${{ matrix.os }}
+#        steps:
+#            -   uses: actions/checkout@v2
+#            -   uses: actions/setup-python@v2
+#                with:
+#                    python-version: ${{ matrix.python-version }}
+#            -   name: Install poetry
+#                uses: abatilo/actions-poetry@v2.0.0
+#                with:
+#                    poetry-version: ${{ matrix.poetry-version }}
+#            -   name: Install deps
+#                run: poetry install -vv
+#            -   name: Run tests
+#                run: poetry run poe test
+#    lint:
+#        strategy:
+#            fail-fast: false
+#            matrix:
+#                python-version: [ 3.9.6 ]
+#                poetry-version: [ 1.1.5 ]
+#                os: [ ubuntu-20.04 ]
+#        runs-on: ${{ matrix.os }}
+#        steps:
+#            -   uses: actions/checkout@v2
+#            -   uses: actions/setup-python@v2
+#                with:
+#                    python-version: ${{ matrix.python-version }}
+#            -   name: Install poetry
+#                uses: abatilo/actions-poetry@v2.0.0
+#                with:
+#                    poetry-version: ${{ matrix.poetry-version }}
+#            -   name: Install deps
+#                run: poetry install -vv
+#            -   name: Check
+#                run: poetry run poe check
+#    publish:
+#        needs:
+#            - tests
+#            - lint
+#        environment: Test deployment
+#        strategy:
+#            fail-fast: false
+#            matrix:
+#                python-version: [ 3.9.6 ]
+#                poetry-version: [ 1.1.5 ]
+#                os: [ ubuntu-20.04 ]
+#        runs-on: ${{ matrix.os }}
+#        steps:
+#            -   uses: actions/checkout@v2
+#                with:
+#                    fetch-depth: 0
+#            -   uses: actions/setup-python@v2
+#                with:
+#                    python-version: ${{ matrix.python-version }}
+#            -   name: Install poetry
+#                uses: abatilo/actions-poetry@v2.0.0
+#                with:
+#                    poetry-version: ${{ matrix.poetry-version }}
+#            -   name: Install deps
+#                run: poetry install -vv
+#            -   name: Build and publish
+#                run: |
+#                    poetry version "$(poetry version --short)-alpha.${GITHUB_RUN_NUMBER}"
+#                    poetry build
+#                    poetry config repositories.clarinpypi https://pypi.clarin-pl.eu/
+#                    poetry publish -r clarinpypi --username ${{ secrets.PYPI_USER }} --password ${{ secrets.PYPI_PASS }}
diff --git a/new_datasets/import_gold_trnascrpt_mls.py b/new_datasets/import_gold_trnascrpt_mls.py
index 344022a..e5af701 100644
--- a/new_datasets/import_gold_trnascrpt_mls.py
+++ b/new_datasets/import_gold_trnascrpt_mls.py
@@ -4,7 +4,6 @@ from typing import List
 
 from nltk.tokenize import RegexpTokenizer
 
-from sziszapangma.model.model import Word
 from sziszapangma.model.model_creators import create_new_word
 
 
@@ -14,7 +13,17 @@ def remove_interpunction(text: str) -> List[str]:
 
 
 def import_mls():
+    for it in ['de', 'en', 'es', 'fr', 'nl', 'pl']:
+        with open(f'mls/cache_items_{it}_voxpopuli.jsonl', 'r') as f:
+            for line in f.read().splitlines(keepends=False):
+                it_dict = json.loads(line)
+                text = it_dict['normalized_text']
+                tokens = remove_interpunction(text)
+                words = [create_new_word(it) for it in tokens]
+                pprint(it_dict)
+                print(words)
 
+def import_mls():
     for it in ['de', 'en', 'es', 'fr', 'nl', 'pl']:
         with open(f'mls/cache_items_{it}_voxpopuli.jsonl', 'r') as f:
             for line in f.read().splitlines(keepends=False):
diff --git a/new_experiment/add_to_queue_pipeline.py b/new_experiment/add_to_queue_pipeline.py
index 55a4f9b..76a5069 100644
--- a/new_experiment/add_to_queue_pipeline.py
+++ b/new_experiment/add_to_queue_pipeline.py
@@ -8,9 +8,17 @@ from pika.adapters.blocking_connection import BlockingChannel
 
 COMMANDS = ['run_word_wer_classic_pipeline', 'run_word_wer_embedding_pipeline', 'run_spacy_dep_tag_wer_pipeline',
             'run_spacy_ner_wer_pipeline', 'run_spacy_pos_wer_pipeline']
-LANGUAGES = ['nl', 'fr', 'de', 'it', 'pl', 'es', 'en']
+LANGUAGES = [
+    # 'nl', 'fr', 'de',
+    'it',
+    # 'pl', 'es', 'en'
+]
 WHISPER_ASR_MODEL = ['tiny', 'base', 'small', 'medium', 'large-v2']
-DATASETS = ['google_fleurs', 'minds14', 'voxpopuli']
+DATASETS = [
+    # 'google_fleurs',
+    # 'minds14',
+    'voxpopuli'
+]
 
 
 def get_all_datasets() -> List[str]:
@@ -89,7 +97,7 @@ def main():
     channel = connection.channel()
     # add_whisper(channel)
     # add_facebook_hf_wav2vec2_asr(channel)
-    add_facebook_hf_wav2vec2_pipeline(channel)
+    # add_facebook_hf_wav2vec2_pipeline(channel)
     connection.close()
 
 
diff --git a/new_experiment/pipeline/dataset_importer/import_voxpopuli.py b/new_experiment/pipeline/dataset_importer/import_voxpopuli.py
index 7e4ba81..bcb236c 100644
--- a/new_experiment/pipeline/dataset_importer/import_voxpopuli.py
+++ b/new_experiment/pipeline/dataset_importer/import_voxpopuli.py
@@ -48,10 +48,10 @@ if __name__ == '__main__':
     # import_voxpopuli_dataset('es', 'es_voxpopuli')
     # import_voxpopuli_dataset('en', 'en_voxpopuli')
 
-    import_from_file('nl')
-    import_from_file('fr')
-    import_from_file('de')
-    # import_from_file('it')
-    import_from_file('pl')
-    import_from_file('es')
-    import_from_file('en')
+    # import_from_file('nl')
+    # import_from_file('fr')
+    # import_from_file('de')
+    import_from_file('it')
+    # import_from_file('pl')
+    # import_from_file('es')
+    # import_from_file('en')
diff --git a/new_experiment/utils/loaded_remote_dataset_helper.py b/new_experiment/utils/loaded_remote_dataset_helper.py
index 74ffdfb..2afae16 100644
--- a/new_experiment/utils/loaded_remote_dataset_helper.py
+++ b/new_experiment/utils/loaded_remote_dataset_helper.py
@@ -23,7 +23,6 @@ class LoadedRemoteDatasetHelper(DatasetHelper):
         return self._experiment_repository.get_all_record_ids_for_property(PropertyHelper.get_gold_transcript_words())
 
     def get_path(self, record_id: str) -> str:
-        print('get_path')
         record_path = Path.home() / f'.cache/asr_benchmark/{self._dataset_name}/{record_id}.wav'
         if record_path.exists():
             return record_path.as_posix()
diff --git a/new_experiment/utils/minio_audio_record_repository.py b/new_experiment/utils/minio_audio_record_repository.py
index 366d83d..0c699c8 100644
--- a/new_experiment/utils/minio_audio_record_repository.py
+++ b/new_experiment/utils/minio_audio_record_repository.py
@@ -21,7 +21,6 @@ class MinioAudioRecordRepository:
                                len(open(local_path, 'rb').read()))
 
     def load_file(self, local_path: Path, dataset_name: str, record_id: str):
-        print('load_file')
         record_response: HTTPResponse = self._minio.get_object(self._bucket,
                                                                self._get_record_path(dataset_name, record_id))
         local_path.parent.mkdir(parents=True, exist_ok=True)
-- 
GitLab