From aa7b7f1ed8072b8fd24b94a4f0976bc2a946ba2b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marcin=20W=C4=85troba?= <markowanga@gmail.com>
Date: Sat, 14 Jan 2023 13:46:52 +0100
Subject: [PATCH] Add NeMo processing

---
 new_experiment/add_to_queue_pipeline.py                      | 1 +
 new_experiment/hf_asr/import_nvidia_nemo_asr_result.py       | 5 ++++-
 sziszapangma/integration/repository/experiment_repository.py | 4 ++++
 .../integration/repository/mongo_experiment_repository.py    | 3 +++
 4 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/new_experiment/add_to_queue_pipeline.py b/new_experiment/add_to_queue_pipeline.py
index 76a5069..b87ca62 100644
--- a/new_experiment/add_to_queue_pipeline.py
+++ b/new_experiment/add_to_queue_pipeline.py
@@ -99,6 +99,7 @@ def main():
     # add_facebook_hf_wav2vec2_asr(channel)
     # add_facebook_hf_wav2vec2_pipeline(channel)
     connection.close()
+    # ['de', 'en', 'es', 'fr', 'it']
 
 
 if __name__ == '__main__':
diff --git a/new_experiment/hf_asr/import_nvidia_nemo_asr_result.py b/new_experiment/hf_asr/import_nvidia_nemo_asr_result.py
index 9c23456..32c4e2c 100644
--- a/new_experiment/hf_asr/import_nvidia_nemo_asr_result.py
+++ b/new_experiment/hf_asr/import_nvidia_nemo_asr_result.py
@@ -2,6 +2,7 @@ import json
 from pathlib import Path
 
 from new_experiment.new_dependency_provider import get_experiment_repository
+from new_experiment.utils.property_helper import PropertyHelper
 from sziszapangma.model.model_creators import create_new_word
 
 
@@ -12,6 +13,8 @@ def load_nemo_asr_results(dataset_name: str):
     with open(jsonl_path, 'r') as reader:
         lines = reader.read().splitlines(keepends=False)
         for json_line in lines:
+            old_property_name = f'nvidia_stt_{language_code}_conformer_transducer_large'
+            repository.delete_property(old_property_name)
             parsed_json = json.loads(json_line)
             print(parsed_json['audio_filepath'].split('/')[-1])
             record_id = parsed_json['audio_filepath'].split('/')[-1][:-4]
@@ -22,7 +25,7 @@ def load_nemo_asr_results(dataset_name: str):
                 "full_text": transcript,
                 "words_time_alignment": None
             }
-            property_name = f'nvidia_stt_{language_code}_conformer_transducer_large'
+            property_name = PropertyHelper.asr_result(f'nvidia_stt_{language_code}_conformer_transducer_large')
             repository.update_property_for_key(record_id, property_name, asr_result)
 
 
diff --git a/sziszapangma/integration/repository/experiment_repository.py b/sziszapangma/integration/repository/experiment_repository.py
index 61ddbb3..779933f 100644
--- a/sziszapangma/integration/repository/experiment_repository.py
+++ b/sziszapangma/integration/repository/experiment_repository.py
@@ -41,3 +41,7 @@ class ExperimentRepository(ABC):
     @abstractmethod
     def get_all_values_from_property(self, property_name: str) -> Dict[str, Any]:
         pass
+
+    @abstractmethod
+    def delete_property(self, property_name: str):
+        pass
diff --git a/sziszapangma/integration/repository/mongo_experiment_repository.py b/sziszapangma/integration/repository/mongo_experiment_repository.py
index 98c2ef3..1ee99a3 100644
--- a/sziszapangma/integration/repository/mongo_experiment_repository.py
+++ b/sziszapangma/integration/repository/mongo_experiment_repository.py
@@ -61,3 +61,6 @@ class MongoExperimentRepository(ExperimentRepository):
 
     def get_all_values_from_property(self, property_name: str) -> Dict[str, Any]:
         return {record[ID]: record[VALUE] for record in self._get_database()[property_name].find()}
+
+    def delete_property(self, property_name: str):
+        self._get_database().drop_collection(property_name)
-- 
GitLab