From aa7b7f1ed8072b8fd24b94a4f0976bc2a946ba2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20W=C4=85troba?= <markowanga@gmail.com> Date: Sat, 14 Jan 2023 13:46:52 +0100 Subject: [PATCH] Add NeMo processing --- new_experiment/add_to_queue_pipeline.py | 1 + new_experiment/hf_asr/import_nvidia_nemo_asr_result.py | 5 ++++- sziszapangma/integration/repository/experiment_repository.py | 4 ++++ .../integration/repository/mongo_experiment_repository.py | 3 +++ 4 files changed, 12 insertions(+), 1 deletion(-) diff --git a/new_experiment/add_to_queue_pipeline.py b/new_experiment/add_to_queue_pipeline.py index 76a5069..b87ca62 100644 --- a/new_experiment/add_to_queue_pipeline.py +++ b/new_experiment/add_to_queue_pipeline.py @@ -99,6 +99,7 @@ def main(): # add_facebook_hf_wav2vec2_asr(channel) # add_facebook_hf_wav2vec2_pipeline(channel) connection.close() + # ['de', 'en', 'es', 'fr', 'it'] if __name__ == '__main__': diff --git a/new_experiment/hf_asr/import_nvidia_nemo_asr_result.py b/new_experiment/hf_asr/import_nvidia_nemo_asr_result.py index 9c23456..32c4e2c 100644 --- a/new_experiment/hf_asr/import_nvidia_nemo_asr_result.py +++ b/new_experiment/hf_asr/import_nvidia_nemo_asr_result.py @@ -2,6 +2,7 @@ import json from pathlib import Path from new_experiment.new_dependency_provider import get_experiment_repository +from new_experiment.utils.property_helper import PropertyHelper from sziszapangma.model.model_creators import create_new_word @@ -12,6 +13,8 @@ def load_nemo_asr_results(dataset_name: str): with open(jsonl_path, 'r') as reader: lines = reader.read().splitlines(keepends=False) for json_line in lines: + old_property_name = f'nvidia_stt_{language_code}_conformer_transducer_large' + repository.delete_property(old_property_name) parsed_json = json.loads(json_line) print(parsed_json['audio_filepath'].split('/')[-1]) record_id = parsed_json['audio_filepath'].split('/')[-1][:-4] @@ -22,7 +25,7 @@ def load_nemo_asr_results(dataset_name: str): "full_text": transcript, "words_time_alignment": None } - property_name = f'nvidia_stt_{language_code}_conformer_transducer_large' + property_name = PropertyHelper.asr_result(f'nvidia_stt_{language_code}_conformer_transducer_large') repository.update_property_for_key(record_id, property_name, asr_result) diff --git a/sziszapangma/integration/repository/experiment_repository.py b/sziszapangma/integration/repository/experiment_repository.py index 61ddbb3..779933f 100644 --- a/sziszapangma/integration/repository/experiment_repository.py +++ b/sziszapangma/integration/repository/experiment_repository.py @@ -41,3 +41,7 @@ class ExperimentRepository(ABC): @abstractmethod def get_all_values_from_property(self, property_name: str) -> Dict[str, Any]: pass + + @abstractmethod + def delete_property(self, property_name: str): + pass diff --git a/sziszapangma/integration/repository/mongo_experiment_repository.py b/sziszapangma/integration/repository/mongo_experiment_repository.py index 98c2ef3..1ee99a3 100644 --- a/sziszapangma/integration/repository/mongo_experiment_repository.py +++ b/sziszapangma/integration/repository/mongo_experiment_repository.py @@ -61,3 +61,6 @@ class MongoExperimentRepository(ExperimentRepository): def get_all_values_from_property(self, property_name: str) -> Dict[str, Any]: return {record[ID]: record[VALUE] for record in self._get_database()[property_name].find()} + + def delete_property(self, property_name: str): + self._get_database().drop_collection(property_name) -- GitLab