Skip to content
Snippets Groups Projects
Select Git revision
  • 0ef9cb6660040af88135cc46d2dea94e24a4d96d
  • main default protected
  • change_data_model
  • feature/add_auth_asr_service
  • fix/incorrect_import
  • feature/change_registry_clarin
  • feature/add_base_asr_service
  • feature/add_poetry
  • feature/add_word_ids
  • feature/add_sziszapangma
10 results

import_gold_trnascrpt_mls.py

Blame
  • user avatar
    Marcin Wątroba authored
    0ef9cb66
    History
    import_gold_trnascrpt_mls.py 1.25 KiB
    import json
    from pprint import pprint
    from typing import List
    
    from nltk.tokenize import RegexpTokenizer
    
    from sziszapangma.model.model_creators import create_new_word
    
    
    def remove_interpunction(text: str) -> List[str]:
        tokenizer = RegexpTokenizer(r'\w+')
        return tokenizer.tokenize(text)
    
    
    def import_mls():
        for it in ['de', 'en', 'es', 'fr', 'nl', 'pl']:
            with open(f'mls/cache_items_{it}_voxpopuli.jsonl', 'r') as f:
                for line in f.read().splitlines(keepends=False):
                    it_dict = json.loads(line)
                    text = it_dict['normalized_text']
                    tokens = remove_interpunction(text)
                    words = [create_new_word(it) for it in tokens]
                    pprint(it_dict)
                    print(words)
    
    def import_mls():
        for it in ['de', 'en', 'es', 'fr', 'nl', 'pl']:
            with open(f'mls/cache_items_{it}_voxpopuli.jsonl', 'r') as f:
                for line in f.read().splitlines(keepends=False):
                    it_dict = json.loads(line)
                    text = it_dict['normalized_text']
                    tokens = remove_interpunction(text)
                    words = [create_new_word(it) for it in tokens]
                    pprint(it_dict)
                    print(words)
    
    
    if __name__ == '__main__':
        import_mls()