Select Git revision
import_gold_trnascrpt_mls.py
import_gold_trnascrpt_mls.py 1.25 KiB
import json
from pprint import pprint
from typing import List
from nltk.tokenize import RegexpTokenizer
from sziszapangma.model.model_creators import create_new_word
def remove_interpunction(text: str) -> List[str]:
tokenizer = RegexpTokenizer(r'\w+')
return tokenizer.tokenize(text)
def import_mls():
for it in ['de', 'en', 'es', 'fr', 'nl', 'pl']:
with open(f'mls/cache_items_{it}_voxpopuli.jsonl', 'r') as f:
for line in f.read().splitlines(keepends=False):
it_dict = json.loads(line)
text = it_dict['normalized_text']
tokens = remove_interpunction(text)
words = [create_new_word(it) for it in tokens]
pprint(it_dict)
print(words)
def import_mls():
for it in ['de', 'en', 'es', 'fr', 'nl', 'pl']:
with open(f'mls/cache_items_{it}_voxpopuli.jsonl', 'r') as f:
for line in f.read().splitlines(keepends=False):
it_dict = json.loads(line)
text = it_dict['normalized_text']
tokens = remove_interpunction(text)
words = [create_new_word(it) for it in tokens]
pprint(it_dict)
print(words)
if __name__ == '__main__':
import_mls()