From 8e0d8500c895e24d913ee196e19bf6e968284fae Mon Sep 17 00:00:00 2001 From: piotrmp <piotr.m.przybyla@gmail.com> Date: Tue, 22 Nov 2022 15:52:36 +0100 Subject: [PATCH] Modifications for larger models. --- src/lambo/examples/run_pretraining.py | 4 +- src/lambo/learning/model.py | 8 +-- src/lambo/learning/model_pretraining.py | 4 +- src/lambo/learning/train.py | 4 +- src/lambo/resources/languages.txt | 65 ++++++++++++------ src/lambo/utils/generate_languages_txt.py | 82 +++++++++++++++++++++++ 6 files changed, 138 insertions(+), 29 deletions(-) create mode 100644 src/lambo/utils/generate_languages_txt.py diff --git a/src/lambo/examples/run_pretraining.py b/src/lambo/examples/run_pretraining.py index ddcb05b..a1a14c5 100644 --- a/src/lambo/examples/run_pretraining.py +++ b/src/lambo/examples/run_pretraining.py @@ -26,8 +26,8 @@ if __name__=='__main__': line[0] != '#' and line.split(' ')[1] != '?'] languages = list(dict.fromkeys(languages)) - MAX_DOCUMENTS = 10 - CONTEXT_LEN = 256 + MAX_DOCUMENTS = 100 + CONTEXT_LEN = 1024 for language in languages: if (outpath / ('oscar_' + language + '.pth')).exists(): diff --git a/src/lambo/learning/model.py b/src/lambo/learning/model.py index 6383d3e..9e69067 100644 --- a/src/lambo/learning/model.py +++ b/src/lambo/learning/model.py @@ -7,8 +7,8 @@ class LamboNetwork(Module): """ LAMBO neural network model. The network has four layers: - * embedding layers for characters, representing each as a 32-long vector, - * bidirectional LSTM layer, taking a concatenation of (1) character embedding and (2) one-hot UTF category vector as input and outputting 2*128-long state vector, + * embedding layers for characters, representing each as a 32-long vector (or 64-long), + * bidirectional LSTM layer, taking a concatenation of (1) character embedding and (2) one-hot UTF category vector as input and outputting 2*128-long state vector (or 2*256), * dense linear layer, converting LSTM state vectors to class scores * softmax layer, computing probability of eight events for any character: @@ -34,8 +34,8 @@ class LamboNetwork(Module): self.embedding_layer = copy.deepcopy(pretrained.embedding_layer) self.lstm_layer = copy.deepcopy(pretrained.lstm_layer) else: - self.embedding_layer = Embedding(len(dict), 32, dict['<PAD>']) - self.lstm_layer = LSTM(input_size=self.embedding_layer.embedding_dim + utf_categories_num, hidden_size=128, + self.embedding_layer = Embedding(len(dict), 64, dict['<PAD>']) + self.lstm_layer = LSTM(input_size=self.embedding_layer.embedding_dim + utf_categories_num, hidden_size=256, batch_first=True, bidirectional=True) self.linear_layer = Linear(self.lstm_layer.hidden_size * 2, 8) diff --git a/src/lambo/learning/model_pretraining.py b/src/lambo/learning/model_pretraining.py index e37d08b..fa07693 100644 --- a/src/lambo/learning/model_pretraining.py +++ b/src/lambo/learning/model_pretraining.py @@ -17,8 +17,8 @@ class LamboPretrainingNetwork(Module): """ super(LamboPretrainingNetwork, self).__init__() self.max_len = max_len - self.embedding_layer = Embedding(len(dict), 32, dict['<PAD>']) - self.lstm_layer = LSTM(input_size=self.embedding_layer.embedding_dim + utf_categories_num, hidden_size=128, + self.embedding_layer = Embedding(len(dict), 64, dict['<PAD>']) + self.lstm_layer = LSTM(input_size=self.embedding_layer.embedding_dim + utf_categories_num, hidden_size=256, batch_first=True, bidirectional=True) self.linear_layer = Linear(self.lstm_layer.hidden_size * 2, len(dict)) diff --git a/src/lambo/learning/train.py b/src/lambo/learning/train.py index c45c500..f33c88b 100644 --- a/src/lambo/learning/train.py +++ b/src/lambo/learning/train.py @@ -114,7 +114,7 @@ def train_new_and_save(model_name, treebank_path, save_path, epochs=10): BATCH_SIZE = 32 print("Initiating the model.") - MAX_LEN = 100 + MAX_LEN = 1024 dict, train_dataloader, test_dataloader = prepare_dataloaders_withdict([train_doc, dev_doc], [test_doc], MAX_LEN, BATCH_SIZE) @@ -156,7 +156,7 @@ def train_pretrained_and_save(language, treebank_path, save_path, pretrained_pat train_doc, dev_doc, test_doc = read_treebank(treebank_path, True) print("Initiating the model.") - MAX_LEN = 100 + MAX_LEN = 1024 model = LamboNetwork(MAX_LEN, dict, len(utf_category_dictionary), pretrained=pretrained_model) print("Preparing data") diff --git a/src/lambo/resources/languages.txt b/src/lambo/resources/languages.txt index ee5981c..9e213ff 100644 --- a/src/lambo/resources/languages.txt +++ b/src/lambo/resources/languages.txt @@ -1,17 +1,20 @@ # Format: <UD training corpus> <ISO 639-1 code (for OSCAR pretraining)> <Language name> <Recommended (chosen by size)> UD_Afrikaans-AfriBooms af Afrikaans -UD_Ancient_Greek-Perseus ? Ancient_Greek UD_Ancient_Greek-PROIEL ? Ancient_Greek * -UD_Arabic-PADT ar Arabic -UD_Armenian-ArmTDP hy Armenian -UD_Western_Armenian-ArmTDP hy Western_Armenian +UD_Ancient_Greek-Perseus ? Ancient_Greek +UD_Ancient_Hebrew-PTNK ? Ancient_Hebrew +UD_Arabic-NYUAD ar Arabic +UD_Arabic-PADT ar Arabic * +UD_Armenian-ArmTDP hy Armenian * +UD_Armenian-BSUT hy Armenian UD_Basque-BDT eu Basque UD_Belarusian-HSE be Belarusian UD_Bulgarian-BTB bg Bulgarian UD_Catalan-AnCora ca Catalan -UD_Chinese-GSD zh Chinese * -UD_Chinese-GSDSimp zh Chinese +UD_Chinese-GSD zh Chinese +UD_Chinese-GSDSimp zh Chinese * UD_Classical_Chinese-Kyoto ? Classical_Chinese +UD_Coptic-Scriptorium ? Coptic UD_Croatian-SET hr Croatian UD_Czech-CAC cs Czech UD_Czech-CLTT cs Czech @@ -21,54 +24,77 @@ UD_Danish-DDT da Danish UD_Dutch-Alpino nl Dutch * UD_Dutch-LassySmall nl Dutch UD_English-Atis en English +UD_English-ESL en English UD_English-EWT en English * UD_English-GUM en English +UD_English-GUMReddit en English UD_English-LinES en English UD_English-ParTUT en English UD_Estonian-EDT et Estonian * UD_Estonian-EWT et Estonian +UD_Faroese-FarPaHC fo Faroese UD_Finnish-FTB fi Finnish UD_Finnish-TDT fi Finnish * +UD_French-FTB fr French UD_French-GSD fr French * UD_French-ParTUT fr French +UD_French-ParisStories fr French UD_French-Rhapsodie fr French UD_French-Sequoia fr French UD_Galician-CTG gl Galician UD_German-GSD de German UD_German-HDT de German * +UD_Gothic-PROIEL ? Gothic UD_Greek-GDT el Greek -UD_Hebrew-HTB he Hebrew +UD_Hebrew-HTB he Hebrew * +UD_Hebrew-IAHLTwiki he Hebrew UD_Hindi-HDTB hi Hindi +UD_Hindi_English-HIENCS ? Hindi_English UD_Hungarian-Szeged hu Hungarian +UD_Icelandic-GC is Icelandic UD_Icelandic-IcePaHC is Icelandic * UD_Icelandic-Modern is Icelandic UD_Indonesian-GSD id Indonesian UD_Irish-IDT ga Irish UD_Italian-ISDT it Italian * +UD_Italian-MarkIT it Italian UD_Italian-ParTUT it Italian UD_Italian-PoSTWITA it Italian UD_Italian-TWITTIRO it Italian UD_Italian-VIT it Italian -UD_Japanese-GSD ja Japanese * +UD_Japanese-BCCWJ ja Japanese * +UD_Japanese-BCCWJLUW ja Japanese +UD_Japanese-GSD ja Japanese UD_Japanese-GSDLUW ja Japanese -UD_Korean-Kaist ko Korean +UD_Korean-GSD ko Korean +UD_Korean-Kaist ko Korean * UD_Latin-ITTB la Latin * UD_Latin-LLCT la Latin UD_Latin-PROIEL la Latin UD_Latin-UDante la Latin UD_Latvian-LVTB lv Latvian -UD_Lithuanian-ALKSNIS lt Lithuanian +UD_Lithuanian-ALKSNIS lt Lithuanian * +UD_Lithuanian-HSE lt Lithuanian UD_Maltese-MUDT mt Maltese -UD_Norwegian-Bokmaal no Norwegian_Bokmål -UD_Norwegian-Nynorsk nn Norwegian_Nynorsk * -UD_Norwegian-NynorskLIA nn Norwegian_Nynorsk +UD_Marathi-UFAL mr Marathi +UD_Naija-NSC ? Naija +UD_Norwegian-Bokmaal no Norwegian +UD_Norwegian-Nynorsk nn Norwegian * +UD_Norwegian-NynorskLIA nn Norwegian +UD_Old_Church_Slavonic-PROIEL ? Old_Church_Slavonic +UD_Old_East_Slavic-Birchbark ? Old_East_Slavic +UD_Old_East_Slavic-RNC ? Old_East_Slavic +UD_Old_East_Slavic-TOROT ? Old_East_Slavic * UD_Old_French-SRCMF ? Old_French UD_Persian-PerDT fa Persian * UD_Persian-Seraji fa Persian -UD_Polish-PDB pl Polish * UD_Polish-LFG pl Polish +UD_Polish-PDB pl Polish * +UD_Pomak-Philotis ? Pomak UD_Portuguese-Bosque pt Portuguese -UD_Portuguese-GSD pt Portuguese * +UD_Portuguese-CINTIL pt Portuguese * +UD_Portuguese-GSD pt Portuguese +UD_Portuguese-PetroGold pt Portuguese UD_Romanian-Nonstandard ro Romanian * UD_Romanian-RRT ro Romanian UD_Romanian-SiMoNERo ro Romanian @@ -83,8 +109,9 @@ UD_Spanish-AnCora es Spanish * UD_Spanish-GSD es Spanish UD_Swedish-LinES sv Swedish UD_Swedish-Talbanken sv Swedish * +UD_Swedish_Sign_Language-SSLC ? Swedish_Sign_Language UD_Tamil-TTB ta Tamil -UD_Telugu-MTG te Telegu +UD_Telugu-MTG te Telugu UD_Turkish-Atis tr Turkish UD_Turkish-BOUN tr Turkish UD_Turkish-FrameNet tr Turkish @@ -92,11 +119,11 @@ UD_Turkish-IMST tr Turkish UD_Turkish-Kenet tr Turkish UD_Turkish-Penn tr Turkish * UD_Turkish-Tourism tr Turkish +UD_Turkish_German-SAGT ? Turkish_German UD_Ukrainian-IU uk Ukrainian UD_Urdu-UDTB ur Urdu UD_Uyghur-UDT ug Uyghur UD_Vietnamese-VTB vi Vietnamese UD_Welsh-CCG cy Welsh -#NKJP_Polish-byName pl Polish -#NKJP_Polish-byType pl Polish -#UD_Polish-PDBnoMW pl Polish \ No newline at end of file +UD_Western_Armenian-ArmTDP hy Western_Armenian +UD_Wolof-WTB wo Wolof \ No newline at end of file diff --git a/src/lambo/utils/generate_languages_txt.py b/src/lambo/utils/generate_languages_txt.py new file mode 100644 index 0000000..d3982e4 --- /dev/null +++ b/src/lambo/utils/generate_languages_txt.py @@ -0,0 +1,82 @@ +from pathlib import Path + +old_languages_txt = '' +new_ud_treebanks = '' + +codedict = {} +for line in open(old_languages_txt): + if line.startswith('#'): + continue + parts = line.strip().split(' ') + lang = parts[2] + code = parts[1] + codedict[lang] = code + +ud11path = Path(new_ud_treebanks) + +subdirs = [x for x in ud11path.iterdir() if x.is_dir()] +subdirs.sort() + +sizes = {} + +for subdir in subdirs: + hasTrain = False + hasDev = False + hasTest = False + trainfile = None + for file in subdir.iterdir(): + if file.name.endswith('train.txt'): + hasTrain = True + trainfile = file + elif file.name.endswith('test.txt'): + hasTest = True + elif file.name.endswith('dev.txt'): + hasDev = True + if (not hasTrain) or (not hasTest) or (not hasDev): + continue + treebank_name = subdir.name + language_name = treebank_name[3:].split('-')[0] + code = '@@@@@' + if language_name in codedict: + code = codedict[language_name] + if language_name not in sizes: + sizes[language_name] = {} + sizes[language_name][treebank_name] = trainfile.stat().st_size + +for language_name in sizes: + maxlen = 0 + best = None + for treebank_name in sizes[language_name]: + if sizes[language_name][treebank_name] > maxlen: + best = treebank_name + maxlen = sizes[language_name][treebank_name] + if len(sizes[language_name]) > 1: + sizes[language_name]['preferred'] = best + +print( + "# Format: <UD training corpus> <ISO 639-1 code (for OSCAR pretraining)> <Language name> <Recommended (chosen by size)>") +for subdir in subdirs: + hasTrain = False + hasDev = False + hasTest = False + trainfile = None + for file in subdir.iterdir(): + if file.name.endswith('train.txt'): + hasTrain = True + trainfile = file + elif file.name.endswith('test.txt'): + hasTest = True + elif file.name.endswith('dev.txt'): + hasDev = True + if (not hasTrain) or (not hasTest) or (not hasDev): + continue + treebank_name = subdir.name + language_name = treebank_name[3:].split('-')[0] + code = '@@@@@' + if language_name in codedict: + code = codedict[language_name] + preferred = '' + if 'preferred' in sizes[language_name] and sizes[language_name]['preferred'] == treebank_name: + preferred = ' *' + print(treebank_name + ' ' + code + ' ' + language_name + preferred) + -- GitLab