From 8e0d8500c895e24d913ee196e19bf6e968284fae Mon Sep 17 00:00:00 2001 From: piotrmp <piotr.m.przybyla@gmail.com> Date: Tue, 22 Nov 2022 15:52:36 +0100 Subject: [PATCH 01/16] Modifications for larger models. --- src/lambo/examples/run_pretraining.py | 4 +- src/lambo/learning/model.py | 8 +-- src/lambo/learning/model_pretraining.py | 4 +- src/lambo/learning/train.py | 4 +- src/lambo/resources/languages.txt | 65 ++++++++++++------ src/lambo/utils/generate_languages_txt.py | 82 +++++++++++++++++++++++ 6 files changed, 138 insertions(+), 29 deletions(-) create mode 100644 src/lambo/utils/generate_languages_txt.py diff --git a/src/lambo/examples/run_pretraining.py b/src/lambo/examples/run_pretraining.py index ddcb05b..a1a14c5 100644 --- a/src/lambo/examples/run_pretraining.py +++ b/src/lambo/examples/run_pretraining.py @@ -26,8 +26,8 @@ if __name__=='__main__': line[0] != '#' and line.split(' ')[1] != '?'] languages = list(dict.fromkeys(languages)) - MAX_DOCUMENTS = 10 - CONTEXT_LEN = 256 + MAX_DOCUMENTS = 100 + CONTEXT_LEN = 1024 for language in languages: if (outpath / ('oscar_' + language + '.pth')).exists(): diff --git a/src/lambo/learning/model.py b/src/lambo/learning/model.py index 6383d3e..9e69067 100644 --- a/src/lambo/learning/model.py +++ b/src/lambo/learning/model.py @@ -7,8 +7,8 @@ class LamboNetwork(Module): """ LAMBO neural network model. The network has four layers: - * embedding layers for characters, representing each as a 32-long vector, - * bidirectional LSTM layer, taking a concatenation of (1) character embedding and (2) one-hot UTF category vector as input and outputting 2*128-long state vector, + * embedding layers for characters, representing each as a 32-long vector (or 64-long), + * bidirectional LSTM layer, taking a concatenation of (1) character embedding and (2) one-hot UTF category vector as input and outputting 2*128-long state vector (or 2*256), * dense linear layer, converting LSTM state vectors to class scores * softmax layer, computing probability of eight events for any character: @@ -34,8 +34,8 @@ class LamboNetwork(Module): self.embedding_layer = copy.deepcopy(pretrained.embedding_layer) self.lstm_layer = copy.deepcopy(pretrained.lstm_layer) else: - self.embedding_layer = Embedding(len(dict), 32, dict['<PAD>']) - self.lstm_layer = LSTM(input_size=self.embedding_layer.embedding_dim + utf_categories_num, hidden_size=128, + self.embedding_layer = Embedding(len(dict), 64, dict['<PAD>']) + self.lstm_layer = LSTM(input_size=self.embedding_layer.embedding_dim + utf_categories_num, hidden_size=256, batch_first=True, bidirectional=True) self.linear_layer = Linear(self.lstm_layer.hidden_size * 2, 8) diff --git a/src/lambo/learning/model_pretraining.py b/src/lambo/learning/model_pretraining.py index e37d08b..fa07693 100644 --- a/src/lambo/learning/model_pretraining.py +++ b/src/lambo/learning/model_pretraining.py @@ -17,8 +17,8 @@ class LamboPretrainingNetwork(Module): """ super(LamboPretrainingNetwork, self).__init__() self.max_len = max_len - self.embedding_layer = Embedding(len(dict), 32, dict['<PAD>']) - self.lstm_layer = LSTM(input_size=self.embedding_layer.embedding_dim + utf_categories_num, hidden_size=128, + self.embedding_layer = Embedding(len(dict), 64, dict['<PAD>']) + self.lstm_layer = LSTM(input_size=self.embedding_layer.embedding_dim + utf_categories_num, hidden_size=256, batch_first=True, bidirectional=True) self.linear_layer = Linear(self.lstm_layer.hidden_size * 2, len(dict)) diff --git a/src/lambo/learning/train.py b/src/lambo/learning/train.py index c45c500..f33c88b 100644 --- a/src/lambo/learning/train.py +++ b/src/lambo/learning/train.py @@ -114,7 +114,7 @@ def train_new_and_save(model_name, treebank_path, save_path, epochs=10): BATCH_SIZE = 32 print("Initiating the model.") - MAX_LEN = 100 + MAX_LEN = 1024 dict, train_dataloader, test_dataloader = prepare_dataloaders_withdict([train_doc, dev_doc], [test_doc], MAX_LEN, BATCH_SIZE) @@ -156,7 +156,7 @@ def train_pretrained_and_save(language, treebank_path, save_path, pretrained_pat train_doc, dev_doc, test_doc = read_treebank(treebank_path, True) print("Initiating the model.") - MAX_LEN = 100 + MAX_LEN = 1024 model = LamboNetwork(MAX_LEN, dict, len(utf_category_dictionary), pretrained=pretrained_model) print("Preparing data") diff --git a/src/lambo/resources/languages.txt b/src/lambo/resources/languages.txt index ee5981c..9e213ff 100644 --- a/src/lambo/resources/languages.txt +++ b/src/lambo/resources/languages.txt @@ -1,17 +1,20 @@ # Format: <UD training corpus> <ISO 639-1 code (for OSCAR pretraining)> <Language name> <Recommended (chosen by size)> UD_Afrikaans-AfriBooms af Afrikaans -UD_Ancient_Greek-Perseus ? Ancient_Greek UD_Ancient_Greek-PROIEL ? Ancient_Greek * -UD_Arabic-PADT ar Arabic -UD_Armenian-ArmTDP hy Armenian -UD_Western_Armenian-ArmTDP hy Western_Armenian +UD_Ancient_Greek-Perseus ? Ancient_Greek +UD_Ancient_Hebrew-PTNK ? Ancient_Hebrew +UD_Arabic-NYUAD ar Arabic +UD_Arabic-PADT ar Arabic * +UD_Armenian-ArmTDP hy Armenian * +UD_Armenian-BSUT hy Armenian UD_Basque-BDT eu Basque UD_Belarusian-HSE be Belarusian UD_Bulgarian-BTB bg Bulgarian UD_Catalan-AnCora ca Catalan -UD_Chinese-GSD zh Chinese * -UD_Chinese-GSDSimp zh Chinese +UD_Chinese-GSD zh Chinese +UD_Chinese-GSDSimp zh Chinese * UD_Classical_Chinese-Kyoto ? Classical_Chinese +UD_Coptic-Scriptorium ? Coptic UD_Croatian-SET hr Croatian UD_Czech-CAC cs Czech UD_Czech-CLTT cs Czech @@ -21,54 +24,77 @@ UD_Danish-DDT da Danish UD_Dutch-Alpino nl Dutch * UD_Dutch-LassySmall nl Dutch UD_English-Atis en English +UD_English-ESL en English UD_English-EWT en English * UD_English-GUM en English +UD_English-GUMReddit en English UD_English-LinES en English UD_English-ParTUT en English UD_Estonian-EDT et Estonian * UD_Estonian-EWT et Estonian +UD_Faroese-FarPaHC fo Faroese UD_Finnish-FTB fi Finnish UD_Finnish-TDT fi Finnish * +UD_French-FTB fr French UD_French-GSD fr French * UD_French-ParTUT fr French +UD_French-ParisStories fr French UD_French-Rhapsodie fr French UD_French-Sequoia fr French UD_Galician-CTG gl Galician UD_German-GSD de German UD_German-HDT de German * +UD_Gothic-PROIEL ? Gothic UD_Greek-GDT el Greek -UD_Hebrew-HTB he Hebrew +UD_Hebrew-HTB he Hebrew * +UD_Hebrew-IAHLTwiki he Hebrew UD_Hindi-HDTB hi Hindi +UD_Hindi_English-HIENCS ? Hindi_English UD_Hungarian-Szeged hu Hungarian +UD_Icelandic-GC is Icelandic UD_Icelandic-IcePaHC is Icelandic * UD_Icelandic-Modern is Icelandic UD_Indonesian-GSD id Indonesian UD_Irish-IDT ga Irish UD_Italian-ISDT it Italian * +UD_Italian-MarkIT it Italian UD_Italian-ParTUT it Italian UD_Italian-PoSTWITA it Italian UD_Italian-TWITTIRO it Italian UD_Italian-VIT it Italian -UD_Japanese-GSD ja Japanese * +UD_Japanese-BCCWJ ja Japanese * +UD_Japanese-BCCWJLUW ja Japanese +UD_Japanese-GSD ja Japanese UD_Japanese-GSDLUW ja Japanese -UD_Korean-Kaist ko Korean +UD_Korean-GSD ko Korean +UD_Korean-Kaist ko Korean * UD_Latin-ITTB la Latin * UD_Latin-LLCT la Latin UD_Latin-PROIEL la Latin UD_Latin-UDante la Latin UD_Latvian-LVTB lv Latvian -UD_Lithuanian-ALKSNIS lt Lithuanian +UD_Lithuanian-ALKSNIS lt Lithuanian * +UD_Lithuanian-HSE lt Lithuanian UD_Maltese-MUDT mt Maltese -UD_Norwegian-Bokmaal no Norwegian_Bokmål -UD_Norwegian-Nynorsk nn Norwegian_Nynorsk * -UD_Norwegian-NynorskLIA nn Norwegian_Nynorsk +UD_Marathi-UFAL mr Marathi +UD_Naija-NSC ? Naija +UD_Norwegian-Bokmaal no Norwegian +UD_Norwegian-Nynorsk nn Norwegian * +UD_Norwegian-NynorskLIA nn Norwegian +UD_Old_Church_Slavonic-PROIEL ? Old_Church_Slavonic +UD_Old_East_Slavic-Birchbark ? Old_East_Slavic +UD_Old_East_Slavic-RNC ? Old_East_Slavic +UD_Old_East_Slavic-TOROT ? Old_East_Slavic * UD_Old_French-SRCMF ? Old_French UD_Persian-PerDT fa Persian * UD_Persian-Seraji fa Persian -UD_Polish-PDB pl Polish * UD_Polish-LFG pl Polish +UD_Polish-PDB pl Polish * +UD_Pomak-Philotis ? Pomak UD_Portuguese-Bosque pt Portuguese -UD_Portuguese-GSD pt Portuguese * +UD_Portuguese-CINTIL pt Portuguese * +UD_Portuguese-GSD pt Portuguese +UD_Portuguese-PetroGold pt Portuguese UD_Romanian-Nonstandard ro Romanian * UD_Romanian-RRT ro Romanian UD_Romanian-SiMoNERo ro Romanian @@ -83,8 +109,9 @@ UD_Spanish-AnCora es Spanish * UD_Spanish-GSD es Spanish UD_Swedish-LinES sv Swedish UD_Swedish-Talbanken sv Swedish * +UD_Swedish_Sign_Language-SSLC ? Swedish_Sign_Language UD_Tamil-TTB ta Tamil -UD_Telugu-MTG te Telegu +UD_Telugu-MTG te Telugu UD_Turkish-Atis tr Turkish UD_Turkish-BOUN tr Turkish UD_Turkish-FrameNet tr Turkish @@ -92,11 +119,11 @@ UD_Turkish-IMST tr Turkish UD_Turkish-Kenet tr Turkish UD_Turkish-Penn tr Turkish * UD_Turkish-Tourism tr Turkish +UD_Turkish_German-SAGT ? Turkish_German UD_Ukrainian-IU uk Ukrainian UD_Urdu-UDTB ur Urdu UD_Uyghur-UDT ug Uyghur UD_Vietnamese-VTB vi Vietnamese UD_Welsh-CCG cy Welsh -#NKJP_Polish-byName pl Polish -#NKJP_Polish-byType pl Polish -#UD_Polish-PDBnoMW pl Polish \ No newline at end of file +UD_Western_Armenian-ArmTDP hy Western_Armenian +UD_Wolof-WTB wo Wolof \ No newline at end of file diff --git a/src/lambo/utils/generate_languages_txt.py b/src/lambo/utils/generate_languages_txt.py new file mode 100644 index 0000000..d3982e4 --- /dev/null +++ b/src/lambo/utils/generate_languages_txt.py @@ -0,0 +1,82 @@ +from pathlib import Path + +old_languages_txt = '' +new_ud_treebanks = '' + +codedict = {} +for line in open(old_languages_txt): + if line.startswith('#'): + continue + parts = line.strip().split(' ') + lang = parts[2] + code = parts[1] + codedict[lang] = code + +ud11path = Path(new_ud_treebanks) + +subdirs = [x for x in ud11path.iterdir() if x.is_dir()] +subdirs.sort() + +sizes = {} + +for subdir in subdirs: + hasTrain = False + hasDev = False + hasTest = False + trainfile = None + for file in subdir.iterdir(): + if file.name.endswith('train.txt'): + hasTrain = True + trainfile = file + elif file.name.endswith('test.txt'): + hasTest = True + elif file.name.endswith('dev.txt'): + hasDev = True + if (not hasTrain) or (not hasTest) or (not hasDev): + continue + treebank_name = subdir.name + language_name = treebank_name[3:].split('-')[0] + code = '@@@@@' + if language_name in codedict: + code = codedict[language_name] + if language_name not in sizes: + sizes[language_name] = {} + sizes[language_name][treebank_name] = trainfile.stat().st_size + +for language_name in sizes: + maxlen = 0 + best = None + for treebank_name in sizes[language_name]: + if sizes[language_name][treebank_name] > maxlen: + best = treebank_name + maxlen = sizes[language_name][treebank_name] + if len(sizes[language_name]) > 1: + sizes[language_name]['preferred'] = best + +print( + "# Format: <UD training corpus> <ISO 639-1 code (for OSCAR pretraining)> <Language name> <Recommended (chosen by size)>") +for subdir in subdirs: + hasTrain = False + hasDev = False + hasTest = False + trainfile = None + for file in subdir.iterdir(): + if file.name.endswith('train.txt'): + hasTrain = True + trainfile = file + elif file.name.endswith('test.txt'): + hasTest = True + elif file.name.endswith('dev.txt'): + hasDev = True + if (not hasTrain) or (not hasTest) or (not hasDev): + continue + treebank_name = subdir.name + language_name = treebank_name[3:].split('-')[0] + code = '@@@@@' + if language_name in codedict: + code = codedict[language_name] + preferred = '' + if 'preferred' in sizes[language_name] and sizes[language_name]['preferred'] == treebank_name: + preferred = ' *' + print(treebank_name + ' ' + code + ' ' + language_name + preferred) + -- GitLab From 1986518e0d084510e3aca21c88f718c7ebb28f7e Mon Sep 17 00:00:00 2001 From: piotrmp <piotr.m.przybyla@gmail.com> Date: Wed, 23 Nov 2022 09:54:16 +0100 Subject: [PATCH 02/16] Implemented GPU support. --- src/lambo/examples/run_pretraining.py | 13 +++++---- src/lambo/learning/train.py | 38 ++++++++++++++++++--------- 2 files changed, 33 insertions(+), 18 deletions(-) diff --git a/src/lambo/examples/run_pretraining.py b/src/lambo/examples/run_pretraining.py index a1a14c5..45531bc 100644 --- a/src/lambo/examples/run_pretraining.py +++ b/src/lambo/examples/run_pretraining.py @@ -6,6 +6,7 @@ import importlib_resources as resources from pathlib import Path import torch +import sys from lambo.learning.dictionary import create_dictionary from lambo.learning.model_pretraining import LamboPretrainingNetwork @@ -15,11 +16,13 @@ from lambo.learning.train import pretrain from lambo.utils.oscar import read_jsonl_to_documents, download_archive1_from_oscar if __name__=='__main__': - outpath = Path.home() / 'PATH-TO/models/pretrained/' - tmppath = Path.home() / 'PATH-TO/tmp/tmp.jsonl.gz' + outpath = sys.argv[1] #Path.home() / 'PATH-TO/models/pretrained/' + tmppath = sys.argv[2] #Path.home() / 'PATH-TO/tmp/tmp.jsonl.gz' # These need to be filled ine before running. OSCAR is avaialable on request. - OSCAR_LOGIN = '' - OSCAR_PASSWORD = '' + OSCAR_LOGIN = sys.argv[3] + OSCAR_PASSWORD = sys.argv[4] + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') languages_file_str = resources.read_text('lambo.resources', 'languages.txt', encoding='utf-8', errors='strict') languages = [line.split(' ')[1] for line in languages_file_str.split('\n') if @@ -50,7 +53,7 @@ if __name__=='__main__': Xchars, Xutfs, Xmasks, Yvecs = encode_pretraining([document_train], dict, CONTEXT_LEN) _, train_dataloader, test_dataloader = prepare_dataloaders_pretraining([document_train], [document_test], CONTEXT_LEN, 32, dict) - pretrain(model, train_dataloader, test_dataloader, 1) + pretrain(model, train_dataloader, test_dataloader, 1, device) torch.save(model, outpath / ('oscar_' + language + '.pth')) with open(outpath / ('oscar_' + language + '.dict'), "w") as file1: file1.writelines([x + '\t' + str(dict[x]) + '\n' for x in dict]) diff --git a/src/lambo/learning/train.py b/src/lambo/learning/train.py index f33c88b..a2cfd2c 100644 --- a/src/lambo/learning/train.py +++ b/src/lambo/learning/train.py @@ -9,17 +9,19 @@ from lambo.learning.preprocessing_dict import utf_category_dictionary, prepare_d from lambo.utils.ud_reader import read_treebank -def train_loop(dataloader, model, optimizer): +def train_loop(dataloader, model, optimizer, device='cpu'): """ Training loop. :param dataloader: dataloader with training data :param model: model to be optimised :param optimizer: optimiser used + :param device: the device to use for computation :return: no value returned """ size = len(dataloader.dataset) for batch, XY in enumerate(dataloader): + XY = [xy.to(device) for xy in XY] Xs = XY[:-1] Y = XY[-1] pred = model(*Xs) @@ -32,12 +34,13 @@ def train_loop(dataloader, model, optimizer): print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]") -def test_loop(dataloader, model): +def test_loop(dataloader, model, device='cpu'): """ Test loop. :param dataloader: dataloader with test data :param model: model to be tested + :param device: the device to use for computation :return: no value returned """ num_batches = len(dataloader) @@ -46,6 +49,7 @@ def test_loop(dataloader, model): size = [0, 0, 0, 0] with torch.no_grad(): for XY in dataloader: + XY = [xy.to(device) for xy in XY] Xs = XY[:-1] Y = XY[-1] pred = model(*Xs) @@ -63,12 +67,13 @@ def test_loop(dataloader, model): f"Test Error: \n Accuracy chars: {(100 * (correct[0] / size[0])):>5f}%, tokens: {(100 * (correct[1] / size[1])):>5f}%, mwtokens: {(100 * (correct[2] / size[2])):>5f}%, sentences: {(100 * (correct[3] / size[3])):>5f}%, Avg loss: {test_loss:>8f} \n") -def test_loop_pretraining(dataloader, model): +def test_loop_pretraining(dataloader, model, device='cpu'): """ Test loop for pretraining. :param dataloader: dataloader with test data :param model: model to be tested + :param device: the device to use for computation :return: no value returned """ num_batches = len(dataloader) @@ -77,6 +82,7 @@ def test_loop_pretraining(dataloader, model): size = [0, 0] with torch.no_grad(): for XY in dataloader: + XY = [xy.to(device) for xy in XY] Xs = XY[:-1] Y = XY[-1] pred = model(*Xs) @@ -93,7 +99,7 @@ def test_loop_pretraining(dataloader, model): f"Test Error: \n Accuracy nontrivial: {(100 * (correct[0] / size[0])):>5f}%, trivial: {(100 * (correct[1] / size[1])):>5f}%, Avg loss: {test_loss:>8f} \n") -def train_new_and_save(model_name, treebank_path, save_path, epochs=10): +def train_new_and_save(model_name, treebank_path, save_path, epochs=10, device='cpu'): """ Train a new LAMBO model and save it in filesystem. @@ -101,6 +107,7 @@ def train_new_and_save(model_name, treebank_path, save_path, epochs=10): :param treebank_path: path to the treebank training data :param save_path: path to save the generated model :param epochs: number of epochs to run for (default: 10) + :param device: the device to use for computation :return: no value returned """ if model_name not in ['LAMBO-BILSTM']: @@ -120,7 +127,7 @@ def train_new_and_save(model_name, treebank_path, save_path, epochs=10): BATCH_SIZE) model = LamboNetwork(MAX_LEN, dict, len(utf_category_dictionary)) - tune(model, train_dataloader, test_dataloader, epochs) + tune(model, train_dataloader, test_dataloader, epochs, device) print("Saving") torch.save(model, save_path / (treebank_path.name + '.pth')) @@ -128,7 +135,7 @@ def train_new_and_save(model_name, treebank_path, save_path, epochs=10): file1.writelines([x + '\t' + str(dict[x]) + '\n' for x in dict]) -def train_pretrained_and_save(language, treebank_path, save_path, pretrained_path, epochs=10): +def train_pretrained_and_save(language, treebank_path, save_path, pretrained_path, epochs=10, device='cpu'): """ Train a new LAMBO model, staring from pretrained, and save it in filesystem. @@ -137,6 +144,7 @@ def train_pretrained_and_save(language, treebank_path, save_path, pretrained_pat :param save_path: path to save the generated model :param pretrained_path: path to the pretraining models :param epochs: number of epochs to run for (default: 10) + :param device: the device to use for computation :return: no value returned """ print("Loading pretrained model") @@ -166,7 +174,7 @@ def train_pretrained_and_save(language, treebank_path, save_path, pretrained_pat MAX_LEN, BATCH_SIZE, dict=dict) - tune(model, train_dataloader, test_dataloader, epochs) + tune(model, train_dataloader, test_dataloader, epochs, device) print("Saving") torch.save(model, save_path / (treebank_path.name + '.pth')) @@ -174,7 +182,7 @@ def train_pretrained_and_save(language, treebank_path, save_path, pretrained_pat file1.writelines([x + '\t' + str(dict[x]) + '\n' for x in dict]) -def tune(model, train_dataloader, test_dataloader, epochs): +def tune(model, train_dataloader, test_dataloader, epochs, device='cpu'): """ Tune an existing LAMBO model with the provided data @@ -182,9 +190,11 @@ def tune(model, train_dataloader, test_dataloader, epochs): :param train_dataloader: dataloader for training data :param test_dataloader: dataloader for test data :param epochs: number of epochs to run for + :param device: the device to use for computation :return: no value returned """ print("Preparing training") + model.to(device) learning_rate = 1e-3 optimizer = Adam(model.parameters(), lr=learning_rate) @@ -192,11 +202,11 @@ def tune(model, train_dataloader, test_dataloader, epochs): test_loop(test_dataloader, model) for t in range(epochs): print(f"Epoch {t + 1}\n-------------------------------") - train_loop(train_dataloader, model, optimizer) - test_loop(test_dataloader, model) + train_loop(train_dataloader, model, optimizer, device) + test_loop(test_dataloader, model, device) -def pretrain(model, train_dataloader, test_dataloader, epochs): +def pretrain(model, train_dataloader, test_dataloader, epochs, device='cpu'): """ Tune an existing LAMBO pretraining model with the provided data @@ -204,9 +214,11 @@ def pretrain(model, train_dataloader, test_dataloader, epochs): :param train_dataloader: dataloader for training data :param test_dataloader: dataloader for test data :param epochs: number of epochs to run for + :param device: the device to use for computation :return: no value returned """ print("Preparing pretraining") + model.to(device) learning_rate = 1e-3 optimizer = Adam(model.parameters(), lr=learning_rate) @@ -214,5 +226,5 @@ def pretrain(model, train_dataloader, test_dataloader, epochs): test_loop_pretraining(test_dataloader, model) for t in range(epochs): print(f"Epoch {t + 1}\n-------------------------------") - train_loop(train_dataloader, model, optimizer) - test_loop_pretraining(test_dataloader, model) + train_loop(train_dataloader, model, optimizer, device) + test_loop_pretraining(test_dataloader, model, device) -- GitLab From 9a7523c4579b0069b05bcbd4d8e5dd3b15128b14 Mon Sep 17 00:00:00 2001 From: piotrmp <piotr.m.przybyla@gmail.com> Date: Wed, 23 Nov 2022 10:01:46 +0100 Subject: [PATCH 03/16] Bug fix. --- src/lambo/examples/run_pretraining.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lambo/examples/run_pretraining.py b/src/lambo/examples/run_pretraining.py index 45531bc..a6d0173 100644 --- a/src/lambo/examples/run_pretraining.py +++ b/src/lambo/examples/run_pretraining.py @@ -16,8 +16,8 @@ from lambo.learning.train import pretrain from lambo.utils.oscar import read_jsonl_to_documents, download_archive1_from_oscar if __name__=='__main__': - outpath = sys.argv[1] #Path.home() / 'PATH-TO/models/pretrained/' - tmppath = sys.argv[2] #Path.home() / 'PATH-TO/tmp/tmp.jsonl.gz' + outpath = Path(sys.argv[1]) #Path.home() / 'PATH-TO/models/pretrained/' + tmppath = Path(sys.argv[2]) #Path.home() / 'PATH-TO/tmp/tmp.jsonl.gz' # These need to be filled ine before running. OSCAR is avaialable on request. OSCAR_LOGIN = sys.argv[3] OSCAR_PASSWORD = sys.argv[4] -- GitLab From ab07051dd54bd9155a0ee8d7a74df26cf81eb8c0 Mon Sep 17 00:00:00 2001 From: piotrmp <piotr.m.przybyla@gmail.com> Date: Wed, 23 Nov 2022 10:07:01 +0100 Subject: [PATCH 04/16] Bug fix. --- src/lambo/utils/oscar.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/lambo/utils/oscar.py b/src/lambo/utils/oscar.py index bda4c66..0b588a1 100644 --- a/src/lambo/utils/oscar.py +++ b/src/lambo/utils/oscar.py @@ -4,6 +4,7 @@ Functions used to obtain multilingual corpora from `OSCAR <https://oscar-corpus. import json import random import urllib +import time from urllib.error import HTTPError from lambo.data.document import Document @@ -46,10 +47,11 @@ def download_archive1_from_oscar(language, path, OSCAR_LOGIN, OSCAR_PASSWORD, re return except HTTPError as err: error = err - if i == retry - 1: + if i == retry - 1 or err.code<500: raise error - time = ((i + 1) * (i + 1) * (i + 1) * 15) - print("[Got " + str(error.code) + ", retrying after " + str(time) + " seconds...]") + secs = ((i + 1) * (i + 1) * (i + 1) * 15) + print("[Got " + str(error.code) + ", retrying after " + str(secs) + " seconds...]") + time.sleep(secs) def read_jsonl_to_documents(fileobj, MAX_LEN=3000000): -- GitLab From 11b5840073a827351f19050c8f70a634c9e77be1 Mon Sep 17 00:00:00 2001 From: piotrmp <piotr.m.przybyla@gmail.com> Date: Wed, 23 Nov 2022 10:13:10 +0100 Subject: [PATCH 05/16] Bug fix. --- src/lambo/learning/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lambo/learning/train.py b/src/lambo/learning/train.py index a2cfd2c..dfab088 100644 --- a/src/lambo/learning/train.py +++ b/src/lambo/learning/train.py @@ -223,7 +223,7 @@ def pretrain(model, train_dataloader, test_dataloader, epochs, device='cpu'): optimizer = Adam(model.parameters(), lr=learning_rate) print("Pretraining") - test_loop_pretraining(test_dataloader, model) + test_loop_pretraining(test_dataloader, model, device) for t in range(epochs): print(f"Epoch {t + 1}\n-------------------------------") train_loop(train_dataloader, model, optimizer, device) -- GitLab From 98383a84f6e5eeb7bdec45922221fce02d904ed4 Mon Sep 17 00:00:00 2001 From: piotrmp <piotr.m.przybyla@gmail.com> Date: Wed, 23 Nov 2022 12:15:41 +0100 Subject: [PATCH 06/16] Bug fix. --- src/lambo/utils/oscar.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lambo/utils/oscar.py b/src/lambo/utils/oscar.py index 0b588a1..20ff83c 100644 --- a/src/lambo/utils/oscar.py +++ b/src/lambo/utils/oscar.py @@ -47,7 +47,7 @@ def download_archive1_from_oscar(language, path, OSCAR_LOGIN, OSCAR_PASSWORD, re return except HTTPError as err: error = err - if i == retry - 1 or err.code<500: + if i == retry - 1 or error.code<500: raise error secs = ((i + 1) * (i + 1) * (i + 1) * 15) print("[Got " + str(error.code) + ", retrying after " + str(secs) + " seconds...]") -- GitLab From ac1070811617b68fdc6d9a80771f84357de05c07 Mon Sep 17 00:00:00 2001 From: piotrmp <piotr.m.przybyla@gmail.com> Date: Wed, 23 Nov 2022 15:26:09 +0100 Subject: [PATCH 07/16] Added partial execution to pretraining. --- src/lambo/examples/run_pretraining.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/lambo/examples/run_pretraining.py b/src/lambo/examples/run_pretraining.py index a6d0173..6ff74bc 100644 --- a/src/lambo/examples/run_pretraining.py +++ b/src/lambo/examples/run_pretraining.py @@ -15,13 +15,13 @@ from lambo.learning.preprocessing_pretraining import encode_pretraining, prepare from lambo.learning.train import pretrain from lambo.utils.oscar import read_jsonl_to_documents, download_archive1_from_oscar -if __name__=='__main__': - outpath = Path(sys.argv[1]) #Path.home() / 'PATH-TO/models/pretrained/' - tmppath = Path(sys.argv[2]) #Path.home() / 'PATH-TO/tmp/tmp.jsonl.gz' +if __name__ == '__main__': + outpath = Path(sys.argv[1]) # Path.home() / 'PATH-TO/models/pretrained/' + tmppath = Path(sys.argv[2]) # Path.home() / 'PATH-TO/tmp/tmp.jsonl.gz' # These need to be filled ine before running. OSCAR is avaialable on request. OSCAR_LOGIN = sys.argv[3] OSCAR_PASSWORD = sys.argv[4] - + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') languages_file_str = resources.read_text('lambo.resources', 'languages.txt', encoding='utf-8', errors='strict') @@ -32,7 +32,9 @@ if __name__=='__main__': MAX_DOCUMENTS = 100 CONTEXT_LEN = 1024 - for language in languages: + for l, language in enumerate(languages): + if l % 5 != int(sys.argv[5]): + continue if (outpath / ('oscar_' + language + '.pth')).exists(): continue print("Language: " + language) @@ -52,7 +54,8 @@ if __name__=='__main__': print(str(i + 1) + '/' + str(min(len(train_documents), MAX_DOCUMENTS))) Xchars, Xutfs, Xmasks, Yvecs = encode_pretraining([document_train], dict, CONTEXT_LEN) _, train_dataloader, test_dataloader = prepare_dataloaders_pretraining([document_train], - [document_test], CONTEXT_LEN, 32, dict) + [document_test], CONTEXT_LEN, 32, + dict) pretrain(model, train_dataloader, test_dataloader, 1, device) torch.save(model, outpath / ('oscar_' + language + '.pth')) with open(outpath / ('oscar_' + language + '.dict'), "w") as file1: -- GitLab From 7fffa5953255f62644a3614abb4faa874487343c Mon Sep 17 00:00:00 2001 From: piotrmp <piotr.m.przybyla@gmail.com> Date: Wed, 23 Nov 2022 20:01:25 +0100 Subject: [PATCH 08/16] Added handling for unknown language in OSCAR. --- src/lambo/examples/run_pretraining.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/lambo/examples/run_pretraining.py b/src/lambo/examples/run_pretraining.py index 6ff74bc..2c724f4 100644 --- a/src/lambo/examples/run_pretraining.py +++ b/src/lambo/examples/run_pretraining.py @@ -2,6 +2,8 @@ Script from pretraining models using OSCAR corpora """ import gzip +from urllib.error import HTTPError + import importlib_resources as resources from pathlib import Path @@ -39,7 +41,14 @@ if __name__ == '__main__': continue print("Language: " + language) print("Downloading corpus...") - download_archive1_from_oscar(language, tmppath, OSCAR_LOGIN, OSCAR_PASSWORD) + try: + download_archive1_from_oscar(language, tmppath, OSCAR_LOGIN, OSCAR_PASSWORD) + except HTTPError as err: + if err.code==404: + print("Language unavailable in OSCAR. moving on...") + continue + else: + raise err with gzip.open(tmppath) as jsonfile: train_documents, test_documents = read_jsonl_to_documents(jsonfile) print("Generated " + str(len(train_documents)) + " documents.") -- GitLab From 88413d8f156e6ee961342ddfcf40b6dbed752927 Mon Sep 17 00:00:00 2001 From: piotrmp <piotr.m.przybyla@gmail.com> Date: Thu, 24 Nov 2022 08:52:56 +0100 Subject: [PATCH 09/16] Added larger models and GPU support for training. --- src/lambo/examples/run_training.py | 12 +++++++++--- src/lambo/examples/run_training_pretrained.py | 17 +++++++++++------ src/lambo/learning/train.py | 2 +- 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/src/lambo/examples/run_training.py b/src/lambo/examples/run_training.py index 4376970..7bac54d 100644 --- a/src/lambo/examples/run_training.py +++ b/src/lambo/examples/run_training.py @@ -1,14 +1,20 @@ """ Script for training LAMBO models using UD data """ +import sys + import importlib_resources as resources from pathlib import Path +import torch from lambo.learning.train import train_new_and_save if __name__=='__main__': - treebanks = Path.home() / 'PATH-TO/ud-treebanks-v2.9/' - outpath = Path.home() / 'PATH-TO/models/' + treebanks = Path(sys.argv[1]) #Path.home() / 'PATH-TO/ud-treebanks-v2.9/' + outpath = Path(sys.argv[2]) #Path.home() / 'PATH-TO/models/' + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + # Read available languages languages_file_str = resources.read_text('lambo.resources', 'languages.txt', encoding='utf-8', errors='strict') languages = [line.split(' ')[0] for line in languages_file_str.split('\n')] @@ -19,4 +25,4 @@ if __name__=='__main__': continue print(str(i) + '/' + str(len(languages)) + '========== ' + language + ' ==========') inpath = treebanks / language - train_new_and_save('LAMBO-BILSTM', inpath, outpath) + train_new_and_save('LAMBO-BILSTM', inpath, outpath, device) diff --git a/src/lambo/examples/run_training_pretrained.py b/src/lambo/examples/run_training_pretrained.py index f2dc8f2..33c3ea3 100644 --- a/src/lambo/examples/run_training_pretrained.py +++ b/src/lambo/examples/run_training_pretrained.py @@ -1,22 +1,27 @@ """ Script for training LAMBO models using UD data from pretrained """ - +import sys from pathlib import Path import importlib_resources as resources +import torch from lambo.learning.train import train_new_and_save, train_pretrained_and_save if __name__=='__main__': - treebanks = Path.home() / 'PATH-TO/ud-treebanks-v2.9/' - outpath = Path.home() / 'PATH-TO/models/full/' - pretrained_path = Path.home() / 'PATH-TO/models/pretrained/' + treebanks = Path(sys.argv[1]) #Path.home() / 'PATH-TO/ud-treebanks-v2.9/' + outpath = Path(sys.argv[2]) #Path.home() / 'PATH-TO/models/full/' + pretrained_path = Path(sys.argv[3]) #Path.home() / 'PATH-TO/models/pretrained/' + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') languages_file_str = resources.read_text('lambo.resources', 'languages.txt', encoding='utf-8', errors='strict') lines = [line.strip() for line in languages_file_str.split('\n') if not line[0] == '#'] for i, line in enumerate(lines): + if i % 5 != int(sys.argv[4]): + continue parts = line.split() model = parts[0] language = parts[1] @@ -25,6 +30,6 @@ if __name__=='__main__': print(str(i) + '/' + str(len(lines)) + '========== ' + model + ' ==========') inpath = treebanks / model if language != '?': - train_pretrained_and_save(language, inpath, outpath, pretrained_path) + train_pretrained_and_save(language, inpath, outpath, pretrained_path, device) else: - train_new_and_save('LAMBO-BILSTM', inpath, outpath) + train_new_and_save('LAMBO-BILSTM', inpath, outpath, device) diff --git a/src/lambo/learning/train.py b/src/lambo/learning/train.py index dfab088..203eafb 100644 --- a/src/lambo/learning/train.py +++ b/src/lambo/learning/train.py @@ -199,7 +199,7 @@ def tune(model, train_dataloader, test_dataloader, epochs, device='cpu'): optimizer = Adam(model.parameters(), lr=learning_rate) print("Training") - test_loop(test_dataloader, model) + test_loop(test_dataloader, model, device) for t in range(epochs): print(f"Epoch {t + 1}\n-------------------------------") train_loop(train_dataloader, model, optimizer, device) -- GitLab From 7b6b1b503b1f71a3d92ee48023d651b6b4e9616f Mon Sep 17 00:00:00 2001 From: piotrmp <piotr.m.przybyla@gmail.com> Date: Thu, 24 Nov 2022 09:03:02 +0100 Subject: [PATCH 10/16] Bug fix. --- src/lambo/examples/run_training.py | 2 +- src/lambo/examples/run_training_pretrained.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/lambo/examples/run_training.py b/src/lambo/examples/run_training.py index 7bac54d..34c8b88 100644 --- a/src/lambo/examples/run_training.py +++ b/src/lambo/examples/run_training.py @@ -25,4 +25,4 @@ if __name__=='__main__': continue print(str(i) + '/' + str(len(languages)) + '========== ' + language + ' ==========') inpath = treebanks / language - train_new_and_save('LAMBO-BILSTM', inpath, outpath, device) + train_new_and_save('LAMBO-BILSTM', inpath, outpath, 10, device) diff --git a/src/lambo/examples/run_training_pretrained.py b/src/lambo/examples/run_training_pretrained.py index 33c3ea3..0e7f7ee 100644 --- a/src/lambo/examples/run_training_pretrained.py +++ b/src/lambo/examples/run_training_pretrained.py @@ -30,6 +30,6 @@ if __name__=='__main__': print(str(i) + '/' + str(len(lines)) + '========== ' + model + ' ==========') inpath = treebanks / model if language != '?': - train_pretrained_and_save(language, inpath, outpath, pretrained_path, device) + train_pretrained_and_save(language, inpath, outpath, pretrained_path, 10, device) else: - train_new_and_save('LAMBO-BILSTM', inpath, outpath, device) + train_new_and_save('LAMBO-BILSTM', inpath, outpath, 10, device) -- GitLab From b3fb984ee5bb01ba642d4199e40ab3ced7592f12 Mon Sep 17 00:00:00 2001 From: piotrmp <piotr.m.przybyla@gmail.com> Date: Thu, 24 Nov 2022 09:07:51 +0100 Subject: [PATCH 11/16] Added fallback for no pretrained model case. --- src/lambo/learning/train.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/lambo/learning/train.py b/src/lambo/learning/train.py index 203eafb..13884a1 100644 --- a/src/lambo/learning/train.py +++ b/src/lambo/learning/train.py @@ -149,7 +149,11 @@ def train_pretrained_and_save(language, treebank_path, save_path, pretrained_pat """ print("Loading pretrained model") pretrained_name = 'oscar_' + language - pretrained_model = torch.load(pretrained_path / (pretrained_name + '.pth')) + file_path = pretrained_path / (pretrained_name + '.pth') + if not file_path.exists(): + print("Pretrained model not found, falling back to training from scratch.") + return train_new_and_save('LAMBO-BILSTM', treebank_path, save_path, epochs, device) + pretrained_model = torch.load(file_path) dict = {} for line in open(pretrained_path / (pretrained_name + '.dict')): if line.strip() == '': -- GitLab From 386953e3263dcf26ed1cf8d87ef47fcf76bacf5a Mon Sep 17 00:00:00 2001 From: piotrmp <piotr.m.przybyla@gmail.com> Date: Thu, 24 Nov 2022 11:11:00 +0100 Subject: [PATCH 12/16] Small adjustments to job split. --- src/lambo/examples/run_training.py | 4 +++- src/lambo/examples/run_training_pretrained.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/lambo/examples/run_training.py b/src/lambo/examples/run_training.py index 34c8b88..9554404 100644 --- a/src/lambo/examples/run_training.py +++ b/src/lambo/examples/run_training.py @@ -17,9 +17,11 @@ if __name__=='__main__': # Read available languages languages_file_str = resources.read_text('lambo.resources', 'languages.txt', encoding='utf-8', errors='strict') - languages = [line.split(' ')[0] for line in languages_file_str.split('\n')] + languages = [line.split(' ')[0] for line in languages_file_str.split('\n') if not line[0] == '#'] for i in range(len(languages)): + if len(sys.argv)>3 and i % 5 != int(sys.argv[3]): + continue language = languages[i] if (outpath / (language + '.pth')).exists(): continue diff --git a/src/lambo/examples/run_training_pretrained.py b/src/lambo/examples/run_training_pretrained.py index 0e7f7ee..6a81eb2 100644 --- a/src/lambo/examples/run_training_pretrained.py +++ b/src/lambo/examples/run_training_pretrained.py @@ -20,7 +20,7 @@ if __name__=='__main__': lines = [line.strip() for line in languages_file_str.split('\n') if not line[0] == '#'] for i, line in enumerate(lines): - if i % 5 != int(sys.argv[4]): + if len(sys.argv)>4 and i % 5 != int(sys.argv[4]): continue parts = line.split() model = parts[0] -- GitLab From 1b0987077ec57d9f678a4b5dcfe58a5b183464e4 Mon Sep 17 00:00:00 2001 From: piotrmp <piotr.m.przybyla@gmail.com> Date: Thu, 24 Nov 2022 11:13:10 +0100 Subject: [PATCH 13/16] Number of epoch changed. --- src/lambo/examples/run_training.py | 2 +- src/lambo/examples/run_training_pretrained.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/lambo/examples/run_training.py b/src/lambo/examples/run_training.py index 9554404..5676f36 100644 --- a/src/lambo/examples/run_training.py +++ b/src/lambo/examples/run_training.py @@ -27,4 +27,4 @@ if __name__=='__main__': continue print(str(i) + '/' + str(len(languages)) + '========== ' + language + ' ==========') inpath = treebanks / language - train_new_and_save('LAMBO-BILSTM', inpath, outpath, 10, device) + train_new_and_save('LAMBO-BILSTM', inpath, outpath, 20, device) diff --git a/src/lambo/examples/run_training_pretrained.py b/src/lambo/examples/run_training_pretrained.py index 6a81eb2..9cc6b2b 100644 --- a/src/lambo/examples/run_training_pretrained.py +++ b/src/lambo/examples/run_training_pretrained.py @@ -30,6 +30,6 @@ if __name__=='__main__': print(str(i) + '/' + str(len(lines)) + '========== ' + model + ' ==========') inpath = treebanks / model if language != '?': - train_pretrained_and_save(language, inpath, outpath, pretrained_path, 10, device) + train_pretrained_and_save(language, inpath, outpath, pretrained_path, 20, device) else: - train_new_and_save('LAMBO-BILSTM', inpath, outpath, 10, device) + train_new_and_save('LAMBO-BILSTM', inpath, outpath, 20, device) -- GitLab From 51c310b13b7cb69a11dc05b8688987b85a5a2b25 Mon Sep 17 00:00:00 2001 From: piotrmp <piotr.m.przybyla@gmail.com> Date: Thu, 24 Nov 2022 15:28:20 +0100 Subject: [PATCH 14/16] Bug fix. --- src/lambo/learning/train.py | 2 +- src/lambo/segmenter/lambo.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/lambo/learning/train.py b/src/lambo/learning/train.py index 13884a1..d3f3a06 100644 --- a/src/lambo/learning/train.py +++ b/src/lambo/learning/train.py @@ -153,7 +153,7 @@ def train_pretrained_and_save(language, treebank_path, save_path, pretrained_pat if not file_path.exists(): print("Pretrained model not found, falling back to training from scratch.") return train_new_and_save('LAMBO-BILSTM', treebank_path, save_path, epochs, device) - pretrained_model = torch.load(file_path) + pretrained_model = torch.load(file_path, map_location=torch.device('cpu')) dict = {} for line in open(pretrained_path / (pretrained_name + '.dict')): if line.strip() == '': diff --git a/src/lambo/segmenter/lambo.py b/src/lambo/segmenter/lambo.py index e2fbc53..a5e46d8 100644 --- a/src/lambo/segmenter/lambo.py +++ b/src/lambo/segmenter/lambo.py @@ -38,7 +38,7 @@ class Lambo(): model_name = Lambo.getDefaultModel(provided_name) dict_path, model_path = download_model(model_name) dict = Lambo.read_dict(dict_path) - model = torch.load(model_path) + model = torch.load(model_path, map_location=torch.device('cpu')) return cls(model, dict) @staticmethod @@ -75,7 +75,7 @@ class Lambo(): :param model_name: model name :return: """ - model = torch.load(model_path / (model_name + '.pth')) + model = torch.load(model_path / (model_name + '.pth'), map_location=torch.device('cpu')) dict = Lambo.read_dict(model_path / (model_name + '.dict')) return cls(model, dict) -- GitLab From bea72f566d8a0a432df994c542017c5c6f37187a Mon Sep 17 00:00:00 2001 From: piotrmp <piotr.m.przybyla@gmail.com> Date: Thu, 24 Nov 2022 21:38:08 +0100 Subject: [PATCH 15/16] Reduced window size. --- src/lambo/evaluation/evaluate.py | 15 ++++++++++----- src/lambo/learning/train.py | 4 ++-- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/src/lambo/evaluation/evaluate.py b/src/lambo/evaluation/evaluate.py index f4bb70c..b86a62f 100644 --- a/src/lambo/evaluation/evaluate.py +++ b/src/lambo/evaluation/evaluate.py @@ -1,4 +1,4 @@ -from lambo.evaluation.conll18_ud_eval import load_conllu, evaluate +from lambo.evaluation.conll18_ud_eval import load_conllu, evaluate, UDError from lambo.utils.printer import print_document_to_conll @@ -19,8 +19,13 @@ def evaluate_segmenter(segmenter, test_text, gold_path, tmp_path): with open(gold_path) as fGold: pred = load_conllu(fPred) gold = load_conllu(fGold) - conll_result = evaluate(gold, pred) - for category in ['Tokens', 'Words', 'Sentences']: - result[category] = {'F1': conll_result[category].f1, 'precision': conll_result[category].precision, - 'recall': conll_result[category].recall} + try: + conll_result = evaluate(gold, pred) + for category in ['Tokens', 'Words', 'Sentences']: + result[category] = {'F1': conll_result[category].f1, 'precision': conll_result[category].precision, + 'recall': conll_result[category].recall} + except UDError as e: + for category in ['Tokens', 'Words', 'Sentences']: + result[category] = {'F1': 0.0, 'precision': 0.0, + 'recall': 0.0} return result diff --git a/src/lambo/learning/train.py b/src/lambo/learning/train.py index d3f3a06..a0efdfc 100644 --- a/src/lambo/learning/train.py +++ b/src/lambo/learning/train.py @@ -121,7 +121,7 @@ def train_new_and_save(model_name, treebank_path, save_path, epochs=10, device=' BATCH_SIZE = 32 print("Initiating the model.") - MAX_LEN = 1024 + MAX_LEN = 256 dict, train_dataloader, test_dataloader = prepare_dataloaders_withdict([train_doc, dev_doc], [test_doc], MAX_LEN, BATCH_SIZE) @@ -168,7 +168,7 @@ def train_pretrained_and_save(language, treebank_path, save_path, pretrained_pat train_doc, dev_doc, test_doc = read_treebank(treebank_path, True) print("Initiating the model.") - MAX_LEN = 1024 + MAX_LEN = 256 model = LamboNetwork(MAX_LEN, dict, len(utf_category_dictionary), pretrained=pretrained_model) print("Preparing data") -- GitLab From 7f1ed3e26736f141f27c5606be88c90fda5e5b64 Mon Sep 17 00:00:00 2001 From: piotrmp <piotr.m.przybyla@gmail.com> Date: Fri, 25 Nov 2022 13:15:14 +0100 Subject: [PATCH 16/16] Finalised the UD 2.11 migration. --- src/lambo/utils/download.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lambo/utils/download.py b/src/lambo/utils/download.py index e4fcc0f..0aef6df 100644 --- a/src/lambo/utils/download.py +++ b/src/lambo/utils/download.py @@ -16,8 +16,8 @@ logger = logging.getLogger(__name__) # The types of models available and their subdirectories in the model repository TYPE_TO_PATH = { - "LAMBO_no_pretraining": "vanilla", - "LAMBO": "full"} + "LAMBO_no_pretraining": "vanilla211-s", + "LAMBO": "full211-s"} # The adress of the remote repository _URL = "http://home.ipipan.waw.pl/p.przybyla/lambo/{type}/{treebank}.{extension}" -- GitLab