diff --git a/src/lambo/evaluation/evaluate.py b/src/lambo/evaluation/evaluate.py index f4bb70c1b2ab071342d8667e99b0ce0b14b47294..b86a62f5449ed1399029a25fae4d6583e3427aee 100644 --- a/src/lambo/evaluation/evaluate.py +++ b/src/lambo/evaluation/evaluate.py @@ -1,4 +1,4 @@ -from lambo.evaluation.conll18_ud_eval import load_conllu, evaluate +from lambo.evaluation.conll18_ud_eval import load_conllu, evaluate, UDError from lambo.utils.printer import print_document_to_conll @@ -19,8 +19,13 @@ def evaluate_segmenter(segmenter, test_text, gold_path, tmp_path): with open(gold_path) as fGold: pred = load_conllu(fPred) gold = load_conllu(fGold) - conll_result = evaluate(gold, pred) - for category in ['Tokens', 'Words', 'Sentences']: - result[category] = {'F1': conll_result[category].f1, 'precision': conll_result[category].precision, - 'recall': conll_result[category].recall} + try: + conll_result = evaluate(gold, pred) + for category in ['Tokens', 'Words', 'Sentences']: + result[category] = {'F1': conll_result[category].f1, 'precision': conll_result[category].precision, + 'recall': conll_result[category].recall} + except UDError as e: + for category in ['Tokens', 'Words', 'Sentences']: + result[category] = {'F1': 0.0, 'precision': 0.0, + 'recall': 0.0} return result diff --git a/src/lambo/examples/run_pretraining.py b/src/lambo/examples/run_pretraining.py index ddcb05bcff000ccd9ec4a1bb09c14510a7acb672..2c724f408b35a8d8f964f4087fdf00fd91ead4ea 100644 --- a/src/lambo/examples/run_pretraining.py +++ b/src/lambo/examples/run_pretraining.py @@ -2,10 +2,13 @@ Script from pretraining models using OSCAR corpora """ import gzip +from urllib.error import HTTPError + import importlib_resources as resources from pathlib import Path import torch +import sys from lambo.learning.dictionary import create_dictionary from lambo.learning.model_pretraining import LamboPretrainingNetwork @@ -14,27 +17,38 @@ from lambo.learning.preprocessing_pretraining import encode_pretraining, prepare from lambo.learning.train import pretrain from lambo.utils.oscar import read_jsonl_to_documents, download_archive1_from_oscar -if __name__=='__main__': - outpath = Path.home() / 'PATH-TO/models/pretrained/' - tmppath = Path.home() / 'PATH-TO/tmp/tmp.jsonl.gz' +if __name__ == '__main__': + outpath = Path(sys.argv[1]) # Path.home() / 'PATH-TO/models/pretrained/' + tmppath = Path(sys.argv[2]) # Path.home() / 'PATH-TO/tmp/tmp.jsonl.gz' # These need to be filled ine before running. OSCAR is avaialable on request. - OSCAR_LOGIN = '' - OSCAR_PASSWORD = '' + OSCAR_LOGIN = sys.argv[3] + OSCAR_PASSWORD = sys.argv[4] + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') languages_file_str = resources.read_text('lambo.resources', 'languages.txt', encoding='utf-8', errors='strict') languages = [line.split(' ')[1] for line in languages_file_str.split('\n') if line[0] != '#' and line.split(' ')[1] != '?'] languages = list(dict.fromkeys(languages)) - MAX_DOCUMENTS = 10 - CONTEXT_LEN = 256 + MAX_DOCUMENTS = 100 + CONTEXT_LEN = 1024 - for language in languages: + for l, language in enumerate(languages): + if l % 5 != int(sys.argv[5]): + continue if (outpath / ('oscar_' + language + '.pth')).exists(): continue print("Language: " + language) print("Downloading corpus...") - download_archive1_from_oscar(language, tmppath, OSCAR_LOGIN, OSCAR_PASSWORD) + try: + download_archive1_from_oscar(language, tmppath, OSCAR_LOGIN, OSCAR_PASSWORD) + except HTTPError as err: + if err.code==404: + print("Language unavailable in OSCAR. moving on...") + continue + else: + raise err with gzip.open(tmppath) as jsonfile: train_documents, test_documents = read_jsonl_to_documents(jsonfile) print("Generated " + str(len(train_documents)) + " documents.") @@ -49,8 +63,9 @@ if __name__=='__main__': print(str(i + 1) + '/' + str(min(len(train_documents), MAX_DOCUMENTS))) Xchars, Xutfs, Xmasks, Yvecs = encode_pretraining([document_train], dict, CONTEXT_LEN) _, train_dataloader, test_dataloader = prepare_dataloaders_pretraining([document_train], - [document_test], CONTEXT_LEN, 32, dict) - pretrain(model, train_dataloader, test_dataloader, 1) + [document_test], CONTEXT_LEN, 32, + dict) + pretrain(model, train_dataloader, test_dataloader, 1, device) torch.save(model, outpath / ('oscar_' + language + '.pth')) with open(outpath / ('oscar_' + language + '.dict'), "w") as file1: file1.writelines([x + '\t' + str(dict[x]) + '\n' for x in dict]) diff --git a/src/lambo/examples/run_training.py b/src/lambo/examples/run_training.py index 437697088ee16c13d73084f16547208394f8f7c8..5676f365d2049fce86349966c1882fd59d3c6f6e 100644 --- a/src/lambo/examples/run_training.py +++ b/src/lambo/examples/run_training.py @@ -1,22 +1,30 @@ """ Script for training LAMBO models using UD data """ +import sys + import importlib_resources as resources from pathlib import Path +import torch from lambo.learning.train import train_new_and_save if __name__=='__main__': - treebanks = Path.home() / 'PATH-TO/ud-treebanks-v2.9/' - outpath = Path.home() / 'PATH-TO/models/' + treebanks = Path(sys.argv[1]) #Path.home() / 'PATH-TO/ud-treebanks-v2.9/' + outpath = Path(sys.argv[2]) #Path.home() / 'PATH-TO/models/' + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + # Read available languages languages_file_str = resources.read_text('lambo.resources', 'languages.txt', encoding='utf-8', errors='strict') - languages = [line.split(' ')[0] for line in languages_file_str.split('\n')] + languages = [line.split(' ')[0] for line in languages_file_str.split('\n') if not line[0] == '#'] for i in range(len(languages)): + if len(sys.argv)>3 and i % 5 != int(sys.argv[3]): + continue language = languages[i] if (outpath / (language + '.pth')).exists(): continue print(str(i) + '/' + str(len(languages)) + '========== ' + language + ' ==========') inpath = treebanks / language - train_new_and_save('LAMBO-BILSTM', inpath, outpath) + train_new_and_save('LAMBO-BILSTM', inpath, outpath, 20, device) diff --git a/src/lambo/examples/run_training_pretrained.py b/src/lambo/examples/run_training_pretrained.py index f2dc8f2d5723cf4eaafc971fdee1c83c5559ade6..9cc6b2bb0be8df0aafe09032d4b503458d5b2bc3 100644 --- a/src/lambo/examples/run_training_pretrained.py +++ b/src/lambo/examples/run_training_pretrained.py @@ -1,22 +1,27 @@ """ Script for training LAMBO models using UD data from pretrained """ - +import sys from pathlib import Path import importlib_resources as resources +import torch from lambo.learning.train import train_new_and_save, train_pretrained_and_save if __name__=='__main__': - treebanks = Path.home() / 'PATH-TO/ud-treebanks-v2.9/' - outpath = Path.home() / 'PATH-TO/models/full/' - pretrained_path = Path.home() / 'PATH-TO/models/pretrained/' + treebanks = Path(sys.argv[1]) #Path.home() / 'PATH-TO/ud-treebanks-v2.9/' + outpath = Path(sys.argv[2]) #Path.home() / 'PATH-TO/models/full/' + pretrained_path = Path(sys.argv[3]) #Path.home() / 'PATH-TO/models/pretrained/' + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') languages_file_str = resources.read_text('lambo.resources', 'languages.txt', encoding='utf-8', errors='strict') lines = [line.strip() for line in languages_file_str.split('\n') if not line[0] == '#'] for i, line in enumerate(lines): + if len(sys.argv)>4 and i % 5 != int(sys.argv[4]): + continue parts = line.split() model = parts[0] language = parts[1] @@ -25,6 +30,6 @@ if __name__=='__main__': print(str(i) + '/' + str(len(lines)) + '========== ' + model + ' ==========') inpath = treebanks / model if language != '?': - train_pretrained_and_save(language, inpath, outpath, pretrained_path) + train_pretrained_and_save(language, inpath, outpath, pretrained_path, 20, device) else: - train_new_and_save('LAMBO-BILSTM', inpath, outpath) + train_new_and_save('LAMBO-BILSTM', inpath, outpath, 20, device) diff --git a/src/lambo/learning/model.py b/src/lambo/learning/model.py index 6383d3e18935d882717b035f3aa68b58781bd811..9e69067e5d6f234ea9766b76488284c2faa411a6 100644 --- a/src/lambo/learning/model.py +++ b/src/lambo/learning/model.py @@ -7,8 +7,8 @@ class LamboNetwork(Module): """ LAMBO neural network model. The network has four layers: - * embedding layers for characters, representing each as a 32-long vector, - * bidirectional LSTM layer, taking a concatenation of (1) character embedding and (2) one-hot UTF category vector as input and outputting 2*128-long state vector, + * embedding layers for characters, representing each as a 32-long vector (or 64-long), + * bidirectional LSTM layer, taking a concatenation of (1) character embedding and (2) one-hot UTF category vector as input and outputting 2*128-long state vector (or 2*256), * dense linear layer, converting LSTM state vectors to class scores * softmax layer, computing probability of eight events for any character: @@ -34,8 +34,8 @@ class LamboNetwork(Module): self.embedding_layer = copy.deepcopy(pretrained.embedding_layer) self.lstm_layer = copy.deepcopy(pretrained.lstm_layer) else: - self.embedding_layer = Embedding(len(dict), 32, dict['<PAD>']) - self.lstm_layer = LSTM(input_size=self.embedding_layer.embedding_dim + utf_categories_num, hidden_size=128, + self.embedding_layer = Embedding(len(dict), 64, dict['<PAD>']) + self.lstm_layer = LSTM(input_size=self.embedding_layer.embedding_dim + utf_categories_num, hidden_size=256, batch_first=True, bidirectional=True) self.linear_layer = Linear(self.lstm_layer.hidden_size * 2, 8) diff --git a/src/lambo/learning/model_pretraining.py b/src/lambo/learning/model_pretraining.py index e37d08b31f17302371445a02a5008f5127ef3514..fa07693ac27e8bb36868fdafad8eae472e89d323 100644 --- a/src/lambo/learning/model_pretraining.py +++ b/src/lambo/learning/model_pretraining.py @@ -17,8 +17,8 @@ class LamboPretrainingNetwork(Module): """ super(LamboPretrainingNetwork, self).__init__() self.max_len = max_len - self.embedding_layer = Embedding(len(dict), 32, dict['<PAD>']) - self.lstm_layer = LSTM(input_size=self.embedding_layer.embedding_dim + utf_categories_num, hidden_size=128, + self.embedding_layer = Embedding(len(dict), 64, dict['<PAD>']) + self.lstm_layer = LSTM(input_size=self.embedding_layer.embedding_dim + utf_categories_num, hidden_size=256, batch_first=True, bidirectional=True) self.linear_layer = Linear(self.lstm_layer.hidden_size * 2, len(dict)) diff --git a/src/lambo/learning/train.py b/src/lambo/learning/train.py index c45c5004ec991e7ac99d701151910e7ef99c2f5b..a0efdfcda1515a67c67d6e19b5922fd816694c43 100644 --- a/src/lambo/learning/train.py +++ b/src/lambo/learning/train.py @@ -9,17 +9,19 @@ from lambo.learning.preprocessing_dict import utf_category_dictionary, prepare_d from lambo.utils.ud_reader import read_treebank -def train_loop(dataloader, model, optimizer): +def train_loop(dataloader, model, optimizer, device='cpu'): """ Training loop. :param dataloader: dataloader with training data :param model: model to be optimised :param optimizer: optimiser used + :param device: the device to use for computation :return: no value returned """ size = len(dataloader.dataset) for batch, XY in enumerate(dataloader): + XY = [xy.to(device) for xy in XY] Xs = XY[:-1] Y = XY[-1] pred = model(*Xs) @@ -32,12 +34,13 @@ def train_loop(dataloader, model, optimizer): print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]") -def test_loop(dataloader, model): +def test_loop(dataloader, model, device='cpu'): """ Test loop. :param dataloader: dataloader with test data :param model: model to be tested + :param device: the device to use for computation :return: no value returned """ num_batches = len(dataloader) @@ -46,6 +49,7 @@ def test_loop(dataloader, model): size = [0, 0, 0, 0] with torch.no_grad(): for XY in dataloader: + XY = [xy.to(device) for xy in XY] Xs = XY[:-1] Y = XY[-1] pred = model(*Xs) @@ -63,12 +67,13 @@ def test_loop(dataloader, model): f"Test Error: \n Accuracy chars: {(100 * (correct[0] / size[0])):>5f}%, tokens: {(100 * (correct[1] / size[1])):>5f}%, mwtokens: {(100 * (correct[2] / size[2])):>5f}%, sentences: {(100 * (correct[3] / size[3])):>5f}%, Avg loss: {test_loss:>8f} \n") -def test_loop_pretraining(dataloader, model): +def test_loop_pretraining(dataloader, model, device='cpu'): """ Test loop for pretraining. :param dataloader: dataloader with test data :param model: model to be tested + :param device: the device to use for computation :return: no value returned """ num_batches = len(dataloader) @@ -77,6 +82,7 @@ def test_loop_pretraining(dataloader, model): size = [0, 0] with torch.no_grad(): for XY in dataloader: + XY = [xy.to(device) for xy in XY] Xs = XY[:-1] Y = XY[-1] pred = model(*Xs) @@ -93,7 +99,7 @@ def test_loop_pretraining(dataloader, model): f"Test Error: \n Accuracy nontrivial: {(100 * (correct[0] / size[0])):>5f}%, trivial: {(100 * (correct[1] / size[1])):>5f}%, Avg loss: {test_loss:>8f} \n") -def train_new_and_save(model_name, treebank_path, save_path, epochs=10): +def train_new_and_save(model_name, treebank_path, save_path, epochs=10, device='cpu'): """ Train a new LAMBO model and save it in filesystem. @@ -101,6 +107,7 @@ def train_new_and_save(model_name, treebank_path, save_path, epochs=10): :param treebank_path: path to the treebank training data :param save_path: path to save the generated model :param epochs: number of epochs to run for (default: 10) + :param device: the device to use for computation :return: no value returned """ if model_name not in ['LAMBO-BILSTM']: @@ -114,13 +121,13 @@ def train_new_and_save(model_name, treebank_path, save_path, epochs=10): BATCH_SIZE = 32 print("Initiating the model.") - MAX_LEN = 100 + MAX_LEN = 256 dict, train_dataloader, test_dataloader = prepare_dataloaders_withdict([train_doc, dev_doc], [test_doc], MAX_LEN, BATCH_SIZE) model = LamboNetwork(MAX_LEN, dict, len(utf_category_dictionary)) - tune(model, train_dataloader, test_dataloader, epochs) + tune(model, train_dataloader, test_dataloader, epochs, device) print("Saving") torch.save(model, save_path / (treebank_path.name + '.pth')) @@ -128,7 +135,7 @@ def train_new_and_save(model_name, treebank_path, save_path, epochs=10): file1.writelines([x + '\t' + str(dict[x]) + '\n' for x in dict]) -def train_pretrained_and_save(language, treebank_path, save_path, pretrained_path, epochs=10): +def train_pretrained_and_save(language, treebank_path, save_path, pretrained_path, epochs=10, device='cpu'): """ Train a new LAMBO model, staring from pretrained, and save it in filesystem. @@ -137,11 +144,16 @@ def train_pretrained_and_save(language, treebank_path, save_path, pretrained_pat :param save_path: path to save the generated model :param pretrained_path: path to the pretraining models :param epochs: number of epochs to run for (default: 10) + :param device: the device to use for computation :return: no value returned """ print("Loading pretrained model") pretrained_name = 'oscar_' + language - pretrained_model = torch.load(pretrained_path / (pretrained_name + '.pth')) + file_path = pretrained_path / (pretrained_name + '.pth') + if not file_path.exists(): + print("Pretrained model not found, falling back to training from scratch.") + return train_new_and_save('LAMBO-BILSTM', treebank_path, save_path, epochs, device) + pretrained_model = torch.load(file_path, map_location=torch.device('cpu')) dict = {} for line in open(pretrained_path / (pretrained_name + '.dict')): if line.strip() == '': @@ -156,7 +168,7 @@ def train_pretrained_and_save(language, treebank_path, save_path, pretrained_pat train_doc, dev_doc, test_doc = read_treebank(treebank_path, True) print("Initiating the model.") - MAX_LEN = 100 + MAX_LEN = 256 model = LamboNetwork(MAX_LEN, dict, len(utf_category_dictionary), pretrained=pretrained_model) print("Preparing data") @@ -166,7 +178,7 @@ def train_pretrained_and_save(language, treebank_path, save_path, pretrained_pat MAX_LEN, BATCH_SIZE, dict=dict) - tune(model, train_dataloader, test_dataloader, epochs) + tune(model, train_dataloader, test_dataloader, epochs, device) print("Saving") torch.save(model, save_path / (treebank_path.name + '.pth')) @@ -174,7 +186,7 @@ def train_pretrained_and_save(language, treebank_path, save_path, pretrained_pat file1.writelines([x + '\t' + str(dict[x]) + '\n' for x in dict]) -def tune(model, train_dataloader, test_dataloader, epochs): +def tune(model, train_dataloader, test_dataloader, epochs, device='cpu'): """ Tune an existing LAMBO model with the provided data @@ -182,21 +194,23 @@ def tune(model, train_dataloader, test_dataloader, epochs): :param train_dataloader: dataloader for training data :param test_dataloader: dataloader for test data :param epochs: number of epochs to run for + :param device: the device to use for computation :return: no value returned """ print("Preparing training") + model.to(device) learning_rate = 1e-3 optimizer = Adam(model.parameters(), lr=learning_rate) print("Training") - test_loop(test_dataloader, model) + test_loop(test_dataloader, model, device) for t in range(epochs): print(f"Epoch {t + 1}\n-------------------------------") - train_loop(train_dataloader, model, optimizer) - test_loop(test_dataloader, model) + train_loop(train_dataloader, model, optimizer, device) + test_loop(test_dataloader, model, device) -def pretrain(model, train_dataloader, test_dataloader, epochs): +def pretrain(model, train_dataloader, test_dataloader, epochs, device='cpu'): """ Tune an existing LAMBO pretraining model with the provided data @@ -204,15 +218,17 @@ def pretrain(model, train_dataloader, test_dataloader, epochs): :param train_dataloader: dataloader for training data :param test_dataloader: dataloader for test data :param epochs: number of epochs to run for + :param device: the device to use for computation :return: no value returned """ print("Preparing pretraining") + model.to(device) learning_rate = 1e-3 optimizer = Adam(model.parameters(), lr=learning_rate) print("Pretraining") - test_loop_pretraining(test_dataloader, model) + test_loop_pretraining(test_dataloader, model, device) for t in range(epochs): print(f"Epoch {t + 1}\n-------------------------------") - train_loop(train_dataloader, model, optimizer) - test_loop_pretraining(test_dataloader, model) + train_loop(train_dataloader, model, optimizer, device) + test_loop_pretraining(test_dataloader, model, device) diff --git a/src/lambo/resources/languages.txt b/src/lambo/resources/languages.txt index ee5981c581a089db20f31bff248fc74cdfd23460..9e213ff758500adbb9cc64f1850feed0dc739dc6 100644 --- a/src/lambo/resources/languages.txt +++ b/src/lambo/resources/languages.txt @@ -1,17 +1,20 @@ # Format: <UD training corpus> <ISO 639-1 code (for OSCAR pretraining)> <Language name> <Recommended (chosen by size)> UD_Afrikaans-AfriBooms af Afrikaans -UD_Ancient_Greek-Perseus ? Ancient_Greek UD_Ancient_Greek-PROIEL ? Ancient_Greek * -UD_Arabic-PADT ar Arabic -UD_Armenian-ArmTDP hy Armenian -UD_Western_Armenian-ArmTDP hy Western_Armenian +UD_Ancient_Greek-Perseus ? Ancient_Greek +UD_Ancient_Hebrew-PTNK ? Ancient_Hebrew +UD_Arabic-NYUAD ar Arabic +UD_Arabic-PADT ar Arabic * +UD_Armenian-ArmTDP hy Armenian * +UD_Armenian-BSUT hy Armenian UD_Basque-BDT eu Basque UD_Belarusian-HSE be Belarusian UD_Bulgarian-BTB bg Bulgarian UD_Catalan-AnCora ca Catalan -UD_Chinese-GSD zh Chinese * -UD_Chinese-GSDSimp zh Chinese +UD_Chinese-GSD zh Chinese +UD_Chinese-GSDSimp zh Chinese * UD_Classical_Chinese-Kyoto ? Classical_Chinese +UD_Coptic-Scriptorium ? Coptic UD_Croatian-SET hr Croatian UD_Czech-CAC cs Czech UD_Czech-CLTT cs Czech @@ -21,54 +24,77 @@ UD_Danish-DDT da Danish UD_Dutch-Alpino nl Dutch * UD_Dutch-LassySmall nl Dutch UD_English-Atis en English +UD_English-ESL en English UD_English-EWT en English * UD_English-GUM en English +UD_English-GUMReddit en English UD_English-LinES en English UD_English-ParTUT en English UD_Estonian-EDT et Estonian * UD_Estonian-EWT et Estonian +UD_Faroese-FarPaHC fo Faroese UD_Finnish-FTB fi Finnish UD_Finnish-TDT fi Finnish * +UD_French-FTB fr French UD_French-GSD fr French * UD_French-ParTUT fr French +UD_French-ParisStories fr French UD_French-Rhapsodie fr French UD_French-Sequoia fr French UD_Galician-CTG gl Galician UD_German-GSD de German UD_German-HDT de German * +UD_Gothic-PROIEL ? Gothic UD_Greek-GDT el Greek -UD_Hebrew-HTB he Hebrew +UD_Hebrew-HTB he Hebrew * +UD_Hebrew-IAHLTwiki he Hebrew UD_Hindi-HDTB hi Hindi +UD_Hindi_English-HIENCS ? Hindi_English UD_Hungarian-Szeged hu Hungarian +UD_Icelandic-GC is Icelandic UD_Icelandic-IcePaHC is Icelandic * UD_Icelandic-Modern is Icelandic UD_Indonesian-GSD id Indonesian UD_Irish-IDT ga Irish UD_Italian-ISDT it Italian * +UD_Italian-MarkIT it Italian UD_Italian-ParTUT it Italian UD_Italian-PoSTWITA it Italian UD_Italian-TWITTIRO it Italian UD_Italian-VIT it Italian -UD_Japanese-GSD ja Japanese * +UD_Japanese-BCCWJ ja Japanese * +UD_Japanese-BCCWJLUW ja Japanese +UD_Japanese-GSD ja Japanese UD_Japanese-GSDLUW ja Japanese -UD_Korean-Kaist ko Korean +UD_Korean-GSD ko Korean +UD_Korean-Kaist ko Korean * UD_Latin-ITTB la Latin * UD_Latin-LLCT la Latin UD_Latin-PROIEL la Latin UD_Latin-UDante la Latin UD_Latvian-LVTB lv Latvian -UD_Lithuanian-ALKSNIS lt Lithuanian +UD_Lithuanian-ALKSNIS lt Lithuanian * +UD_Lithuanian-HSE lt Lithuanian UD_Maltese-MUDT mt Maltese -UD_Norwegian-Bokmaal no Norwegian_Bokmål -UD_Norwegian-Nynorsk nn Norwegian_Nynorsk * -UD_Norwegian-NynorskLIA nn Norwegian_Nynorsk +UD_Marathi-UFAL mr Marathi +UD_Naija-NSC ? Naija +UD_Norwegian-Bokmaal no Norwegian +UD_Norwegian-Nynorsk nn Norwegian * +UD_Norwegian-NynorskLIA nn Norwegian +UD_Old_Church_Slavonic-PROIEL ? Old_Church_Slavonic +UD_Old_East_Slavic-Birchbark ? Old_East_Slavic +UD_Old_East_Slavic-RNC ? Old_East_Slavic +UD_Old_East_Slavic-TOROT ? Old_East_Slavic * UD_Old_French-SRCMF ? Old_French UD_Persian-PerDT fa Persian * UD_Persian-Seraji fa Persian -UD_Polish-PDB pl Polish * UD_Polish-LFG pl Polish +UD_Polish-PDB pl Polish * +UD_Pomak-Philotis ? Pomak UD_Portuguese-Bosque pt Portuguese -UD_Portuguese-GSD pt Portuguese * +UD_Portuguese-CINTIL pt Portuguese * +UD_Portuguese-GSD pt Portuguese +UD_Portuguese-PetroGold pt Portuguese UD_Romanian-Nonstandard ro Romanian * UD_Romanian-RRT ro Romanian UD_Romanian-SiMoNERo ro Romanian @@ -83,8 +109,9 @@ UD_Spanish-AnCora es Spanish * UD_Spanish-GSD es Spanish UD_Swedish-LinES sv Swedish UD_Swedish-Talbanken sv Swedish * +UD_Swedish_Sign_Language-SSLC ? Swedish_Sign_Language UD_Tamil-TTB ta Tamil -UD_Telugu-MTG te Telegu +UD_Telugu-MTG te Telugu UD_Turkish-Atis tr Turkish UD_Turkish-BOUN tr Turkish UD_Turkish-FrameNet tr Turkish @@ -92,11 +119,11 @@ UD_Turkish-IMST tr Turkish UD_Turkish-Kenet tr Turkish UD_Turkish-Penn tr Turkish * UD_Turkish-Tourism tr Turkish +UD_Turkish_German-SAGT ? Turkish_German UD_Ukrainian-IU uk Ukrainian UD_Urdu-UDTB ur Urdu UD_Uyghur-UDT ug Uyghur UD_Vietnamese-VTB vi Vietnamese UD_Welsh-CCG cy Welsh -#NKJP_Polish-byName pl Polish -#NKJP_Polish-byType pl Polish -#UD_Polish-PDBnoMW pl Polish \ No newline at end of file +UD_Western_Armenian-ArmTDP hy Western_Armenian +UD_Wolof-WTB wo Wolof \ No newline at end of file diff --git a/src/lambo/segmenter/lambo.py b/src/lambo/segmenter/lambo.py index e2fbc538b5438324d0bf20726b77bfc5485ff8bd..a5e46d80fb95182f972d877d1ed4e903aacc79cd 100644 --- a/src/lambo/segmenter/lambo.py +++ b/src/lambo/segmenter/lambo.py @@ -38,7 +38,7 @@ class Lambo(): model_name = Lambo.getDefaultModel(provided_name) dict_path, model_path = download_model(model_name) dict = Lambo.read_dict(dict_path) - model = torch.load(model_path) + model = torch.load(model_path, map_location=torch.device('cpu')) return cls(model, dict) @staticmethod @@ -75,7 +75,7 @@ class Lambo(): :param model_name: model name :return: """ - model = torch.load(model_path / (model_name + '.pth')) + model = torch.load(model_path / (model_name + '.pth'), map_location=torch.device('cpu')) dict = Lambo.read_dict(model_path / (model_name + '.dict')) return cls(model, dict) diff --git a/src/lambo/utils/download.py b/src/lambo/utils/download.py index e4fcc0f8f17f0d8763f7ce9cbec6fc1a1487a447..0aef6dfb767ec19e3711e32cfcab7031a604a626 100644 --- a/src/lambo/utils/download.py +++ b/src/lambo/utils/download.py @@ -16,8 +16,8 @@ logger = logging.getLogger(__name__) # The types of models available and their subdirectories in the model repository TYPE_TO_PATH = { - "LAMBO_no_pretraining": "vanilla", - "LAMBO": "full"} + "LAMBO_no_pretraining": "vanilla211-s", + "LAMBO": "full211-s"} # The adress of the remote repository _URL = "http://home.ipipan.waw.pl/p.przybyla/lambo/{type}/{treebank}.{extension}" diff --git a/src/lambo/utils/generate_languages_txt.py b/src/lambo/utils/generate_languages_txt.py new file mode 100644 index 0000000000000000000000000000000000000000..d3982e4ca1af327c65c264a5138f19ca8a403840 --- /dev/null +++ b/src/lambo/utils/generate_languages_txt.py @@ -0,0 +1,82 @@ +from pathlib import Path + +old_languages_txt = '' +new_ud_treebanks = '' + +codedict = {} +for line in open(old_languages_txt): + if line.startswith('#'): + continue + parts = line.strip().split(' ') + lang = parts[2] + code = parts[1] + codedict[lang] = code + +ud11path = Path(new_ud_treebanks) + +subdirs = [x for x in ud11path.iterdir() if x.is_dir()] +subdirs.sort() + +sizes = {} + +for subdir in subdirs: + hasTrain = False + hasDev = False + hasTest = False + trainfile = None + for file in subdir.iterdir(): + if file.name.endswith('train.txt'): + hasTrain = True + trainfile = file + elif file.name.endswith('test.txt'): + hasTest = True + elif file.name.endswith('dev.txt'): + hasDev = True + if (not hasTrain) or (not hasTest) or (not hasDev): + continue + treebank_name = subdir.name + language_name = treebank_name[3:].split('-')[0] + code = '@@@@@' + if language_name in codedict: + code = codedict[language_name] + if language_name not in sizes: + sizes[language_name] = {} + sizes[language_name][treebank_name] = trainfile.stat().st_size + +for language_name in sizes: + maxlen = 0 + best = None + for treebank_name in sizes[language_name]: + if sizes[language_name][treebank_name] > maxlen: + best = treebank_name + maxlen = sizes[language_name][treebank_name] + if len(sizes[language_name]) > 1: + sizes[language_name]['preferred'] = best + +print( + "# Format: <UD training corpus> <ISO 639-1 code (for OSCAR pretraining)> <Language name> <Recommended (chosen by size)>") +for subdir in subdirs: + hasTrain = False + hasDev = False + hasTest = False + trainfile = None + for file in subdir.iterdir(): + if file.name.endswith('train.txt'): + hasTrain = True + trainfile = file + elif file.name.endswith('test.txt'): + hasTest = True + elif file.name.endswith('dev.txt'): + hasDev = True + if (not hasTrain) or (not hasTest) or (not hasDev): + continue + treebank_name = subdir.name + language_name = treebank_name[3:].split('-')[0] + code = '@@@@@' + if language_name in codedict: + code = codedict[language_name] + preferred = '' + if 'preferred' in sizes[language_name] and sizes[language_name]['preferred'] == treebank_name: + preferred = ' *' + print(treebank_name + ' ' + code + ' ' + language_name + preferred) + diff --git a/src/lambo/utils/oscar.py b/src/lambo/utils/oscar.py index bda4c66da81d3ef2176c112d04c21a0016d3608c..20ff83cdba6f3701212d76a435a6ed08db38bf1e 100644 --- a/src/lambo/utils/oscar.py +++ b/src/lambo/utils/oscar.py @@ -4,6 +4,7 @@ Functions used to obtain multilingual corpora from `OSCAR <https://oscar-corpus. import json import random import urllib +import time from urllib.error import HTTPError from lambo.data.document import Document @@ -46,10 +47,11 @@ def download_archive1_from_oscar(language, path, OSCAR_LOGIN, OSCAR_PASSWORD, re return except HTTPError as err: error = err - if i == retry - 1: + if i == retry - 1 or error.code<500: raise error - time = ((i + 1) * (i + 1) * (i + 1) * 15) - print("[Got " + str(error.code) + ", retrying after " + str(time) + " seconds...]") + secs = ((i + 1) * (i + 1) * (i + 1) * 15) + print("[Got " + str(error.code) + ", retrying after " + str(secs) + " seconds...]") + time.sleep(secs) def read_jsonl_to_documents(fileobj, MAX_LEN=3000000):