diff --git a/src/lambo/examples/run_training_pretrained.py b/src/lambo/examples/run_training_pretrained.py index 471251a0fea5a57548d514b6c3dcc45bc2a06e1e..69f5cf082d46c275afb684b96d4dd2f6bcddb220 100644 --- a/src/lambo/examples/run_training_pretrained.py +++ b/src/lambo/examples/run_training_pretrained.py @@ -9,8 +9,11 @@ import torch from lambo.learning.train import train_new_and_save, train_pretrained_and_save +EPOCHS = 20 +SOW_UNKNOWNS = False + if __name__=='__main__': - treebanks = Path(sys.argv[1]) #Path.home() / 'PATH-TO/ud-treebanks-v2.9/' + treebanks = Path(sys.argv[1]) #Path.home() / 'PATH-TO/ud-treebanks-v2.11/' outpath = Path(sys.argv[2]) #Path.home() / 'PATH-TO/models/full/' pretrained_path = Path(sys.argv[3]) #Path.home() / 'PATH-TO/models/pretrained/' @@ -30,6 +33,6 @@ if __name__=='__main__': print(str(i) + '/' + str(len(lines)) + '========== ' + model + ' ==========') inpath = treebanks / model if language != '?': - train_pretrained_and_save(language, inpath, outpath, pretrained_path, 20, device) + train_pretrained_and_save(language, inpath, outpath, pretrained_path, EPOCHS, sow_unknowns=SOW_UNKNOWNS, device = device) else: - train_new_and_save('LAMBO-BILSTM', inpath, outpath, 20, device) + train_new_and_save('LAMBO-BILSTM', inpath, outpath, EPOCHS, device = device) diff --git a/src/lambo/examples/run_tuning.py b/src/lambo/examples/run_tuning.py index fc1d97903219b83c45e0ae1a7d8143d6177ccb39..9a344925c99d463b2be49be626c716007f97efd5 100644 --- a/src/lambo/examples/run_tuning.py +++ b/src/lambo/examples/run_tuning.py @@ -26,7 +26,7 @@ if __name__=='__main__': # Prepare data tuning_doc = read_document(tuningpath, False) _, train_dataloader, test_dataloader = prepare_dataloaders_withdict([tuning_doc], [tuning_doc], lambo.model.max_len, 32, - lambo.dict) + False, lambo.dict) # Tune tune(lambo.model, train_dataloader, test_dataloader, 3) diff --git a/src/lambo/examples/run_usage.py b/src/lambo/examples/run_usage.py index 905f032163d0a2d0a705d53937715693db1a6556..2f33f5a5f607b8d190128bce0ddd142594ce942b 100644 --- a/src/lambo/examples/run_usage.py +++ b/src/lambo/examples/run_usage.py @@ -2,14 +2,15 @@ Short demo on using LAMBO """ from lambo.segmenter.lambo import Lambo +import pathlib if __name__ == '__main__': # Load the recommended model for Polish - lambo = Lambo.get('Polish') + lambo = Lambo.from_path(pathlib.Path.home() / 'data' / 'lambo'/ 'models' / 'withunk','UD_Polish-PDB', False) # Provide text, including pauses (``(yy)``), emojis and turn markers (``<turn>``). - text = "Ciemny i jasny (yy) pies biegają 🏴w płytkiej w🅾️dzie... obok 🏴kamienistej😂 plaży.\n\n 😆 To jest następne zdanie <turn>to byłaby następna tura." + text = "Poza Japonią, począwszy od cesarza Shōwa, cesarzy często nazywano ich imionami, zarówno za życia, jak po śmierci." # Perform segmentation document = lambo.segment(text) diff --git a/src/lambo/learning/preprocessing_dict.py b/src/lambo/learning/preprocessing_dict.py index 0443ab03b87603aa220ef2b6097155f1f270ef78..f4ac9769a06fd4bab9c4dbb1f42c17ab5b2e0116 100644 --- a/src/lambo/learning/preprocessing_dict.py +++ b/src/lambo/learning/preprocessing_dict.py @@ -3,7 +3,7 @@ Functions for preprocessing the data for the main LAMBO model. """ import unicodedata -import torch +import torch, random from torch.utils.data import TensorDataset, DataLoader from lambo.learning.dictionary import create_dictionary @@ -41,6 +41,9 @@ utf_category_dictionary = { 'Cn': 29 } +UNKNOWN_RATIO = 0.01 +random.seed(1) + def character_to_utf_feature(char): """ @@ -54,13 +57,14 @@ def character_to_utf_feature(char): return result -def encode_training_dict(documents, dictionary, maximum_length, finalise_all_tokens=True): +def encode_training_dict(documents, dictionary, maximum_length, sow_unknowns, finalise_all_tokens=True): """ Encode documents as neural network inputs :param documents: list of documents :param dictionary: character dictionary :param maximum_length: maximum length of network input + :param sow_unknowns: whether to randomly mask some characters as unknown :param finalise_all_tokens: whether every token should be properly encoded (default True, do not change) :return: a triple of network input/output tensors: character encodings, UTF representations, true catagories """ @@ -71,6 +75,12 @@ def encode_training_dict(documents, dictionary, maximum_length, finalise_all_tok offset = 0 for turn in document.turns: Xchar = [dictionary[char] if char in dictionary else dictionary['<UNK>'] for char in turn.text] + if sow_unknowns: + pre_unknowns = sum([x == dictionary['<UNK>'] for x in Xchar]) + Xchar = [dictionary['<UNK>'] if random.random() < UNKNOWN_RATIO else xchar for xchar in Xchar] + post_unknowns = sum([x == dictionary['<UNK>'] for x in Xchar]) + print("Sown unknowns: from every " + str(len(Xchar) / pre_unknowns) + " character to every " + str( + len(Xchar) / post_unknowns) + " character.") Xchar += [dictionary['<PAD>']] * (maximum_length - (len(Xchar) % maximum_length)) Xutf = [character_to_utf_feature(char) for char in turn.text] Xutf += [character_to_utf_feature('\u0000')] * (maximum_length - (len(Xutf) % maximum_length)) @@ -115,7 +125,7 @@ def prepare_test_withdict(text, dictionary, maximum_length): return torch.Tensor(Xchars).to(torch.int64), torch.Tensor(Xutfs).to(torch.int64) -def prepare_dataloaders_withdict(train_docs, test_docs, max_len, batch_size, dict=None): +def prepare_dataloaders_withdict(train_docs, test_docs, max_len, batch_size, sow_unknowns, dict=None): """ Prapare Pytorch dataloaders for the documents. @@ -123,19 +133,20 @@ def prepare_dataloaders_withdict(train_docs, test_docs, max_len, batch_size, dic :param test_docs: list of test documents :param max_len: maximum length of network input :param batch_size: batch size + :param sow_unknowns: whether to randomly mask some characters as unknown :param dict: character dictionary (or None, if to be created) :return: a triple with character dictionary, train dataloader and test dataloader """ if dict is None: dict = create_dictionary([doc.text for doc in train_docs]) - train_X_char, train_X_utf, train_Y = encode_training_dict(train_docs, dict, max_len) + train_X_char, train_X_utf, train_Y = encode_training_dict(train_docs, dict, max_len, sow_unknowns) train_X_char = torch.Tensor(train_X_char).to(torch.int64) train_X_utf = torch.Tensor(train_X_utf).to(torch.int64) train_Y = torch.Tensor(train_Y).to(torch.int64) train_dataset = TensorDataset(train_X_char, train_X_utf, train_Y) train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) - test_X_char, test_X_utf, test_Y = encode_training_dict(test_docs, dict, max_len) + test_X_char, test_X_utf, test_Y = encode_training_dict(test_docs, dict, max_len, False) test_X_char = torch.Tensor(test_X_char).to(torch.int64) test_X_utf = torch.Tensor(test_X_utf).to(torch.int64) test_Y = torch.Tensor(test_Y).to(torch.int64) diff --git a/src/lambo/learning/train.py b/src/lambo/learning/train.py index acde36fb3acc9d5070c8bcfcdc0a38285f4a517c..32e9f9e242540b71989b5cac0ead22d6156d20aa 100644 --- a/src/lambo/learning/train.py +++ b/src/lambo/learning/train.py @@ -55,7 +55,7 @@ def test_loop(dataloader, model, device=torch.device('cpu')): Y = XY[-1] pred = model(*Xs) test_loss += model.compute_loss(pred, Y, Xs).item() - if len(pred.shape)==4: + if len(pred.shape) == 4: # Predicting character types (segmentation) for i in range(pred.shape[2]): A = pred[:, :, i, :].argmax(2) @@ -64,17 +64,17 @@ def test_loop(dataloader, model, device=torch.device('cpu')): equals = (A == B)[nontrivial].type(torch.float) correct[i] += equals.sum().item() size[i] += torch.numel(equals) - elif len(pred.shape)==3: + elif len(pred.shape) == 3: # Predictiong characters (subword prediction) A = pred.argmax(2) B = Y nontrivial = torch.nonzero(Y, as_tuple=True) equals = (A == B)[nontrivial].type(torch.float) - #equals = (A==B).type(torch.float) + # equals = (A==B).type(torch.float) correct[0] += equals.sum().item() size[0] += torch.numel(equals) pass - + test_loss /= num_batches size = [s if s > 0 else 1 for s in size] print( @@ -113,14 +113,15 @@ def test_loop_pretraining(dataloader, model, device=torch.device('cpu')): f"Test Error: \n Accuracy nontrivial: {(100 * (correct[0] / size[0])):>5f}%, trivial: {(100 * (correct[1] / size[1])):>5f}%, Avg loss: {test_loss:>8f} \n") -def train_new_and_save(model_name, treebank_path, save_path, epochs=10, device=torch.device('cpu')): +def train_new_and_save(model_name, treebank_path, save_path, epochs, sow_unknowns=False, device=torch.device('cpu')): """ Train a new LAMBO model and save it in filesystem. :param model_name: type of model trained, currently only ``LAMBO-BILSTM`` is supported :param treebank_path: path to the treebank training data :param save_path: path to save the generated model - :param epochs: number of epochs to run for (default: 10) + :param epochs: number of epochs to run for + :param sow_unknowns: whether to randomly mask some characters as unknown :param device: the device to use for computation :return: no value returned """ @@ -138,7 +139,7 @@ def train_new_and_save(model_name, treebank_path, save_path, epochs=10, device=t MAX_LEN = 256 dict, train_dataloader, test_dataloader = prepare_dataloaders_withdict([train_doc, dev_doc], [test_doc], MAX_LEN, - BATCH_SIZE) + BATCH_SIZE, sow_unknowns) model = LamboNetwork(MAX_LEN, dict, len(utf_category_dictionary)) tune(model, train_dataloader, test_dataloader, epochs, device) @@ -149,7 +150,8 @@ def train_new_and_save(model_name, treebank_path, save_path, epochs=10, device=t file1.writelines([x + '\t' + str(dict[x]) + '\n' for x in dict]) -def train_pretrained_and_save(language, treebank_path, save_path, pretrained_path, epochs=10, device=torch.device('cpu')): +def train_pretrained_and_save(language, treebank_path, save_path, pretrained_path, epochs, sow_unknowns=False, + device=torch.device('cpu')): """ Train a new LAMBO model, staring from pretrained, and save it in filesystem. @@ -157,7 +159,8 @@ def train_pretrained_and_save(language, treebank_path, save_path, pretrained_pat :param treebank_path: path to the treebank training data :param save_path: path to save the generated model :param pretrained_path: path to the pretraining models - :param epochs: number of epochs to run for (default: 10) + :param epochs: number of epochs to run for + :param sow_unknowns: whether to randomly mask some characters as unknown :param device: the device to use for computation :return: no value returned """ @@ -182,7 +185,7 @@ def train_pretrained_and_save(language, treebank_path, save_path, pretrained_pat dict, train_dataloader, test_dataloader = prepare_dataloaders_withdict([train_doc, dev_doc], [test_doc], MAX_LEN, - BATCH_SIZE, dict=dict) + BATCH_SIZE, sow_unknowns, dict=dict) tune(model, train_dataloader, test_dataloader, epochs, device)