Skip to content
Snippets Groups Projects
Commit f5151bf6 authored by piotrmp's avatar piotrmp
Browse files

Added the ability to randomly sow unknown characters.

parent 4b9f0c3a
Branches
Tags
No related merge requests found
Pipeline #16986 passed with stage
in 34 seconds
......@@ -9,8 +9,11 @@ import torch
from lambo.learning.train import train_new_and_save, train_pretrained_and_save
EPOCHS = 20
SOW_UNKNOWNS = False
if __name__=='__main__':
treebanks = Path(sys.argv[1]) #Path.home() / 'PATH-TO/ud-treebanks-v2.9/'
treebanks = Path(sys.argv[1]) #Path.home() / 'PATH-TO/ud-treebanks-v2.11/'
outpath = Path(sys.argv[2]) #Path.home() / 'PATH-TO/models/full/'
pretrained_path = Path(sys.argv[3]) #Path.home() / 'PATH-TO/models/pretrained/'
......@@ -30,6 +33,6 @@ if __name__=='__main__':
print(str(i) + '/' + str(len(lines)) + '========== ' + model + ' ==========')
inpath = treebanks / model
if language != '?':
train_pretrained_and_save(language, inpath, outpath, pretrained_path, 20, device)
train_pretrained_and_save(language, inpath, outpath, pretrained_path, EPOCHS, sow_unknowns=SOW_UNKNOWNS, device = device)
else:
train_new_and_save('LAMBO-BILSTM', inpath, outpath, 20, device)
train_new_and_save('LAMBO-BILSTM', inpath, outpath, EPOCHS, device = device)
......@@ -26,7 +26,7 @@ if __name__=='__main__':
# Prepare data
tuning_doc = read_document(tuningpath, False)
_, train_dataloader, test_dataloader = prepare_dataloaders_withdict([tuning_doc], [tuning_doc], lambo.model.max_len, 32,
lambo.dict)
False, lambo.dict)
# Tune
tune(lambo.model, train_dataloader, test_dataloader, 3)
......
......@@ -2,14 +2,15 @@
Short demo on using LAMBO
"""
from lambo.segmenter.lambo import Lambo
import pathlib
if __name__ == '__main__':
# Load the recommended model for Polish
lambo = Lambo.get('Polish')
lambo = Lambo.from_path(pathlib.Path.home() / 'data' / 'lambo'/ 'models' / 'withunk','UD_Polish-PDB', False)
# Provide text, including pauses (``(yy)``), emojis and turn markers (``<turn>``).
text = "Ciemny i jasny (yy) pies biegają 🏴w płytkiej w🅾️dzie... obok 🏴󠁧󠁢󠁷󠁬󠁳󠁿kamienistej😂 plaży.\n\n 😆 To jest następne zdanie <turn>to byłaby następna tura."
text = "Poza Japonią, począwszy od cesarza Shōwa, cesarzy często nazywano ich imionami, zarówno za życia, jak po śmierci."
# Perform segmentation
document = lambo.segment(text)
......
......@@ -3,7 +3,7 @@ Functions for preprocessing the data for the main LAMBO model.
"""
import unicodedata
import torch
import torch, random
from torch.utils.data import TensorDataset, DataLoader
from lambo.learning.dictionary import create_dictionary
......@@ -41,6 +41,9 @@ utf_category_dictionary = {
'Cn': 29
}
UNKNOWN_RATIO = 0.01
random.seed(1)
def character_to_utf_feature(char):
"""
......@@ -54,13 +57,14 @@ def character_to_utf_feature(char):
return result
def encode_training_dict(documents, dictionary, maximum_length, finalise_all_tokens=True):
def encode_training_dict(documents, dictionary, maximum_length, sow_unknowns, finalise_all_tokens=True):
"""
Encode documents as neural network inputs
:param documents: list of documents
:param dictionary: character dictionary
:param maximum_length: maximum length of network input
:param sow_unknowns: whether to randomly mask some characters as unknown
:param finalise_all_tokens: whether every token should be properly encoded (default True, do not change)
:return: a triple of network input/output tensors: character encodings, UTF representations, true catagories
"""
......@@ -71,6 +75,12 @@ def encode_training_dict(documents, dictionary, maximum_length, finalise_all_tok
offset = 0
for turn in document.turns:
Xchar = [dictionary[char] if char in dictionary else dictionary['<UNK>'] for char in turn.text]
if sow_unknowns:
pre_unknowns = sum([x == dictionary['<UNK>'] for x in Xchar])
Xchar = [dictionary['<UNK>'] if random.random() < UNKNOWN_RATIO else xchar for xchar in Xchar]
post_unknowns = sum([x == dictionary['<UNK>'] for x in Xchar])
print("Sown unknowns: from every " + str(len(Xchar) / pre_unknowns) + " character to every " + str(
len(Xchar) / post_unknowns) + " character.")
Xchar += [dictionary['<PAD>']] * (maximum_length - (len(Xchar) % maximum_length))
Xutf = [character_to_utf_feature(char) for char in turn.text]
Xutf += [character_to_utf_feature('\u0000')] * (maximum_length - (len(Xutf) % maximum_length))
......@@ -115,7 +125,7 @@ def prepare_test_withdict(text, dictionary, maximum_length):
return torch.Tensor(Xchars).to(torch.int64), torch.Tensor(Xutfs).to(torch.int64)
def prepare_dataloaders_withdict(train_docs, test_docs, max_len, batch_size, dict=None):
def prepare_dataloaders_withdict(train_docs, test_docs, max_len, batch_size, sow_unknowns, dict=None):
"""
Prapare Pytorch dataloaders for the documents.
......@@ -123,19 +133,20 @@ def prepare_dataloaders_withdict(train_docs, test_docs, max_len, batch_size, dic
:param test_docs: list of test documents
:param max_len: maximum length of network input
:param batch_size: batch size
:param sow_unknowns: whether to randomly mask some characters as unknown
:param dict: character dictionary (or None, if to be created)
:return: a triple with character dictionary, train dataloader and test dataloader
"""
if dict is None:
dict = create_dictionary([doc.text for doc in train_docs])
train_X_char, train_X_utf, train_Y = encode_training_dict(train_docs, dict, max_len)
train_X_char, train_X_utf, train_Y = encode_training_dict(train_docs, dict, max_len, sow_unknowns)
train_X_char = torch.Tensor(train_X_char).to(torch.int64)
train_X_utf = torch.Tensor(train_X_utf).to(torch.int64)
train_Y = torch.Tensor(train_Y).to(torch.int64)
train_dataset = TensorDataset(train_X_char, train_X_utf, train_Y)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_X_char, test_X_utf, test_Y = encode_training_dict(test_docs, dict, max_len)
test_X_char, test_X_utf, test_Y = encode_training_dict(test_docs, dict, max_len, False)
test_X_char = torch.Tensor(test_X_char).to(torch.int64)
test_X_utf = torch.Tensor(test_X_utf).to(torch.int64)
test_Y = torch.Tensor(test_Y).to(torch.int64)
......
......@@ -55,7 +55,7 @@ def test_loop(dataloader, model, device=torch.device('cpu')):
Y = XY[-1]
pred = model(*Xs)
test_loss += model.compute_loss(pred, Y, Xs).item()
if len(pred.shape)==4:
if len(pred.shape) == 4:
# Predicting character types (segmentation)
for i in range(pred.shape[2]):
A = pred[:, :, i, :].argmax(2)
......@@ -64,17 +64,17 @@ def test_loop(dataloader, model, device=torch.device('cpu')):
equals = (A == B)[nontrivial].type(torch.float)
correct[i] += equals.sum().item()
size[i] += torch.numel(equals)
elif len(pred.shape)==3:
elif len(pred.shape) == 3:
# Predictiong characters (subword prediction)
A = pred.argmax(2)
B = Y
nontrivial = torch.nonzero(Y, as_tuple=True)
equals = (A == B)[nontrivial].type(torch.float)
#equals = (A==B).type(torch.float)
# equals = (A==B).type(torch.float)
correct[0] += equals.sum().item()
size[0] += torch.numel(equals)
pass
test_loss /= num_batches
size = [s if s > 0 else 1 for s in size]
print(
......@@ -113,14 +113,15 @@ def test_loop_pretraining(dataloader, model, device=torch.device('cpu')):
f"Test Error: \n Accuracy nontrivial: {(100 * (correct[0] / size[0])):>5f}%, trivial: {(100 * (correct[1] / size[1])):>5f}%, Avg loss: {test_loss:>8f} \n")
def train_new_and_save(model_name, treebank_path, save_path, epochs=10, device=torch.device('cpu')):
def train_new_and_save(model_name, treebank_path, save_path, epochs, sow_unknowns=False, device=torch.device('cpu')):
"""
Train a new LAMBO model and save it in filesystem.
:param model_name: type of model trained, currently only ``LAMBO-BILSTM`` is supported
:param treebank_path: path to the treebank training data
:param save_path: path to save the generated model
:param epochs: number of epochs to run for (default: 10)
:param epochs: number of epochs to run for
:param sow_unknowns: whether to randomly mask some characters as unknown
:param device: the device to use for computation
:return: no value returned
"""
......@@ -138,7 +139,7 @@ def train_new_and_save(model_name, treebank_path, save_path, epochs=10, device=t
MAX_LEN = 256
dict, train_dataloader, test_dataloader = prepare_dataloaders_withdict([train_doc, dev_doc], [test_doc],
MAX_LEN,
BATCH_SIZE)
BATCH_SIZE, sow_unknowns)
model = LamboNetwork(MAX_LEN, dict, len(utf_category_dictionary))
tune(model, train_dataloader, test_dataloader, epochs, device)
......@@ -149,7 +150,8 @@ def train_new_and_save(model_name, treebank_path, save_path, epochs=10, device=t
file1.writelines([x + '\t' + str(dict[x]) + '\n' for x in dict])
def train_pretrained_and_save(language, treebank_path, save_path, pretrained_path, epochs=10, device=torch.device('cpu')):
def train_pretrained_and_save(language, treebank_path, save_path, pretrained_path, epochs, sow_unknowns=False,
device=torch.device('cpu')):
"""
Train a new LAMBO model, staring from pretrained, and save it in filesystem.
......@@ -157,7 +159,8 @@ def train_pretrained_and_save(language, treebank_path, save_path, pretrained_pat
:param treebank_path: path to the treebank training data
:param save_path: path to save the generated model
:param pretrained_path: path to the pretraining models
:param epochs: number of epochs to run for (default: 10)
:param epochs: number of epochs to run for
:param sow_unknowns: whether to randomly mask some characters as unknown
:param device: the device to use for computation
:return: no value returned
"""
......@@ -182,7 +185,7 @@ def train_pretrained_and_save(language, treebank_path, save_path, pretrained_pat
dict, train_dataloader, test_dataloader = prepare_dataloaders_withdict([train_doc, dev_doc], [test_doc],
MAX_LEN,
BATCH_SIZE, dict=dict)
BATCH_SIZE, sow_unknowns, dict=dict)
tune(model, train_dataloader, test_dataloader, epochs, device)
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment