Skip to content
Snippets Groups Projects
Commit 183cf56d authored by piotrmp's avatar piotrmp
Browse files

Subword splitting implementation.

parent 055715ab
1 merge request!2Multiword generation
......@@ -25,7 +25,7 @@ class LamboNetwork(Module):
:param max_len: maximum length of an input sequence,
:param dict: character dictionary
:param utf_categories_num: number of UTF categories
:param pretrained: either ``None`` (for new models) or an instance of ``LamboPretrainingModel`` (if using pretraining data)
:param pretrained: either ``None`` (for new models) or an instance of ``LamboPretrainingNetwork`` (if using pretraining data)
"""
super(LamboNetwork, self).__init__()
self.max_len = max_len
......
......@@ -54,13 +54,26 @@ def test_loop(dataloader, model, device='cpu'):
Y = XY[-1]
pred = model(*Xs)
test_loss += model.compute_loss(pred, Y, Xs).item()
for i in range(pred.shape[2]):
A = pred[:, :, i, :].argmax(2)
B = Y[:, :, i]
nontrivial = torch.nonzero(A + B, as_tuple=True)
if len(pred.shape)==4:
# Predicting character types (segmentation)
for i in range(pred.shape[2]):
A = pred[:, :, i, :].argmax(2)
B = Y[:, :, i]
nontrivial = torch.nonzero(A + B, as_tuple=True)
equals = (A == B)[nontrivial].type(torch.float)
correct[i] += equals.sum().item()
size[i] += torch.numel(equals)
elif len(pred.shape)==3:
# Predictiong characters (subword prediction)
A = pred.argmax(2)
B = Y
nontrivial = torch.nonzero(Y, as_tuple=True)
equals = (A == B)[nontrivial].type(torch.float)
correct[i] += equals.sum().item()
size[i] += torch.numel(equals)
#equals = (A==B).type(torch.float)
correct[0] += equals.sum().item()
size[0] += torch.numel(equals)
pass
test_loss /= num_batches
size = [s if s > 0 else 1 for s in size]
print(
......
......@@ -98,14 +98,22 @@ class Lambo():
:return: character dictionary
"""
dict = {}
for line in open(dict_path):
if line.strip() == '':
prevEmpty = False
chunks=dict_path.read_bytes().decode('utf-8').split('\n')
for chunk in chunks:
if chunk == '':
prevEmpty = True
continue
parts = line.split('\t')
parts = chunk.split('\t')
if len(parts) == 3 and parts[0] == '' and parts[1] == '':
# TAB character
parts = ['\t', parts[2]]
if parts[0]=='' and prevEmpty:
parts[0]='\n'
if parts[0] in dict:
print("WARNING: duplicated key in dictionary")
dict[parts[0]] = int(parts[1])
prevEmpty = False
return dict
@staticmethod
......
import copy
import torch
from torch.nn import Embedding, LSTM, Linear, LogSoftmax, NLLLoss, Module
class LamboSubwordNetwork(Module):
"""
LAMBO subword neural network model. The network has four layers:
* embedding layers for characters, representing each as a 64-long vector,
* bidirectional LSTM layer, taking a character embedding as input and outputting 2*64-long state vector,
* dense linear layer, converting LSTM state vectors to 64-dimensional embedding space
* inverted embedding layer to convert back to characters using the same matrix as for embedding
* softmax layer, computing probability of any character
"""
def __init__(self, max_len, dict, pretrained=None):
"""
Create a LAMBO subword neural network.
:param max_len: maximum length of an input sequence,
:param dict: character dictionary
:param pretrained: either ``None`` (for new models) or an instance of ``LamboNetwork`` (if using pretraining data)
"""
super(LamboSubwordNetwork, self).__init__()
self.max_len = max_len
self.dict = dict
if pretrained is not None:
# Copy the weights of the embedding of pretraining model
self.embedding_layer = Embedding.from_pretrained(pretrained.embedding_layer.weight, freeze=False, padding_idx=None)
else:
self.embedding_layer = Embedding(len(dict), 64, dict['<PAD>'])
self.lstm_layer = LSTM(input_size=self.embedding_layer.embedding_dim, hidden_size=64,batch_first=True,bidirectional=True)
self.linear_layer = Linear(self.lstm_layer.hidden_size * 2, self.embedding_layer.embedding_dim)
self.softmax_layer = LogSoftmax(2)
self.loss_fn = NLLLoss()
def forward(self, x_char):
"""
Computation of the network output.
:param x_char: a tensor of BxL character indices,
:return: a tensor of BxLxV class scores
Where B = batch size, L = maximum sequence length, V = number of words in the dictionary
"""
embedded = self.embedding_layer(x_char)
hidden = self.lstm_layer(embedded)[0]
reduced = self.linear_layer(hidden)
# Computing inverted embedding as a cosine similarity score of the transformed representation and original embeddings
scores = self.inverted_embedding(reduced, self.embedding_layer)
probabilities = self.softmax_layer(scores)
return probabilities
@staticmethod
def inverted_embedding(input, embedding_layer):
# Normalise both matrices
input_normalised = torch.nn.functional.normalize(input, dim=2)
weights_normalised = torch.nn.functional.normalize(embedding_layer.weight.data, dim=1)
# Dot product of normalised vectors equals cosine similarity
scores = torch.matmul(input_normalised, torch.transpose(weights_normalised, 0, 1))
return scores
def compute_loss(self, pred, true, Xs):
"""
Comput cross-entropy loss.
:param pred: tensor with predicted character probabilities
:param true: tensor witrh true classes
:param Xs: (not used)
:return: loss value
"""
pred = torch.reshape(pred, (-1, len(self.dict)))
true = torch.reshape(true, (-1,))
output = self.loss_fn(pred, true)
return output
import random
import torch
from torch.utils.data import TensorDataset, DataLoader
def encode_test(text, dictionary, maximum_length):
Xchar = [dictionary[char] if char in dictionary else dictionary['<UNK>'] for char in text]
Xchar += [dictionary['<PAD>']] * (maximum_length - (len(Xchar) % maximum_length))
Xchar = Xchar [:maximum_length]
return (torch.Tensor([Xchar]).to(torch.int64),)
def encode_subwords(documents, dictionary, maximum_length):
"""
Encode subwords as neural network inputs and outputs
:param documents: list of documents
:param dictionary: character dictionary
:param maximum_length: maximum length of network input and output
:return: a pair of network input/output tensors: character encodings, true catagories
"""
tokenCount = 0
multiwordCount=0
for document in documents:
for turn in document.turns:
for sentence in turn.sentences:
for token in sentence.tokens:
tokenCount+=1
if token.is_multi_word:
multiwordCount+=1
thrs = multiwordCount / tokenCount
Xchars = []
Ychars = []
for document in documents:
for turn in document.turns:
for sentence in turn.sentences:
for token in sentence.tokens:
r=random.random()
if token.is_multi_word or r < thrs:
Xchar = [dictionary[char] if char in dictionary else dictionary['<UNK>'] for char in token.text]
Xchar += [dictionary['<PAD>']] * (maximum_length - (len(Xchar) % maximum_length))
Xchar = Xchar[:maximum_length]
subwords=token.subwords
if len(subwords)==0:
subwords=[token.text]
Ychar=[]
for subword in subwords:
if len(Ychar)!=0:
Ychar += [dictionary['<PAD>']]
Ychar+= [dictionary[char] if char in dictionary else dictionary['<UNK>'] for char in
subword]
Ychar += [dictionary['<PAD>']] * (maximum_length - (len(Ychar) % maximum_length))
Ychar=Ychar[:maximum_length]
Xchars+= [Xchar]
Ychars+= [Ychar]
return Xchars, Ychars
def prepare_subwords_dataloaders(train_docs, test_docs, max_len, batch_size, dict):
"""
Prapare Pytorch dataloaders for the documents.
:param train_docs: list of training documents
:param test_docs: list of test documents
:param max_len: maximum length of network input
:param batch_size: batch size
:param dict: character dictionary (or None, if to be created)
:return: a triple with character dictionary, train dataloader and test dataloader
"""
train_X_char, train_Y = encode_subwords(train_docs, dict, max_len)
if len(train_X_char)<256:
return None, None
train_X_char = torch.Tensor(train_X_char).to(torch.int64)
train_Y = torch.Tensor(train_Y).to(torch.int64)
train_dataset = TensorDataset(train_X_char, train_Y)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_X_char, test_Y = encode_subwords(test_docs, dict, max_len)
if len(test_X_char)<64:
return None, None
test_X_char = torch.Tensor(test_X_char).to(torch.int64)
test_Y = torch.Tensor(test_Y).to(torch.int64)
test_dataset = TensorDataset(test_X_char, test_Y)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
return train_dataloader, test_dataloader
import torch
from lambo.segmenter.lambo import Lambo
from lambo.subwords.preprocessing import encode_test
class LamboSplitter():
@classmethod
def from_path(cls, model_path, model_name):
"""
Obtain a LAMBO subword splitter by reading a model from a given path.
:param model_path: directory including the model files
:param model_name: model name
:return:
"""
model = torch.load(model_path / (model_name + '_subwords.pth'), map_location=torch.device('cpu'))
dict = Lambo.read_dict(model_path / (model_name + '.dict'))
return cls(model, dict)
def __init__(self, model, dict):
"""
Create a new LAMBO subword splitter from a given model and dictionary.
:param model: prediction Pytorch model
:param dict: dictionary
"""
self.model = model
self.dict = dict
self.inv_dict = {dict[key]: key for key in dict}
def split(self, token_text):
if len(token_text)>=self.model.max_len:
return [token_text]
Xs = encode_test(token_text, self.dict, self.model.max_len)
with torch.no_grad():
Y = self.model(*Xs)
codes= Y.argmax(2).numpy()[0]
decisions = [ self.inv_dict[code] for code in codes]
result = ['']
for char in decisions:
if len(char)==1:
result[-1]+=char
elif char=='<PAD>':
if result[-1]=='':
break
result.append('')
else:
return [token_text]
result = [subword for subword in result if subword!='']
if len(result)==0:
return [token_text]
return result
from lambo.learning.train import tune
from lambo.subwords.model import LamboSubwordNetwork
from lambo.subwords.preprocessing import prepare_subwords_dataloaders
from lambo.utils.ud_reader import read_treebank
import torch
def train_subwords_and_save(model_name, treebank_path, save_path, lambo_segmenter, epochs=20, device='cpu'):
"""
Train a new LAMBO subwords model and save it in filesystem.
:param model_name: type of model trained, currently only ``LAMBO-BILSTM`` is supported
:param treebank_path: path to the treebank training data
:param save_path: path to save the generated model
:param lambo_segmenter: LAMBO segmenter to base on
:param epochs: number of epochs to run for (default: 10)
:param device: the device to use for computation
:return: no value returned
"""
if model_name not in ['LAMBO-BILSTM']:
print(" Unrecognised model name: " + model_name)
return
print("Reading data.")
train_doc, dev_doc, test_doc = read_treebank(treebank_path, True)
print("Preparing data")
BATCH_SIZE = 32
print("Initiating the model.")
MAX_LEN = 32
train_dataloader, test_dataloader = prepare_subwords_dataloaders([train_doc, dev_doc], [test_doc],
MAX_LEN,
BATCH_SIZE,lambo_segmenter.dict)
if train_dataloader is None:
print("Not enough data to train, moving on.")
return
model = LamboSubwordNetwork(MAX_LEN, lambo_segmenter.dict, lambo_segmenter.model)
tune(model, train_dataloader, test_dataloader, epochs, device)
print("Saving")
torch.save(model, save_path / (treebank_path.name + '_subwords.pth'))
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment