Subword splitting implementation.

183cf56d · piotrmp · 055715ab · 183cf56d · 183cf56d · 183cf56d
Commit 183cf56d authored 2 years ago by piotrmp
--- a/src/lambo/learning/model.py
+++ b/src/lambo/learning/model.py
@@ -25,7 +25,7 @@ class LamboNetwork(Module):
        :param max_len: maximum length of an input sequence,
        :param dict: character dictionary
        :param utf_categories_num: number of UTF categories
-        :param pretrained: either ``None`` (for new models) or an instance of ``LamboPretrainingModel`` (if using pretraining data)
+        :param pretrained: either ``None`` (for new models) or an instance of ``LamboPretrainingNetwork`` (if using pretraining data)
        """
        super(LamboNetwork, self).__init__()
        self.max_len = max_len

--- a/src/lambo/learning/train.py
+++ b/src/lambo/learning/train.py
@@ -54,13 +54,26 @@ def test_loop(dataloader, model, device='cpu'):
            Y = XY[-1]
            pred = model(*Xs)
            test_loss += model.compute_loss(pred, Y, Xs).item()
-            for i in range(pred.shape[2]):
-                A = pred[:, :, i, :].argmax(2)
-                B = Y[:, :, i]
-                nontrivial = torch.nonzero(A + B, as_tuple=True)
+            if len(pred.shape)==4:
+                # Predicting character types (segmentation)
+                for i in range(pred.shape[2]):
+                    A = pred[:, :, i, :].argmax(2)
+                    B = Y[:, :, i]
+                    nontrivial = torch.nonzero(A + B, as_tuple=True)
+                    equals = (A == B)[nontrivial].type(torch.float)
+                    correct[i] += equals.sum().item()
+                    size[i] += torch.numel(equals)
+            elif len(pred.shape)==3:
+                # Predictiong characters (subword prediction)
+                A = pred.argmax(2)
+                B = Y
+                nontrivial = torch.nonzero(Y, as_tuple=True)
                equals = (A == B)[nontrivial].type(torch.float)
-                correct[i] += equals.sum().item()
-                size[i] += torch.numel(equals)
+                #equals = (A==B).type(torch.float)
+                correct[0] += equals.sum().item()
+                size[0] += torch.numel(equals)
+                pass
+                
    test_loss /= num_batches
    size = [s if s > 0 else 1 for s in size]
    print(

--- a/src/lambo/segmenter/lambo.py
+++ b/src/lambo/segmenter/lambo.py
@@ -98,14 +98,22 @@ class Lambo():
        :return: character dictionary
        """
        dict = {}
-        for line in open(dict_path):
-            if line.strip() == '':
+        prevEmpty = False
+        chunks=dict_path.read_bytes().decode('utf-8').split('\n')
+        for chunk in chunks:
+            if chunk == '':
+                prevEmpty = True
                continue
-            parts = line.split('\t')
+            parts = chunk.split('\t')
            if len(parts) == 3 and parts[0] == '' and parts[1] == '':
                # TAB character
                parts = ['\t', parts[2]]
+            if parts[0]=='' and prevEmpty:
+                parts[0]='\n'
+            if parts[0] in dict:
+                print("WARNING: duplicated key in dictionary")
            dict[parts[0]] = int(parts[1])
+            prevEmpty = False
        return dict
    
    @staticmethod

--- a/src/lambo/subwords/model.py
+++ b/src/lambo/subwords/model.py
+import copy
+
+import torch
+from torch.nn import Embedding, LSTM, Linear, LogSoftmax, NLLLoss, Module
+
+
+class LamboSubwordNetwork(Module):
+    """
+    LAMBO subword neural network model. The network has four layers:
+
+    * embedding layers for characters, representing each as a 64-long vector,
+    * bidirectional LSTM layer, taking a character embedding as input and outputting 2*64-long state vector,
+    * dense linear layer, converting LSTM state vectors to 64-dimensional embedding space
+    * inverted embedding layer to convert back to characters using the same matrix as for embedding
+    * softmax layer, computing probability of any character
+    """
+    
+    def __init__(self, max_len, dict, pretrained=None):
+        """
+        Create a LAMBO subword neural network.
+
+        :param max_len: maximum length of an input sequence,
+        :param dict: character dictionary
+        :param pretrained: either ``None`` (for new models) or an instance of ``LamboNetwork`` (if using pretraining data)
+        """
+        super(LamboSubwordNetwork, self).__init__()
+        self.max_len = max_len
+        self.dict = dict
+        if pretrained is not None:
+            # Copy the weights of the embedding of pretraining model
+            self.embedding_layer = Embedding.from_pretrained(pretrained.embedding_layer.weight, freeze=False, padding_idx=None)
+        else:
+            self.embedding_layer = Embedding(len(dict), 64, dict['<PAD>'])
+        self.lstm_layer = LSTM(input_size=self.embedding_layer.embedding_dim, hidden_size=64,batch_first=True,bidirectional=True)
+        self.linear_layer = Linear(self.lstm_layer.hidden_size * 2, self.embedding_layer.embedding_dim)
+        self.softmax_layer = LogSoftmax(2)
+        self.loss_fn = NLLLoss()
+    
+    def forward(self, x_char):
+        """
+        Computation of the network output.
+
+        :param x_char: a tensor of BxL character indices,
+        :return: a tensor of BxLxV class scores
+
+        Where B = batch size, L = maximum sequence length, V = number of words in the dictionary
+        """
+        embedded = self.embedding_layer(x_char)
+        hidden = self.lstm_layer(embedded)[0]
+        reduced = self.linear_layer(hidden)
+        
+        # Computing inverted embedding as a cosine similarity score of the transformed representation and original embeddings
+        scores = self.inverted_embedding(reduced, self.embedding_layer)
+        
+        probabilities = self.softmax_layer(scores)
+        return probabilities
+    
+    @staticmethod
+    def inverted_embedding(input, embedding_layer):
+        # Normalise both matrices
+        input_normalised = torch.nn.functional.normalize(input, dim=2)
+        weights_normalised = torch.nn.functional.normalize(embedding_layer.weight.data, dim=1)
+        # Dot product of normalised vectors equals cosine similarity
+        scores = torch.matmul(input_normalised, torch.transpose(weights_normalised, 0, 1))
+        return scores
+    
+    def compute_loss(self, pred, true, Xs):
+        """
+        Comput cross-entropy loss.
+
+        :param pred: tensor with predicted character probabilities
+        :param true: tensor witrh true classes
+        :param Xs: (not used)
+        :return: loss value
+        """
+        pred = torch.reshape(pred, (-1, len(self.dict)))
+        true = torch.reshape(true, (-1,))
+        output = self.loss_fn(pred, true)
+        return output
+
--- a/src/lambo/subwords/preprocessing.py
+++ b/src/lambo/subwords/preprocessing.py
+import random
+
+import torch
+from torch.utils.data import TensorDataset, DataLoader
+
+def encode_test(text, dictionary, maximum_length):
+    Xchar = [dictionary[char] if char in dictionary else dictionary['<UNK>'] for char in text]
+    Xchar += [dictionary['<PAD>']] * (maximum_length - (len(Xchar) % maximum_length))
+    Xchar = Xchar [:maximum_length]
+    return (torch.Tensor([Xchar]).to(torch.int64),)
+
+def encode_subwords(documents, dictionary, maximum_length):
+    """
+    Encode subwords as neural network inputs and outputs
+
+    :param documents: list of documents
+    :param dictionary: character dictionary
+    :param maximum_length: maximum length of network input and output
+    :return: a pair of network input/output tensors: character encodings, true catagories
+    """
+    tokenCount = 0
+    multiwordCount=0
+    for document in documents:
+        for turn in document.turns:
+            for sentence in turn.sentences:
+                for token in sentence.tokens:
+                    tokenCount+=1
+                    if token.is_multi_word:
+                        multiwordCount+=1
+    thrs = multiwordCount / tokenCount
+    Xchars = []
+    Ychars = []
+    for document in documents:
+        for turn in document.turns:
+            for sentence in turn.sentences:
+                for token in sentence.tokens:
+                    r=random.random()
+                    if token.is_multi_word or r < thrs:
+                        Xchar = [dictionary[char] if char in dictionary else dictionary['<UNK>'] for char in token.text]
+                        Xchar += [dictionary['<PAD>']] * (maximum_length - (len(Xchar) % maximum_length))
+                        Xchar = Xchar[:maximum_length]
+                        subwords=token.subwords
+                        if len(subwords)==0:
+                            subwords=[token.text]
+                        Ychar=[]
+                        for subword in subwords:
+                            if len(Ychar)!=0:
+                                Ychar += [dictionary['<PAD>']]
+                            Ychar+= [dictionary[char] if char in dictionary else dictionary['<UNK>'] for char in
+                                         subword]
+                        Ychar += [dictionary['<PAD>']] * (maximum_length - (len(Ychar) % maximum_length))
+                        Ychar=Ychar[:maximum_length]
+                        Xchars+= [Xchar]
+                        Ychars+= [Ychar]
+    
+    return Xchars, Ychars
+
+def prepare_subwords_dataloaders(train_docs, test_docs, max_len, batch_size, dict):
+    """
+    Prapare Pytorch dataloaders for the documents.
+
+    :param train_docs: list of training documents
+    :param test_docs: list of test documents
+    :param max_len: maximum length of network input
+    :param batch_size: batch size
+    :param dict: character dictionary (or None, if to be created)
+    :return: a triple with character dictionary, train dataloader and test dataloader
+    """
+    train_X_char, train_Y = encode_subwords(train_docs, dict, max_len)
+    if len(train_X_char)<256:
+        return None, None
+    train_X_char = torch.Tensor(train_X_char).to(torch.int64)
+    train_Y = torch.Tensor(train_Y).to(torch.int64)
+    train_dataset = TensorDataset(train_X_char, train_Y)
+    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    
+    test_X_char, test_Y = encode_subwords(test_docs, dict, max_len)
+    if len(test_X_char)<64:
+        return None, None
+    test_X_char = torch.Tensor(test_X_char).to(torch.int64)
+    test_Y = torch.Tensor(test_Y).to(torch.int64)
+    test_dataset = TensorDataset(test_X_char, test_Y)
+    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
+    return train_dataloader, test_dataloader
--- a/src/lambo/subwords/splitter.py
+++ b/src/lambo/subwords/splitter.py
+import torch
+
+from lambo.segmenter.lambo import Lambo
+from lambo.subwords.preprocessing import encode_test
+
+
+class LamboSplitter():
+    @classmethod
+    def from_path(cls, model_path, model_name):
+        """
+        Obtain a LAMBO subword splitter by reading a model from a given path.
+
+        :param model_path: directory including the model files
+        :param model_name: model name
+        :return:
+        """
+        model = torch.load(model_path / (model_name + '_subwords.pth'), map_location=torch.device('cpu'))
+        dict = Lambo.read_dict(model_path / (model_name + '.dict'))
+        return cls(model, dict)
+    
+    def __init__(self, model, dict):
+        """
+        Create a new LAMBO subword splitter from a given model and dictionary.
+
+        :param model: prediction Pytorch model
+        :param dict: dictionary
+        """
+        self.model = model
+        self.dict = dict
+        self.inv_dict = {dict[key]: key for key in dict}
+        
+    def split(self, token_text):
+        if len(token_text)>=self.model.max_len:
+            return [token_text]
+        Xs = encode_test(token_text, self.dict, self.model.max_len)
+        with torch.no_grad():
+            Y = self.model(*Xs)
+        codes= Y.argmax(2).numpy()[0]
+        decisions = [ self.inv_dict[code] for code in codes]
+        result = ['']
+        for char in decisions:
+            if len(char)==1:
+                result[-1]+=char
+            elif char=='<PAD>':
+                if result[-1]=='':
+                    break
+                result.append('')
+            else:
+                return [token_text]
+        result = [subword for subword in result if subword!='']
+        if len(result)==0:
+            return [token_text]
+        return result
+        
+    
--- a/src/lambo/subwords/subber.py
+++ b/src/lambo/subwords/subber.py
--- a/src/lambo/subwords/train.py
+++ b/src/lambo/subwords/train.py
+from lambo.learning.train import tune
+from lambo.subwords.model import LamboSubwordNetwork
+from lambo.subwords.preprocessing import prepare_subwords_dataloaders
+from lambo.utils.ud_reader import read_treebank
+
+import torch
+
+
+def train_subwords_and_save(model_name, treebank_path, save_path, lambo_segmenter, epochs=20, device='cpu'):
+    """
+    Train a new LAMBO subwords model and save it in filesystem.
+
+    :param model_name: type of model trained, currently only ``LAMBO-BILSTM`` is supported
+    :param treebank_path: path to the treebank training data
+    :param save_path: path to save the generated model
+    :param lambo_segmenter: LAMBO segmenter to base on
+    :param epochs: number of epochs to run for (default: 10)
+    :param device: the device to use for computation
+    :return: no value returned
+    """
+    if model_name not in ['LAMBO-BILSTM']:
+        print(" Unrecognised model name: " + model_name)
+        return
+    
+    print("Reading data.")
+    train_doc, dev_doc, test_doc = read_treebank(treebank_path, True)
+    
+    print("Preparing data")
+    BATCH_SIZE = 32
+    print("Initiating the model.")
+    
+    MAX_LEN = 32
+    train_dataloader, test_dataloader = prepare_subwords_dataloaders([train_doc, dev_doc], [test_doc],
+                                                                              MAX_LEN,
+                                                                              BATCH_SIZE,lambo_segmenter.dict)
+    if train_dataloader is None:
+        print("Not enough data to train, moving on.")
+        return
+    
+    model = LamboSubwordNetwork(MAX_LEN, lambo_segmenter.dict, lambo_segmenter.model)
+    
+    tune(model, train_dataloader, test_dataloader, epochs, device)
+    
+    print("Saving")
+    torch.save(model, save_path / (treebank_path.name + '_subwords.pth'))