Skip to content
Snippets Groups Projects
Commit 9fcf1532 authored by piotrmp's avatar piotrmp
Browse files

Subword splitting implementation.

parent 183cf56d
Branches
Tags
1 merge request!2Multiword generation
"""
Script for training LAMBO subword splitting models using UD data from pretrained
"""
import sys
from pathlib import Path
import importlib_resources as resources
import torch
from lambo.learning.train import train_new_and_save, train_pretrained_and_save
from lambo.segmenter.lambo import Lambo
from lambo.subwords.train import train_subwords_and_save
if __name__=='__main__':
treebanks = Path.home() / 'data/lambo/ud-treebanks-v2.11/'
outpath = Path.home() / 'data/lambo/models/subword/'
segmenting_path = Path.home() / 'data/lambo/models/full211-s/'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
languages_file_str = resources.read_text('lambo.resources', 'languages.txt', encoding='utf-8', errors='strict')
lines = [line.strip() for line in languages_file_str.split('\n') if not line[0] == '#']
for i, line in enumerate(lines):
#if len(sys.argv)>4 and i % 5 != int(sys.argv[4]):
# continue
parts = line.split()
model = parts[0]
if (outpath / (model + '_subwords.pth')).exists():
continue
print(str(i) + '/' + str(len(lines)) + '========== ' + model + ' ==========')
inpath = treebanks / model
segmenter = Lambo.from_path(segmenting_path, model)
train_subwords_and_save('LAMBO-BILSTM', treebanks/ model, outpath, segmenter, epochs=20)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment