run_training_splitting.py

"""
Script for training LAMBO subword splitting models using UD data from pretrained
"""
import sys
from pathlib import Path

import importlib_resources as resources
import torch

from lambo.learning.train import train_new_and_save, train_pretrained_and_save
from lambo.segmenter.lambo import Lambo
from lambo.subwords.train import train_subwords_and_save

if __name__=='__main__':
    treebanks = Path.home() / 'data/lambo/ud-treebanks-v2.11/'
    outpath = Path.home() / 'data/lambo/models/subword/'
    segmenting_path = Path.home() / 'data/lambo/models/full211-s/'

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    languages_file_str = resources.read_text('lambo.resources', 'languages.txt', encoding='utf-8', errors='strict')
    lines = [line.strip() for line in languages_file_str.split('\n') if not line[0] == '#']

    for i, line in enumerate(lines):
        #if len(sys.argv)>4 and i % 5 != int(sys.argv[4]):
        #    continue
        parts = line.split()
        model = parts[0]
        if (outpath / (model + '_subwords.pth')).exists():
            continue
        print(str(i) + '/' + str(len(lines)) + '========== ' + model + ' ==========')
        inpath = treebanks / model
        segmenter = Lambo.from_path(segmenting_path, model)
        train_subwords_and_save('LAMBO-BILSTM', treebanks/ model, outpath, segmenter, epochs=20)