Subword splitting implementation.

9fcf1532 · piotrmp · 183cf56d · 9fcf1532
Commit 9fcf1532 authored Dec 13, 2022 by piotrmp
--- a/src/lambo/examples/run_training_splitting.py
+++ b/src/lambo/examples/run_training_splitting.py
+"""
+Script for training LAMBO subword splitting models using UD data from pretrained
+"""
+import sys
+from pathlib import Path
+import importlib_resources as resources
+import torch
+from lambo.learning.train import train_new_and_save, train_pretrained_and_save
+from lambo.segmenter.lambo import Lambo
+from lambo.subwords.train import train_subwords_and_save
+if __name__=='__main__':
+    treebanks = Path.home() / 'data/lambo/ud-treebanks-v2.11/'
+    outpath = Path.home() / 'data/lambo/models/subword/'
+    segmenting_path = Path.home() / 'data/lambo/models/full211-s/'
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    languages_file_str = resources.read_text('lambo.resources', 'languages.txt', encoding='utf-8', errors='strict')
+    lines = [line.strip() for line in languages_file_str.split('\n') if not line[0] == '#']
+    for i, line in enumerate(lines):
+        #if len(sys.argv)>4 and i % 5 != int(sys.argv[4]):
+        #    continue
+        parts = line.split()
+        model = parts[0]
+        if (outpath / (model + '_subwords.pth')).exists():
+            continue
+        print(str(i) + '/' + str(len(lines)) + '========== ' + model + ' ==========')
+        inpath = treebanks / model
+        segmenter = Lambo.from_path(segmenting_path, model)
+        train_subwords_and_save('LAMBO-BILSTM', treebanks/ model, outpath, segmenter, epochs=20)