Skip to content
Snippets Groups Projects
Select Git revision
  • 187515c26c6022a9a20fb1e6c6e653b8f4578c7e
  • master default protected
  • vertical_relations
  • lu_without_semantic_frames
  • hierarchy
  • additional-unification-filters
  • v0.1.1
  • v0.1.0
  • v0.0.9
  • v0.0.8
  • v0.0.7
  • v0.0.6
  • v0.0.5
  • v0.0.4
  • v0.0.3
  • v0.0.2
  • v0.0.1
17 results

__init__.py

Blame
  • run_training_splitting.py 1.36 KiB
    """
    Script for training LAMBO subword splitting models using UD data from pretrained
    """
    import sys
    from pathlib import Path
    
    import importlib_resources as resources
    import torch
    
    from lambo.learning.train import train_new_and_save, train_pretrained_and_save
    from lambo.segmenter.lambo import Lambo
    from lambo.subwords.train import train_subwords_and_save
    
    if __name__=='__main__':
        treebanks = Path.home() / 'data/lambo/ud-treebanks-v2.11/'
        outpath = Path.home() / 'data/lambo/models/subword/'
        segmenting_path = Path.home() / 'data/lambo/models/full211-s/'
    
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        languages_file_str = resources.read_text('lambo.resources', 'languages.txt', encoding='utf-8', errors='strict')
        lines = [line.strip() for line in languages_file_str.split('\n') if not line[0] == '#']
        
        for i, line in enumerate(lines):
            #if len(sys.argv)>4 and i % 5 != int(sys.argv[4]):
            #    continue
            parts = line.split()
            model = parts[0]
            if (outpath / (model + '_subwords.pth')).exists():
                continue
            print(str(i) + '/' + str(len(lines)) + '========== ' + model + ' ==========')
            inpath = treebanks / model
            segmenter = Lambo.from_path(segmenting_path, model)
            train_subwords_and_save('LAMBO-BILSTM', treebanks/ model, outpath, segmenter, epochs=20)