Skip to content
Snippets Groups Projects
Select Git revision
  • 75ef7daa3ce44abc7b7a716a6570e9e44420a442
  • master default protected
  • fix-words-ann
  • wccl-rules-migration
  • develop
5 results

logicalpredicate.cpp

Blame
  • parfolds.py 2.43 KiB
    #!/usr/bin/python
    # -*- coding: utf-8 -*-
    
    # Copyright (C) 2012 Adam Radziszewski.
    # This program is free software; you can redistribute and/or modify it
    # under the terms of the GNU Lesser General Public License as published by the Free
    # Software Foundation; either version 3 of the License, or (at your option)
    # any later version.
    #
    # This program is distributed in the hope that it will be useful, but
    # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    # or FITNESS FOR A PARTICULAR PURPOSE.
    #
    # See the LICENCE and COPYING files for more details
    
    descr = """%prog [options] INPUT OUTDIR
    
    Generates paragraph-wise folds."""
    
    from optparse import OptionParser
    import sys, codecs, os
    import corpus2
    
    def go():
    	parser = OptionParser(usage=descr)
    	parser.add_option('-i', '--input-format', type='string', action='store',
    		dest='input_format', default='xces',
    		help='set the input format; default: xces')
    	parser.add_option('-o', '--output-format', type='string', action='store',
    		dest='output_format', default='xces',
    		help='set the output format; default: xces')
    	parser.add_option('-t', '--tagset', type='string', action='store',
    		dest='tagset', default='nkjp',
    		help='set the tagset used in input; default: nkjp')
    	parser.add_option('-f', '--num-folds', type='int', action='store',
    		dest='num_folds', default='10',
    		help='set the number of folds (default: 10)')
    	
    	(options, args) = parser.parse_args()
    	if len(args) != 2:
    		print 'Need to provide input and output.'
    		print 'See --help for details.'
    		print
    		sys.exit(1)
    	
    	fold_nums = range(options.num_folds)
    	fn_input, fold_dir = args
    	
    	tagset = corpus2.get_named_tagset(options.tagset)
    	rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, fn_input)
    	fold_test = [corpus2.TokenWriter.create_path_writer(
    			options.output_format,
    			os.path.join(fold_dir, 'test%02d.xml' % (num + 1)), tagset)
    			for num in fold_nums]
    	fold_train = [corpus2.TokenWriter.create_path_writer(
    			options.output_format,
    			os.path.join(fold_dir, 'train%02d.xml' % (num + 1)), tagset)
    			for num in fold_nums]
    	first = True
    	fold_now = 0
    	while True:
    		par = rdr.get_next_chunk()
    		if not par:
    			break
    		fold_test[fold_now].write_chunk(par)
    		for other_num in fold_nums:
    			if other_num != fold_now:
    				fold_train[other_num].write_chunk(par)
    		
    		fold_now = (fold_now + 1) % options.num_folds
    		
    	for w in fold_test: w.finish()
    	for w in fold_train: w.finish()
    
    if __name__ == '__main__':
    	go()