Skip to content
Snippets Groups Projects
Commit 380e5964 authored by Adam Radziszewski's avatar Adam Radziszewski
Browse files

script for generating xval folds

parent 6e2603f2
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/python
# -*- coding: utf-8 -*-
descr = """%prog [options] INPUT OUTDIR
Generates paragraph-wise folds."""
from optparse import OptionParser
import sys, codecs, os
import corpus2
def go():
parser = OptionParser(usage=descr)
parser.add_option('-i', '--input-format', type='string', action='store',
dest='input_format', default='xces',
help='set the input format; default: xces')
parser.add_option('-o', '--output-format', type='string', action='store',
dest='output_format', default='xces',
help='set the output format; default: xces')
parser.add_option('-t', '--tagset', type='string', action='store',
dest='tagset', default='nkjp',
help='set the tagset used in input; default: nkjp')
parser.add_option('-f', '--num-folds', type='int', action='store',
dest='num_folds', default='10',
help='set the number of folds (default: 10)')
(options, args) = parser.parse_args()
if len(args) != 2:
print 'Need to provide input and output.'
print 'See --help for details.'
print
sys.exit(1)
fold_nums = range(options.num_folds)
fn_input, fold_dir = args
tagset = corpus2.get_named_tagset(options.tagset)
rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, fn_input)
fold_test = [corpus2.TokenWriter.create_path_writer(
options.output_format,
os.path.join(fold_dir, 'test%02d.xml' % (num + 1)), tagset)
for num in fold_nums]
fold_train = [corpus2.TokenWriter.create_path_writer(
options.output_format,
os.path.join(fold_dir, 'train%02d.xml' % (num + 1)), tagset)
for num in fold_nums]
first = True
fold_now = 0
while True:
par = rdr.get_next_chunk()
if not par:
break
fold_test[fold_now].write_chunk(par)
for other_num in fold_nums:
if other_num != fold_now:
fold_train[other_num].write_chunk(par)
fold_now = (fold_now + 1) % options.num_folds
for w in fold_test: w.finish()
for w in fold_train: w.finish()
if __name__ == '__main__':
go()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment