Skip to content
Snippets Groups Projects
Commit 380e5964 authored by Adam Radziszewski's avatar Adam Radziszewski
Browse files

script for generating xval folds

parent 6e2603f2
No related merge requests found
#!/usr/bin/python
# -*- coding: utf-8 -*-
descr = """%prog [options] INPUT OUTDIR
Generates paragraph-wise folds."""
from optparse import OptionParser
import sys, codecs, os
import corpus2
def go():
parser = OptionParser(usage=descr)
parser.add_option('-i', '--input-format', type='string', action='store',
dest='input_format', default='xces',
help='set the input format; default: xces')
parser.add_option('-o', '--output-format', type='string', action='store',
dest='output_format', default='xces',
help='set the output format; default: xces')
parser.add_option('-t', '--tagset', type='string', action='store',
dest='tagset', default='nkjp',
help='set the tagset used in input; default: nkjp')
parser.add_option('-f', '--num-folds', type='int', action='store',
dest='num_folds', default='10',
help='set the number of folds (default: 10)')
(options, args) = parser.parse_args()
if len(args) != 2:
print 'Need to provide input and output.'
print 'See --help for details.'
print
sys.exit(1)
fold_nums = range(options.num_folds)
fn_input, fold_dir = args
tagset = corpus2.get_named_tagset(options.tagset)
rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, fn_input)
fold_test = [corpus2.TokenWriter.create_path_writer(
options.output_format,
os.path.join(fold_dir, 'test%02d.xml' % (num + 1)), tagset)
for num in fold_nums]
fold_train = [corpus2.TokenWriter.create_path_writer(
options.output_format,
os.path.join(fold_dir, 'train%02d.xml' % (num + 1)), tagset)
for num in fold_nums]
first = True
fold_now = 0
while True:
par = rdr.get_next_chunk()
if not par:
break
fold_test[fold_now].write_chunk(par)
for other_num in fold_nums:
if other_num != fold_now:
fold_train[other_num].write_chunk(par)
fold_now = (fold_now + 1) % options.num_folds
for w in fold_test: w.finish()
for w in fold_train: w.finish()
if __name__ == '__main__':
go()
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment