parfolds.py

#!/usr/bin/python
# -*- coding: utf-8 -*-

# Copyright (C) 2012 Adam Radziszewski.
# This program is free software; you can redistribute and/or modify it
# under the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation; either version 3 of the License, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.
#
# See the LICENCE and COPYING files for more details

descr = """%prog [options] INPUT OUTDIR

Generates paragraph-wise folds."""

from optparse import OptionParser
import sys, codecs, os
import corpus2

def go():
	parser = OptionParser(usage=descr)
	parser.add_option('-i', '--input-format', type='string', action='store',
		dest='input_format', default='xces',
		help='set the input format; default: xces')
	parser.add_option('-o', '--output-format', type='string', action='store',
		dest='output_format', default='xces',
		help='set the output format; default: xces')
	parser.add_option('-t', '--tagset', type='string', action='store',
		dest='tagset', default='nkjp',
		help='set the tagset used in input; default: nkjp')
	parser.add_option('-f', '--num-folds', type='int', action='store',
		dest='num_folds', default='10',
		help='set the number of folds (default: 10)')

	(options, args) = parser.parse_args()
	if len(args) != 2:
		print 'Need to provide input and output.'
		print 'See --help for details.'
		print
		sys.exit(1)

	fold_nums = range(options.num_folds)
	fn_input, fold_dir = args

	tagset = corpus2.get_named_tagset(options.tagset)
	rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, fn_input)
	fold_test = [corpus2.TokenWriter.create_path_writer(
			options.output_format,
			os.path.join(fold_dir, 'test%02d.xml' % (num + 1)), tagset)
			for num in fold_nums]
	fold_train = [corpus2.TokenWriter.create_path_writer(
			options.output_format,
			os.path.join(fold_dir, 'train%02d.xml' % (num + 1)), tagset)
			for num in fold_nums]
	first = True
	fold_now = 0
	while True:
		par = rdr.get_next_chunk()
		if not par:
			break
		fold_test[fold_now].write_chunk(par)
		for other_num in fold_nums:
			if other_num != fold_now:
				fold_train[other_num].write_chunk(par)

		fold_now = (fold_now + 1) % options.num_folds

	for w in fold_test: w.finish()
	for w in fold_train: w.finish()

if __name__ == '__main__':
	go()