From 831e4113a4a74ef93f39a0d4c90f50e06df6f201 Mon Sep 17 00:00:00 2001
From: Adam Radziszewski <adam.radziszewski@pwr.wroc.pl>
Date: Thu, 11 Jul 2013 14:41:30 +0200
Subject: [PATCH] parfolds script now shuffles sentences prior to
 redistribution

---
 utils/parfolds.py | 49 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 43 insertions(+), 6 deletions(-)

diff --git a/utils/parfolds.py b/utils/parfolds.py
index e49b8e0..9f4d6e9 100755
--- a/utils/parfolds.py
+++ b/utils/parfolds.py
@@ -15,10 +15,19 @@
 
 descr = """%prog [options] INPUT OUTDIR
 
-Generates paragraph-wise folds."""
+Generates paragraph-wise folds by reading given input file (one possibly large
+file containing the whole corpus). The folds are written into OUTDIR.
+You may override the default seedword with any string you like using -s option.
+Seedword's hash will be used as seed for pseudo-random number generator, which
+in turn controlls the way input corpus paragraphs are shuffled before being
+dispatched into folds. If you want to be able to reproduce your experiments
+later, you may want to write down the seedword and include it in a log/readme
+file where you store your folds and experimental results obtained
+(along with corpus version etc.).
+"""
 
 from optparse import OptionParser
-import sys, codecs, os
+import sys, codecs, os, random
 import corpus2
 
 def go():
@@ -35,10 +44,16 @@ def go():
 	parser.add_option('-f', '--num-folds', type='int', action='store',
 		dest='num_folds', default='10',
 		help='set the number of folds (default: 10)')
+	parser.add_option('-s', '--seed-word', type='string', action='store',
+		dest='seedword', default='korpus',
+		help='set the seedword; default: korpus')
+	parser.add_option('-v', '--verbose', action='store_true',
+		dest='verbose', default=False,
+		help='verbose mode')
 	
 	(options, args) = parser.parse_args()
 	if len(args) != 2:
-		print 'Need to provide input and output.'
+		print 'Need to provide input file and output dir.'
 		print 'See --help for details.'
 		print
 		sys.exit(1)
@@ -47,6 +62,26 @@ def go():
 	fn_input, fold_dir = args
 	
 	tagset = corpus2.get_named_tagset(options.tagset)
+	# count paragraphs in input
+	if options.verbose:
+		sys.stderr.write('Counting paragraphs... ')
+	rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, fn_input)
+	num_pars = 0
+	while True:
+		par = rdr.get_next_chunk()
+		if not par:
+			break
+		num_pars += 1
+	del rdr
+	if options.verbose:
+		sys.stderr.write('%d\n' % num_pars)
+	# prepare index -- where to send ith paragraph
+	rnd = random.Random(options.seedword)
+	fold_of_par = [(par_idx % options.num_folds) for par_idx in xrange(num_pars)]
+	rnd.shuffle(fold_of_par)
+	# now the real run
+	if options.verbose:
+		sys.stderr.write('Generating folds...\n')
 	rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, fn_input)
 	fold_test = [corpus2.TokenWriter.create_path_writer(
 			options.output_format,
@@ -57,18 +92,20 @@ def go():
 			os.path.join(fold_dir, 'train%02d.xml' % (num + 1)), tagset)
 			for num in fold_nums]
 	first = True
-	fold_now = 0
+	par_now = 0
 	while True:
 		par = rdr.get_next_chunk()
 		if not par:
 			break
+		fold_now = fold_of_par[par_now]
 		fold_test[fold_now].write_chunk(par)
 		for other_num in fold_nums:
 			if other_num != fold_now:
 				fold_train[other_num].write_chunk(par)
 		
-		fold_now = (fold_now + 1) % options.num_folds
-		
+		#fold_now = (fold_now + 1) % options.num_folds
+		par_now += 1
+	del rdr
 	for w in fold_test: w.finish()
 	for w in fold_train: w.finish()
 
-- 
GitLab