From 831e4113a4a74ef93f39a0d4c90f50e06df6f201 Mon Sep 17 00:00:00 2001 From: Adam Radziszewski <adam.radziszewski@pwr.wroc.pl> Date: Thu, 11 Jul 2013 14:41:30 +0200 Subject: [PATCH] parfolds script now shuffles sentences prior to redistribution --- utils/parfolds.py | 49 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 6 deletions(-) diff --git a/utils/parfolds.py b/utils/parfolds.py index e49b8e0..9f4d6e9 100755 --- a/utils/parfolds.py +++ b/utils/parfolds.py @@ -15,10 +15,19 @@ descr = """%prog [options] INPUT OUTDIR -Generates paragraph-wise folds.""" +Generates paragraph-wise folds by reading given input file (one possibly large +file containing the whole corpus). The folds are written into OUTDIR. +You may override the default seedword with any string you like using -s option. +Seedword's hash will be used as seed for pseudo-random number generator, which +in turn controlls the way input corpus paragraphs are shuffled before being +dispatched into folds. If you want to be able to reproduce your experiments +later, you may want to write down the seedword and include it in a log/readme +file where you store your folds and experimental results obtained +(along with corpus version etc.). +""" from optparse import OptionParser -import sys, codecs, os +import sys, codecs, os, random import corpus2 def go(): @@ -35,10 +44,16 @@ def go(): parser.add_option('-f', '--num-folds', type='int', action='store', dest='num_folds', default='10', help='set the number of folds (default: 10)') + parser.add_option('-s', '--seed-word', type='string', action='store', + dest='seedword', default='korpus', + help='set the seedword; default: korpus') + parser.add_option('-v', '--verbose', action='store_true', + dest='verbose', default=False, + help='verbose mode') (options, args) = parser.parse_args() if len(args) != 2: - print 'Need to provide input and output.' + print 'Need to provide input file and output dir.' print 'See --help for details.' print sys.exit(1) @@ -47,6 +62,26 @@ def go(): fn_input, fold_dir = args tagset = corpus2.get_named_tagset(options.tagset) + # count paragraphs in input + if options.verbose: + sys.stderr.write('Counting paragraphs... ') + rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, fn_input) + num_pars = 0 + while True: + par = rdr.get_next_chunk() + if not par: + break + num_pars += 1 + del rdr + if options.verbose: + sys.stderr.write('%d\n' % num_pars) + # prepare index -- where to send ith paragraph + rnd = random.Random(options.seedword) + fold_of_par = [(par_idx % options.num_folds) for par_idx in xrange(num_pars)] + rnd.shuffle(fold_of_par) + # now the real run + if options.verbose: + sys.stderr.write('Generating folds...\n') rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, fn_input) fold_test = [corpus2.TokenWriter.create_path_writer( options.output_format, @@ -57,18 +92,20 @@ def go(): os.path.join(fold_dir, 'train%02d.xml' % (num + 1)), tagset) for num in fold_nums] first = True - fold_now = 0 + par_now = 0 while True: par = rdr.get_next_chunk() if not par: break + fold_now = fold_of_par[par_now] fold_test[fold_now].write_chunk(par) for other_num in fold_nums: if other_num != fold_now: fold_train[other_num].write_chunk(par) - fold_now = (fold_now + 1) % options.num_folds - + #fold_now = (fold_now + 1) % options.num_folds + par_now += 1 + del rdr for w in fold_test: w.finish() for w in fold_train: w.finish() -- GitLab