Skip to content
Snippets Groups Projects
Commit 831e4113 authored by Adam Radziszewski's avatar Adam Radziszewski
Browse files

parfolds script now shuffles sentences prior to redistribution

parent 036c4433
Branches
No related merge requests found
...@@ -15,10 +15,19 @@ ...@@ -15,10 +15,19 @@
descr = """%prog [options] INPUT OUTDIR descr = """%prog [options] INPUT OUTDIR
Generates paragraph-wise folds.""" Generates paragraph-wise folds by reading given input file (one possibly large
file containing the whole corpus). The folds are written into OUTDIR.
You may override the default seedword with any string you like using -s option.
Seedword's hash will be used as seed for pseudo-random number generator, which
in turn controlls the way input corpus paragraphs are shuffled before being
dispatched into folds. If you want to be able to reproduce your experiments
later, you may want to write down the seedword and include it in a log/readme
file where you store your folds and experimental results obtained
(along with corpus version etc.).
"""
from optparse import OptionParser from optparse import OptionParser
import sys, codecs, os import sys, codecs, os, random
import corpus2 import corpus2
def go(): def go():
...@@ -35,10 +44,16 @@ def go(): ...@@ -35,10 +44,16 @@ def go():
parser.add_option('-f', '--num-folds', type='int', action='store', parser.add_option('-f', '--num-folds', type='int', action='store',
dest='num_folds', default='10', dest='num_folds', default='10',
help='set the number of folds (default: 10)') help='set the number of folds (default: 10)')
parser.add_option('-s', '--seed-word', type='string', action='store',
dest='seedword', default='korpus',
help='set the seedword; default: korpus')
parser.add_option('-v', '--verbose', action='store_true',
dest='verbose', default=False,
help='verbose mode')
(options, args) = parser.parse_args() (options, args) = parser.parse_args()
if len(args) != 2: if len(args) != 2:
print 'Need to provide input and output.' print 'Need to provide input file and output dir.'
print 'See --help for details.' print 'See --help for details.'
print print
sys.exit(1) sys.exit(1)
...@@ -47,6 +62,26 @@ def go(): ...@@ -47,6 +62,26 @@ def go():
fn_input, fold_dir = args fn_input, fold_dir = args
tagset = corpus2.get_named_tagset(options.tagset) tagset = corpus2.get_named_tagset(options.tagset)
# count paragraphs in input
if options.verbose:
sys.stderr.write('Counting paragraphs... ')
rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, fn_input)
num_pars = 0
while True:
par = rdr.get_next_chunk()
if not par:
break
num_pars += 1
del rdr
if options.verbose:
sys.stderr.write('%d\n' % num_pars)
# prepare index -- where to send ith paragraph
rnd = random.Random(options.seedword)
fold_of_par = [(par_idx % options.num_folds) for par_idx in xrange(num_pars)]
rnd.shuffle(fold_of_par)
# now the real run
if options.verbose:
sys.stderr.write('Generating folds...\n')
rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, fn_input) rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, fn_input)
fold_test = [corpus2.TokenWriter.create_path_writer( fold_test = [corpus2.TokenWriter.create_path_writer(
options.output_format, options.output_format,
...@@ -57,18 +92,20 @@ def go(): ...@@ -57,18 +92,20 @@ def go():
os.path.join(fold_dir, 'train%02d.xml' % (num + 1)), tagset) os.path.join(fold_dir, 'train%02d.xml' % (num + 1)), tagset)
for num in fold_nums] for num in fold_nums]
first = True first = True
fold_now = 0 par_now = 0
while True: while True:
par = rdr.get_next_chunk() par = rdr.get_next_chunk()
if not par: if not par:
break break
fold_now = fold_of_par[par_now]
fold_test[fold_now].write_chunk(par) fold_test[fold_now].write_chunk(par)
for other_num in fold_nums: for other_num in fold_nums:
if other_num != fold_now: if other_num != fold_now:
fold_train[other_num].write_chunk(par) fold_train[other_num].write_chunk(par)
fold_now = (fold_now + 1) % options.num_folds #fold_now = (fold_now + 1) % options.num_folds
par_now += 1
del rdr
for w in fold_test: w.finish() for w in fold_test: w.finish()
for w in fold_train: w.finish() for w in fold_train: w.finish()
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment