diff --git a/utils/corptext.py b/utils/corptext.py new file mode 100755 index 0000000000000000000000000000000000000000..36a67d72659835f9a1f354748e312be608d0de61 --- /dev/null +++ b/utils/corptext.py @@ -0,0 +1,57 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +descr = """%prog [options] INPUT OUTPUT + +Reads input and saves as plain text. By default, paragraphs are separated with +two newlines, sentence division is not marked.""" + +from optparse import OptionParser +import sys, codecs +import corpus2 + +def go(): + parser = OptionParser(usage=descr) + parser.add_option('-i', '--input-format', type='string', action='store', + dest='input_format', default='xces', + help='set the input format; default: xces') + parser.add_option('-t', '--tagset', type='string', action='store', + dest='tagset', default='nkjp', + help='set the tagset used in input; default: nkjp') + parser.add_option('-s', '--sent-sep', type='string', action='store', + dest='sent_sep', default='', + help='set the sentence separator; default: (empty)') + parser.add_option('-p', '--par-sep', type='string', action='store', + dest='par_sep', default='\n\n', + help='set the sentence separator; default: (two newlines)') + (options, args) = parser.parse_args() + if len(args) != 2: + print 'Need to provide input and output.' + print 'See --help for details.' + print + sys.exit(1) + + fn_input, fn_output = args + + with codecs.open(fn_output, 'wb', 'utf-8') as out: + tagset = corpus2.get_named_tagset(options.tagset) + rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, fn_input) + first = True + while True: + par = rdr.get_next_chunk() + if options.par_sep: + first = True # if non-empty par separator, skip pre-spaces + if not par: + break + for sent in par.sentences(): + if options.sent_sep: + first = True # if non-empty sent sep, skip pre-spaces + for tok in sent.tokens(): + if not first and tok.after_space(): + out.write(' ') + out.write(unicode(tok.orth())) + first = False + out.write(options.sent_sep) + out.write(options.par_sep) + +if __name__ == '__main__': + go()