#!/usr/bin/python # -*- coding: utf-8 -*- # Copyright (C) 2012 Adam Radziszewski. # This program is free software; you can redistribute and/or modify it # under the terms of the GNU Lesser General Public License as published by the Free # Software Foundation; either version 3 of the License, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY # or FITNESS FOR A PARTICULAR PURPOSE. # # See the LICENCE and COPYING files for more details descr = """%prog [options] INPUT OUTPUT Reads input and saves as plain text. By default, paragraphs are separated with two newlines, sentence division is not marked.""" from optparse import OptionParser import sys, codecs import corpus2 def go(): parser = OptionParser(usage=descr) parser.add_option('-i', '--input-format', type='string', action='store', dest='input_format', default='xces', help='set the input format; default: xces') parser.add_option('-t', '--tagset', type='string', action='store', dest='tagset', default='nkjp', help='set the tagset used in input; default: nkjp') parser.add_option('-p', '--par-sep', type='string', action='store', dest='par_sep', default='\n\n', help='set the paragraph separator; default: (two newlines)') parser.add_option('--ignore-ns-sent', action='store_true', default=False, dest='ignore_ns_sent', help='ignore no-space markers on sent boundaries') (options, args) = parser.parse_args() if len(args) != 2: print 'Need to provide input and output.' print 'See --help for details.' print sys.exit(1) fn_input, fn_output = args with codecs.open(fn_output, 'wb', 'utf-8') as out: tagset = corpus2.get_named_tagset(options.tagset) rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, fn_input) first = True while True: par = rdr.get_next_chunk() parfirst = True if not par: break for sent in par.sentences(): sentfirst = True # if non-empty sent sep, skip pre-spaces for tok in sent.tokens(): if not parfirst and ((sentfirst and options.ignore_ns_sent) or tok.after_space()): out.write(' ') out.write(unicode(tok.orth())) sentfirst = False parfirst = False out.write(options.par_sep) if __name__ == '__main__': go()