#!/usr/bin/python
# -*- coding: utf-8 -*-
descr = """%prog [options] INPUT OUTPUT

Reads input and saves as plain text. By default, paragraphs are separated with
two newlines, sentence division is not marked."""

from optparse import OptionParser
import sys, codecs
import corpus2

def go():
	parser = OptionParser(usage=descr)
	parser.add_option('-i', '--input-format', type='string', action='store',
		dest='input_format', default='xces',
		help='set the input format; default: xces')
	parser.add_option('-t', '--tagset', type='string', action='store',
		dest='tagset', default='nkjp',
		help='set the tagset used in input; default: nkjp')
	parser.add_option('-s', '--sent-sep', type='string', action='store',
		dest='sent_sep', default='',
		help='set the sentence separator; default: (empty)')
	parser.add_option('-p', '--par-sep', type='string', action='store',
		dest='par_sep', default='\n\n',
		help='set the sentence separator; default: (two newlines)')
	(options, args) = parser.parse_args()
	if len(args) != 2:
		print 'Need to provide input and output.'
		print 'See --help for details.'
		print
		sys.exit(1)
	
	fn_input, fn_output = args
	
	with codecs.open(fn_output, 'wb', 'utf-8') as out:
		tagset = corpus2.get_named_tagset(options.tagset)
		rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, fn_input)
		first = True
		while True:
			par = rdr.get_next_chunk()
			if options.par_sep:
				first = True # if non-empty par separator, skip pre-spaces
			if not par:
				break
			for sent in par.sentences():
				if options.sent_sep:
					first = True # if non-empty sent sep, skip pre-spaces
				for tok in sent.tokens():
					if not first and tok.after_space():
						out.write(' ')
					out.write(unicode(tok.orth()))
					first = False
				out.write(options.sent_sep)
			out.write(options.par_sep)

if __name__ == '__main__':
	go()