From 84404f957ef2c231f2bf768460b853bd806621c1 Mon Sep 17 00:00:00 2001
From: Adam Radziszewski <adam.radziszewski@pwr.wroc.pl>
Date: Thu, 19 Jan 2012 12:12:38 +0100
Subject: [PATCH] helper script for text extraction

---
 utils/corptext.py | 57 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100755 utils/corptext.py

diff --git a/utils/corptext.py b/utils/corptext.py
new file mode 100755
index 0000000..36a67d7
--- /dev/null
+++ b/utils/corptext.py
@@ -0,0 +1,57 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+descr = """%prog [options] INPUT OUTPUT
+
+Reads input and saves as plain text. By default, paragraphs are separated with
+two newlines, sentence division is not marked."""
+
+from optparse import OptionParser
+import sys, codecs
+import corpus2
+
+def go():
+	parser = OptionParser(usage=descr)
+	parser.add_option('-i', '--input-format', type='string', action='store',
+		dest='input_format', default='xces',
+		help='set the input format; default: xces')
+	parser.add_option('-t', '--tagset', type='string', action='store',
+		dest='tagset', default='nkjp',
+		help='set the tagset used in input; default: nkjp')
+	parser.add_option('-s', '--sent-sep', type='string', action='store',
+		dest='sent_sep', default='',
+		help='set the sentence separator; default: (empty)')
+	parser.add_option('-p', '--par-sep', type='string', action='store',
+		dest='par_sep', default='\n\n',
+		help='set the sentence separator; default: (two newlines)')
+	(options, args) = parser.parse_args()
+	if len(args) != 2:
+		print 'Need to provide input and output.'
+		print 'See --help for details.'
+		print
+		sys.exit(1)
+	
+	fn_input, fn_output = args
+	
+	with codecs.open(fn_output, 'wb', 'utf-8') as out:
+		tagset = corpus2.get_named_tagset(options.tagset)
+		rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, fn_input)
+		first = True
+		while True:
+			par = rdr.get_next_chunk()
+			if options.par_sep:
+				first = True # if non-empty par separator, skip pre-spaces
+			if not par:
+				break
+			for sent in par.sentences():
+				if options.sent_sep:
+					first = True # if non-empty sent sep, skip pre-spaces
+				for tok in sent.tokens():
+					if not first and tok.after_space():
+						out.write(' ')
+					out.write(unicode(tok.orth()))
+					first = False
+				out.write(options.sent_sep)
+			out.write(options.par_sep)
+
+if __name__ == '__main__':
+	go()
-- 
GitLab