From 653af81d3f9d480d4a0e17cf0875f5c925aca4a1 Mon Sep 17 00:00:00 2001
From: Adam Radziszewski <adam.radziszewski@pwr.wroc.pl>
Date: Sun, 31 Jul 2011 12:19:59 +0200
Subject: [PATCH] example Python code

---
 doc/wccl-run.py | 70 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100755 doc/wccl-run.py

diff --git a/doc/wccl-run.py b/doc/wccl-run.py
new file mode 100755
index 0000000..e82398d
--- /dev/null
+++ b/doc/wccl-run.py
@@ -0,0 +1,70 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+import sys
+from optparse import OptionParser
+import ctypes
+sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
+import corpus2, wccl
+
+descr = """%prog [options] CORPUSFILE
+Mimics (simplified) functionality of wccl-run.
+This script is a demo of the Python API."""
+
+def chunks(rdr):
+	"""Yields subsequent sentences from a reader."""
+	while True:
+		chunk = rdr.get_next_chunk()
+		if not chunk:
+			break
+		yield chunk
+
+def iter_sent(sent):
+	"""Iterates over a sentence, yielding the context with current_pos set
+	to the subsequent tokens. NOTE: the same context object is returned each
+	time, so tweaking with its state will affect iteration."""
+	con = wccl.SentenceContext(sent)
+	con.goto_start()
+	while con.is_current_inside():
+		yield con
+		con.advance()
+
+def go():
+	parser = OptionParser(usage=descr)
+	parser.add_option('-i', '--input-format', type='string', action='store',
+		dest='input_format', default='xces',
+		help='set the input format; default: xces')
+	parser.add_option('-t', '--tagset', type='string', action='store',
+		dest='tagset', default='kipi',
+		help='set the tagset used in input; default: kipi')
+	(options, args) = parser.parse_args()
+	
+	ts = corpus2.get_named_tagset(options.tagset)
+	p = wccl.Parser(ts)
+	
+	ops = [] # (name, op) pairs
+	infiles = []
+	for arg in args:
+		if arg.endswith('.xml'):
+			infiles.append(arg)
+		elif arg.endswith('.ccl'):
+			f = p.parseWcclFileFromPath(arg)
+			ops.extend(f.gen_all_op_pairs())
+		else:
+			# parse arg as single op string
+			op = p.parseAnyOperator(arg)
+			ops.append((arg, arg))
+	if ops and infiles:
+		for fname in infiles:
+			rdr = corpus2.TokenReader.create_path_reader(options.input_format, ts, fname)
+			for chunk in chunks(rdr):
+				# dump op names
+				print '\t'.join(name for (name, _) in ops)
+				# iterate and dump values
+				for sent in chunk.sentences():
+					for con in iter_sent(sent):
+						print '\t'.join(op.base_apply(con).to_string(ts) for (_, op) in ops)
+
+
+
+if __name__ == '__main__':
+	go()
-- 
GitLab