From 9fae38f031036216dac655c628c357cb8a8c409c Mon Sep 17 00:00:00 2001
From: Adam Radziszewski <>
Date: Thu, 18 Oct 2012 21:52:15 +0200
Subject: [PATCH] working iobber_txt, todo: add possibility to turn off chunker
 and default model names

 iobber/ | 80 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 60 insertions(+), 20 deletions(-)
 mode change 100644 => 100755 iobber/

diff --git a/iobber/ b/iobber/
old mode 100644
new mode 100755
index b0bc022..19346d8
--- a/iobber/
+++ b/iobber/
@@ -15,6 +15,8 @@
 import sys
 from optparse import OptionParser
+import corpus2
 from wcrft import tagger
 from wcrft import corpio as tagger_io
@@ -23,8 +25,28 @@ import chunker
 #ioformats = corpio.format_help
 #ioformats = '\nINFO: formats: txt premorph; require installed Maca and Morfeusz' + ioformats.replace('input formats: ccl', 'input formats: txt premorph ccl')
-descr = """%prog [options] TODO
-""" + ioformats
+descr = """%prog [options] [input_files]
+IOBBER, configurable chunker.
+(C) 2012, Wroclaw University of Technology
+Processes input files through the tagger (WCRFT, must be installed) and the chunker.
+By default input is assumed to be plain text (UTF-8) and writes output in the CCL format.
+Use -c to set IOBBER (chunker) config and -C to set IOBBER trained model.
+Use -w to set WCRFT (tagger) config and -W to set WCRFT trained model.
+The default values for -c and -w are recommended, but you may need to set trained model
+directories (-C and -W).
+Use -O to specify output path (by default will write to stdout).
+Use - to process stdin to stdout.
+When processing multiple files, either give the filenames directly as arguments,
+or use --batch and provide a filename to a list of paths. Either way, each file
+will be chunked and the output writted to FILENAME.chunked.
 def lines(pathfilename):
@@ -90,24 +112,42 @@ def go():
 	else: # multiple paths as args
 		inputs = files
 		outputs = [path + '.tag' for path in inputs]
-	tagr.load_model()
-	chunkr.load_model()
-	for in_path, out_path in zip(inputs, outputs):
-		if in_path and options.verbose:
-			sys.stderr.write('Processing %s...\n' % in_path)
-		reader = tagger_io.get_reader(in_path, self.tagset, input_format, self.maca_config)
-		writer = tagger_io.get_writer(out_path, self.tagset, output_format)
-		while True:
-			par = reader.get_next_chunk() # here `chunk' denotes paragraph
-			if not par:
-				break # end of input
-			# process each sentence separately
-			for sent in chunk.sentences():
-				# preserve_ambiguity = False
-				self.disambiguate_sentence(sent)
-				# TODO: chunk it actually
-			# save tagged paragraph
-			writer.write_chunk(par)
+	if inputs:
+		tagr.load_model()
+		chunkr.load_model()
+		assert (
+				==, ('Tagger and chunker config must'
+						+ 'operate on the same tagset: %s v. %s' % (,
+		for in_path, out_path in zip(inputs, outputs):
+			if in_path and options.verbose:
+				sys.stderr.write('Processing %s...\n' % in_path)
+			reader = tagger_io.get_reader(in_path, tagr.tagset, options.input_format, tagr.maca_config)
+			writer = tagger_io.get_writer(out_path, tagr.tagset, options.output_format)
+			while True:
+				par = reader.get_next_chunk() # here `chunk' denotes paragraph
+				if not par:
+					break # end of input
+				# prepare new paragraph: chunker will need AnnotatedSentence objects
+				new_par = corpus2.Chunk()
+				for attr_key in par.attributes():
+					new_par.set_attr(attr_key, par.get_attribute(attr_key))
+				for sent in par.sentences():
+					# let it be wrapped and cloned if necessary
+					# this will make sure that later wrapping will
+					# not cause any further cloning, so references to new_sent
+					# and new_asent will be referring to the same underlying obj
+					new_asent = corpus2.AnnotatedSentence.wrap_sentence(sent)
+					new_sent = corpus2.AnnotatedSentence.cast_as_sentence(new_asent)
+					# preserve_ambiguity = False
+					tagr.disambiguate_sentence(new_sent)
+					chunkr.tag_sentence(new_sent)
+					# create a new paragraph with the new sentence
+					new_par.append(new_sent)
+				# save tagged paragraph
+				writer.write_chunk(new_par)
+	else:
+		sys.stderr.write('Nothing to do. See --help\n')
 if __name__ == '__main__':