working iobber_txt, todo: add possibility to turn off chunker and default model names

9fae38f0 · Adam Radziszewski · 5a22c1f2 · 9fae38f0
Commit 9fae38f0 authored Oct 18, 2012 by Adam Radziszewski
--- a/iobber/iobber_txt.py
+++ b/iobber/iobber_txt.py
@@ -15,6 +15,8 @@
 import sys
 from optparse import OptionParser
+import corpus2
 from wcrft import tagger
 from wcrft import corpio as tagger_io
@@ -23,8 +25,28 @@ import chunker
 #ioformats = corpio.format_help
 #ioformats = '\nINFO: formats: txt premorph; require installed Maca and Morfeusz' + ioformats.replace('input formats: ccl', 'input formats: txt premorph ccl')
-descr = """%prog [options] TODO
+descr = """%prog [options] [input_files]
-""" + ioformats
+IOBBER, configurable chunker.
+(C) 2012, Wroclaw University of Technology
+Processes input files through the tagger (WCRFT, must be installed) and the chunker.
+By default input is assumed to be plain text (UTF-8) and writes output in the CCL format.
+Use -c to set IOBBER (chunker) config and -C to set IOBBER trained model.
+Use -w to set WCRFT (tagger) config and -W to set WCRFT trained model.
+The default values for -c and -w are recommended, but you may need to set trained model
+directories (-C and -W).
+Use -O to specify output path (by default will write to stdout).
+Use - to process stdin to stdout.
+When processing multiple files, either give the filenames directly as arguments,
+or use --batch and provide a filename to a list of paths. Either way, each file
+will be chunked and the output writted to FILENAME.chunked.
+""" 
 def lines(pathfilename):
@@ -90,24 +112,42 @@ def go():
 	else: # multiple paths as args
 		inputs = files
 		outputs = [path + '.tag' for path in inputs]
+	if inputs:
 		tagr.load_model()
 		chunkr.load_model()
+		assert (tagr.tagset.name()
+				== chunkr.tagset.name()), ('Tagger and chunker config must'
+						+ 'operate on the same tagset: %s v. %s' % (tagr.tagset.name(),
+							chunkr.tagset.name()))
 		for in_path, out_path in zip(inputs, outputs):
 			if in_path and options.verbose:
 				sys.stderr.write('Processing %s...\n' % in_path)
-		reader = tagger_io.get_reader(in_path, self.tagset, input_format, self.maca_config)
+			reader = tagger_io.get_reader(in_path, tagr.tagset, options.input_format, tagr.maca_config)
-		writer = tagger_io.get_writer(out_path, self.tagset, output_format)
+			writer = tagger_io.get_writer(out_path, tagr.tagset, options.output_format)
 			while True:
 				par = reader.get_next_chunk() # here `chunk' denotes paragraph
 				if not par:
 					break # end of input
-			# process each sentence separately
+				# prepare new paragraph: chunker will need AnnotatedSentence objects
-			for sent in chunk.sentences():
+				new_par = corpus2.Chunk()
+				for attr_key in par.attributes():
+					new_par.set_attr(attr_key, par.get_attribute(attr_key))
+				for sent in par.sentences():
+					# let it be wrapped and cloned if necessary
+					# this will make sure that later wrapping will
+					# not cause any further cloning, so references to new_sent
+					# and new_asent will be referring to the same underlying obj
+					new_asent = corpus2.AnnotatedSentence.wrap_sentence(sent)
+					new_sent = corpus2.AnnotatedSentence.cast_as_sentence(new_asent)
 					# preserve_ambiguity = False
-				self.disambiguate_sentence(sent)
+					tagr.disambiguate_sentence(new_sent)
-				# TODO: chunk it actually
+					chunkr.tag_sentence(new_sent)
+					# create a new paragraph with the new sentence
+					new_par.append(new_sent)
 				# save tagged paragraph
-			writer.write_chunk(par)
+				writer.write_chunk(new_par)
+	else:
+		sys.stderr.write('Nothing to do. See --help\n')
 if __name__ == '__main__':
 	go()