Merge branch 'master' of nlp.pwr.wroc.pl:iobber

1769ac2b · jezozwierzak · 26075b92 · 9fae38f0 · 1769ac2b
Commit 1769ac2b authored Oct 19, 2012 by jezozwierzak
--- a/iobber/iobber_txt.py
+++ b/iobber/iobber_txt.py
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# This file is part of IOBBER
+# Copyright (C) 2011 Adam Radziszewski, Paweł Orłowicz.
+# IOBBER is free software; you can redistribute and/or modify it
+# under the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation; either version 3 of the License, or (at your option)
+# any later version.
+#
+# IOBBER is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the LICENCE and COPYING files for more details
+import sys
+from optparse import OptionParser
+import corpus2
+from wcrft import tagger
+from wcrft import corpio as tagger_io
+import chunker
+#ioformats = corpio.format_help
+#ioformats = '\nINFO: formats: txt premorph; require installed Maca and Morfeusz' + ioformats.replace('input formats: ccl', 'input formats: txt premorph ccl')
+descr = """%prog [options] [input_files]
+IOBBER, configurable chunker.
+(C) 2012, Wroclaw University of Technology
+Processes input files through the tagger (WCRFT, must be installed) and the chunker.
+By default input is assumed to be plain text (UTF-8) and writes output in the CCL format.
+Use -c to set IOBBER (chunker) config and -C to set IOBBER trained model.
+Use -w to set WCRFT (tagger) config and -W to set WCRFT trained model.
+The default values for -c and -w are recommended, but you may need to set trained model
+directories (-C and -W).
+Use -O to specify output path (by default will write to stdout).
+Use - to process stdin to stdout.
+When processing multiple files, either give the filenames directly as arguments,
+or use --batch and provide a filename to a list of paths. Either way, each file
+will be chunked and the output writted to FILENAME.chunked.
+""" 
+def lines(pathfilename):
+	with open(pathfilename) as f:
+		return [line.strip() for line in f if line.strip()]
+def go():
+	parser = OptionParser(usage=descr)
+	parser.add_option('-i', '--input-format', type='string', action='store',
+		dest='input_format', default='txt',
+		help='set the input format; default: txt')
+	parser.add_option('-o', '--output-format', type='string', action='store',
+		dest='output_format', default='ccl',
+		help='set the output format; default: ccl')
+	parser.add_option('-O', '--output-file', type='string', action='store',
+		dest='out_path', default='',
+		help='set output filename (do not write to stdout)')
+	parser.add_option('-c', '--chunker-config', type='string', action='store',
+		dest='chunker_config', default='kpwr.ini',
+		help='use given chunker config (default: kpwr.ini)')
+	parser.add_option('-C', '--chunker-model', type='string', action='store',
+		dest='chunker_dir', default='',
+		help='read chunker trained model from the given dir')
+	parser.add_option('-w', '--tagger-config', type='string', action='store',
+		dest='tagger_config', default='nkjp.ini',
+		help='use given tagger (wcrft) config (default: nkjp.ini)')
+	parser.add_option('-W', '--tagger-model', type='string', action='store',
+		dest='tagger_dir', default='',
+		help='read tagger (wcrft) trained model from the given dir')
+	parser.add_option('-m', '--maca-config', type='string', action='store',
+		dest='maca_config', default='',
+		help='override maca config file specified in tagger config')
+	parser.add_option('--batch', action='store_true',
+		help='treat arguments as lists of paths to files to tag')
+	parser.add_option('-v', '--verbose', action='store_true',
+		dest='verbose', default=False,
+		help='write additional info to stderr')
+	(options, args) = parser.parse_args()
+	files = args
+	chunkr = chunker.Chunker(options.chunker_config, options.chunker_dir)
+	tagr = tagger.Tagger(options.tagger_config, options.tagger_dir)
+	if options.maca_config != '':
+		tagr.maca_config = options.maca_config
+	# TODO option not to use chunker
+	# tag and chunk
+	inputs = []
+	outputs = []
+	if options.batch: # read each arg as input path list
+		for pathpath in files:
+			inputs.extend(lines(pathpath))
+		outputs = [path + '.tag' for path in inputs]
+	elif len(files) == 1:
+		if files[0] == '-': # stdin to stdout
+			inputs.append(None)
+			outputs.append(None)
+		else:
+			inputs.append(files[0])
+			outputs.append(options.out_path)
+	else: # multiple paths as args
+		inputs = files
+		outputs = [path + '.tag' for path in inputs]
+	if inputs:
+		tagr.load_model()
+		chunkr.load_model()
+		assert (tagr.tagset.name()
+				== chunkr.tagset.name()), ('Tagger and chunker config must'
+						+ 'operate on the same tagset: %s v. %s' % (tagr.tagset.name(),
+							chunkr.tagset.name()))
+		for in_path, out_path in zip(inputs, outputs):
+			if in_path and options.verbose:
+				sys.stderr.write('Processing %s...\n' % in_path)
+			reader = tagger_io.get_reader(in_path, tagr.tagset, options.input_format, tagr.maca_config)
+			writer = tagger_io.get_writer(out_path, tagr.tagset, options.output_format)
+			while True:
+				par = reader.get_next_chunk() # here `chunk' denotes paragraph
+				if not par:
+					break # end of input
+				# prepare new paragraph: chunker will need AnnotatedSentence objects
+				new_par = corpus2.Chunk()
+				for attr_key in par.attributes():
+					new_par.set_attr(attr_key, par.get_attribute(attr_key))
+				for sent in par.sentences():
+					# let it be wrapped and cloned if necessary
+					# this will make sure that later wrapping will
+					# not cause any further cloning, so references to new_sent
+					# and new_asent will be referring to the same underlying obj
+					new_asent = corpus2.AnnotatedSentence.wrap_sentence(sent)
+					new_sent = corpus2.AnnotatedSentence.cast_as_sentence(new_asent)
+					# preserve_ambiguity = False
+					tagr.disambiguate_sentence(new_sent)
+					chunkr.tag_sentence(new_sent)
+					# create a new paragraph with the new sentence
+					new_par.append(new_sent)
+				# save tagged paragraph
+				writer.write_chunk(new_par)
+	else:
+		sys.stderr.write('Nothing to do. See --help\n')
+if __name__ == '__main__':
+	go()