initial (incomplete) iobber_txt script to tag&chunk

5a22c1f2 · Adam Radziszewski · 5c47d551 · 5a22c1f2
Commit 5a22c1f2 authored 12 years ago by Adam Radziszewski
--- a/iobber/iobber_txt.py
+++ b/iobber/iobber_txt.py
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# This file is part of IOBBER
+# Copyright (C) 2011 Adam Radziszewski, Paweł Orłowicz.
+# IOBBER is free software; you can redistribute and/or modify it
+# under the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation; either version 3 of the License, or (at your option)
+# any later version.
+#
+# IOBBER is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the LICENCE and COPYING files for more details
+import sys
+from optparse import OptionParser
+from wcrft import tagger
+from wcrft import corpio as tagger_io
+import chunker
+#ioformats = corpio.format_help
+#ioformats = '\nINFO: formats: txt premorph; require installed Maca and Morfeusz' + ioformats.replace('input formats: ccl', 'input formats: txt premorph ccl')
+descr = """%prog [options] TODO
+""" + ioformats
+def lines(pathfilename):
+	with open(pathfilename) as f:
+		return [line.strip() for line in f if line.strip()]
+def go():
+	parser = OptionParser(usage=descr)
+	parser.add_option('-i', '--input-format', type='string', action='store',
+		dest='input_format', default='txt',
+		help='set the input format; default: txt')
+	parser.add_option('-o', '--output-format', type='string', action='store',
+		dest='output_format', default='ccl',
+		help='set the output format; default: ccl')
+	parser.add_option('-O', '--output-file', type='string', action='store',
+		dest='out_path', default='',
+		help='set output filename (do not write to stdout)')
+	parser.add_option('-c', '--chunker-config', type='string', action='store',
+		dest='chunker_config', default='kpwr.ini',
+		help='use given chunker config (default: kpwr.ini)')
+	parser.add_option('-C', '--chunker-model', type='string', action='store',
+		dest='chunker_dir', default='',
+		help='read chunker trained model from the given dir')
+	parser.add_option('-w', '--tagger-config', type='string', action='store',
+		dest='tagger_config', default='nkjp.ini',
+		help='use given tagger (wcrft) config (default: nkjp.ini)')
+	parser.add_option('-W', '--tagger-model', type='string', action='store',
+		dest='tagger_dir', default='',
+		help='read tagger (wcrft) trained model from the given dir')
+	parser.add_option('-m', '--maca-config', type='string', action='store',
+		dest='maca_config', default='',
+		help='override maca config file specified in tagger config')
+	parser.add_option('--batch', action='store_true',
+		help='treat arguments as lists of paths to files to tag')
+	parser.add_option('-v', '--verbose', action='store_true',
+		dest='verbose', default=False,
+		help='write additional info to stderr')
+	(options, args) = parser.parse_args()
+	files = args
+	chunkr = chunker.Chunker(options.chunker_config, options.chunker_dir)
+	tagr = tagger.Tagger(options.tagger_config, options.tagger_dir)
+	if options.maca_config != '':
+		tagr.maca_config = options.maca_config
+	# TODO option not to use chunker
+	# tag and chunk
+	inputs = []
+	outputs = []
+	if options.batch: # read each arg as input path list
+		for pathpath in files:
+			inputs.extend(lines(pathpath))
+		outputs = [path + '.tag' for path in inputs]
+	elif len(files) == 1:
+		if files[0] == '-': # stdin to stdout
+			inputs.append(None)
+			outputs.append(None)
+		else:
+			inputs.append(files[0])
+			outputs.append(options.out_path)
+	else: # multiple paths as args
+		inputs = files
+		outputs = [path + '.tag' for path in inputs]
+	tagr.load_model()
+	chunkr.load_model()
+	for in_path, out_path in zip(inputs, outputs):
+		if in_path and options.verbose:
+			sys.stderr.write('Processing %s...\n' % in_path)
+		reader = tagger_io.get_reader(in_path, self.tagset, input_format, self.maca_config)
+		writer = tagger_io.get_writer(out_path, self.tagset, output_format)
+		while True:
+			par = reader.get_next_chunk() # here `chunk' denotes paragraph
+			if not par:
+				break # end of input
+			# process each sentence separately
+			for sent in chunk.sentences():
+				# preserve_ambiguity = False
+				self.disambiguate_sentence(sent)
+				# TODO: chunk it actually
+			# save tagged paragraph
+			writer.write_chunk(par)
+if __name__ == '__main__':
+	go()