iobber_txt.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
# This file is part of IOBBER
# Copyright (C) 2011 Adam Radziszewski, Paweł Orłowicz.
# IOBBER is free software; you can redistribute and/or modify it
# under the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation; either version 3 of the License, or (at your option)
# any later version.
#
# IOBBER is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.
#
# See the LICENCE and COPYING files for more details
import sys
from optparse import OptionParser

import corpus2

from wcrft import tagger
from wcrft import corpio as tagger_io

import chunker

#ioformats = corpio.format_help
#ioformats = '\nINFO: formats: txt premorph; require installed Maca and Morfeusz' + ioformats.replace('input formats: ccl', 'input formats: txt premorph ccl')

descr = """%prog [options] [input_files]

IOBBER, configurable chunker.
(C) 2012, Wroclaw University of Technology

Processes input files through the tagger (WCRFT, must be installed) and the chunker.
By default input is assumed to be plain text (UTF-8) and writes output in the CCL format.

Use -c to set IOBBER (chunker) config and -C to set IOBBER trained model.
Use -w to set WCRFT (tagger) config and -W to set WCRFT trained model.

The default values for -c and -w are recommended, but you may need to set trained model
directories (-C and -W).

Use -O to specify output path (by default will write to stdout).
Use - to process stdin to stdout.

When processing multiple files, either give the filenames directly as arguments,
or use --batch and provide a filename to a list of paths. Either way, each file
will be chunked and the output writted to FILENAME.chunked.

"""


def lines(pathfilename):
	with open(pathfilename) as f:
		return [line.strip() for line in f if line.strip()]

def go():
	parser = OptionParser(usage=descr)
	parser.add_option('-i', '--input-format', type='string', action='store',
		dest='input_format', default='txt',
		help='set the input format; default: txt')
	parser.add_option('-o', '--output-format', type='string', action='store',
		dest='output_format', default='ccl',
		help='set the output format; default: ccl')
	parser.add_option('-O', '--output-file', type='string', action='store',
		dest='out_path', default='',
		help='set output filename (do not write to stdout)')
	parser.add_option('--no-chunk', action='store_false',
		dest='shall_chunk', default=True,
		help='don\'t run the chunker, only the tagger')
	parser.add_option('-c', '--chunker-config', type='string', action='store',
		dest='chunker_config', default='kpwr.ini',
		help='use given chunker config (default: kpwr.ini)')
	parser.add_option('-C', '--chunker-model', type='string', action='store',
		dest='chunker_dir', default='model-kpwr11-H',
		help='read chunker trained model from the given dir (default: model-kpwr11-H)')
	parser.add_option('-w', '--tagger-config', type='string', action='store',
		dest='tagger_config', default='nkjp_s2.ini',
		help='use given tagger (wcrft) config (default: nkjp_s2.ini)')
	parser.add_option('-W', '--tagger-model', type='string', action='store',
		dest='tagger_dir', default='model_nkjp10_wcrft_s2',
		help='read tagger (wcrft) trained model from the given dir (default: model_nkjp10_wcrft_s2)')
	parser.add_option('-m', '--maca-config', type='string', action='store',
		dest='maca_config', default='',
		help='override maca config file specified in tagger config')
	parser.add_option('--batch', action='store_true',
		help='treat arguments as lists of paths to files to tag')
	parser.add_option('-v', '--verbose', action='store_true',
		dest='verbose', default=False,
		help='write additional info to stderr')
	(options, args) = parser.parse_args()

	files = args

	main(files, options.tagger_config, options.tagger_dir, options.shall_chunk,
		options.chunker_config, options.chunker_dir, options.maca_config,
		options.batch, options.out_path, options.verbose,
		options.input_format, options.output_format)

def main(files, tagger_config, tagger_dir, shall_chunk,
		chunker_config, chunker_dir, maca_config,
		batch, out_path, verbose,
		input_format, output_format):

	tagr = tagger.Tagger(tagger_config, tagger_dir)
	if shall_chunk:
		chunkr = chunker.Chunker(chunker_config, chunker_dir)

	if maca_config != '':
		tagr.maca_config = maca_config

	# tag and chunk
	inputs = []
	outputs = []
	if batch: # read each arg as input path list
		for pathpath in files:
			inputs.extend(lines(pathpath))
		outputs = [path + '.tag' for path in inputs]
	elif len(files) == 1:
		if files[0] == '-': # stdin to stdout
			inputs.append(None)
			outputs.append(None)
		else:
			inputs.append(files[0])
			outputs.append(out_path)
	else: # multiple paths as args
		inputs = files
		outputs = [path + '.tag' for path in inputs]
	if inputs:
		tagr.load_model()
		if shall_chunk:
			chunkr.load_model()
			assert (tagr.tagset.name()
					== chunkr.tagset.name()), ('Tagger and chunker config must'
							+ 'operate on the same tagset: %s v. %s' % (tagr.tagset.name(),
								chunkr.tagset.name()))
		for in_path, out_path in zip(inputs, outputs):
			if in_path and verbose:
				sys.stderr.write('Processing %s...\n' % in_path)
			reader = tagger_io.get_reader(in_path, tagr.tagset, input_format, tagr.maca_config)
			writer = tagger_io.get_writer(out_path, tagr.tagset, output_format)
			while True:
				par = reader.get_next_chunk() # here `chunk' denotes paragraph
				if not par:
					break # end of input
				# prepare new paragraph: chunker will need AnnotatedSentence objects
				new_par = corpus2.Chunk()
				for attr_key in par.attributes():
					new_par.set_attribute(
						attr_key, par.get_attribute(attr_key))

				for sent in par.sentences():
					# let it be wrapped and cloned if necessary
					# this will make sure that later wrapping will
					# not cause any further cloning, so references to new_sent
					# and new_asent will be referring to the same underlying obj
					new_asent = corpus2.AnnotatedSentence.wrap_sentence(sent)
					new_sent = corpus2.AnnotatedSentence.cast_as_sentence(new_asent)
					# preserve_ambiguity = False
					tagr.tag_sentence(new_sent, False)
					if shall_chunk:
						chunkr.tag_sentence(new_sent)
					# create a new paragraph with the new sentence
					new_par.append(new_sent)
				# save tagged paragraph
				writer.write_chunk(new_par)
			del reader
			writer.finish()
			del writer
	else:
		sys.stderr.write('Nothing to do. See --help\n')
if __name__ == '__main__':
	go()