#!/usr/bin/python
# -*- coding: utf-8 -*-

# Copyright (C) 2012 Adam Radziszewski. Part of IOBBER.
# This program is free software; you can redistribute and/or modify it
# under the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 3 of the License, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.
#
# See the LICENCE and COPYING files for more details

import sys
from optparse import OptionParser

import chunker
import corpio


descr = """%prog [options] CONFIGFILE [INPUT...]

IOBBER, configurable chunker.
(C) 2012, Wroclaw University of Technology

Chunks input file(s) using the selected configuration. Use -d to specify where to
look for a trained chunker model (or where to store a model when training).

Use -O to specify output path (by default will write to stdout).
Use - to chunk stdin to stdout.

When chunking multiple files, either give the filenames directly as arguments,
or use --batch and provide a filename to a list of paths. Either way, each file
will be chunked and the output writted to FILENAME.chunked.

Training (--train) requires a configuration to use and a filename of the training
material. Trained model will be stored in DATA_DIR.
""" + corpio.format_help


def lines(pathfilename):
	with open(pathfilename) as f:
		return [line.strip() for line in f if line.strip()]

def go():
	parser = OptionParser(usage=descr)
	parser.add_option('-i', '--input-format', type='string', action='store',
		dest='input_format', default='ccl',
		help='set the input format; default: ccl')
	parser.add_option('-o', '--output-format', type='string', action='store',
		dest='output_format', default='ccl',
		help='set the output format; default: ccl')
	parser.add_option('-O', '--output-file', type='string', action='store',
		dest='out_path', default='',
		help='set output filename (do not write to stdout)')
	parser.add_option('-d', '--data-dir', type='string', action='store',
		dest='data_dir', default='',
		help='assume WCCL and trained model to sit in the given dir')
	parser.add_option('--sent-only', action='store_false',
		dest='preserve_chunks', default=True,
		help='process sentences, ignoring division into paragraphs')
	parser.add_option('-v', '--verbose', action='store_true',
		dest='verbose', default=False,
		help='verbose mode')
	parser.add_option('--train', action='store_true',
		dest='is_training', help='train the chunker')
	parser.add_option('--batch', action='store_true',
		help='treat arguments as lists of paths to files to process')
	(options, args) = parser.parse_args()
	
	if len(args) < 1:
		sys.stderr.write('You need to provide a config file and specify input.\n')
		sys.stderr.write('See %s --help\n' % sys.argv[0])
		sys.exit(1)
	config_path = args[0]
	files = args[1:]
	
	tagr = chunker.Chunker(config_path, options.data_dir,
		verbose = options.verbose)
	
	if options.is_training:
		# chunker training
		assert len(files) == 1, 'must provide path to training file'
		tagr.train_and_save(files[0], options.input_format)
	else:
		# normal chunker performance
		inputs = []
		outputs = []
		if options.batch: # read each arg as input path list
			for pathpath in files:
				inputs.extend(lines(pathpath))
			outputs = [path + '.chunked' for path in inputs]
		elif len(files) == 1:
			if files[0] == '-': # stdin to stdout
				inputs.append(None)
				outputs.append(None)
			else:
				inputs.append(files[0])
				outputs.append(options.out_path)
		else: # multiple paths as args
			inputs = files
			outputs = [path + '.chunked' for path in inputs]
		tagr.load_model()
		for in_path, out_path in zip(inputs, outputs):
			if in_path and options.verbose:
				sys.stderr.write('Processing %s...\n' % in_path)
			tagr.tag_input(in_path, out_path,
				options.input_format, options.output_format,
				options.preserve_chunks)

if __name__ == '__main__':
	go()
