#!/usr/bin/python
# -*- coding: utf-8 -*-

# Copyright (C) 2012 Adam Radziszewski.
# This program is free software; you can redistribute and/or modify it
# under the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation; either version 3 of the License, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.
#
# See the LICENCE and COPYING files for more details

descr = """%prog [options] CHUNKED REF CHAN_NAME

Reads the two chunk-annotated corpora: CHUNKED (chunker output) and REF
(reference annotation / gold standard). Outputs precision and recall values
for the following settings:
1. chunk recognition (counting as hit when exactly same tokens)
2. chunk + head recognition (as above + heads placed on the same tokens)
3. head recognition alone (only head position is compared)

NOTE: this script treats discontinuous chunks as whole annotations.
"""


from optparse import OptionParser
import sys
import corpus2

class Stats:
	def __init__(self):
		self.ch_chunks = 0
		self.ref_chunks = 0
		self.chunk_hits = 0
		self.head_hits = 0
		self.both_hits = 0
	
	def update(self, ch_annots, ref_annots):
		self.ch_chunks += len(ch_annots)
		self.ref_chunks += len(ref_annots)
		# sort by left border
		ch = dict([(min(ann.indices), ann) for ann in ch_annots])
		ref = dict([(min(ann.indices), ann) for ann in ref_annots])
		ch_idx = ref_idx = 0
		maybe_hits = set(ch).intersection(ref)
		for idx in maybe_hits:
			if list(ch[idx].indices) == list(ref[idx].indices):
				self.chunk_hits += 1
				if ch[idx].head_index == ref[idx].head_index:
					self.both_hits += 1
		# now compare head indices only
		ch = set(ann.head_index for ann in ch_annots)
		ref = set(ann.head_index for ann in ref_annots)
		self.head_hits += len(ch.intersection(ref))
	
	def dump_prf(self, name, hits):
		p = 0.0 if self.ch_chunks == 0 else 100.0 * hits / self.ch_chunks
		r = 0.0 if self.ref_chunks == 0 else 100.0 * hits / self.ref_chunks
		f = 0.0 if p + r == 0.0 else 2.0 * p * r / (p + r)
		print '%s\t%.2f\t%.2f\t%.2f' % (name, p, r, f)
	
	def dump(self, verbosity = 2):
		if verbosity > 1:
			print 'CHU chunks\t%d' % self.ch_chunks
			print 'REF chunks\t%d' % self.ref_chunks
			print 'Chunk hits\t%d' % self.chunk_hits
			print 'Head  hits\t%d' % self.head_hits
			print 'Ch+Hd hits\t%d' % self.both_hits
		if verbosity > 0:
			self.dump_prf('Chunk P,R,F', self.chunk_hits)
			self.dump_prf('Heads P,R,F', self.head_hits)
			self.dump_prf('Ch+Hd P,R,F', self.both_hits)


def get_annots(sent, chan_name):
	# wrap the sentence as an AnnotatedSentence
	annots = []
	asent = corpus2.AnnotatedSentence.wrap_sentence(sent)
	if asent.has_channel(chan_name):
		chan = asent.get_channel(chan_name)
		ann_vec = chan.make_annotation_vector()
		for ann in ann_vec:
			assert ann.head_index in ann.indices
			annots.append(ann)
	return annots


def go():
	parser = OptionParser(usage=descr)
	parser.add_option('-i', '--input-format', type='string', action='store',
		dest='input_format', default='ccl',
		help='set the input format; default: ccl')
	parser.add_option('-t', '--tagset', type='string', action='store',
		dest='tagset', default='nkjp',
		help='set the tagset used in input; default: nkjp')
	parser.add_option('-q', '--quiet', action='store_false',
		default=True, dest='verbose')
	(options, args) = parser.parse_args()
	
	if len(args) != 3:
		sys.stderr.write('No args. See --help\n')
		sys.exit(1)
	
	ch_path, ref_path, chan_name = args
	tagset = corpus2.get_named_tagset(options.tagset)
	ch_rdr = corpus2.TokenReader.create_path_reader(
			options.input_format, tagset, ch_path)
	ref_rdr = corpus2.TokenReader.create_path_reader(
			options.input_format, tagset, ref_path)
	
	stats = Stats()
	
	while True:
		# iterate over paragraphs (note that they are called "chunks" here)
		ref_chunk = ref_rdr.get_next_chunk()
		ch_chunk = ch_rdr.get_next_chunk()
		assert (not ref_chunk) == (not ch_chunk), 'corpora of different length'
		
		if not ref_chunk:
			break # end of input
		
		# process each sentence separately
		for ch_sent, ref_sent in zip(ch_chunk.sentences(), ref_chunk.sentences()):
			assert ch_sent.size() == ref_sent.size()
			ch_annots = get_annots(ch_sent, chan_name)
			ref_annots = get_annots(ref_sent, chan_name)
			stats.update(ch_annots, ref_annots)
	stats.dump(int(options.verbose) + 1)

if __name__ == '__main__':
	go()