Skip to content
Snippets Groups Projects
Commit f8acea67 authored by Adam Radziszewski's avatar Adam Radziszewski
Browse files

add evaluation script for chunker

parent 34932f89
Branches
No related merge requests found
#!/usr/bin/python
# -*- coding: utf-8 -*-
descr = """%prog [options] CHUNKED REF CHAN_NAME
Reads the two chunk-annotated corpora: CHUNKED (chunker output) and REF
(reference annotation / gold standard). Outputs precision and recall values
for the following settings:
1. chunk recognition (counting as hit when exactly same tokens)
2. chunk + head recognition (as above + heads placed on the same tokens)
3. head recognition alone (only head position is compared)
NOTE: this script treats discontinuous chunks as whole annotations.
"""
from optparse import OptionParser
import sys
import corpus2
class Stats:
def __init__(self):
self.ch_chunks = 0
self.ref_chunks = 0
self.chunk_hits = 0
self.head_hits = 0
self.both_hits = 0
def update(self, ch_annots, ref_annots):
self.ch_chunks += len(ch_annots)
self.ref_chunks += len(ref_annots)
# sort by left border
ch = dict([(min(ann.indices), ann) for ann in ch_annots])
ref = dict([(min(ann.indices), ann) for ann in ref_annots])
ch_idx = ref_idx = 0
maybe_hits = set(ch).intersection(ref)
for idx in maybe_hits:
if list(ch[idx].indices) == list(ref[idx].indices):
self.chunk_hits += 1
if ch[idx].head_index == ref[idx].head_index:
self.both_hits += 1
# now compare head indices only
ch = set(ann.head_index for ann in ch_annots)
ref = set(ann.head_index for ann in ref_annots)
self.head_hits += len(ch.intersection(ref))
def dump_prf(self, name, hits):
p = 0.0 if self.ch_chunks == 0 else 100.0 * hits / self.ch_chunks
r = 0.0 if self.ref_chunks == 0 else 100.0 * hits / self.ref_chunks
f = 0.0 if p + r == 0.0 else 2.0 * p * r / (p + r)
print '%s\t%.2f\t%.2f\t%.2f' % (name, p, r, f)
def dump(self, verbosity = 2):
if verbosity > 1:
print 'CHU chunks\t%d' % self.ch_chunks
print 'REF chunks\t%d' % self.ref_chunks
print 'Chunk hits\t%d' % self.chunk_hits
print 'Head hits\t%d' % self.head_hits
print 'Ch+Hd hits\t%d' % self.both_hits
if verbosity > 0:
self.dump_prf('Chunk P,R,F', self.chunk_hits)
self.dump_prf('Heads P,R,F', self.head_hits)
self.dump_prf('Ch+Hd P,R,F', self.both_hits)
def get_annots(sent, chan_name):
# wrap the sentence as an AnnotatedSentence
annots = []
asent = corpus2.AnnotatedSentence.wrap_sentence(sent)
if asent.has_channel(chan_name):
chan = asent.get_channel(chan_name)
ann_vec = chan.make_annotation_vector()
for ann in ann_vec:
assert ann.head_index in ann.indices
annots.append(ann)
return annots
def go():
parser = OptionParser(usage=descr)
parser.add_option('-i', '--input-format', type='string', action='store',
dest='input_format', default='ccl',
help='set the input format; default: ccl')
parser.add_option('-t', '--tagset', type='string', action='store',
dest='tagset', default='nkjp',
help='set the tagset used in input; default: nkjp')
parser.add_option('-q', '--quiet', action='store_false',
default=True, dest='verbose')
(options, args) = parser.parse_args()
if len(args) != 3:
sys.stderr.write('No args. See --help\n')
sys.exit(1)
ch_path, ref_path, chan_name = args
tagset = corpus2.get_named_tagset(options.tagset)
ch_rdr = corpus2.TokenReader.create_path_reader(
options.input_format, tagset, ch_path)
ref_rdr = corpus2.TokenReader.create_path_reader(
options.input_format, tagset, ref_path)
stats = Stats()
while True:
# iterate over paragraphs (note that they are called "chunks" here)
ref_chunk = ref_rdr.get_next_chunk()
ch_chunk = ch_rdr.get_next_chunk()
assert (not ref_chunk) == (not ch_chunk), 'corpora of different length'
if not ref_chunk:
break # end of input
# process each sentence separately
for ch_sent, ref_sent in zip(ch_chunk.sentences(), ref_chunk.sentences()):
assert ch_sent.size() == ref_sent.size()
ch_annots = get_annots(ch_sent, chan_name)
ref_annots = get_annots(ref_sent, chan_name)
stats.update(ch_annots, ref_annots)
stats.dump(int(options.verbose) + 1)
if __name__ == '__main__':
go()
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment