diff --git a/iobber/chunker.py b/iobber/chunker.py index 3b705f3aad8fd2b85e9aa7e86529f3a04aac4586..0a2040c4970d3d6d78159c4bc68277429e8d0c6c 100644 --- a/iobber/chunker.py +++ b/iobber/chunker.py @@ -164,6 +164,8 @@ class Chunker: else: non_O_chan = chan_name non_O_tag = there_iob + if chan.is_head_at(tok_idx): + non_O_chan += '-H' # B-NP, I-VP etc. or O class_label = 'O' if non_O_chan is None else '%s-%s' % (non_O_tag, non_O_chan) # generate training example and store to file @@ -219,13 +221,22 @@ class Chunker: decsn = classify.classify_token(model, tok_idx) non_O_chan = None non_O_tag = 'O' + is_head = None if decsn != 'O': - non_O_tag, non_O_chan = decsn.split('-') + decsn_array = decsn.split('-') + if len(decsn_array) == 2: + non_O_tag, non_O_chan = decsn_array + elif len(decsn_array) == 3: + non_O_tag, non_O_chan, is_head = decsn_array + else: + raise IOError('Unexpected label returned from classifier: ' + decsn) for chan_name in chans: chan = asent.get_channel(chan_name) # TODO: rename the from_string in corpus2 and fix it here tag_to_set = 'O' if chan_name != non_O_chan else non_O_tag chan.set_iob_at(tok_idx, corpus2.from_string(tag_to_set)) + if tag_to_set != 'O' and is_head: + chan.set_head_at(tok_idx, True) # switch back to segments for chan_name in chans: chan = asent.get_channel(chan_name)