diff --git a/iobber/chunker.py b/iobber/chunker.py index 16ed3437d37209634a326148d0ee6c1a2a75018e..27643d001f5dd1cc903e5c82d73fecefb04ddcee 100644 --- a/iobber/chunker.py +++ b/iobber/chunker.py @@ -28,8 +28,11 @@ from operator import itemgetter as ig import corpio, config, classify -def get_channels(conf): - return conf.get(config.S_GLOBAL, config.O_CHANNELS).split(',') +def get_layers(conf): + layers = [(k, v.split(',')) for (k, v) in conf.items(config.S_LAYERS)] + for layer in layers: + assert '-' not in layer, 'hyphens are not allowed in channel names' + return layers def is_input_tagged(conf): return conf.getboolean(config.S_GLOBAL, config.O_TAGGED) @@ -53,7 +56,9 @@ class Stats: class Chunker: """The CRF-based chunker. The chunker may add annotations to multiple - channels during one run. + channels during one run, as specified in layer definitions. + Layers are applied sequentially. A layer defines a set of channels + that are dealt with at a time. TODO doc """ @@ -80,29 +85,31 @@ class Chunker: # list of WCCL operators describing the features used # NOTE: dynamic lexicon generation is currently not supported # to make it possible, move op loading to load_model and train_and_save - self.channels = get_channels(self.conf) + # layers -- list of layer names + # layer_channels -- list of channel lists per layer + self.layers, self.layer_channels = zip(*get_layers(self.conf)) # list of per-channel op lists - self.chan_ops = corpio.get_wccl_ops( + self.layer_ops = corpio.get_wccl_ops( self.conf, self.model_name, - self.conf_dir, self.data_dir, self.channels) - self.chan_models = None # chan_name -> trained classifier + self.conf_dir, self.data_dir, self.layers) + self.layer_models = None # layer_name -> trained classifier self.stats = Stats() def load_model(self): - self.chan_models = {} - for chan_name in self.channels: - self.chan_models[chan_name] = classify.load( - self.conf, self.model_name, self.data_dir, chan_name) + self.layer_models = {} + for layer in self.layers: + self.layer_models[layer] = classify.load( + self.conf, self.model_name, self.data_dir, layer) def train_and_save(self, in_path, input_format): """Trains the tagger and stores the model to files beginning with model_name.""" - self.chan_models = None # forget any previously trained model + self.layer_models = None # forget any previously trained model if self.verbose: sys.stderr.write('Generating training data...\n') - # open files for storing training examples for each channel + # open files for storing training examples for each layer tr_files = classify.open_tr_files( - self.model_name, self.data_dir, self.channels) + self.model_name, self.data_dir, self.layers) # set-up the reader and gather feature values for subsequent sentences reader = corpio.get_reader( @@ -116,26 +123,41 @@ class Chunker: # wrap the sentence as an AnnotatedSentence asent = corpus2.AnnotatedSentence.wrap_sentence(sent) - # iterate over channels - for chan_idx, chan_name in enumerate(self.channels): - # ensure the channel is there and switch to IOB2 representation - if not asent.has_channel(chan_name): - asent.create_channel(chan_name) - chan = asent.get_channel(chan_name) - chan.make_iob_from_segments() + # iterate over layers + for layer_idx, layer in enumerate(self.layers): + chans = self.layer_channels[layer_idx] + for chan_name in chans: + # ensure the channel is there and switch to IOB2 representation + if not asent.has_channel(chan_name): + asent.create_channel(chan_name) + chan = asent.get_channel(chan_name) + chan.make_iob_from_segments() # prepare WCCL context con = corpio.create_context(sent) # get file for storing training data - tr_file = tr_files[chan_name] + tr_file = tr_files[layer] # iterate over each sentence token for tok_idx, tok in enumerate(sent.tokens()): con.set_position(tok_idx) # for WCCL ops feat_vals = [op.base_apply(con) .to_compact_string(self.tagset).decode('utf-8') - for op in self.chan_ops[chan_idx]] - # get IOB2 tag as string + for op in self.layer_ops[layer_idx]] + # get IOB2 tags as strings, find non-O IOB2 tag or mark it as O # TODO: rename the to_string in corpus2 and fix it here - class_label = corpus2.to_string(chan.get_iob_at(tok_idx)) + non_O_chan = None + non_O_tag = 'O' + for chan_name in chans: + chan = asent.get_channel(chan_name) + there_iob = corpus2.to_string(chan.get_iob_at(tok_idx)) + if there_iob != 'O': + if non_O_chan is not None: + sys.stderr.write( + 'WARNING: overlapping phrases in sentence %s\n' % unicode(asent.id())) + else: + non_O_chan = chan_name + non_O_tag = there_iob + # B-NP, I-VP etc. or O + class_label = 'O' if non_O_chan is None else '%s-%s' % (non_O_tag, non_O_chan) # generate training example and store to file classify.write_example(tr_file, feat_vals, class_label) classify.write_end_of_sent(tr_file) @@ -146,13 +168,13 @@ class Chunker: classify.close_tr_files(tr_files) - # train the classifier for each channel - for chan_name in self.channels: + # train the classifier for each layer + for layer in self.layers: if self.verbose: - sys.stderr.write('Training classifier for %s... ' % chan_name) + sys.stderr.write('Training classifier for %s... ' % layer) classify.train_and_save( self.conf, self.model_name, - self.conf_dir, self.data_dir, chan_name) + self.conf_dir, self.data_dir, layer) if self.verbose: sys.stderr.write('done!\n') self.stats.dump() @@ -162,16 +184,18 @@ class Chunker: # wrap the sentence as an AnnotatedSentence asent = corpus2.AnnotatedSentence.wrap_sentence(sent) - # iterate over channels - for chan_idx, chan_name in enumerate(self.channels): - # get model for chan_name - model = self.chan_models[chan_name] + # iterate over layers + for layer_idx, layer in enumerate(self.layers): + # get model for current layer + model = self.layer_models[layer] if model is not None: - # ensure the channel is there and switch to IOB2 representation - if not asent.has_channel(chan_name): - asent.create_channel(chan_name) - chan = asent.get_channel(chan_name) - chan.make_iob_from_segments() + chans = self.layer_channels[layer_idx] + for chan_name in chans: + # ensure the channel is there and switch to IOB2 representation + if not asent.has_channel(chan_name): + asent.create_channel(chan_name) + chan = asent.get_channel(chan_name) + chan.make_iob_from_segments() # prepare WCCL context and feed the sentence features con = corpio.create_context(sent) classify.open_sent(model) @@ -180,16 +204,24 @@ class Chunker: con.set_position(tok_idx) feat_vals = [op.base_apply(con) .to_compact_string(self.tagset).decode('utf-8') - for op in self.chan_ops[chan_idx]] + for op in self.layer_ops[layer_idx]] classify.eat_token(model, feat_vals) classify.close_sent(model) for tok_idx, tok in enumerate(sent.tokens()): decsn = classify.classify_token(model, tok_idx) - assert decsn in ['I', 'O', 'B'] - # TODO: rename the from_string in corpus2 and fix it here - chan.set_iob_at(tok_idx, corpus2.from_string(decsn)) + non_O_chan = None + non_O_tag = 'O' + if decsn != 'O': + non_O_tag, non_O_chan = decsn.split('-') + for chan_name in chans: + chan = asent.get_channel(chan_name) + # TODO: rename the from_string in corpus2 and fix it here + tag_to_set = 'O' if chan_name != non_O_chan else non_O_tag + chan.set_iob_at(tok_idx, corpus2.from_string(tag_to_set)) # switch back to segments - chan.make_segments_from_iob() + for chan_name in chans: + chan = asent.get_channel(chan_name) + chan.make_segments_from_iob() self.stats.num_sents += 1 self.stats.num_toks += sent.tokens().size() diff --git a/iobber/config.py b/iobber/config.py index 7ffb19e4f6920bbc7471e1cc0f15773d023b7109..80a599ec93dae400b37bb59738f0b69c4684c9e5 100644 --- a/iobber/config.py +++ b/iobber/config.py @@ -18,6 +18,8 @@ O_TAGSET = 'tagset' O_CHANNELS = 'channels' # comma separated list of channel/phrase names??? O_TAGGED = 'tagged' # yes/no: shall we expect tagged (yes) or ambiguous (no) input +S_LAYERS = 'layers' + # WCCL file section defining ops for all layers DEFAULT_OPS = 'default'