From f5c56e898dd472e9822175e2ebab45c6cd9a5b4e Mon Sep 17 00:00:00 2001 From: Adam Radziszewski <adam.radziszewski@pwr.wroc.pl> Date: Wed, 18 Apr 2012 15:09:29 +0200 Subject: [PATCH] fix iobber chunker: write sent boundaries to train files as expected by crf_learn --- iobber/chunker.py | 1 + iobber/classify.py | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/iobber/chunker.py b/iobber/chunker.py index 616029b..16ed343 100644 --- a/iobber/chunker.py +++ b/iobber/chunker.py @@ -138,6 +138,7 @@ class Chunker: class_label = corpus2.to_string(chan.get_iob_at(tok_idx)) # generate training example and store to file classify.write_example(tr_file, feat_vals, class_label) + classify.write_end_of_sent(tr_file) self.stats.num_sents += 1 self.stats.num_toks += sent.tokens().size() diff --git a/iobber/classify.py b/iobber/classify.py index f70a776..bce6a1d 100644 --- a/iobber/classify.py +++ b/iobber/classify.py @@ -38,6 +38,10 @@ def write_example(tr_file, feat_vals, class_label): tr_file.write(class_label) tr_file.write('\n') +def write_end_of_sent(tr_file): + """Writes end-of-sentence marker to the training file.""" + tr_file.write('\n') + def train_and_save(conf, model_name, config_dir, data_dir, chan_name): """Trains a CRF classifier for the given chan_name. The trained model is saved to filenames (generated using model_name and conf).""" -- GitLab