From 19b27f9370f8a32769b2c16521ec66ecb5a60ee9 Mon Sep 17 00:00:00 2001 From: ilor <kailoran@gmail.com> Date: Fri, 11 Feb 2011 14:38:45 +0100 Subject: [PATCH] make wcclrules process a chunk at a time, add progress info --- wcclrules/main.cpp | 41 +++++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/wcclrules/main.cpp b/wcclrules/main.cpp index 2006968..40936af 100644 --- a/wcclrules/main.cpp +++ b/wcclrules/main.cpp @@ -7,6 +7,8 @@ #include <libwccl/parser/Parser.h> #include <libwccl/ops/rulesequence.h> #include <libcorpus2/tagsetmanager.h> +#include <libcorpus2/util/tokentimer.h> + #include <boost/bind.hpp> #include <boost/algorithm/string.hpp> @@ -20,6 +22,7 @@ namespace { bool quiet = false; + bool progress = false; struct options { bool first; @@ -39,7 +42,10 @@ bool load_more_rules(Wccl::Parser& parser, const std::string& filename, Wccl::Ru ret = parser.parseRuleSequence(is); if (ret) { - std::cerr << ret->size() << "\n"; + if (!quiet) { + std::cerr << "Loaded " << ret->size() << " rule(s) from " + << filename << "\n"; + } std::copy(ret->begin(), ret->end(), std::back_inserter(rules)); return true; } else { @@ -69,17 +75,27 @@ bool load_more_rules(Wccl::Parser& parser, const std::string& filename, Wccl::Ru void do_stream(boost::shared_ptr<Corpus2::TokenWriter> writer, const Corpus2::Tagset& tagset, Wccl::RuleSequence& rules, std::istream& is, const options& opts) { - Corpus2::XcesReader xr(tagset, is); - Corpus2::Sentence::Ptr s; - while ((s = xr.get_next_sentence())) { - if (opts.until_done) { - rules.execute_until_done(s, opts.until_done_iterations); - } else { - rules.execute_once(s); + Corpus2::XcesReader reader(tagset, is); + Corpus2::TokenTimer& timer = Corpus2::global_timer(); + while (boost::shared_ptr<Corpus2::Chunk> c = reader.get_next_chunk()) { + foreach (boost::shared_ptr<Corpus2::Sentence>& s, c->sentences()) { + if (opts.until_done) { + rules.execute_until_done(s, opts.until_done_iterations); + } else { + rules.execute_once(s); + } + timer.count_sentence(*s); + if (progress) { + timer.check_slice(); + } + if (opts.first) break; } - writer->write_sentence(*s); + writer->write_chunk(*c); if (opts.first) break; } + if (progress) { + timer.stats(); + } } @@ -110,6 +126,8 @@ int main(int argc, char** argv) "Files to load, looking at the extension to determine type\n") ("output-format,o", value(&output_format)->default_value("xces"), writers_help.c_str()) + ("progress,p", value(&progress)->zero_tokens(), + "Show progress info") ("quiet,q", value(&quiet)->zero_tokens(), "Suppress messages\n") ("until-done,u", value(&opts.until_done)->zero_tokens(), @@ -165,8 +183,11 @@ int main(int argc, char** argv) } } if (!rules.empty()) { + Corpus2::TokenTimer& timer = Corpus2::global_timer(); + timer.register_signal_handler(); boost::shared_ptr<Corpus2::TokenWriter> writer; - writer.reset(Corpus2::TokenWriter::create(output_format, std::cout, tagset)); foreach (const std::string& f, corpora_files) { + writer.reset(Corpus2::TokenWriter::create(output_format, std::cout, tagset)); + foreach (const std::string& f, corpora_files) { std::ifstream ifs(f.c_str()); if (ifs.good()) { do_stream(writer, tagset, rules, ifs, opts); -- GitLab