Skip to content
Snippets Groups Projects
Commit 19b27f93 authored by ilor's avatar ilor
Browse files

make wcclrules process a chunk at a time, add progress info

parent c25d539a
Branches
No related merge requests found
......@@ -7,6 +7,8 @@
#include <libwccl/parser/Parser.h>
#include <libwccl/ops/rulesequence.h>
#include <libcorpus2/tagsetmanager.h>
#include <libcorpus2/util/tokentimer.h>
#include <boost/bind.hpp>
#include <boost/algorithm/string.hpp>
......@@ -20,6 +22,7 @@
namespace {
bool quiet = false;
bool progress = false;
struct options {
bool first;
......@@ -39,7 +42,10 @@ bool load_more_rules(Wccl::Parser& parser, const std::string& filename, Wccl::Ru
ret = parser.parseRuleSequence(is);
if (ret) {
std::cerr << ret->size() << "\n";
if (!quiet) {
std::cerr << "Loaded " << ret->size() << " rule(s) from "
<< filename << "\n";
}
std::copy(ret->begin(), ret->end(), std::back_inserter(rules));
return true;
} else {
......@@ -69,17 +75,27 @@ bool load_more_rules(Wccl::Parser& parser, const std::string& filename, Wccl::Ru
void do_stream(boost::shared_ptr<Corpus2::TokenWriter> writer, const Corpus2::Tagset& tagset, Wccl::RuleSequence& rules,
std::istream& is, const options& opts)
{
Corpus2::XcesReader xr(tagset, is);
Corpus2::Sentence::Ptr s;
while ((s = xr.get_next_sentence())) {
if (opts.until_done) {
rules.execute_until_done(s, opts.until_done_iterations);
} else {
rules.execute_once(s);
Corpus2::XcesReader reader(tagset, is);
Corpus2::TokenTimer& timer = Corpus2::global_timer();
while (boost::shared_ptr<Corpus2::Chunk> c = reader.get_next_chunk()) {
foreach (boost::shared_ptr<Corpus2::Sentence>& s, c->sentences()) {
if (opts.until_done) {
rules.execute_until_done(s, opts.until_done_iterations);
} else {
rules.execute_once(s);
}
timer.count_sentence(*s);
if (progress) {
timer.check_slice();
}
if (opts.first) break;
}
writer->write_sentence(*s);
writer->write_chunk(*c);
if (opts.first) break;
}
if (progress) {
timer.stats();
}
}
......@@ -110,6 +126,8 @@ int main(int argc, char** argv)
"Files to load, looking at the extension to determine type\n")
("output-format,o", value(&output_format)->default_value("xces"),
writers_help.c_str())
("progress,p", value(&progress)->zero_tokens(),
"Show progress info")
("quiet,q", value(&quiet)->zero_tokens(),
"Suppress messages\n")
("until-done,u", value(&opts.until_done)->zero_tokens(),
......@@ -165,8 +183,11 @@ int main(int argc, char** argv)
}
}
if (!rules.empty()) {
Corpus2::TokenTimer& timer = Corpus2::global_timer();
timer.register_signal_handler();
boost::shared_ptr<Corpus2::TokenWriter> writer;
writer.reset(Corpus2::TokenWriter::create(output_format, std::cout, tagset)); foreach (const std::string& f, corpora_files) {
writer.reset(Corpus2::TokenWriter::create(output_format, std::cout, tagset));
foreach (const std::string& f, corpora_files) {
std::ifstream ifs(f.c_str());
if (ifs.good()) {
do_stream(writer, tagset, rules, ifs, opts);
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment