From 967dd4d1c1b5d25d4b62910c5dcbc8b89b78ac7e Mon Sep 17 00:00:00 2001 From: ilor <kailoran@gmail.com> Date: Thu, 12 May 2011 13:05:08 +0200 Subject: [PATCH] wccl-run upgrade: simple filtering, sentecne # output, output aggregation --- wccl-apps/wccl-run.cpp | 138 +++++++++++++++++++++++++++++++++-------- 1 file changed, 113 insertions(+), 25 deletions(-) diff --git a/wccl-apps/wccl-run.cpp b/wccl-apps/wccl-run.cpp index 2a20c37..fb2d552 100644 --- a/wccl-apps/wccl-run.cpp +++ b/wccl-apps/wccl-run.cpp @@ -26,6 +26,7 @@ namespace { bool output_orths = true; bool output_variables = false; bool global_numbering = false; + bool sentence_indices = true; bool output_header = true; bool in_sentence_numbering = true; } @@ -45,7 +46,7 @@ class Runner public: Runner(const Corpus2::Tagset& tagset) : tagset_(tagset), parser_(tagset_), token_idx(0), progress_(false), - search_path_(".") + search_path_("."), want_header_(true), aggregate_output_(false) { } @@ -57,31 +58,52 @@ public: } } + void set_aggregate_output(bool v) { aggregate_output_ = v; } + bool load_more_operators(const std::string &filename); bool load_operator_string(const std::string &op_string); + size_t size() const { + return ops_.size() + (filter_op_ ? 1 : 0); + } + const std::vector< boost::shared_ptr<Wccl::FunctionalOperator> >& operators() const { return ops_; } - void do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence); + void do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence, + std::vector< std::vector< UnicodeString > >& outputs, int sidx); + void run(boost::shared_ptr<Corpus2::TokenReader> , bool first); - void output_tabular(const std::vector< std::vector< UnicodeString > > outputs); + void output_tabular(const std::vector< std::vector< UnicodeString > >& outputs); void set_search_path(const std::string& path) { search_path_ = path; } + void set_filter_op(const std::string op_name, const std::string& op_value) { + filter_op_name_ = op_name; + filter_op_value_ = op_value; + } + private: + void do_operator_variables(const boost::shared_ptr<Wccl::FunctionalOperator>& op, + std::vector<UnicodeString>& out, bool variables); + const Corpus2::Tagset& tagset_; Wccl::Parser parser_; std::vector< boost::shared_ptr<Wccl::FunctionalOperator> > ops_; + boost::shared_ptr< Wccl::FunctionalOperator > filter_op_; std::vector< std::string > op_names_; int token_idx; bool progress_; std::string search_path_; + std::string filter_op_name_; + std::string filter_op_value_; + bool want_header_; + bool aggregate_output_; }; bool Runner::load_more_operators(const std::string& filename) @@ -96,11 +118,16 @@ bool Runner::load_more_operators(const std::string& filename) retOp = parser_.parseWcclFile(is, search_path_); if (retOp) { boost::filesystem::path p(filename); - std::string prefix = p.stem() + ":"; + std::string prefix = ""; //p.stem() + ":"; Wccl::UntypedOpSequence::name_op_v_t pairs = retOp->gen_all_op_pairs(); foreach (const Wccl::UntypedOpSequence::name_op_pair_t v, pairs) { - op_names_.push_back(prefix + v.first); - ops_.push_back(v.second); + std::string opname = v.first; + if (opname == filter_op_name_) { + filter_op_ = v.second; + } else { + op_names_.push_back(opname); + ops_.push_back(v.second); + } } return true; } else { @@ -148,26 +175,48 @@ bool Runner::load_operator_string(const std::string& op_string) return false; } -void Runner::do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence) +void Runner::do_operator_variables(const boost::shared_ptr<Wccl::FunctionalOperator>& op, + std::vector<UnicodeString>& out, bool variables) { - std::cerr << "dos"; - Wccl::SentenceContext sc(sentence); - std::vector< std::vector< UnicodeString > > outputs; + if (variables) { + foreach (const std::string& varname, op->valid_variable_names()) { + out.push_back((*op)[varname].to_string_u(tagset_)); + } + } +} +void Runner::do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence, + std::vector< std::vector< UnicodeString > >& outputs, int sidx) +{ + Wccl::SentenceContext sc(sentence); streamsave sv(std::cout); - if (output_header) { + if (output_header && want_header_) { outputs.resize(outputs.size() + 1); std::vector< UnicodeString >& out = outputs.back(); if (global_numbering) { out.push_back(UnicodeString::fromUTF8("##")); } + if (sentence_indices) { + out.push_back(UnicodeString::fromUTF8("S#")); + } if (in_sentence_numbering) { out.push_back(UnicodeString::fromUTF8("#")); } if (output_orths) { out.push_back(UnicodeString::fromUTF8("orth")); } + if (filter_op_) { + out.push_back(UnicodeString::fromUTF8(filter_op_name_)); + if (output_variables) { + boost::shared_ptr<Wccl::FunctionalOperator> o = filter_op_; + foreach (const std::string& varname, o->valid_variable_names()) { + const Wccl::Value& value = (*o)[varname]; + std::string label = "(" + filter_op_name_ + ")" + value.make_var_repr(varname); + out.push_back(UnicodeString::fromUTF8(label)); + } + } + } for (size_t i = 0; i < op_names_.size(); ++i) { out.push_back(UnicodeString::fromUTF8(op_names_[i])); if (output_variables) { @@ -181,36 +230,48 @@ void Runner::do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence) } } for (size_t i = 0; i < sentence->size(); ++i) { + ++token_idx; + sc.set_position(i); + UnicodeString vstr; + if (filter_op_) { + boost::shared_ptr<const Wccl::Value> v = filter_op_->base_apply(sc); + vstr = v->to_string_u(tagset_); + std::string uvstr = PwrNlp::to_utf8(vstr); + if (uvstr != filter_op_value_) { + continue; + } else { + } + } outputs.resize(outputs.size() + 1); std::vector< UnicodeString >& out = outputs.back(); - ++token_idx; if (global_numbering) { out.push_back(UnicodeString::fromUTF8(boost::lexical_cast<std::string>(token_idx))); } + if (sentence_indices) { + out.push_back(UnicodeString::fromUTF8(boost::lexical_cast<std::string>(sidx))); + } if (in_sentence_numbering) { out.push_back(UnicodeString::fromUTF8(boost::lexical_cast<std::string>(i + 1))); } if (output_orths) { out.push_back(sentence->tokens()[i]->orth()); } + if (filter_op_) { + out.push_back(vstr); + do_operator_variables(filter_op_, out, output_variables); + } - sc.set_position(i); foreach (const boost::shared_ptr<Wccl::FunctionalOperator>& o, ops_) { boost::shared_ptr<const Wccl::Value> v = o->base_apply(sc); UnicodeString vstr = v->to_string_u(tagset_); out.push_back(vstr); - if (output_variables) { - foreach (const std::string& varname, o->valid_variable_names()) { - out.push_back((*o)[varname].to_string_u(tagset_)); - } - } + do_operator_variables(o, out, output_variables); } } - output_tabular(outputs); } -void Runner::output_tabular(const std::vector<std::vector<UnicodeString> > outputs) +void Runner::output_tabular(const std::vector<std::vector<UnicodeString> >& outputs) { std::vector<int> lengths(outputs[0].size()); foreach (const std::vector< UnicodeString >& line, outputs) { @@ -238,27 +299,42 @@ void Runner::output_tabular(const std::vector<std::vector<UnicodeString> > outpu void Runner::run(boost::shared_ptr<Corpus2::TokenReader> reader, bool first) { + std::vector<std::vector<UnicodeString> > outputs; Corpus2::Sentence::Ptr s; Corpus2::TokenTimer& timer = Corpus2::global_timer(); + int sidx = 0; while ((s = reader->get_next_sentence())) { - do_sentence(s); + do_sentence(s, outputs, sidx++); timer.count_sentence(*s); + if (aggregate_output_) { + want_header_ = false; + } + if (!outputs.empty() && !aggregate_output_) { + output_tabular(outputs); + } if (progress_) { timer.check_slice(); } - std::cout << "\n"; + if (!filter_op_) { + std::cout << "\n"; + } if (first) break; } + if (!outputs.empty() && aggregate_output_) { + output_tabular(outputs); + } } int main(int argc, char** argv) { std::string tagset_load = "kipi"; + std::string filter_op_name, filter_op_value; bool first = false, progress = false; std::string input_format; std::string search_path; std::vector<std::string> corpora_files, files, operator_strings; bool corpus_stdin = false; + bool aggregate = false; using boost::program_options::value; std::string readers = boost::algorithm::join(Corpus2::TokenReader::available_reader_types_help(), " "); std::string readers_help = "Input format, any of: " + readers + "\n"; @@ -289,12 +365,20 @@ int main(int argc, char** argv) "Output in-sentence token counts") ("global-counts,g", value(&global_numbering), "Output global counts") + ("output-sentence-indices,H", value(&sentence_indices), + "Output sentence indices") ("output-orths,O", value(&output_orths), "Output token orths") ("output-variables,V", value(&output_variables), "Output operator variables") ("output-header,H", value(&output_header), "Output table header") + ("filter-operator,F", value(&filter_op_name), + "Filter operator name") + ("filter-value", value(&filter_op_value)->default_value("True"), + "Filter operator expected valye") + ("aggregate-output,A", value(&aggregate), + "Aggregate output (prettier, slower)") ("progress,p", value(&progress)->zero_tokens(), "Show progress info") ("help,h", "Show help") @@ -342,20 +426,24 @@ int main(int argc, char** argv) if (!search_path.empty()) { runner.set_search_path(search_path); } + if (!filter_op_name.empty()) { + runner.set_filter_op(filter_op_name, filter_op_value); + } + runner.set_aggregate_output(aggregate); foreach (const std::string& f, operator_strings) { if (boost::algorithm::ends_with(f, ".ccl")) { - size_t sz = runner.operators().size(); + size_t sz = runner.size(); if (!runner.load_more_operators(f)) { std::cerr << "Warning: error while parsing " << f << "\n"; } - if (runner.operators().size() == sz) { + if (runner.size() == sz) { std::cerr << "Warning: no operators loaded from " << f << "\n"; } } else { runner.load_operator_string(f); } } - if (!runner.operators().empty()) { + if (runner.size() > 0) { foreach (const std::string& f, corpora_files) { runner.run(Corpus2::TokenReader::create_path_reader( input_format, tagset, f), first); -- GitLab