Skip to content
Snippets Groups Projects
Commit 6a53ef86 authored by ilor's avatar ilor
Browse files

update wcclrun with variable dump mode, table header and some cleanups

parent 85ec635b
No related merge requests found
#include <cstdlib>
#include <cstdio>
#include <fstream>
#include <iomanip>
......@@ -11,7 +12,9 @@
#include <boost/algorithm/string.hpp>
#include <boost/make_shared.hpp>
#include <boost/program_options.hpp>
#include <boost/filesystem.hpp>
#include <libcorpus2/io/xcesreader.h>
#include <boost/lexical_cast.hpp>
#include <antlr/NoViableAltException.hpp>
#include <antlr/MismatchedTokenException.hpp>
......@@ -20,6 +23,7 @@ namespace {
bool quiet = false;
bool tabs = false;
bool output_orths = true;
bool output_variables = false;
bool global_numbering = false;
bool in_sentence_numbering = true;
}
......@@ -54,6 +58,7 @@ private:
const Corpus2::Tagset& tagset_;
Wccl::Parser parser_;
std::vector< boost::shared_ptr<Wccl::FunctionalOperator> > ops_;
std::vector< std::string > op_names_;
int token_idx;
};
......@@ -68,6 +73,8 @@ bool Runner::load_more_operators(const std::string& filename)
retOp = parser_.parseAnyOperator(is);
if (retOp) {
boost::filesystem::path p(filename);
op_names_.push_back(p.stem());
ops_.push_back(retOp);
return true;
} else {
......@@ -97,51 +104,76 @@ bool Runner::load_more_operators(const std::string& filename)
void Runner::do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence)
{
Wccl::SentenceContext sc(sentence);
std::vector< std::vector< UnicodeString > > outputs(sentence->size());
std::vector<int> lengths(ops_.size() + 1, 0);
std::vector< std::vector< UnicodeString > > outputs;
streamsave sv(std::cout);
for (size_t i = 0; i < sentence->size(); ++i) {
sc.set_position(i);
UnicodeString orth = sentence->tokens()[i]->orth();
outputs[i].push_back(orth);
lengths[0] = std::max(lengths[0], orth.length());
int li = 1;
foreach (const boost::shared_ptr<Wccl::FunctionalOperator>& o, ops_) {
boost::shared_ptr<const Wccl::Value> v = o->base_apply(sc);
UnicodeString vstr = v->to_string_u(tagset_);
lengths[li] = std::max(lengths[li], vstr.length());
++li;
outputs[i].push_back(vstr);
if (1) { //header
outputs.resize(outputs.size() + 1);
std::vector< UnicodeString >& out = outputs.back();
if (global_numbering) {
out.push_back(UnicodeString::fromUTF8("##"));
}
if (in_sentence_numbering) {
out.push_back(UnicodeString::fromUTF8("#"));
}
if (output_orths) {
out.push_back(UnicodeString::fromUTF8("orth"));
}
for (size_t i = 0; i < op_names_.size(); ++i) {
out.push_back(UnicodeString::fromUTF8(op_names_[i]));
if (output_variables) {
boost::shared_ptr<Wccl::FunctionalOperator> o = ops_[i];
foreach (const std::string& varname, o->valid_variable_names()) {
const Wccl::Value& value = (*o)[varname];
std::string label = "(" + op_names_[i] + ")" + value.make_var_repr(varname);
out.push_back(UnicodeString::fromUTF8(label));
}
}
}
}
for (size_t i = 0; i < sentence->size(); ++i) {
outputs.resize(outputs.size() + 1);
std::vector< UnicodeString >& out = outputs.back();
++token_idx;
if (global_numbering) {
if (tabs) {
std::cout << token_idx << "\t";
} else {
std::cout << std::setw(6) << token_idx << " ";
}
out.push_back(UnicodeString::fromUTF8(boost::lexical_cast<std::string>(token_idx)));
}
if (in_sentence_numbering) {
if (tabs) {
std::cout << (i + 1) << "\t";
} else {
std::cout << std::setw(2) << (i + 1) << " ";
}
out.push_back(UnicodeString::fromUTF8(boost::lexical_cast<std::string>(i + 1)));
}
size_t b = output_orths ? 0 : 1;
for (size_t oi = b; oi < outputs[i].size(); ++oi) {
UnicodeString u = outputs[i][oi];
if (oi + 1 < outputs[i].size()) {
if (tabs) {
std::cout << PwrNlp::to_utf8(u);
std::cout << "\t";
} else {
u.padTrailing(lengths[oi] + 1);
std::cout << PwrNlp::to_utf8(u);
if (output_orths) {
out.push_back(sentence->tokens()[i]->orth());
}
sc.set_position(i);
foreach (const boost::shared_ptr<Wccl::FunctionalOperator>& o, ops_) {
boost::shared_ptr<const Wccl::Value> v = o->base_apply(sc);
UnicodeString vstr = v->to_string_u(tagset_);
out.push_back(vstr);
if (output_variables) {
foreach (const std::string& varname, o->valid_variable_names()) {
out.push_back((*o)[varname].to_string_u(tagset_));
}
}
}
}
std::vector<int> lengths(outputs[0].size());
foreach (const std::vector< UnicodeString >& line, outputs) {
for (size_t i = 0; i < line.size(); ++i) {
lengths[i] = std::max(lengths[i], line[i].length());
}
}
foreach (const std::vector< UnicodeString >& line, outputs) {
for (size_t i = 0; i < line.size(); ++i) {
UnicodeString u = line[i];
if (tabs) {
std::cout << PwrNlp::to_utf8(line[i]) << "\t";
} else {
u.padTrailing(lengths[i] + 1);
std::cout << PwrNlp::to_utf8(u);
}
}
......@@ -192,6 +224,8 @@ int main(int argc, char** argv)
"Output global counts\n")
("output-orths,O", value(&output_orths),
"Output token orths\n")
("output-variables,V", value(&output_variables),
"Output operator variables\n")
("help,h", "Show help")
;
boost::program_options::variables_map vm;
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment