Skip to content
Snippets Groups Projects
Commit 6a53ef86 authored by ilor's avatar ilor
Browse files

update wcclrun with variable dump mode, table header and some cleanups

parent 85ec635b
Branches
No related tags found
No related merge requests found
#include <cstdlib>
#include <cstdio>
#include <fstream>
#include <iomanip>
......@@ -11,7 +12,9 @@
#include <boost/algorithm/string.hpp>
#include <boost/make_shared.hpp>
#include <boost/program_options.hpp>
#include <boost/filesystem.hpp>
#include <libcorpus2/io/xcesreader.h>
#include <boost/lexical_cast.hpp>
#include <antlr/NoViableAltException.hpp>
#include <antlr/MismatchedTokenException.hpp>
......@@ -20,6 +23,7 @@ namespace {
bool quiet = false;
bool tabs = false;
bool output_orths = true;
bool output_variables = false;
bool global_numbering = false;
bool in_sentence_numbering = true;
}
......@@ -54,6 +58,7 @@ private:
const Corpus2::Tagset& tagset_;
Wccl::Parser parser_;
std::vector< boost::shared_ptr<Wccl::FunctionalOperator> > ops_;
std::vector< std::string > op_names_;
int token_idx;
};
......@@ -68,6 +73,8 @@ bool Runner::load_more_operators(const std::string& filename)
retOp = parser_.parseAnyOperator(is);
if (retOp) {
boost::filesystem::path p(filename);
op_names_.push_back(p.stem());
ops_.push_back(retOp);
return true;
} else {
......@@ -97,51 +104,76 @@ bool Runner::load_more_operators(const std::string& filename)
void Runner::do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence)
{
Wccl::SentenceContext sc(sentence);
std::vector< std::vector< UnicodeString > > outputs(sentence->size());
std::vector<int> lengths(ops_.size() + 1, 0);
std::vector< std::vector< UnicodeString > > outputs;
streamsave sv(std::cout);
if (1) { //header
outputs.resize(outputs.size() + 1);
std::vector< UnicodeString >& out = outputs.back();
if (global_numbering) {
out.push_back(UnicodeString::fromUTF8("##"));
}
if (in_sentence_numbering) {
out.push_back(UnicodeString::fromUTF8("#"));
}
if (output_orths) {
out.push_back(UnicodeString::fromUTF8("orth"));
}
for (size_t i = 0; i < op_names_.size(); ++i) {
out.push_back(UnicodeString::fromUTF8(op_names_[i]));
if (output_variables) {
boost::shared_ptr<Wccl::FunctionalOperator> o = ops_[i];
foreach (const std::string& varname, o->valid_variable_names()) {
const Wccl::Value& value = (*o)[varname];
std::string label = "(" + op_names_[i] + ")" + value.make_var_repr(varname);
out.push_back(UnicodeString::fromUTF8(label));
}
}
}
}
for (size_t i = 0; i < sentence->size(); ++i) {
outputs.resize(outputs.size() + 1);
std::vector< UnicodeString >& out = outputs.back();
++token_idx;
if (global_numbering) {
out.push_back(UnicodeString::fromUTF8(boost::lexical_cast<std::string>(token_idx)));
}
if (in_sentence_numbering) {
out.push_back(UnicodeString::fromUTF8(boost::lexical_cast<std::string>(i + 1)));
}
if (output_orths) {
out.push_back(sentence->tokens()[i]->orth());
}
sc.set_position(i);
UnicodeString orth = sentence->tokens()[i]->orth();
outputs[i].push_back(orth);
lengths[0] = std::max(lengths[0], orth.length());
int li = 1;
foreach (const boost::shared_ptr<Wccl::FunctionalOperator>& o, ops_) {
boost::shared_ptr<const Wccl::Value> v = o->base_apply(sc);
UnicodeString vstr = v->to_string_u(tagset_);
lengths[li] = std::max(lengths[li], vstr.length());
++li;
outputs[i].push_back(vstr);
out.push_back(vstr);
if (output_variables) {
foreach (const std::string& varname, o->valid_variable_names()) {
out.push_back((*o)[varname].to_string_u(tagset_));
}
}
for (size_t i = 0; i < sentence->size(); ++i) {
++token_idx;
if (global_numbering) {
if (tabs) {
std::cout << token_idx << "\t";
} else {
std::cout << std::setw(6) << token_idx << " ";
}
}
if (in_sentence_numbering) {
if (tabs) {
std::cout << (i + 1) << "\t";
} else {
std::cout << std::setw(2) << (i + 1) << " ";
std::vector<int> lengths(outputs[0].size());
foreach (const std::vector< UnicodeString >& line, outputs) {
for (size_t i = 0; i < line.size(); ++i) {
lengths[i] = std::max(lengths[i], line[i].length());
}
}
size_t b = output_orths ? 0 : 1;
for (size_t oi = b; oi < outputs[i].size(); ++oi) {
UnicodeString u = outputs[i][oi];
if (oi + 1 < outputs[i].size()) {
foreach (const std::vector< UnicodeString >& line, outputs) {
for (size_t i = 0; i < line.size(); ++i) {
UnicodeString u = line[i];
if (tabs) {
std::cout << PwrNlp::to_utf8(u);
std::cout << "\t";
} else {
u.padTrailing(lengths[oi] + 1);
std::cout << PwrNlp::to_utf8(u);
}
std::cout << PwrNlp::to_utf8(line[i]) << "\t";
} else {
u.padTrailing(lengths[i] + 1);
std::cout << PwrNlp::to_utf8(u);
}
}
......@@ -192,6 +224,8 @@ int main(int argc, char** argv)
"Output global counts\n")
("output-orths,O", value(&output_orths),
"Output token orths\n")
("output-variables,V", value(&output_variables),
"Output operator variables\n")
("help,h", "Show help")
;
boost::program_options::variables_map vm;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment