diff --git a/wcclrun/main.cpp b/wcclrun/main.cpp index ad391ffc1502de460ebd38600676aec23ebedc79..710a81d7bc45a53a00830c342768f878cc9ce3f8 100644 --- a/wcclrun/main.cpp +++ b/wcclrun/main.cpp @@ -1,4 +1,5 @@ #include <cstdlib> +#include <cstdio> #include <fstream> #include <iomanip> @@ -11,7 +12,9 @@ #include <boost/algorithm/string.hpp> #include <boost/make_shared.hpp> #include <boost/program_options.hpp> +#include <boost/filesystem.hpp> #include <libcorpus2/io/xcesreader.h> +#include <boost/lexical_cast.hpp> #include <antlr/NoViableAltException.hpp> #include <antlr/MismatchedTokenException.hpp> @@ -20,6 +23,7 @@ namespace { bool quiet = false; bool tabs = false; bool output_orths = true; + bool output_variables = false; bool global_numbering = false; bool in_sentence_numbering = true; } @@ -54,6 +58,7 @@ private: const Corpus2::Tagset& tagset_; Wccl::Parser parser_; std::vector< boost::shared_ptr<Wccl::FunctionalOperator> > ops_; + std::vector< std::string > op_names_; int token_idx; }; @@ -68,6 +73,8 @@ bool Runner::load_more_operators(const std::string& filename) retOp = parser_.parseAnyOperator(is); if (retOp) { + boost::filesystem::path p(filename); + op_names_.push_back(p.stem()); ops_.push_back(retOp); return true; } else { @@ -97,51 +104,76 @@ bool Runner::load_more_operators(const std::string& filename) void Runner::do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence) { Wccl::SentenceContext sc(sentence); - std::vector< std::vector< UnicodeString > > outputs(sentence->size()); - std::vector<int> lengths(ops_.size() + 1, 0); + std::vector< std::vector< UnicodeString > > outputs; + streamsave sv(std::cout); - for (size_t i = 0; i < sentence->size(); ++i) { - sc.set_position(i); - UnicodeString orth = sentence->tokens()[i]->orth(); - outputs[i].push_back(orth); - lengths[0] = std::max(lengths[0], orth.length()); - int li = 1; - foreach (const boost::shared_ptr<Wccl::FunctionalOperator>& o, ops_) { - boost::shared_ptr<const Wccl::Value> v = o->base_apply(sc); - UnicodeString vstr = v->to_string_u(tagset_); - lengths[li] = std::max(lengths[li], vstr.length()); - ++li; - outputs[i].push_back(vstr); + + if (1) { //header + outputs.resize(outputs.size() + 1); + std::vector< UnicodeString >& out = outputs.back(); + if (global_numbering) { + out.push_back(UnicodeString::fromUTF8("##")); + } + if (in_sentence_numbering) { + out.push_back(UnicodeString::fromUTF8("#")); + } + if (output_orths) { + out.push_back(UnicodeString::fromUTF8("orth")); + } + for (size_t i = 0; i < op_names_.size(); ++i) { + out.push_back(UnicodeString::fromUTF8(op_names_[i])); + if (output_variables) { + boost::shared_ptr<Wccl::FunctionalOperator> o = ops_[i]; + foreach (const std::string& varname, o->valid_variable_names()) { + const Wccl::Value& value = (*o)[varname]; + std::string label = "(" + op_names_[i] + ")" + value.make_var_repr(varname); + out.push_back(UnicodeString::fromUTF8(label)); + } + } } } for (size_t i = 0; i < sentence->size(); ++i) { + outputs.resize(outputs.size() + 1); + std::vector< UnicodeString >& out = outputs.back(); + ++token_idx; if (global_numbering) { - if (tabs) { - std::cout << token_idx << "\t"; - } else { - std::cout << std::setw(6) << token_idx << " "; - } + out.push_back(UnicodeString::fromUTF8(boost::lexical_cast<std::string>(token_idx))); } if (in_sentence_numbering) { - if (tabs) { - std::cout << (i + 1) << "\t"; - } else { - std::cout << std::setw(2) << (i + 1) << " "; - } + out.push_back(UnicodeString::fromUTF8(boost::lexical_cast<std::string>(i + 1))); } - size_t b = output_orths ? 0 : 1; - for (size_t oi = b; oi < outputs[i].size(); ++oi) { - UnicodeString u = outputs[i][oi]; - if (oi + 1 < outputs[i].size()) { - if (tabs) { - std::cout << PwrNlp::to_utf8(u); - std::cout << "\t"; - } else { - u.padTrailing(lengths[oi] + 1); - std::cout << PwrNlp::to_utf8(u); + if (output_orths) { + out.push_back(sentence->tokens()[i]->orth()); + } + + sc.set_position(i); + foreach (const boost::shared_ptr<Wccl::FunctionalOperator>& o, ops_) { + boost::shared_ptr<const Wccl::Value> v = o->base_apply(sc); + UnicodeString vstr = v->to_string_u(tagset_); + out.push_back(vstr); + if (output_variables) { + foreach (const std::string& varname, o->valid_variable_names()) { + out.push_back((*o)[varname].to_string_u(tagset_)); } + } + } + } + + std::vector<int> lengths(outputs[0].size()); + foreach (const std::vector< UnicodeString >& line, outputs) { + for (size_t i = 0; i < line.size(); ++i) { + lengths[i] = std::max(lengths[i], line[i].length()); + } + } + + foreach (const std::vector< UnicodeString >& line, outputs) { + for (size_t i = 0; i < line.size(); ++i) { + UnicodeString u = line[i]; + if (tabs) { + std::cout << PwrNlp::to_utf8(line[i]) << "\t"; } else { + u.padTrailing(lengths[i] + 1); std::cout << PwrNlp::to_utf8(u); } } @@ -192,6 +224,8 @@ int main(int argc, char** argv) "Output global counts\n") ("output-orths,O", value(&output_orths), "Output token orths\n") + ("output-variables,V", value(&output_variables), + "Output operator variables\n") ("help,h", "Show help") ; boost::program_options::variables_map vm;