diff --git a/wcclrun/main.cpp b/wcclrun/main.cpp index eddc1dd738d3e07b9033dc734d3907dc38e08f7d..61c738b9523ae120cca9d59d5438f86bc2510016 100644 --- a/wcclrun/main.cpp +++ b/wcclrun/main.cpp @@ -16,12 +16,50 @@ #include <antlr/NoViableAltException.hpp> #include <antlr/MismatchedTokenException.hpp> +namespace { + bool quiet = false; + bool tabs = false; + bool output_orths = true; + bool global_numbering = false; + bool in_sentence_numbering = true; +} -bool load_more_operators(const std::string& filename, Wccl::Parser& parser, - std::vector< boost::shared_ptr<Wccl::FunctionalOperator> >& ops) +class streamsave { +public: + streamsave(std::ostream& os) : os_(os), flags_(os.flags()) {} + ~streamsave() { os_.flags(flags_); } +private: + std::ostream& os_; + std::ios_base::fmtflags flags_; +}; + +class Runner +{ +public: + Runner(const Corpus2::Tagset& tagset) + : tagset_(tagset), parser_(tagset_), token_idx(0) + { + } + + bool load_more_operators(const std::string &filename); + const std::vector< boost::shared_ptr<Wccl::FunctionalOperator> >& operators() const { + return ops_; + } + + void do_head(); + void do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence); + void do_stream(std::istream& is, bool first); + +private: + const Corpus2::Tagset& tagset_; + Wccl::Parser parser_; + std::vector< boost::shared_ptr<Wccl::FunctionalOperator> > ops_; + int token_idx; +}; - boost::shared_ptr<const Wccl::Value> retVal; +bool Runner::load_more_operators(const std::string& filename) +{ boost::shared_ptr<Wccl::FunctionalOperator> retOp; try { std::ifstream is(filename.c_str()); @@ -29,9 +67,9 @@ bool load_more_operators(const std::string& filename, Wccl::Parser& parser, throw Wccl::FileNotFound(filename, "", __FUNCTION__); } - retOp = parser.parseAnyOperator(is); + retOp = parser_.parseAnyOperator(is); if (retOp) { - ops.push_back(retOp); + ops_.push_back(retOp); return true; } else { std::cerr << "Problem while parsing -- " @@ -57,24 +95,13 @@ bool load_more_operators(const std::string& filename, Wccl::Parser& parser, return false; } -class streamsave -{ -public: - streamsave(std::ostream& os) : os_(os), flags_(os.flags()) {} - ~streamsave() { os_.flags(flags_); } -private: - std::ostream& os_; - std::ios_base::fmtflags flags_; -}; - -void do_head(const Corpus2::Tagset& tagset, - const std::vector< boost::shared_ptr<Wccl::FunctionalOperator> >& ops) +void Runner::do_head() { streamsave sv(std::cout); std::cout << "## "; std::cout << std::setw(20) << "orth"; int i = 0; - foreach (const boost::shared_ptr<Wccl::FunctionalOperator>& o, ops) { + foreach (const boost::shared_ptr<Wccl::FunctionalOperator>& o, ops_) { ++i; std::cout << " "; std::cout.setf(std::ios::right); @@ -85,13 +112,11 @@ void do_head(const Corpus2::Tagset& tagset, std::cout << "\n"; } -void do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence, - const Corpus2::Tagset& tagset, - const std::vector< boost::shared_ptr<Wccl::FunctionalOperator> >& ops) +void Runner::do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence) { Wccl::SentenceContext sc(sentence); std::vector< std::vector< UnicodeString > > outputs(sentence->size()); - std::vector<int> lengths(ops.size() + 1, 0); + std::vector<int> lengths(ops_.size() + 1, 0); streamsave sv(std::cout); for (size_t i = 0; i < sentence->size(); ++i) { sc.set_position(i); @@ -99,34 +124,56 @@ void do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence, outputs[i].push_back(orth); lengths[0] = std::max(lengths[0], orth.length()); int li = 1; - foreach (const boost::shared_ptr<Wccl::FunctionalOperator>& o, ops) { + foreach (const boost::shared_ptr<Wccl::FunctionalOperator>& o, ops_) { boost::shared_ptr<const Wccl::Value> v = o->base_apply(sc); - UnicodeString vstr = v->to_string_u(tagset); + UnicodeString vstr = v->to_string_u(tagset_); lengths[li] = std::max(lengths[li], vstr.length()); ++li; outputs[i].push_back(vstr); } } for (size_t i = 0; i < sentence->size(); ++i) { - std::cout << std::setw(2) << (i + 1) << " "; - for (size_t oi = 0; oi < outputs[i].size(); ++oi) { + ++token_idx; + if (global_numbering) { + if (tabs) { + std::cout << token_idx << "\t"; + } else { + std::cout << std::setw(6) << token_idx << " "; + } + } + if (in_sentence_numbering) { + if (tabs) { + std::cout << (i + 1) << "\t"; + } else { + std::cout << std::setw(2) << (i + 1) << " "; + } + } + size_t b = output_orths ? 0 : 1; + for (size_t oi = b; oi < outputs[i].size(); ++oi) { UnicodeString u = outputs[i][oi]; - u.padTrailing(lengths[oi]); - std::cout << PwrNlp::to_utf8(u) << " "; + if (oi + 1 < outputs[i].size()) { + if (tabs) { + std::cout << PwrNlp::to_utf8(u); + std::cout << "\t"; + } else { + u.padTrailing(lengths[oi] + 1); + std::cout << PwrNlp::to_utf8(u); + } + } else { + std::cout << PwrNlp::to_utf8(u); + } } std::cout << "\n"; } } -void do_file(const std::string& filename, const Corpus2::Tagset& tagset, - const std::vector< boost::shared_ptr<Wccl::FunctionalOperator> >& ops, - bool first) +void Runner::do_stream(std::istream& is, bool first) { - Corpus2::XcesReader xr(tagset, filename); + Corpus2::XcesReader xr(tagset_, is); Corpus2::Sentence::Ptr s; //do_head(tagset, ops); while ((s = xr.get_next_sentence())) { - do_sentence(s, tagset, ops); + do_sentence(s); std::cout << "\n"; if (first) break; } @@ -137,8 +184,7 @@ int main(int argc, char** argv) std::string tagset_load = "kipi"; bool first = false; std::vector<std::string> corpora_files, ccl_files, files; - bool quiet = false; - bool dump_variables = false; + bool corpus_stdin = false; using boost::program_options::value; boost::program_options::options_description desc("Allowed options"); @@ -151,10 +197,18 @@ int main(int argc, char** argv) "CCL query file\n") ("files,f", value(&files), "Files to load, look at extecion to determine type\n") + ("corpus-from-stdin,I", value(&corpus_stdin)->zero_tokens(), + "Read corpus from stdin\n") ("quiet,q", value(&quiet)->zero_tokens(), "Suppress messages\n") ("first-sentence-only,1", value(&first)->zero_tokens(), "Only process first sentence\n") + ("tabs", value(&tabs)->zero_tokens(), + "Output a tab-separated file\n") + ("local-counts,l", value(&in_sentence_numbering), + "Output in-sentence token counts\n") + ("global-counts,g", value(&global_numbering), + "Output global counts\n") ("help,h", "Show help") ; boost::program_options::variables_map vm; @@ -172,12 +226,17 @@ int main(int argc, char** argv) boost::program_options::notify(vm); if (vm.count("help")) { + std::cerr << "Usage " << argv[0] << " [OPTIONS] FILES\n" + << "Files ending with .xml are treated as corpora, otherwise \n" + << "as CCL files. Use - to read corpus from stdin (as with -I)"; std::cout << desc << "\n"; return 1; } foreach (const std::string& f, files) { - if (boost::algorithm::ends_with(f, ".xml")) { + if (f == "-") { + corpus_stdin = true; + } else if (boost::algorithm::ends_with(f, ".xml")) { corpora_files.push_back(f); } else { ccl_files.push_back(f); @@ -186,20 +245,27 @@ int main(int argc, char** argv) try { const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load); - std::vector< boost::shared_ptr<Wccl::FunctionalOperator> > operators; - Wccl::Parser parser(tagset); + Runner runner(tagset); foreach (const std::string& f, ccl_files) { - int sz = operators.size(); - if (!load_more_operators(f, parser, operators)) { + size_t sz = runner.operators().size(); + if (!runner.load_more_operators(f)) { std::cerr << "Warning: error while parsing " << f << "\n"; } - if (operators.size() == sz) { + if (runner.operators().size() == sz) { std::cerr << "Warning: no operators loaded from " << f << "\n"; } } - if (!operators.empty()) { + if (!runner.operators().empty()) { foreach (const std::string& f, corpora_files) { - do_file(f, tagset, operators, first); + std::ifstream ifs(f.c_str()); + if (ifs.good()) { + runner.do_stream(ifs, first); + } else { + std::cerr << "Error reading corpus from " << f << "\n"; + } + } + if (corpus_stdin) { + runner.do_stream(std::cin, first); } } } catch (PwrNlp::PwrNlpError& e) {