#include <cstdlib> #include <fstream> #include <iomanip> #include <libwccl/values/strset.h> #include <libwccl/parser/Parser.h> #include <libcorpus2/tagsetmanager.h> #include <boost/bind.hpp> #include <boost/algorithm/string.hpp> #include <boost/make_shared.hpp> #include <boost/program_options.hpp> #include <libcorpus2/io/xcesreader.h> #include <antlr/NoViableAltException.hpp> #include <antlr/MismatchedTokenException.hpp> namespace { bool quiet = false; bool tabs = false; bool output_orths = true; bool global_numbering = false; bool in_sentence_numbering = true; } class streamsave { public: streamsave(std::ostream& os) : os_(os), flags_(os.flags()) {} ~streamsave() { os_.flags(flags_); } private: std::ostream& os_; std::ios_base::fmtflags flags_; }; class Runner { public: Runner(const Corpus2::Tagset& tagset) : tagset_(tagset), parser_(tagset_), token_idx(0) { } bool load_more_operators(const std::string &filename); const std::vector< boost::shared_ptr<Wccl::FunctionalOperator> >& operators() const { return ops_; } void do_head(); void do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence); void do_stream(std::istream& is, bool first); private: const Corpus2::Tagset& tagset_; Wccl::Parser parser_; std::vector< boost::shared_ptr<Wccl::FunctionalOperator> > ops_; int token_idx; }; bool Runner::load_more_operators(const std::string& filename) { boost::shared_ptr<Wccl::FunctionalOperator> retOp; try { std::ifstream is(filename.c_str()); if (!is.good()) { throw Wccl::FileNotFound(filename, "", __FUNCTION__); } retOp = parser_.parseAnyOperator(is); if (retOp) { ops_.push_back(retOp); return true; } else { std::cerr << "Problem while parsing -- " << "parser returned NULL!" << std::endl; } } catch (antlr::MismatchedTokenException &e) { std::cerr << e.getFileLineColumnString() << " " << e.getMessage() << std::endl; } catch(antlr::NoViableAltException &e) { std::cerr << e.getFileLineColumnString() << " " << e.getMessage() << std::endl; } catch (Wccl::InvalidVariableName &e) { std::cerr << "Wccl::InvalidVariableName " << e.info() << std::endl; } catch (Wccl::VariableTypeMismatch &e) { std::cerr << "Wccl::VariableTypeMismatch " << e.info() << std::endl; } catch (Wccl::WcclError& e) { std::cerr << "Wccl::WcclError:" << e.info() << std::endl; } catch (PwrNlp::PwrNlpError& e) { std::cerr << "PwrNlp::PwrNlpError " << e.info() << std::endl; } catch (antlr::ANTLRException& e) { std::cerr << "Antlr error " << e.getMessage() << std::endl; } return false; } void Runner::do_head() { streamsave sv(std::cout); std::cout << "## "; std::cout << std::setw(20) << "orth"; int i = 0; foreach (const boost::shared_ptr<Wccl::FunctionalOperator>& o, ops_) { ++i; std::cout << " "; std::cout.setf(std::ios::right); std::cout << std::setw(15) << "operator "; std::cout.setf(std::ios::left); std::cout << std::setw(5) << i; } std::cout << "\n"; } void Runner::do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence) { Wccl::SentenceContext sc(sentence); std::vector< std::vector< UnicodeString > > outputs(sentence->size()); std::vector<int> lengths(ops_.size() + 1, 0); streamsave sv(std::cout); for (size_t i = 0; i < sentence->size(); ++i) { sc.set_position(i); UnicodeString orth = sentence->tokens()[i]->orth(); outputs[i].push_back(orth); lengths[0] = std::max(lengths[0], orth.length()); int li = 1; foreach (const boost::shared_ptr<Wccl::FunctionalOperator>& o, ops_) { boost::shared_ptr<const Wccl::Value> v = o->base_apply(sc); UnicodeString vstr = v->to_string_u(tagset_); lengths[li] = std::max(lengths[li], vstr.length()); ++li; outputs[i].push_back(vstr); } } for (size_t i = 0; i < sentence->size(); ++i) { ++token_idx; if (global_numbering) { if (tabs) { std::cout << token_idx << "\t"; } else { std::cout << std::setw(6) << token_idx << " "; } } if (in_sentence_numbering) { if (tabs) { std::cout << (i + 1) << "\t"; } else { std::cout << std::setw(2) << (i + 1) << " "; } } size_t b = output_orths ? 0 : 1; for (size_t oi = b; oi < outputs[i].size(); ++oi) { UnicodeString u = outputs[i][oi]; if (oi + 1 < outputs[i].size()) { if (tabs) { std::cout << PwrNlp::to_utf8(u); std::cout << "\t"; } else { u.padTrailing(lengths[oi] + 1); std::cout << PwrNlp::to_utf8(u); } } else { std::cout << PwrNlp::to_utf8(u); } } std::cout << "\n"; } } void Runner::do_stream(std::istream& is, bool first) { Corpus2::XcesReader xr(tagset_, is); Corpus2::Sentence::Ptr s; //do_head(tagset, ops); while ((s = xr.get_next_sentence())) { do_sentence(s); std::cout << "\n"; if (first) break; } } int main(int argc, char** argv) { std::string tagset_load = "kipi"; bool first = false; std::vector<std::string> corpora_files, ccl_files, files; bool corpus_stdin = false; using boost::program_options::value; boost::program_options::options_description desc("Allowed options"); desc.add_options() ("tagset,t", value(&tagset_load), "Tagset to use\n") ("corpus,c", value(&corpora_files), "Corpus file to load (XCES)\n") ("ccl-file,C", value(&ccl_files), "CCL query file\n") ("files,f", value(&files), "Files to load, look at extecion to determine type\n") ("corpus-from-stdin,I", value(&corpus_stdin)->zero_tokens(), "Read corpus from stdin\n") ("quiet,q", value(&quiet)->zero_tokens(), "Suppress messages\n") ("first-sentence-only,1", value(&first)->zero_tokens(), "Only process first sentence\n") ("tabs", value(&tabs)->zero_tokens(), "Output a tab-separated file\n") ("local-counts,l", value(&in_sentence_numbering), "Output in-sentence token counts\n") ("global-counts,g", value(&global_numbering), "Output global counts\n") ("help,h", "Show help") ; boost::program_options::variables_map vm; boost::program_options::positional_options_description p; p.add("files", -1); try { boost::program_options::store( boost::program_options::command_line_parser(argc, argv) .options(desc).positional(p).run(), vm); } catch (boost::program_options::error& e) { std::cerr << e.what() << std::endl; return 2; } boost::program_options::notify(vm); if (vm.count("help")) { std::cerr << "Usage " << argv[0] << " [OPTIONS] FILES\n" << "Files ending with .xml are treated as corpora, otherwise \n" << "as CCL files. Use - to read corpus from stdin (as with -I)"; std::cout << desc << "\n"; return 1; } foreach (const std::string& f, files) { if (f == "-") { corpus_stdin = true; } else if (boost::algorithm::ends_with(f, ".xml")) { corpora_files.push_back(f); } else { ccl_files.push_back(f); } } try { const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load); Runner runner(tagset); foreach (const std::string& f, ccl_files) { size_t sz = runner.operators().size(); if (!runner.load_more_operators(f)) { std::cerr << "Warning: error while parsing " << f << "\n"; } if (runner.operators().size() == sz) { std::cerr << "Warning: no operators loaded from " << f << "\n"; } } if (!runner.operators().empty()) { foreach (const std::string& f, corpora_files) { std::ifstream ifs(f.c_str()); if (ifs.good()) { runner.do_stream(ifs, first); } else { std::cerr << "Error reading corpus from " << f << "\n"; } } if (corpus_stdin) { runner.do_stream(std::cin, first); } } } catch (PwrNlp::PwrNlpError& e) { std::cerr << e.info() << std::endl; return 2; } return 0; }