#include <cstdlib> #include <cstdio> #include <fstream> #include <iomanip> #include <libwccl/values/strset.h> #include <libwccl/parser/Parser.h> #include <libcorpus2/tagsetmanager.h> #include <boost/bind.hpp> #include <boost/algorithm/string.hpp> #include <boost/make_shared.hpp> #include <boost/program_options.hpp> #include <boost/filesystem.hpp> #include <libcorpus2/io/xcesreader.h> #include <boost/lexical_cast.hpp> #include <antlr/NoViableAltException.hpp> #include <antlr/MismatchedTokenException.hpp> namespace { bool quiet = false; bool tabs = false; bool output_orths = true; bool output_variables = false; bool global_numbering = false; bool in_sentence_numbering = true; } class streamsave { public: streamsave(std::ostream& os) : os_(os), flags_(os.flags()) {} ~streamsave() { os_.flags(flags_); } private: std::ostream& os_; std::ios_base::fmtflags flags_; }; class Runner { public: Runner(const Corpus2::Tagset& tagset) : tagset_(tagset), parser_(tagset_), token_idx(0) { } bool load_more_operators(const std::string &filename); const std::vector< boost::shared_ptr<Wccl::FunctionalOperator> >& operators() const { return ops_; } void do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence); void do_stream(std::istream& is, bool first); private: const Corpus2::Tagset& tagset_; Wccl::Parser parser_; std::vector< boost::shared_ptr<Wccl::FunctionalOperator> > ops_; std::vector< std::string > op_names_; int token_idx; }; bool Runner::load_more_operators(const std::string& filename) { boost::shared_ptr<Wccl::FunctionalOperator> retOp; try { std::ifstream is(filename.c_str()); if (!is.good()) { throw Wccl::FileNotFound(filename, "", __FUNCTION__); } retOp = parser_.parseAnyOperator(is); if (retOp) { boost::filesystem::path p(filename); op_names_.push_back(p.stem()); ops_.push_back(retOp); return true; } else { std::cerr << "Problem while parsing -- " << "parser returned NULL!" << std::endl; } } catch (antlr::MismatchedTokenException &e) { std::cerr << e.getFileLineColumnString() << " " << e.getMessage() << std::endl; } catch(antlr::NoViableAltException &e) { std::cerr << e.getFileLineColumnString() << " " << e.getMessage() << std::endl; } catch (Wccl::InvalidVariableName &e) { std::cerr << "Wccl::InvalidVariableName " << e.info() << std::endl; } catch (Wccl::VariableTypeMismatch &e) { std::cerr << "Wccl::VariableTypeMismatch " << e.info() << std::endl; } catch (Wccl::WcclError& e) { std::cerr << "Wccl::WcclError:" << e.info() << std::endl; } catch (PwrNlp::PwrNlpError& e) { std::cerr << "PwrNlp::PwrNlpError " << e.info() << std::endl; } catch (antlr::ANTLRException& e) { std::cerr << "Antlr error " << e.getMessage() << std::endl; } return false; } void Runner::do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence) { Wccl::SentenceContext sc(sentence); std::vector< std::vector< UnicodeString > > outputs; streamsave sv(std::cout); if (1) { //header outputs.resize(outputs.size() + 1); std::vector< UnicodeString >& out = outputs.back(); if (global_numbering) { out.push_back(UnicodeString::fromUTF8("##")); } if (in_sentence_numbering) { out.push_back(UnicodeString::fromUTF8("#")); } if (output_orths) { out.push_back(UnicodeString::fromUTF8("orth")); } for (size_t i = 0; i < op_names_.size(); ++i) { out.push_back(UnicodeString::fromUTF8(op_names_[i])); if (output_variables) { boost::shared_ptr<Wccl::FunctionalOperator> o = ops_[i]; foreach (const std::string& varname, o->valid_variable_names()) { const Wccl::Value& value = (*o)[varname]; std::string label = "(" + op_names_[i] + ")" + value.make_var_repr(varname); out.push_back(UnicodeString::fromUTF8(label)); } } } } for (size_t i = 0; i < sentence->size(); ++i) { outputs.resize(outputs.size() + 1); std::vector< UnicodeString >& out = outputs.back(); ++token_idx; if (global_numbering) { out.push_back(UnicodeString::fromUTF8(boost::lexical_cast<std::string>(token_idx))); } if (in_sentence_numbering) { out.push_back(UnicodeString::fromUTF8(boost::lexical_cast<std::string>(i + 1))); } if (output_orths) { out.push_back(sentence->tokens()[i]->orth()); } sc.set_position(i); foreach (const boost::shared_ptr<Wccl::FunctionalOperator>& o, ops_) { boost::shared_ptr<const Wccl::Value> v = o->base_apply(sc); UnicodeString vstr = v->to_string_u(tagset_); out.push_back(vstr); if (output_variables) { foreach (const std::string& varname, o->valid_variable_names()) { out.push_back((*o)[varname].to_string_u(tagset_)); } } } } std::vector<int> lengths(outputs[0].size()); foreach (const std::vector< UnicodeString >& line, outputs) { for (size_t i = 0; i < line.size(); ++i) { lengths[i] = std::max(lengths[i], line[i].length()); } } foreach (const std::vector< UnicodeString >& line, outputs) { for (size_t i = 0; i < line.size(); ++i) { UnicodeString u = line[i]; if (tabs) { std::cout << PwrNlp::to_utf8(line[i]) << "\t"; } else { u.padTrailing(lengths[i] + 1); std::cout << PwrNlp::to_utf8(u); } } std::cout << "\n"; } } void Runner::do_stream(std::istream& is, bool first) { Corpus2::XcesReader xr(tagset_, is); Corpus2::Sentence::Ptr s; while ((s = xr.get_next_sentence())) { do_sentence(s); std::cout << "\n"; if (first) break; } } int main(int argc, char** argv) { std::string tagset_load = "kipi"; bool first = false; std::vector<std::string> corpora_files, ccl_files, files; bool corpus_stdin = false; using boost::program_options::value; boost::program_options::options_description desc("Allowed options"); desc.add_options() ("tagset,t", value(&tagset_load), "Tagset to use\n") ("corpus,c", value(&corpora_files), "Corpus file to load (XCES)\n") ("ccl-file,C", value(&ccl_files), "CCL query file\n") ("files,f", value(&files), "Files to load, looking at the extension to determine type\n") ("corpus-from-stdin,I", value(&corpus_stdin)->zero_tokens(), "Read corpus from stdin\n") ("quiet,q", value(&quiet)->zero_tokens(), "Suppress messages\n") ("first-sentence-only,1", value(&first)->zero_tokens(), "Only process first sentence\n") ("tabs", value(&tabs)->zero_tokens(), "Output a tab-separated file\n") ("local-counts,l", value(&in_sentence_numbering), "Output in-sentence token counts\n") ("global-counts,g", value(&global_numbering), "Output global counts\n") ("output-orths,O", value(&output_orths), "Output token orths\n") ("output-variables,V", value(&output_variables), "Output operator variables\n") ("help,h", "Show help") ; boost::program_options::variables_map vm; boost::program_options::positional_options_description p; p.add("files", -1); try { boost::program_options::store( boost::program_options::command_line_parser(argc, argv) .options(desc).positional(p).run(), vm); } catch (boost::program_options::error& e) { std::cerr << e.what() << std::endl; return 2; } boost::program_options::notify(vm); if (vm.count("help")) { std::cerr << "Usage " << argv[0] << " [OPTIONS] FILES\n" << "Files ending with .xml are treated as corpora, otherwise \n" << "as CCL files. Use - to read corpus from stdin (as with -I)"; std::cout << desc << "\n"; return 1; } foreach (const std::string& f, files) { if (f == "-") { corpus_stdin = true; } else if (boost::algorithm::ends_with(f, ".xml")) { corpora_files.push_back(f); } else { ccl_files.push_back(f); } } try { const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load); Runner runner(tagset); foreach (const std::string& f, ccl_files) { size_t sz = runner.operators().size(); if (!runner.load_more_operators(f)) { std::cerr << "Warning: error while parsing " << f << "\n"; } if (runner.operators().size() == sz) { std::cerr << "Warning: no operators loaded from " << f << "\n"; } } if (!runner.operators().empty()) { foreach (const std::string& f, corpora_files) { std::ifstream ifs(f.c_str()); if (ifs.good()) { runner.do_stream(ifs, first); } else { std::cerr << "Error reading corpus from " << f << "\n"; } } if (corpus_stdin) { runner.do_stream(std::cin, first); } } } catch (PwrNlp::PwrNlpError& e) { std::cerr << e.info() << std::endl; return 2; } return 0; }