#include <cstdlib> #include <fstream> #include <iomanip> #include <libwccl/values/strset.h> #include <libwccl/parser/Parser.h> #include <libcorpus2/tagsetmanager.h> #include <boost/bind.hpp> #include <boost/algorithm/string.hpp> #include <boost/make_shared.hpp> #include <boost/program_options.hpp> #include <libcorpus2/io/xcesreader.h> #include <antlr/NoViableAltException.hpp> #include <antlr/MismatchedTokenException.hpp> void load_more_operators(const std::string& filename, Wccl::Parser& parser, std::vector< boost::shared_ptr<Wccl::FunctionalOperator> >& ops) { boost::shared_ptr<const Wccl::Value> retVal; boost::shared_ptr<Wccl::FunctionalOperator> retOp; try { std::ifstream is(filename.c_str()); if (!is.good()) { throw Wccl::FileNotFound(filename, "", __FUNCTION__); } retOp = parser.parseAnyOperator(is); if (retOp) { ops.push_back(retOp); } else { std::cerr << "Problem while parsing -- " << "parser returned NULL!" << std::endl; } } catch (antlr::MismatchedTokenException &e) { std::cerr << e.getFileLineColumnString() << " " << e.getMessage() << std::endl; } catch(antlr::NoViableAltException &e) { std::cerr << e.getFileLineColumnString() << " " << e.getMessage() << std::endl; } catch (Wccl::InvalidVariableName &e) { std::cerr << "Wccl::InvalidVariableName " << e.info() << std::endl; } catch (Wccl::VariableTypeMismatch &e) { std::cerr << "Wccl::VariableTypeMismatch " << e.info() << std::endl; } catch (Wccl::WcclError& e) { std::cerr << "Wccl::WcclError:" << e.info() << std::endl; } catch (PwrNlp::PwrNlpError& e) { std::cerr << "PwrNlp::PwrNlpError " << e.info() << std::endl; } catch (antlr::ANTLRException& e) { std::cerr << "Antlr error " << e.getMessage() << std::endl; } } class streamsave { public: streamsave(std::ostream& os) : os_(os), flags_(os.flags()) {} ~streamsave() { os_.flags(flags_); } private: std::ostream& os_; std::ios_base::fmtflags flags_; }; void do_head(const Corpus2::Tagset& tagset, const std::vector< boost::shared_ptr<Wccl::FunctionalOperator> >& ops) { streamsave sv(std::cout); std::cout << "## "; std::cout << std::setw(20) << "orth"; int i = 0; foreach (const boost::shared_ptr<Wccl::FunctionalOperator>& o, ops) { ++i; std::cout << " "; std::cout.setf(std::ios::right); std::cout << std::setw(15) << "operator "; std::cout.setf(std::ios::left); std::cout << std::setw(5) << i; } std::cout << "\n"; } void do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence, const Corpus2::Tagset& tagset, const std::vector< boost::shared_ptr<Wccl::FunctionalOperator> >& ops) { Wccl::SentenceContext sc(sentence); std::vector< std::vector< UnicodeString > > outputs(sentence->size()); std::vector<int> lengths(ops.size() + 1, 0); streamsave sv(std::cout); for (size_t i = 0; i < sentence->size(); ++i) { sc.set_position(i); UnicodeString orth = sentence->tokens()[i]->orth(); outputs[i].push_back(orth); lengths[0] = std::max(lengths[0], orth.length()); int li = 1; foreach (const boost::shared_ptr<Wccl::FunctionalOperator>& o, ops) { boost::shared_ptr<const Wccl::Value> v = o->base_apply(sc); UnicodeString vstr = v->to_string_u(tagset); lengths[li] = std::max(lengths[li], vstr.length()); ++li; outputs[i].push_back(vstr); } } for (size_t i = 0; i < sentence->size(); ++i) { std::cout << std::setw(2) << (i + 1) << " "; for (size_t oi = 0; oi < outputs[i].size(); ++oi) { UnicodeString u = outputs[i][oi]; u.padTrailing(lengths[oi]); std::cout << PwrNlp::to_utf8(u) << " "; } std::cout << "\n"; } } void do_file(const std::string& filename, const Corpus2::Tagset& tagset, const std::vector< boost::shared_ptr<Wccl::FunctionalOperator> >& ops, bool first) { Corpus2::XcesReader xr(tagset, filename); Corpus2::Sentence::Ptr s; //do_head(tagset, ops); while ((s = xr.get_next_sentence())) { do_sentence(s, tagset, ops); std::cout << "\n"; if (first) break; } } int main(int argc, char** argv) { std::string tagset_load = "kipi"; bool first = false; std::vector<std::string> corpora_files, ccl_files, files; bool quiet = false; bool dump_variables = false; using boost::program_options::value; boost::program_options::options_description desc("Allowed options"); desc.add_options() ("tagset,t", value(&tagset_load), "Tagset to use\n") ("corpus,c", value(&corpora_files), "Corpus file to load (XCES)\n") ("ccl-file,C", value(&ccl_files), "CCL query file\n") ("files,f", value(&files), "Files to load, look at extecion to determine type\n") ("quiet,q", value(&quiet)->zero_tokens(), "Suppress messages\n") ("first-sentence-only,1", value(&first)->zero_tokens(), "Only process first sentence\n") ("help,h", "Show help") ; boost::program_options::variables_map vm; boost::program_options::positional_options_description p; p.add("files", -1); try { boost::program_options::store( boost::program_options::command_line_parser(argc, argv) .options(desc).positional(p).run(), vm); } catch (boost::program_options::error& e) { std::cerr << e.what() << std::endl; return 2; } boost::program_options::notify(vm); if (vm.count("help")) { std::cout << desc << "\n"; return 1; } foreach (const std::string& f, files) { if (boost::algorithm::ends_with(f, ".xml")) { corpora_files.push_back(f); } else { ccl_files.push_back(f); } } try { const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load); std::vector< boost::shared_ptr<Wccl::FunctionalOperator> > operators; Wccl::Parser parser(tagset); foreach (const std::string& f, ccl_files) { load_more_operators(f, parser, operators); } if (!operators.empty()) { foreach (const std::string& f, corpora_files) { do_file(f, tagset, operators, first); } } } catch (PwrNlp::PwrNlpError& e) { std::cerr << e.info() << std::endl; return 2; } return 0; }