#include <cstdlib> #include <fstream> #include <iomanip> #include <libwccl/values/strset.h> #include <libwccl/parser/Parser.h> #include <libcorpus2/tagsetmanager.h> #include <libcorpus2/util/tokentimer.h> #include <boost/bind.hpp> #include <boost/algorithm/string.hpp> #include <boost/make_shared.hpp> #include <boost/filesystem.hpp> #include <boost/program_options.hpp> #include <libcorpus2/io/reader.h> #include <libcorpus2/io/writer.h> namespace { bool quiet = false; bool progress = false; struct options { bool first; bool until_done; int until_done_iterations; }; } class MatchRunner { public: MatchRunner(const Corpus2::Tagset& tagset) : tagset_(tagset), parser_(tagset_) { } bool load_more_rules(const std::string &filename); bool load_operator_string(const std::string &op_string); void apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader, boost::shared_ptr<Corpus2::TokenWriter> writer); bool empty() { return rules_.empty(); } private: const Corpus2::Tagset& tagset_; Wccl::Parser parser_; std::vector<std::string> rule_names_; std::vector<boost::shared_ptr<Wccl::MatchRule> > rules_; }; bool MatchRunner::load_more_rules(const std::string& filename) { boost::shared_ptr<Wccl::MatchRule> retOp; try { std::ifstream is(filename.c_str()); if (!is.good()) { throw Wccl::FileNotFound(filename, "", __FUNCTION__); } retOp = parser_.parseMatchRule(is); if (retOp) { boost::filesystem::path p(filename); rule_names_.push_back(p.stem()); rules_.push_back(retOp); return true; } else { std::cerr << "Problem while parsing -- " << "parser returned NULL!" << std::endl; } } catch (PwrNlp::PwrNlpError& e) { std::cerr << e.scope() << " Error: " << e.info() << std::endl; } return false; } void MatchRunner::apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader, boost::shared_ptr<Corpus2::TokenWriter> writer) { Corpus2::TokenTimer& timer = Corpus2::global_timer(); while (boost::shared_ptr<Corpus2::Chunk> c = reader->get_next_chunk()) { foreach (boost::shared_ptr<Corpus2::Sentence>& s, c->sentences()) { boost::shared_ptr<Corpus2::AnnotatedSentence> as; as = Corpus2::AnnotatedSentence::wrap_sentence(s); foreach (const boost::shared_ptr<Wccl::MatchRule>& r, rules_) { r->apply(as); } timer.count_sentence(*as); if (progress) { timer.check_slice(); } writer->write_sentence(*as); } //writer->write_chunk(*c); } if (progress) { timer.stats(); } } void usage(char* name) { std::cerr << "This program runs WCCL match rules.\n"; std::cerr << "Usage " << name << " [OPTIONS] FILES\n" << "Files ending with .xml are treated as corpora, otherwise \n" << "as CCL files. Use - to read corpus from stdin (as with -I)\n"; } int main(int argc, char** argv) { std::string tagset_load = "kipi"; std::string input_format; std::string output_format; options opts; opts.first = false; opts.until_done = false; opts.until_done_iterations = 1000; std::vector<std::string> corpora_files, ccl_files, files; bool corpus_stdin = true; using boost::program_options::value; std::string readers = boost::algorithm::join(Corpus2::TokenReader::available_reader_types_help(), " "); std::string readers_help = "Input format, any of: " + readers + "\n"; std::string writers = boost::algorithm::join(Corpus2::TokenWriter::available_writer_types_help(), " "); std::string writers_help = "Output format, any of: " + writers + "\n";; boost::program_options::options_description desc("Allowed options"); desc.add_options() ("tagset,t", value(&tagset_load), "Tagset to use\n") ("corpus,c", value(&corpora_files), "Corpus file to load (XCES), do not load from stdin\n") ("ccl-file,C", value(&ccl_files), "CCL rule files\n") ("files,f", value(&files), "Files to load, looking at the extension to determine type\n") ("corpus-from-stdin,I", value(&corpus_stdin)->zero_tokens(), "Read corpus from stdin") ("input-format,i", value(&input_format)->default_value("xces"), readers_help.c_str()) ("output-format,o", value(&output_format)->default_value("ccl"), writers_help.c_str()) ("progress,p", value(&progress)->zero_tokens(), "Show progress info") ("quiet,q", value(&quiet)->zero_tokens(), "Suppress messages\n") ("until-done,u", value(&opts.until_done)->zero_tokens(), "Until-done mode\n") ("until-done-iterations", value(&opts.until_done_iterations), "Until-done iteration limit\n") ("first-sentence-only,1", value(&opts.first)->zero_tokens(), "Only process first sentence\n") ("help,h", "Show help") ; boost::program_options::variables_map vm; boost::program_options::positional_options_description p; p.add("files", -1); try { boost::program_options::store( boost::program_options::command_line_parser(argc, argv) .options(desc).positional(p).run(), vm); } catch (boost::program_options::error& e) { std::cerr << e.what() << std::endl; return 2; } boost::program_options::notify(vm); if (vm.count("help")) { usage(argv[0]); std::cout << desc << "\n"; return 1; } foreach (const std::string& f, files) { if (f == "-") { corpus_stdin = true; } else if (boost::algorithm::ends_with(f, ".xml")) { corpora_files.push_back(f); } else { ccl_files.push_back(f); } } // consider stdin only when no corpus files given corpus_stdin = corpus_stdin && corpora_files.empty(); if (ccl_files.empty() || (corpora_files.empty() && !corpus_stdin)) { usage(argv[0]); return 2; } try { const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load); MatchRunner runner(tagset); foreach (const std::string& file, ccl_files) { runner.load_more_rules(file); } if (!runner.empty()) { Corpus2::TokenTimer& timer = Corpus2::global_timer(); timer.register_signal_handler(); boost::shared_ptr<Corpus2::TokenWriter> writer; writer = Corpus2::TokenWriter::create_stream_writer(output_format, std::cout, tagset); boost::shared_ptr<Corpus2::TokenReader> reader; foreach (std::string cf, corpora_files) { reader = Corpus2::TokenReader::create_path_reader(input_format, tagset, cf); runner.apply_rules(reader, writer); } if (corpus_stdin) { reader = Corpus2::TokenReader::create_stream_reader(input_format, tagset, std::cin); runner.apply_rules(reader, writer); } } } catch (PwrNlp::PwrNlpError& e) { std::cerr << e.info() << std::endl; return 2; } return 0; }