#include <cstdlib> #include <fstream> #include <iomanip> #include <libwccl/values/strset.h> #include <libwccl/parser/Parser.h> #include <libcorpus2/tagsetmanager.h> #include <libcorpus2/util/tokentimer.h> #include <boost/bind.hpp> #include <boost/algorithm/string.hpp> #include <boost/make_shared.hpp> #include <boost/filesystem.hpp> #include <boost/program_options.hpp> #include <libcorpus2/io/reader.h> #include <libcorpus2/io/writer.h> namespace { bool quiet = false; struct options { bool first; }; } class RuleRunner { public: RuleRunner(const Corpus2::Tagset& tagset) : tagset_(tagset), parser_(tagset_), progress_(false), search_path_(".") , tag_rule_iterations_(0), total_match_rules_(0), total_tag_rules_(0) { } void use_progress(bool use) { progress_ = use; if (use) { Corpus2::TokenTimer& timer = Corpus2::global_timer(); timer.register_signal_handler(); } } void set_tag_rule_iterations(int i) { tag_rule_iterations_ = i; } std::pair<int,int> load_more_rules(const std::string &filename); void apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader, boost::shared_ptr<Corpus2::TokenWriter> writer); bool empty() const { return size() > 0; } size_t size() const { return total_match_rules_ + total_tag_rules_; } size_t total_match_rules() const { return total_match_rules_; } size_t total_tag_rules() const { return total_tag_rules_; } void set_search_path(const std::string& path) { search_path_ = path; } private: const Corpus2::Tagset& tagset_; Wccl::Parser parser_; std::vector<std::string> file_names_; std::vector<boost::shared_ptr<Wccl::WcclFile> > parsed_files_; bool progress_; std::string search_path_; int tag_rule_iterations_; size_t total_match_rules_, total_tag_rules_; }; std::pair<int,int> RuleRunner::load_more_rules(const std::string& filename) { boost::shared_ptr<Wccl::WcclFile> parsed_file; try { std::ifstream is(filename.c_str()); if (!is.good()) { throw Wccl::FileNotFound(filename, "", __FUNCTION__); } parsed_file = parser_.parseWcclFile(is, search_path_); if (parsed_file) { boost::filesystem::path p(filename); file_names_.push_back(p.stem()); size_t match_rules = 0, tag_rules = 0; if (parsed_file->has_tag_rules()) { tag_rules = parsed_file->get_tag_rules().size(); } if (parsed_file->has_match_rules()) { match_rules = parsed_file->get_match_rules().size(); } total_tag_rules_ += tag_rules; total_match_rules_ += match_rules; parsed_files_.push_back(parsed_file); return std::make_pair(tag_rules, match_rules); } else { std::cerr << "Problem while parsing -- " << "parser returned NULL!" << std::endl; } } catch (PwrNlp::PwrNlpError& e) { std::cerr << e.scope() << " Error: " << e.info() << std::endl; } return std::make_pair(0,0); } void RuleRunner::apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader, boost::shared_ptr<Corpus2::TokenWriter> writer) { Corpus2::TokenTimer& timer = Corpus2::global_timer(); while (boost::shared_ptr<Corpus2::Chunk> c = reader->get_next_chunk()) { foreach (boost::shared_ptr<Corpus2::Sentence>& s, c->sentences()) { boost::shared_ptr<Corpus2::AnnotatedSentence> as; as = boost::dynamic_pointer_cast<Corpus2::AnnotatedSentence>(s); if (!as) { std::cerr << "Did not get an AnnotatedSentence from reader," "'ann'' option broken?\n"; return; } foreach (boost::shared_ptr<Wccl::WcclFile>& f, parsed_files_) { if (tag_rule_iterations_ == 0) { f->get_tag_rules_ptr()->execute_once(as); } else if (tag_rule_iterations_ < 0) { f->get_tag_rules_ptr()->execute_until_done(as); } else { f->get_tag_rules_ptr()->execute_until_done(as, tag_rule_iterations_); } f->get_match_rules_ptr()->apply_all(as); } timer.count_sentence(*as); if (progress_) { timer.check_slice(); } //writer->write_sentence(*as); } writer->write_chunk(*c); } } void usage(char* name) { std::cerr << "This program runs WCCL match and/or tag rules. Tag rules are applied first.\n"; std::cerr << "Usage " << name << " [OPTIONS] FILES\n" << "Files ending with .xml are treated as corpora, otherwise " << "as WCCL files. Use - to read corpus from stdin (as with -I)\n" << "Note: the ,ann option is implied on all input formats\n"; } int main(int argc, char** argv) { std::string tagset_load = "kipi"; std::string input_format; std::string output_format; std::string search_path; bool progress = false; options opts; opts.first = false; std::vector<std::string> corpora_files, ccl_files, files; bool corpus_stdin = true; using boost::program_options::value; std::string readers = boost::algorithm::join(Corpus2::TokenReader::available_reader_types_help(), " "); std::string readers_help = "Input format, any of: " + readers + "\n"; std::string writers = boost::algorithm::join(Corpus2::TokenWriter::available_writer_types_help(), " "); std::string writers_help = "Output format, any of: " + writers + "\n";; boost::program_options::options_description desc("Allowed options"); desc.add_options() ("tagset,t", value(&tagset_load), "Tagset to use\n") ("corpus,c", value(&corpora_files), "Corpus file to load (XCES), do not load from stdin\n") ("ccl-file,C", value(&ccl_files), "CCL rule files\n") ("files,f", value(&files), "Files to load, looking at the extension to determine type\n") ("search-path,P", value(&search_path), "WCCL resources (lexicons) search path") ("corpus-from-stdin,I", value(&corpus_stdin)->zero_tokens(), "Read corpus from stdin (requires that no corpora filenames are passed)") ("input-format,i", value(&input_format)->default_value("xces"), readers_help.c_str()) ("output-format,o", value(&output_format)->default_value("ccl"), writers_help.c_str()) ("progress,p", value(&progress)->zero_tokens(), "Show progress info") ("quiet,q", value(&quiet)->zero_tokens(), "Suppress messages\n") ("until-done-iterations,u", value<int>()->implicit_value(1000), "Until-done iteration limit, no arg for default limit(1000)\n") ("first-sentence-only,1", value(&opts.first)->zero_tokens(), "Only process first sentence\n") ("help,h", "Show help") ; boost::program_options::variables_map vm; boost::program_options::positional_options_description p; p.add("files", -1); try { boost::program_options::store( boost::program_options::command_line_parser(argc, argv) .options(desc).positional(p).run(), vm); } catch (boost::program_options::error& e) { std::cerr << e.what() << std::endl; return 2; } boost::program_options::notify(vm); if (vm.count("help")) { usage(argv[0]); std::cout << desc << "\n"; return 1; } foreach (const std::string& f, files) { if (f == "-") { corpus_stdin = true; } else if (boost::algorithm::ends_with(f, ".xml")) { corpora_files.push_back(f); } else { ccl_files.push_back(f); } } // consider stdin only when no corpus files given corpus_stdin = corpus_stdin && corpora_files.empty(); if (ccl_files.empty() || (corpora_files.empty() && !corpus_stdin)) { usage(argv[0]); return 2; } try { const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load); RuleRunner runner(tagset); if (vm.count("until-done-iterations")) { runner.set_tag_rule_iterations(vm["until-done-iterations"].as<int>()); } runner.use_progress(progress); if (!search_path.empty()) { runner.set_search_path(search_path); } foreach (const std::string& file, ccl_files) { std::pair<int,int> res = runner.load_more_rules(file); if (res.first == 0 && res.second == 0) { std::cerr << "Warning: no rules loaded from " << file << "\n"; } else if (!quiet) { std::cerr << "Loaded " << res.first << " tag rule(s) and " << res.second << " match rule(s) from " << file << "\n"; } } if (!runner.empty()) { boost::shared_ptr<Corpus2::TokenWriter> writer; writer = Corpus2::TokenWriter::create_stream_writer(output_format, std::cout, tagset); boost::shared_ptr<Corpus2::TokenReader> reader; foreach (std::string cf, corpora_files) { reader = Corpus2::TokenReader::create_path_reader(input_format, tagset, cf); reader->set_option("ann"); runner.apply_rules(reader, writer); } if (corpus_stdin) { reader = Corpus2::TokenReader::create_stream_reader(input_format, tagset, std::cin); reader->set_option("ann"); runner.apply_rules(reader, writer); } if (progress) { Corpus2::TokenTimer& timer = Corpus2::global_timer(); timer.stats(); } } } catch (PwrNlp::PwrNlpError& e) { std::cerr << e.info() << std::endl; return 2; } return 0; }