diff --git a/wccl-apps/CMakeLists.txt b/wccl-apps/CMakeLists.txt index e07a49df24685820ba8044b67b203b6f8e4e8325..9122f0f0891a14c8f8694780b978b5ba83db3363 100644 --- a/wccl-apps/CMakeLists.txt +++ b/wccl-apps/CMakeLists.txt @@ -27,11 +27,9 @@ add_executable(wccl-rules wccl-rules.cpp) target_link_libraries (wccl-rules wccl ${Boost_LIBRARIES} antlr ${LIBS}) add_executable(wccl-parser wccl-parser.cpp) target_link_libraries (wccl-parser wccl ${Boost_LIBRARIES} antlr ${LIBS}) -add_executable(wccl-match wccl-match.cpp) -target_link_libraries (wccl-match wccl ${Boost_LIBRARIES} antlr ${LIBS}) if(UNIX) - install(TARGETS wccl-features wccl-run wccl-rules wccl-parser wccl-match + install(TARGETS wccl-features wccl-run wccl-rules wccl-parser RUNTIME DESTINATION bin ) endif(UNIX) diff --git a/wccl-apps/wccl-match.cpp b/wccl-apps/wccl-match.cpp deleted file mode 100644 index 68f7efbbaeba720600c41477b49f4ca4635bfad9..0000000000000000000000000000000000000000 --- a/wccl-apps/wccl-match.cpp +++ /dev/null @@ -1,243 +0,0 @@ -#include <cstdlib> -#include <fstream> -#include <iomanip> - -#include <libwccl/values/strset.h> -#include <libwccl/parser/Parser.h> -#include <libcorpus2/tagsetmanager.h> -#include <libcorpus2/util/tokentimer.h> - -#include <boost/bind.hpp> -#include <boost/algorithm/string.hpp> -#include <boost/make_shared.hpp> -#include <boost/filesystem.hpp> -#include <boost/program_options.hpp> -#include <libcorpus2/io/reader.h> -#include <libcorpus2/io/writer.h> - -namespace { - bool quiet = false; - - struct options { - bool first; - bool until_done; - int until_done_iterations; - }; -} - -class MatchRunner -{ -public: - MatchRunner(const Corpus2::Tagset& tagset) - : tagset_(tagset), parser_(tagset_), progress_(false) - { - } - - void use_progress(bool use) { - progress_ = use; - if (use) { - Corpus2::TokenTimer& timer = Corpus2::global_timer(); - timer.register_signal_handler(); - } - } - - bool load_more_rules(const std::string &filename); - - bool load_operator_string(const std::string &op_string); - - void apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader, - boost::shared_ptr<Corpus2::TokenWriter> writer); - - bool empty() { - return rules_.empty(); - } - -private: - const Corpus2::Tagset& tagset_; - Wccl::Parser parser_; - std::vector<std::string> rule_names_; - std::vector<boost::shared_ptr<Wccl::MatchRule> > rules_; - bool progress_; -}; - -bool MatchRunner::load_more_rules(const std::string& filename) -{ - boost::shared_ptr<Wccl::MatchRule> retOp; - try { - std::ifstream is(filename.c_str()); - if (!is.good()) { - throw Wccl::FileNotFound(filename, "", __FUNCTION__); - } - retOp = parser_.parseMatchRule(is); - if (retOp) { - boost::filesystem::path p(filename); - rule_names_.push_back(p.stem()); - rules_.push_back(retOp); - return true; - } else { - std::cerr << "Problem while parsing -- " - << "parser returned NULL!" << std::endl; - } - } catch (PwrNlp::PwrNlpError& e) { - std::cerr << e.scope() << " Error: " << e.info() << std::endl; - } - return false; -} - -void MatchRunner::apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader, - boost::shared_ptr<Corpus2::TokenWriter> writer) -{ - Corpus2::TokenTimer& timer = Corpus2::global_timer(); - while (boost::shared_ptr<Corpus2::Chunk> c = reader->get_next_chunk()) { - foreach (boost::shared_ptr<Corpus2::Sentence>& s, c->sentences()) { - boost::shared_ptr<Corpus2::AnnotatedSentence> as; - as = boost::dynamic_pointer_cast<Corpus2::AnnotatedSentence>(s); - if (!as) { - std::cerr << "Did not get an AnnotatedSentence from reader," - "'ann'' option broken?\n"; - return; - } - - foreach (const boost::shared_ptr<Wccl::MatchRule>& r, rules_) { - r->apply(as); - } - - timer.count_sentence(*as); - if (progress_) { - timer.check_slice(); - } - //writer->write_sentence(*as); - } - writer->write_chunk(*c); - } -} - -void usage(char* name) -{ - std::cerr << "This program runs WCCL match rules.\n"; - std::cerr << "Usage " << name << " [OPTIONS] FILES\n" - << "Files ending with .xml are treated as corpora, otherwise \n" - << "as CCL files. Use - to read corpus from stdin (as with -I)\n" - << "Note: the ann option is implied on all input formats\n"; -} - -int main(int argc, char** argv) -{ - std::string tagset_load = "kipi"; - std::string input_format; - std::string output_format; - bool progress = false; - options opts; - opts.first = false; - opts.until_done = false; - opts.until_done_iterations = 1000; - std::vector<std::string> corpora_files, ccl_files, files; - bool corpus_stdin = true; - using boost::program_options::value; - - std::string readers = boost::algorithm::join(Corpus2::TokenReader::available_reader_types_help(), " "); - std::string readers_help = "Input format, any of: " + readers + "\n"; - std::string writers = boost::algorithm::join(Corpus2::TokenWriter::available_writer_types_help(), " "); - std::string writers_help = "Output format, any of: " + writers + "\n";; - - boost::program_options::options_description desc("Allowed options"); - desc.add_options() - ("tagset,t", value(&tagset_load), - "Tagset to use\n") - ("corpus,c", value(&corpora_files), - "Corpus file to load (XCES), do not load from stdin\n") - ("ccl-file,C", value(&ccl_files), - "CCL rule files\n") - ("files,f", value(&files), - "Files to load, looking at the extension to determine type\n") - ("corpus-from-stdin,I", value(&corpus_stdin)->zero_tokens(), - "Read corpus from stdin") - ("input-format,i", value(&input_format)->default_value("xces"), - readers_help.c_str()) - ("output-format,o", value(&output_format)->default_value("ccl"), - writers_help.c_str()) - ("progress,p", value(&progress)->zero_tokens(), - "Show progress info") - ("quiet,q", value(&quiet)->zero_tokens(), - "Suppress messages\n") - ("until-done,u", value(&opts.until_done)->zero_tokens(), - "Until-done mode\n") - ("until-done-iterations", value(&opts.until_done_iterations), - "Until-done iteration limit\n") - ("first-sentence-only,1", value(&opts.first)->zero_tokens(), - "Only process first sentence\n") - ("help,h", "Show help") - ; - boost::program_options::variables_map vm; - boost::program_options::positional_options_description p; - p.add("files", -1); - - try { - boost::program_options::store( - boost::program_options::command_line_parser(argc, argv) - .options(desc).positional(p).run(), vm); - } catch (boost::program_options::error& e) { - std::cerr << e.what() << std::endl; - return 2; - } - boost::program_options::notify(vm); - - if (vm.count("help")) { - usage(argv[0]); - std::cout << desc << "\n"; - return 1; - } - - foreach (const std::string& f, files) { - if (f == "-") { - corpus_stdin = true; - } else if (boost::algorithm::ends_with(f, ".xml")) { - corpora_files.push_back(f); - } else { - ccl_files.push_back(f); - } - } - - // consider stdin only when no corpus files given - corpus_stdin = corpus_stdin && corpora_files.empty(); - - if (ccl_files.empty() || (corpora_files.empty() && !corpus_stdin)) { - usage(argv[0]); - return 2; - } - - try { - const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load); - MatchRunner runner(tagset); - runner.use_progress(progress); - foreach (const std::string& file, ccl_files) { - runner.load_more_rules(file); - } - if (!runner.empty()) { - Corpus2::TokenTimer& timer = Corpus2::global_timer(); - timer.register_signal_handler(); - boost::shared_ptr<Corpus2::TokenWriter> writer; - writer = Corpus2::TokenWriter::create_stream_writer(output_format, std::cout, tagset); - boost::shared_ptr<Corpus2::TokenReader> reader; - foreach (std::string cf, corpora_files) { - reader = Corpus2::TokenReader::create_path_reader(input_format, tagset, cf); - reader->set_option("ann"); - runner.apply_rules(reader, writer); - } - if (corpus_stdin) { - reader = Corpus2::TokenReader::create_stream_reader(input_format, tagset, std::cin); - reader->set_option("ann"); - runner.apply_rules(reader, writer); - } - if (progress) { - Corpus2::TokenTimer& timer = Corpus2::global_timer(); - timer.stats(); - } - } - } catch (PwrNlp::PwrNlpError& e) { - std::cerr << e.info() << std::endl; - return 2; - } - - return 0; -} diff --git a/wccl-apps/wccl-rules.cpp b/wccl-apps/wccl-rules.cpp index e357f72415c650a853368d972ea7dd04abcaf6be..13f595cd0e53ab8baa3c63f26088e262cb565e25 100644 --- a/wccl-apps/wccl-rules.cpp +++ b/wccl-apps/wccl-rules.cpp @@ -2,79 +2,109 @@ #include <fstream> #include <iomanip> - #include <libwccl/values/strset.h> #include <libwccl/parser/Parser.h> -#include <libwccl/ops/tagrulesequence.h> #include <libcorpus2/tagsetmanager.h> #include <libcorpus2/util/tokentimer.h> - #include <boost/bind.hpp> #include <boost/algorithm/string.hpp> #include <boost/make_shared.hpp> +#include <boost/filesystem.hpp> #include <boost/program_options.hpp> -#include <libcorpus2/io/xcesreader.h> -#include <libcorpus2/io/xceswriter.h> - -#include <antlr/NoViableAltException.hpp> -#include <antlr/MismatchedTokenException.hpp> +#include <libcorpus2/io/reader.h> +#include <libcorpus2/io/writer.h> namespace { bool quiet = false; - bool progress = false; struct options { bool first; - bool until_done; - int until_done_iterations; }; } -bool load_more_rules(Wccl::Parser& parser, const std::string& filename, Wccl::TagRuleSequence& rules) +class RuleRunner +{ +public: + RuleRunner(const Corpus2::Tagset& tagset) + : tagset_(tagset), parser_(tagset_), progress_(false) + , tag_rule_iterations_(0), total_match_rules_(0), total_tag_rules_(0) + { + } + + void use_progress(bool use) { + progress_ = use; + if (use) { + Corpus2::TokenTimer& timer = Corpus2::global_timer(); + timer.register_signal_handler(); + } + } + + void set_tag_rule_iterations(int i) { + tag_rule_iterations_ = i; + } + + std::pair<int,int> load_more_rules(const std::string &filename); + + void apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader, + boost::shared_ptr<Corpus2::TokenWriter> writer); + + bool empty() const { + return size() > 0; + } + + size_t size() const { + return total_match_rules_ + total_tag_rules_; + } + + size_t total_match_rules() const { + return total_match_rules_; + } + + size_t total_tag_rules() const { + return total_tag_rules_; + } + +private: + const Corpus2::Tagset& tagset_; + Wccl::Parser parser_; + std::vector<std::string> file_names_; + std::vector<boost::shared_ptr<Wccl::WcclFile> > parsed_files_; + bool progress_; + int tag_rule_iterations_; + size_t total_match_rules_, total_tag_rules_; +}; + +std::pair<int,int> RuleRunner::load_more_rules(const std::string& filename) { - boost::shared_ptr<Wccl::TagRuleSequence> ret; + boost::shared_ptr<Wccl::WcclFile> parsed_file; try { std::ifstream is(filename.c_str()); if (!is.good()) { throw Wccl::FileNotFound(filename, "", __FUNCTION__); } - - ret = parser.parseTagRuleSequence(is); - if (ret) { - if (!quiet) { - std::cerr << "Loaded " << ret->size() << " rule(s) from " - << filename << "\n"; - } - std::copy(ret->begin(), ret->end(), std::back_inserter(rules)); - return true; + parsed_file = parser_.parseWcclFile(is, "."); + if (parsed_file) { + boost::filesystem::path p(filename); + file_names_.push_back(p.stem()); + size_t match_rules = parsed_file->get_match_rules().size(); + size_t tag_rules = parsed_file->get_tag_rules().size(); + total_match_rules_ += match_rules; + total_tag_rules_ += tag_rules; + parsed_files_.push_back(parsed_file); + return std::make_pair(tag_rules, match_rules); } else { std::cerr << "Problem while parsing -- " << "parser returned NULL!" << std::endl; } - } catch (antlr::MismatchedTokenException &e) { - std::cerr << e.getFileLineColumnString() - << " " << e.getMessage() << std::endl; - } catch(antlr::NoViableAltException &e) { - std::cerr << e.getFileLineColumnString() - << " " << e.getMessage() << std::endl; - } catch (Wccl::InvalidVariableName &e) { - std::cerr << "Wccl::InvalidVariableName " << e.info() << std::endl; - } catch (Wccl::VariableTypeMismatch &e) { - std::cerr << "Wccl::VariableTypeMismatch " << e.info() << std::endl; - } catch (Wccl::WcclError& e) { - std::cerr << "Wccl::WcclError:" << e.info() << std::endl; } catch (PwrNlp::PwrNlpError& e) { - std::cerr << "PwrNlp::PwrNlpError " << e.info() << std::endl; - } catch (antlr::ANTLRException& e) { - std::cerr << "Antlr error " << e.getMessage() << std::endl; + std::cerr << e.scope() << " Error: " << e.info() << std::endl; } - return false; + return std::make_pair(0,0); } -void apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader, - boost::shared_ptr<Corpus2::TokenWriter> writer, Wccl::TagRuleSequence& rules, - const options& opts) +void RuleRunner::apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader, + boost::shared_ptr<Corpus2::TokenWriter> writer) { Corpus2::TokenTimer& timer = Corpus2::global_timer(); while (boost::shared_ptr<Corpus2::Chunk> c = reader->get_next_chunk()) { @@ -86,45 +116,47 @@ void apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader, "'ann'' option broken?\n"; return; } - if (opts.until_done) { - rules.execute_until_done(as, opts.until_done_iterations); - } else { - rules.execute_once(as); + + foreach (boost::shared_ptr<Wccl::WcclFile>& f, parsed_files_) { + if (tag_rule_iterations_ == 0) { + f->get_tag_rules_ptr()->execute_once(as); + } else if (tag_rule_iterations_ < 0) { + f->get_tag_rules_ptr()->execute_until_done(as); + } else { + f->get_tag_rules_ptr()->execute_until_done(as, tag_rule_iterations_); + } + foreach (const boost::shared_ptr<Wccl::MatchRule>& mr, f->get_match_rules()) { + mr->apply(as); + } } + timer.count_sentence(*as); - if (progress) { + if (progress_) { timer.check_slice(); } - if (opts.first) break; //writer->write_sentence(*as); } writer->write_chunk(*c); - if (opts.first) break; - } - if (progress) { - timer.stats(); } } void usage(char* name) { - std::cerr << "This program runs WCCL disambiguation rules.\n"; + std::cerr << "This program runs WCCL match and/or tag rules. Tag rules are applied first.\n"; std::cerr << "Usage " << name << " [OPTIONS] FILES\n" - << "Files ending with .xml are treated as corpora, otherwise \n" - << "as CCL files. Use - to read corpus from stdin (as with -I)\n" - << "Note: the ann option is implied on all input formats\n"; + << "Files ending with .xml are treated as corpora, otherwise " + << "as WCCL files. Use - to read corpus from stdin (as with -I)\n" + << "Note: the ,ann option is implied on all input formats\n"; } - int main(int argc, char** argv) { std::string tagset_load = "kipi"; std::string input_format; std::string output_format; + bool progress = false; options opts; opts.first = false; - opts.until_done = false; - opts.until_done_iterations = 1000; std::vector<std::string> corpora_files, ccl_files, files; bool corpus_stdin = true; using boost::program_options::value; @@ -145,19 +177,17 @@ int main(int argc, char** argv) ("files,f", value(&files), "Files to load, looking at the extension to determine type\n") ("corpus-from-stdin,I", value(&corpus_stdin)->zero_tokens(), - "Read corpus from stdin") + "Read corpus from stdin (requires that no corpora filenames are passed)") ("input-format,i", value(&input_format)->default_value("xces"), readers_help.c_str()) - ("output-format,o", value(&output_format)->default_value("xces"), + ("output-format,o", value(&output_format)->default_value("ccl"), writers_help.c_str()) ("progress,p", value(&progress)->zero_tokens(), "Show progress info") ("quiet,q", value(&quiet)->zero_tokens(), "Suppress messages\n") - ("until-done,u", value(&opts.until_done)->zero_tokens(), - "Until-done mode\n") - ("until-done-iterations", value(&opts.until_done_iterations), - "Until-done iteration limit\n") + ("until-done-iterations,u", value<int>()->implicit_value(1000), + "Until-done iteration limit, no arg for default limit(1000)\n") ("first-sentence-only,1", value(&opts.first)->zero_tokens(), "Only process first sentence\n") ("help,h", "Show help") @@ -202,32 +232,31 @@ int main(int argc, char** argv) try { const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load); - Wccl::Parser parser(tagset); - Wccl::TagRuleSequence rules; - foreach (const std::string& f, ccl_files) { - size_t sz = rules.size(); - if (!load_more_rules(parser, f, rules)) { - std::cerr << "Warning: error while parsing " << f << "\n"; - } - if (rules.size() == sz) { - std::cerr << "Warning: no rules loaded from " << f << "\n"; + RuleRunner runner(tagset); + runner.set_tag_rule_iterations(vm["until-done-iterations"].as<int>()); + runner.use_progress(progress); + foreach (const std::string& file, ccl_files) { + std::pair<int,int> res = runner.load_more_rules(file); + if (res.first == 0 && res.second == 0) { + std::cerr << "Warning: no rules loaded from " << file << "\n"; + } else if (!quiet) { + std::cerr << "Loaded " << res.first << " tag rule(s) and " + << res.second << " match rule(s) from " << file << "\n"; } } - if (!rules.empty()) { - Corpus2::TokenTimer& timer = Corpus2::global_timer(); - timer.register_signal_handler(); + if (!runner.empty()) { boost::shared_ptr<Corpus2::TokenWriter> writer; writer = Corpus2::TokenWriter::create_stream_writer(output_format, std::cout, tagset); boost::shared_ptr<Corpus2::TokenReader> reader; - foreach (const std::string& f, corpora_files) { - reader = Corpus2::TokenReader::create_path_reader(input_format, tagset, f); + foreach (std::string cf, corpora_files) { + reader = Corpus2::TokenReader::create_path_reader(input_format, tagset, cf); reader->set_option("ann"); - apply_rules(reader, writer, rules, opts); + runner.apply_rules(reader, writer); } if (corpus_stdin) { reader = Corpus2::TokenReader::create_stream_reader(input_format, tagset, std::cin); reader->set_option("ann"); - apply_rules(reader, writer, rules, opts); + runner.apply_rules(reader, writer); } if (progress) { Corpus2::TokenTimer& timer = Corpus2::global_timer();