diff --git a/wccl-apps/CMakeLists.txt b/wccl-apps/CMakeLists.txt index fc167e0980e3d5a36bb217ca0f259b6c3d2003cd..b8b95da6852fbd08e959ee63d8a9e98eae8fb9d5 100644 --- a/wccl-apps/CMakeLists.txt +++ b/wccl-apps/CMakeLists.txt @@ -18,15 +18,17 @@ link_directories(${Boost_LIBRARY_DIRS}) add_executable(wccl-features wccl-features.cpp) target_link_libraries (wccl-features wccl ${Boost_LIBRARIES} antlr ${LIBS}) -add_executable(wccl-run wccl-run.cpp) +add_executable(wccl-run wccl-run.cpp stdopts.cpp) target_link_libraries (wccl-run wccl ${Boost_LIBRARIES} antlr ${LIBS}) add_executable(wccl-rules wccl-rules.cpp) target_link_libraries (wccl-rules wccl ${Boost_LIBRARIES} antlr ${LIBS}) add_executable(wccl-parser wccl-parser.cpp) target_link_libraries (wccl-parser wccl ${Boost_LIBRARIES} antlr ${LIBS}) +add_executable(wccl-match wccl-match.cpp) +target_link_libraries (wccl-match wccl ${Boost_LIBRARIES} antlr ${LIBS}) if(UNIX) - install(TARGETS wccl-features wccl-run wccl-rules wccl-parser + install(TARGETS wccl-features wccl-run wccl-rules wccl-parser wccl-match RUNTIME DESTINATION bin ) endif(UNIX) diff --git a/wccl-apps/stdopts.cpp b/wccl-apps/stdopts.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/wccl-apps/stdopts.h b/wccl-apps/stdopts.h new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/wccl-apps/wccl-match.cpp b/wccl-apps/wccl-match.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6924dcd5d620910d7e96f1f8572d1a33f090102b --- /dev/null +++ b/wccl-apps/wccl-match.cpp @@ -0,0 +1,201 @@ +#include <cstdlib> +#include <fstream> +#include <iomanip> + +#include <libwccl/values/strset.h> +#include <libwccl/parser/Parser.h> +#include <libwccl/ops/rulesequence.h> +#include <libcorpus2/tagsetmanager.h> +#include <libcorpus2/util/tokentimer.h> + +#include <boost/bind.hpp> +#include <boost/algorithm/string.hpp> +#include <boost/make_shared.hpp> +#include <boost/filesystem.hpp> +#include <boost/program_options.hpp> +#include <libcorpus2/io/reader.h> +#include <libcorpus2/io/writer.h> + +namespace { + bool quiet = false; + bool progress = false; + + struct options { + bool first; + bool until_done; + int until_done_iterations; + }; +} + +class MatchRunner +{ +public: + MatchRunner(const Corpus2::Tagset& tagset) + : tagset_(tagset), parser_(tagset_) + { + } + + bool load_more_rules(const std::string &filename); + + bool load_operator_string(const std::string &op_string); + + void apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader, + boost::shared_ptr<Corpus2::TokenWriter> writer); + + bool empty() { + return rules_.empty(); + } + +private: + const Corpus2::Tagset& tagset_; + Wccl::Parser parser_; + std::vector<std::string> rule_names_; + std::vector<boost::shared_ptr<Wccl::ApplyOperator> > rules_; +}; + +bool MatchRunner::load_more_rules(const std::string& filename) +{ + boost::shared_ptr<Wccl::ApplyOperator> retOp; + try { + std::ifstream is(filename.c_str()); + if (!is.good()) { + throw Wccl::FileNotFound(filename, "", __FUNCTION__); + } + retOp = parser_.parseMatchRule(is); + if (retOp) { + boost::filesystem::path p(filename); + rule_names_.push_back(p.stem()); + rules_.push_back(retOp); + return true; + } else { + std::cerr << "Problem while parsing -- " + << "parser returned NULL!" << std::endl; + } + } catch (PwrNlp::PwrNlpError& e) { + std::cerr << e.scope() << " Error: " << e.info() << std::endl; + } + return false; +} + +void MatchRunner::apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader, + boost::shared_ptr<Corpus2::TokenWriter> writer) +{ + Corpus2::TokenTimer& timer = Corpus2::global_timer(); + while (boost::shared_ptr<Corpus2::Chunk> c = reader->get_next_chunk()) { + foreach (boost::shared_ptr<Corpus2::Sentence>& s, c->sentences()) { + boost::shared_ptr<Corpus2::AnnotatedSentence> as; + as = Corpus2::AnnotatedSentence::wrap_sentence(s); + + foreach (const boost::shared_ptr<Wccl::ApplyOperator>& r, rules_) { + //r->execute(); + } + + timer.count_sentence(*as); + if (progress) { + timer.check_slice(); + } + writer->write_sentence(*as); + } + //writer->write_chunk(*c); + } + if (progress) { + timer.stats(); + } +} + +int main(int argc, char** argv) +{ + std::string tagset_load = "kipi"; + std::string input_format; + std::string output_format; + options opts; + opts.first = false; + opts.until_done = false; + opts.until_done_iterations = 1000; + std::vector<std::string> corpora_files, ccl_files, files; + bool corpus_stdin = true; + using boost::program_options::value; + + std::string readers = boost::algorithm::join(Corpus2::TokenReader::available_reader_types_help(), " "); + std::string readers_help = "Input format, any of: " + readers + "\n"; + std::string writers = boost::algorithm::join(Corpus2::TokenWriter::available_writer_types_help(), " "); + std::string writers_help = "Output format, any of: " + writers + "\n";; + + boost::program_options::options_description desc("Allowed options"); + desc.add_options() + ("tagset,t", value(&tagset_load), + "Tagset to use\n") + ("corpus,c", value(&corpora_files), + "Corpus file to load (XCES), do not load from stdin\n") + ("ccl-file,C", value(&ccl_files), + "CCL rule files\n") + ("files,f", value(&files), + "Files to load, looking at the extension to determine type\n") + ("input-format,i", value(&input_format)->default_value("xces"), + readers_help.c_str()) + ("output-format,o", value(&output_format)->default_value("xces"), + writers_help.c_str()) + ("progress,p", value(&progress)->zero_tokens(), + "Show progress info") + ("quiet,q", value(&quiet)->zero_tokens(), + "Suppress messages\n") + ("until-done,u", value(&opts.until_done)->zero_tokens(), + "Until-done mode\n") + ("until-done-iterations", value(&opts.until_done_iterations), + "Until-done iteration limit\n") + ("first-sentence-only,1", value(&opts.first)->zero_tokens(), + "Only process first sentence\n") + ("help,h", "Show help") + ; + boost::program_options::variables_map vm; + boost::program_options::positional_options_description p; + p.add("files", -1); + + try { + boost::program_options::store( + boost::program_options::command_line_parser(argc, argv) + .options(desc).positional(p).run(), vm); + } catch (boost::program_options::error& e) { + std::cerr << e.what() << std::endl; + return 2; + } + boost::program_options::notify(vm); + + if (vm.count("help")) { + std::cerr << "Usage " << argv[0] << " [OPTIONS] FILES\n" + << "Files ending with .xml are treated as corpora, otherwise \n" + << "as CCL files. Use - to read corpus from stdin (as with -I)"; + std::cout << desc << "\n"; + return 1; + } + + foreach (const std::string& f, files) { + if (boost::algorithm::ends_with(f, ".xml")) { + corpora_files.push_back(f); + } else { + ccl_files.push_back(f); + } + } + + try { + const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load); + MatchRunner runner(tagset); + foreach (const std::string& file, ccl_files) { + runner.load_more_rules(file); + } + if (!runner.empty()) { + Corpus2::TokenTimer& timer = Corpus2::global_timer(); + timer.register_signal_handler(); + boost::shared_ptr<Corpus2::TokenWriter> writer; + writer.reset(Corpus2::TokenWriter::create(output_format, std::cout, tagset)); + boost::shared_ptr<Corpus2::TokenReader> reader; + reader = Corpus2::TokenReader::create_path_reader(input_format, tagset, ccl_files[0]); + runner.apply_rules(reader, writer); + } + } catch (PwrNlp::PwrNlpError& e) { + std::cerr << e.info() << std::endl; + return 2; + } + + return 0; +}