diff --git a/CMakeLists.txt b/CMakeLists.txt index 3d1aa505da8a55e96fe607720b9085cc5b0518cc..27b8ca0ee5a9cb186f38817132a2a1307a6679a9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -60,4 +60,5 @@ endif(MSVC OR BORLAND) add_subdirectory(libwccl) add_subdirectory(wcclparser) add_subdirectory(wcclrun) +add_subdirectory(wcclrules) add_subdirectory(tests) diff --git a/wcclrules/CMakeLists.txt b/wcclrules/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..46eb5096a4dd68dad81531ef1f59754214bacb90 --- /dev/null +++ b/wcclrules/CMakeLists.txt @@ -0,0 +1,24 @@ +PROJECT( wcclrules ) + +find_package(LibXML++ REQUIRED) +include_directories(${LibXML++_INCLUDE_DIRS}) +link_directories(${LibXML++_LIBRARY_DIRS}) +set(LIBS ${LIBS} ${LibXML++_LIBRARIES}) + +include_directories( ${CMAKE_SOURCE_DIR} ) + +add_definitions(-DLIBWCCL_WCCLRUN_DATA_DIR="${PROJECT_SOURCE_DIR}/") + +add_executable(wcclrules + main.cpp +) +target_link_libraries (wcclrules wccl ${Boost_LIBRARIES} antlr ${LIBS}) + +include_directories(${Boost_INCLUDE_DIR}) +link_directories(${Boost_LIBRARY_DIRS}) + +if(UNIX) + install(TARGETS wcclrules + RUNTIME DESTINATION bin + ) +endif(UNIX) diff --git a/wcclrules/main.cpp b/wcclrules/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..da88578e7b00b24529041fb04cfbe54cc3f57cb2 --- /dev/null +++ b/wcclrules/main.cpp @@ -0,0 +1,183 @@ +#include <cstdlib> +#include <fstream> +#include <iomanip> + + +#include <libwccl/values/strset.h> +#include <libwccl/parser/Parser.h> +#include <libwccl/ops/rulesequence.h> +#include <libcorpus2/tagsetmanager.h> + +#include <boost/bind.hpp> +#include <boost/algorithm/string.hpp> +#include <boost/make_shared.hpp> +#include <boost/program_options.hpp> +#include <libcorpus2/io/xcesreader.h> +#include <libcorpus2/io/xceswriter.h> + +#include <antlr/NoViableAltException.hpp> +#include <antlr/MismatchedTokenException.hpp> + +namespace { + bool quiet = false; + + struct options { + bool first; + bool until_done; + int until_done_iterations; + }; +} + +bool load_more_rules(Wccl::Parser& parser, const std::string& filename, Wccl::RuleSequence& rules) +{ + boost::shared_ptr<Wccl::RuleSequence> ret; + try { + std::ifstream is(filename.c_str()); + if (!is.good()) { + throw Wccl::FileNotFound(filename, "", __FUNCTION__); + } + + ret = parser.parseRuleSequence(is); + if (ret) { + std::cerr << ret->size() << "\n"; + std::copy(ret->begin(), ret->end(), std::back_inserter(rules)); + return true; + } else { + std::cerr << "Problem while parsing -- " + << "parser returned NULL!" << std::endl; + } + } catch (antlr::MismatchedTokenException &e) { + std::cerr << e.getFileLineColumnString() + << " " << e.getMessage() << std::endl; + } catch(antlr::NoViableAltException &e) { + std::cerr << e.getFileLineColumnString() + << " " << e.getMessage() << std::endl; + } catch (Wccl::InvalidVariableName &e) { + std::cerr << "Wccl::InvalidVariableName " << e.info() << std::endl; + } catch (Wccl::VariableTypeMismatch &e) { + std::cerr << "Wccl::VariableTypeMismatch " << e.info() << std::endl; + } catch (Wccl::WcclError& e) { + std::cerr << "Wccl::WcclError:" << e.info() << std::endl; + } catch (PwrNlp::PwrNlpError& e) { + std::cerr << "PwrNlp::PwrNlpError " << e.info() << std::endl; + } catch (antlr::ANTLRException& e) { + std::cerr << "Antlr error " << e.getMessage() << std::endl; + } + return false; +} + +void do_stream(boost::shared_ptr<Corpus2::TokenWriter> writer, const Corpus2::Tagset& tagset, Wccl::RuleSequence& rules, + std::istream& is, const options& opts) +{ + Corpus2::XcesReader xr(tagset, is); + Corpus2::Sentence::Ptr s; + while ((s = xr.get_next_sentence())) { + rules.execute_once(s); + writer->write_sentence(*s); + if (opts.first) break; + } +} + + +int main(int argc, char** argv) +{ + std::string tagset_load = "kipi"; + std::string output_format; + options opts; + opts.first = false; + opts.until_done = false; + opts.until_done_iterations = 1000; + std::vector<std::string> corpora_files, ccl_files, files; + bool corpus_stdin = true; + using boost::program_options::value; + + std::string writers = boost::algorithm::join(Corpus2::TokenWriter::available_writer_types_help(), " "); + std::string writers_help = "Output format, any of: " + writers + "\n"; + + boost::program_options::options_description desc("Allowed options"); + desc.add_options() + ("tagset,t", value(&tagset_load), + "Tagset to use\n") + ("corpus,c", value(&corpora_files), + "Corpus file to load (XCES), do not load from stdin\n") + ("ccl-file,C", value(&ccl_files), + "CCL rule files\n") + ("files,f", value(&files), + "Files to load, looking at the extension to determine type\n") + ("output-format,o", value(&output_format)->default_value("xces"), + writers_help.c_str()) + ("quiet,q", value(&quiet)->zero_tokens(), + "Suppress messages\n") + ("until-done,u", value(&opts.until_done)->zero_tokens(), + "Until-done mode\n") + ("until-done-iterations", value(&opts.until_done_iterations), + "Until-done iteration limit\n") + ("first-sentence-only,1", value(&opts.first)->zero_tokens(), + "Only process first sentence\n") + ("help,h", "Show help") + ; + boost::program_options::variables_map vm; + boost::program_options::positional_options_description p; + p.add("files", -1); + + try { + boost::program_options::store( + boost::program_options::command_line_parser(argc, argv) + .options(desc).positional(p).run(), vm); + } catch (boost::program_options::error& e) { + std::cerr << e.what() << std::endl; + return 2; + } + boost::program_options::notify(vm); + + if (vm.count("help")) { + std::cerr << "Usage " << argv[0] << " [OPTIONS] FILES\n" + << "Files ending with .xml are treated as corpora, otherwise \n" + << "as CCL files. Use - to read corpus from stdin (as with -I)"; + std::cout << desc << "\n"; + return 1; + } + + foreach (const std::string& f, files) { + if (boost::algorithm::ends_with(f, ".xml")) { + corpora_files.push_back(f); + corpus_stdin = false; + } else { + ccl_files.push_back(f); + } + } + + try { + const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load); + Wccl::Parser parser(tagset); + Wccl::RuleSequence rules; + foreach (const std::string& f, ccl_files) { + size_t sz = rules.size(); + if (!load_more_rules(parser, f, rules)) { + std::cerr << "Warning: error while parsing " << f << "\n"; + } + if (rules.size() == sz) { + std::cerr << "Warning: no rules loaded from " << f << "\n"; + } + } + if (!rules.empty()) { + boost::shared_ptr<Corpus2::TokenWriter> writer; + writer.reset(Corpus2::TokenWriter::create(output_format, std::cout, tagset)); foreach (const std::string& f, corpora_files) { + std::ifstream ifs(f.c_str()); + if (ifs.good()) { + do_stream(writer, tagset, rules, ifs, opts); + } else { + std::cerr << "Error reading corpus from " << f << "\n"; + } + } + if (corpus_stdin) { + do_stream(writer, tagset, rules, std::cin, opts); + } + } + } catch (PwrNlp::PwrNlpError& e) { + std::cerr << e.info() << std::endl; + return 2; + } + + return 0; +}