Skip to content
Snippets Groups Projects
Commit a3d5ae74 authored by ilor's avatar ilor
Browse files

rough util for running wccl rules

parent a49d9fa1
No related branches found
No related tags found
No related merge requests found
......@@ -60,4 +60,5 @@ endif(MSVC OR BORLAND)
add_subdirectory(libwccl)
add_subdirectory(wcclparser)
add_subdirectory(wcclrun)
add_subdirectory(wcclrules)
add_subdirectory(tests)
PROJECT( wcclrules )
find_package(LibXML++ REQUIRED)
include_directories(${LibXML++_INCLUDE_DIRS})
link_directories(${LibXML++_LIBRARY_DIRS})
set(LIBS ${LIBS} ${LibXML++_LIBRARIES})
include_directories( ${CMAKE_SOURCE_DIR} )
add_definitions(-DLIBWCCL_WCCLRUN_DATA_DIR="${PROJECT_SOURCE_DIR}/")
add_executable(wcclrules
main.cpp
)
target_link_libraries (wcclrules wccl ${Boost_LIBRARIES} antlr ${LIBS})
include_directories(${Boost_INCLUDE_DIR})
link_directories(${Boost_LIBRARY_DIRS})
if(UNIX)
install(TARGETS wcclrules
RUNTIME DESTINATION bin
)
endif(UNIX)
#include <cstdlib>
#include <fstream>
#include <iomanip>
#include <libwccl/values/strset.h>
#include <libwccl/parser/Parser.h>
#include <libwccl/ops/rulesequence.h>
#include <libcorpus2/tagsetmanager.h>
#include <boost/bind.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/make_shared.hpp>
#include <boost/program_options.hpp>
#include <libcorpus2/io/xcesreader.h>
#include <libcorpus2/io/xceswriter.h>
#include <antlr/NoViableAltException.hpp>
#include <antlr/MismatchedTokenException.hpp>
namespace {
bool quiet = false;
struct options {
bool first;
bool until_done;
int until_done_iterations;
};
}
bool load_more_rules(Wccl::Parser& parser, const std::string& filename, Wccl::RuleSequence& rules)
{
boost::shared_ptr<Wccl::RuleSequence> ret;
try {
std::ifstream is(filename.c_str());
if (!is.good()) {
throw Wccl::FileNotFound(filename, "", __FUNCTION__);
}
ret = parser.parseRuleSequence(is);
if (ret) {
std::cerr << ret->size() << "\n";
std::copy(ret->begin(), ret->end(), std::back_inserter(rules));
return true;
} else {
std::cerr << "Problem while parsing -- "
<< "parser returned NULL!" << std::endl;
}
} catch (antlr::MismatchedTokenException &e) {
std::cerr << e.getFileLineColumnString()
<< " " << e.getMessage() << std::endl;
} catch(antlr::NoViableAltException &e) {
std::cerr << e.getFileLineColumnString()
<< " " << e.getMessage() << std::endl;
} catch (Wccl::InvalidVariableName &e) {
std::cerr << "Wccl::InvalidVariableName " << e.info() << std::endl;
} catch (Wccl::VariableTypeMismatch &e) {
std::cerr << "Wccl::VariableTypeMismatch " << e.info() << std::endl;
} catch (Wccl::WcclError& e) {
std::cerr << "Wccl::WcclError:" << e.info() << std::endl;
} catch (PwrNlp::PwrNlpError& e) {
std::cerr << "PwrNlp::PwrNlpError " << e.info() << std::endl;
} catch (antlr::ANTLRException& e) {
std::cerr << "Antlr error " << e.getMessage() << std::endl;
}
return false;
}
void do_stream(boost::shared_ptr<Corpus2::TokenWriter> writer, const Corpus2::Tagset& tagset, Wccl::RuleSequence& rules,
std::istream& is, const options& opts)
{
Corpus2::XcesReader xr(tagset, is);
Corpus2::Sentence::Ptr s;
while ((s = xr.get_next_sentence())) {
rules.execute_once(s);
writer->write_sentence(*s);
if (opts.first) break;
}
}
int main(int argc, char** argv)
{
std::string tagset_load = "kipi";
std::string output_format;
options opts;
opts.first = false;
opts.until_done = false;
opts.until_done_iterations = 1000;
std::vector<std::string> corpora_files, ccl_files, files;
bool corpus_stdin = true;
using boost::program_options::value;
std::string writers = boost::algorithm::join(Corpus2::TokenWriter::available_writer_types_help(), " ");
std::string writers_help = "Output format, any of: " + writers + "\n";
boost::program_options::options_description desc("Allowed options");
desc.add_options()
("tagset,t", value(&tagset_load),
"Tagset to use\n")
("corpus,c", value(&corpora_files),
"Corpus file to load (XCES), do not load from stdin\n")
("ccl-file,C", value(&ccl_files),
"CCL rule files\n")
("files,f", value(&files),
"Files to load, looking at the extension to determine type\n")
("output-format,o", value(&output_format)->default_value("xces"),
writers_help.c_str())
("quiet,q", value(&quiet)->zero_tokens(),
"Suppress messages\n")
("until-done,u", value(&opts.until_done)->zero_tokens(),
"Until-done mode\n")
("until-done-iterations", value(&opts.until_done_iterations),
"Until-done iteration limit\n")
("first-sentence-only,1", value(&opts.first)->zero_tokens(),
"Only process first sentence\n")
("help,h", "Show help")
;
boost::program_options::variables_map vm;
boost::program_options::positional_options_description p;
p.add("files", -1);
try {
boost::program_options::store(
boost::program_options::command_line_parser(argc, argv)
.options(desc).positional(p).run(), vm);
} catch (boost::program_options::error& e) {
std::cerr << e.what() << std::endl;
return 2;
}
boost::program_options::notify(vm);
if (vm.count("help")) {
std::cerr << "Usage " << argv[0] << " [OPTIONS] FILES\n"
<< "Files ending with .xml are treated as corpora, otherwise \n"
<< "as CCL files. Use - to read corpus from stdin (as with -I)";
std::cout << desc << "\n";
return 1;
}
foreach (const std::string& f, files) {
if (boost::algorithm::ends_with(f, ".xml")) {
corpora_files.push_back(f);
corpus_stdin = false;
} else {
ccl_files.push_back(f);
}
}
try {
const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load);
Wccl::Parser parser(tagset);
Wccl::RuleSequence rules;
foreach (const std::string& f, ccl_files) {
size_t sz = rules.size();
if (!load_more_rules(parser, f, rules)) {
std::cerr << "Warning: error while parsing " << f << "\n";
}
if (rules.size() == sz) {
std::cerr << "Warning: no rules loaded from " << f << "\n";
}
}
if (!rules.empty()) {
boost::shared_ptr<Corpus2::TokenWriter> writer;
writer.reset(Corpus2::TokenWriter::create(output_format, std::cout, tagset)); foreach (const std::string& f, corpora_files) {
std::ifstream ifs(f.c_str());
if (ifs.good()) {
do_stream(writer, tagset, rules, ifs, opts);
} else {
std::cerr << "Error reading corpus from " << f << "\n";
}
}
if (corpus_stdin) {
do_stream(writer, tagset, rules, std::cin, opts);
}
}
} catch (PwrNlp::PwrNlpError& e) {
std::cerr << e.info() << std::endl;
return 2;
}
return 0;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment