Skip to content
Snippets Groups Projects
Commit c8c8af75 authored by ilor's avatar ilor
Browse files

add wccl-match stub, stdopts in wccl- apps for future refactor

parent 1b5f1c37
Branches
No related merge requests found
......@@ -18,15 +18,17 @@ link_directories(${Boost_LIBRARY_DIRS})
add_executable(wccl-features wccl-features.cpp)
target_link_libraries (wccl-features wccl ${Boost_LIBRARIES} antlr ${LIBS})
add_executable(wccl-run wccl-run.cpp)
add_executable(wccl-run wccl-run.cpp stdopts.cpp)
target_link_libraries (wccl-run wccl ${Boost_LIBRARIES} antlr ${LIBS})
add_executable(wccl-rules wccl-rules.cpp)
target_link_libraries (wccl-rules wccl ${Boost_LIBRARIES} antlr ${LIBS})
add_executable(wccl-parser wccl-parser.cpp)
target_link_libraries (wccl-parser wccl ${Boost_LIBRARIES} antlr ${LIBS})
add_executable(wccl-match wccl-match.cpp)
target_link_libraries (wccl-match wccl ${Boost_LIBRARIES} antlr ${LIBS})
if(UNIX)
install(TARGETS wccl-features wccl-run wccl-rules wccl-parser
install(TARGETS wccl-features wccl-run wccl-rules wccl-parser wccl-match
RUNTIME DESTINATION bin
)
endif(UNIX)
#include <cstdlib>
#include <fstream>
#include <iomanip>
#include <libwccl/values/strset.h>
#include <libwccl/parser/Parser.h>
#include <libwccl/ops/rulesequence.h>
#include <libcorpus2/tagsetmanager.h>
#include <libcorpus2/util/tokentimer.h>
#include <boost/bind.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/make_shared.hpp>
#include <boost/filesystem.hpp>
#include <boost/program_options.hpp>
#include <libcorpus2/io/reader.h>
#include <libcorpus2/io/writer.h>
namespace {
bool quiet = false;
bool progress = false;
struct options {
bool first;
bool until_done;
int until_done_iterations;
};
}
class MatchRunner
{
public:
MatchRunner(const Corpus2::Tagset& tagset)
: tagset_(tagset), parser_(tagset_)
{
}
bool load_more_rules(const std::string &filename);
bool load_operator_string(const std::string &op_string);
void apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader,
boost::shared_ptr<Corpus2::TokenWriter> writer);
bool empty() {
return rules_.empty();
}
private:
const Corpus2::Tagset& tagset_;
Wccl::Parser parser_;
std::vector<std::string> rule_names_;
std::vector<boost::shared_ptr<Wccl::ApplyOperator> > rules_;
};
bool MatchRunner::load_more_rules(const std::string& filename)
{
boost::shared_ptr<Wccl::ApplyOperator> retOp;
try {
std::ifstream is(filename.c_str());
if (!is.good()) {
throw Wccl::FileNotFound(filename, "", __FUNCTION__);
}
retOp = parser_.parseMatchRule(is);
if (retOp) {
boost::filesystem::path p(filename);
rule_names_.push_back(p.stem());
rules_.push_back(retOp);
return true;
} else {
std::cerr << "Problem while parsing -- "
<< "parser returned NULL!" << std::endl;
}
} catch (PwrNlp::PwrNlpError& e) {
std::cerr << e.scope() << " Error: " << e.info() << std::endl;
}
return false;
}
void MatchRunner::apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader,
boost::shared_ptr<Corpus2::TokenWriter> writer)
{
Corpus2::TokenTimer& timer = Corpus2::global_timer();
while (boost::shared_ptr<Corpus2::Chunk> c = reader->get_next_chunk()) {
foreach (boost::shared_ptr<Corpus2::Sentence>& s, c->sentences()) {
boost::shared_ptr<Corpus2::AnnotatedSentence> as;
as = Corpus2::AnnotatedSentence::wrap_sentence(s);
foreach (const boost::shared_ptr<Wccl::ApplyOperator>& r, rules_) {
//r->execute();
}
timer.count_sentence(*as);
if (progress) {
timer.check_slice();
}
writer->write_sentence(*as);
}
//writer->write_chunk(*c);
}
if (progress) {
timer.stats();
}
}
int main(int argc, char** argv)
{
std::string tagset_load = "kipi";
std::string input_format;
std::string output_format;
options opts;
opts.first = false;
opts.until_done = false;
opts.until_done_iterations = 1000;
std::vector<std::string> corpora_files, ccl_files, files;
bool corpus_stdin = true;
using boost::program_options::value;
std::string readers = boost::algorithm::join(Corpus2::TokenReader::available_reader_types_help(), " ");
std::string readers_help = "Input format, any of: " + readers + "\n";
std::string writers = boost::algorithm::join(Corpus2::TokenWriter::available_writer_types_help(), " ");
std::string writers_help = "Output format, any of: " + writers + "\n";;
boost::program_options::options_description desc("Allowed options");
desc.add_options()
("tagset,t", value(&tagset_load),
"Tagset to use\n")
("corpus,c", value(&corpora_files),
"Corpus file to load (XCES), do not load from stdin\n")
("ccl-file,C", value(&ccl_files),
"CCL rule files\n")
("files,f", value(&files),
"Files to load, looking at the extension to determine type\n")
("input-format,i", value(&input_format)->default_value("xces"),
readers_help.c_str())
("output-format,o", value(&output_format)->default_value("xces"),
writers_help.c_str())
("progress,p", value(&progress)->zero_tokens(),
"Show progress info")
("quiet,q", value(&quiet)->zero_tokens(),
"Suppress messages\n")
("until-done,u", value(&opts.until_done)->zero_tokens(),
"Until-done mode\n")
("until-done-iterations", value(&opts.until_done_iterations),
"Until-done iteration limit\n")
("first-sentence-only,1", value(&opts.first)->zero_tokens(),
"Only process first sentence\n")
("help,h", "Show help")
;
boost::program_options::variables_map vm;
boost::program_options::positional_options_description p;
p.add("files", -1);
try {
boost::program_options::store(
boost::program_options::command_line_parser(argc, argv)
.options(desc).positional(p).run(), vm);
} catch (boost::program_options::error& e) {
std::cerr << e.what() << std::endl;
return 2;
}
boost::program_options::notify(vm);
if (vm.count("help")) {
std::cerr << "Usage " << argv[0] << " [OPTIONS] FILES\n"
<< "Files ending with .xml are treated as corpora, otherwise \n"
<< "as CCL files. Use - to read corpus from stdin (as with -I)";
std::cout << desc << "\n";
return 1;
}
foreach (const std::string& f, files) {
if (boost::algorithm::ends_with(f, ".xml")) {
corpora_files.push_back(f);
} else {
ccl_files.push_back(f);
}
}
try {
const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load);
MatchRunner runner(tagset);
foreach (const std::string& file, ccl_files) {
runner.load_more_rules(file);
}
if (!runner.empty()) {
Corpus2::TokenTimer& timer = Corpus2::global_timer();
timer.register_signal_handler();
boost::shared_ptr<Corpus2::TokenWriter> writer;
writer.reset(Corpus2::TokenWriter::create(output_format, std::cout, tagset));
boost::shared_ptr<Corpus2::TokenReader> reader;
reader = Corpus2::TokenReader::create_path_reader(input_format, tagset, ccl_files[0]);
runner.apply_rules(reader, writer);
}
} catch (PwrNlp::PwrNlpError& e) {
std::cerr << e.info() << std::endl;
return 2;
}
return 0;
}
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment