Skip to content
Snippets Groups Projects
Commit c5fe1b46 authored by Adam Wardynski's avatar Adam Wardynski
Browse files

Merge branch 'master' of nlp.pwr.wroc.pl:wccl

parents 61cb697b 1923cc70
Branches
No related tags found
No related merge requests found
...@@ -36,6 +36,14 @@ boost::shared_ptr<const TagRuleSequence> WcclFile::get_tag_rules_ptr() const ...@@ -36,6 +36,14 @@ boost::shared_ptr<const TagRuleSequence> WcclFile::get_tag_rules_ptr() const
return tag_rules_; return tag_rules_;
} }
boost::shared_ptr<MatchRuleSequence> WcclFile::get_match_rules_ptr()
{
if (!has_match_rules()) {
throw WcclError("There are no match rules.");
}
return match_rules_;
}
boost::shared_ptr<const MatchRuleSequence> WcclFile::get_match_rules_ptr() const boost::shared_ptr<const MatchRuleSequence> WcclFile::get_match_rules_ptr() const
{ {
if (!has_match_rules()) { if (!has_match_rules()) {
......
...@@ -27,11 +27,9 @@ add_executable(wccl-rules wccl-rules.cpp) ...@@ -27,11 +27,9 @@ add_executable(wccl-rules wccl-rules.cpp)
target_link_libraries (wccl-rules wccl ${Boost_LIBRARIES} antlr ${LIBS}) target_link_libraries (wccl-rules wccl ${Boost_LIBRARIES} antlr ${LIBS})
add_executable(wccl-parser wccl-parser.cpp) add_executable(wccl-parser wccl-parser.cpp)
target_link_libraries (wccl-parser wccl ${Boost_LIBRARIES} antlr ${LIBS}) target_link_libraries (wccl-parser wccl ${Boost_LIBRARIES} antlr ${LIBS})
add_executable(wccl-match wccl-match.cpp)
target_link_libraries (wccl-match wccl ${Boost_LIBRARIES} antlr ${LIBS})
if(UNIX) if(UNIX)
install(TARGETS wccl-features wccl-run wccl-rules wccl-parser wccl-match install(TARGETS wccl-features wccl-run wccl-rules wccl-parser
RUNTIME DESTINATION bin RUNTIME DESTINATION bin
) )
endif(UNIX) endif(UNIX)
#include <cstdlib>
#include <fstream>
#include <iomanip>
#include <libwccl/values/strset.h>
#include <libwccl/parser/Parser.h>
#include <libcorpus2/tagsetmanager.h>
#include <libcorpus2/util/tokentimer.h>
#include <boost/bind.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/make_shared.hpp>
#include <boost/filesystem.hpp>
#include <boost/program_options.hpp>
#include <libcorpus2/io/reader.h>
#include <libcorpus2/io/writer.h>
namespace {
bool quiet = false;
struct options {
bool first;
bool until_done;
int until_done_iterations;
};
}
class MatchRunner
{
public:
MatchRunner(const Corpus2::Tagset& tagset)
: tagset_(tagset), parser_(tagset_), progress_(false)
{
}
void use_progress(bool use) {
progress_ = use;
if (use) {
Corpus2::TokenTimer& timer = Corpus2::global_timer();
timer.register_signal_handler();
}
}
bool load_more_rules(const std::string &filename);
bool load_operator_string(const std::string &op_string);
void apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader,
boost::shared_ptr<Corpus2::TokenWriter> writer);
bool empty() {
return rules_.empty();
}
private:
const Corpus2::Tagset& tagset_;
Wccl::Parser parser_;
std::vector<std::string> rule_names_;
std::vector<boost::shared_ptr<Wccl::MatchRule> > rules_;
bool progress_;
};
bool MatchRunner::load_more_rules(const std::string& filename)
{
boost::shared_ptr<Wccl::MatchRule> retOp;
try {
std::ifstream is(filename.c_str());
if (!is.good()) {
throw Wccl::FileNotFound(filename, "", __FUNCTION__);
}
retOp = parser_.parseMatchRule(is);
if (retOp) {
boost::filesystem::path p(filename);
rule_names_.push_back(p.stem());
rules_.push_back(retOp);
return true;
} else {
std::cerr << "Problem while parsing -- "
<< "parser returned NULL!" << std::endl;
}
} catch (PwrNlp::PwrNlpError& e) {
std::cerr << e.scope() << " Error: " << e.info() << std::endl;
}
return false;
}
void MatchRunner::apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader,
boost::shared_ptr<Corpus2::TokenWriter> writer)
{
Corpus2::TokenTimer& timer = Corpus2::global_timer();
while (boost::shared_ptr<Corpus2::Chunk> c = reader->get_next_chunk()) {
foreach (boost::shared_ptr<Corpus2::Sentence>& s, c->sentences()) {
boost::shared_ptr<Corpus2::AnnotatedSentence> as;
as = boost::dynamic_pointer_cast<Corpus2::AnnotatedSentence>(s);
if (!as) {
std::cerr << "Did not get an AnnotatedSentence from reader,"
"'ann'' option broken?\n";
return;
}
foreach (const boost::shared_ptr<Wccl::MatchRule>& r, rules_) {
r->apply(as);
}
timer.count_sentence(*as);
if (progress_) {
timer.check_slice();
}
//writer->write_sentence(*as);
}
writer->write_chunk(*c);
}
}
void usage(char* name)
{
std::cerr << "This program runs WCCL match rules.\n";
std::cerr << "Usage " << name << " [OPTIONS] FILES\n"
<< "Files ending with .xml are treated as corpora, otherwise \n"
<< "as CCL files. Use - to read corpus from stdin (as with -I)\n"
<< "Note: the ann option is implied on all input formats\n";
}
int main(int argc, char** argv)
{
std::string tagset_load = "kipi";
std::string input_format;
std::string output_format;
bool progress = false;
options opts;
opts.first = false;
opts.until_done = false;
opts.until_done_iterations = 1000;
std::vector<std::string> corpora_files, ccl_files, files;
bool corpus_stdin = true;
using boost::program_options::value;
std::string readers = boost::algorithm::join(Corpus2::TokenReader::available_reader_types_help(), " ");
std::string readers_help = "Input format, any of: " + readers + "\n";
std::string writers = boost::algorithm::join(Corpus2::TokenWriter::available_writer_types_help(), " ");
std::string writers_help = "Output format, any of: " + writers + "\n";;
boost::program_options::options_description desc("Allowed options");
desc.add_options()
("tagset,t", value(&tagset_load),
"Tagset to use\n")
("corpus,c", value(&corpora_files),
"Corpus file to load (XCES), do not load from stdin\n")
("ccl-file,C", value(&ccl_files),
"CCL rule files\n")
("files,f", value(&files),
"Files to load, looking at the extension to determine type\n")
("corpus-from-stdin,I", value(&corpus_stdin)->zero_tokens(),
"Read corpus from stdin")
("input-format,i", value(&input_format)->default_value("xces"),
readers_help.c_str())
("output-format,o", value(&output_format)->default_value("ccl"),
writers_help.c_str())
("progress,p", value(&progress)->zero_tokens(),
"Show progress info")
("quiet,q", value(&quiet)->zero_tokens(),
"Suppress messages\n")
("until-done,u", value(&opts.until_done)->zero_tokens(),
"Until-done mode\n")
("until-done-iterations", value(&opts.until_done_iterations),
"Until-done iteration limit\n")
("first-sentence-only,1", value(&opts.first)->zero_tokens(),
"Only process first sentence\n")
("help,h", "Show help")
;
boost::program_options::variables_map vm;
boost::program_options::positional_options_description p;
p.add("files", -1);
try {
boost::program_options::store(
boost::program_options::command_line_parser(argc, argv)
.options(desc).positional(p).run(), vm);
} catch (boost::program_options::error& e) {
std::cerr << e.what() << std::endl;
return 2;
}
boost::program_options::notify(vm);
if (vm.count("help")) {
usage(argv[0]);
std::cout << desc << "\n";
return 1;
}
foreach (const std::string& f, files) {
if (f == "-") {
corpus_stdin = true;
} else if (boost::algorithm::ends_with(f, ".xml")) {
corpora_files.push_back(f);
} else {
ccl_files.push_back(f);
}
}
// consider stdin only when no corpus files given
corpus_stdin = corpus_stdin && corpora_files.empty();
if (ccl_files.empty() || (corpora_files.empty() && !corpus_stdin)) {
usage(argv[0]);
return 2;
}
try {
const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load);
MatchRunner runner(tagset);
runner.use_progress(progress);
foreach (const std::string& file, ccl_files) {
runner.load_more_rules(file);
}
if (!runner.empty()) {
Corpus2::TokenTimer& timer = Corpus2::global_timer();
timer.register_signal_handler();
boost::shared_ptr<Corpus2::TokenWriter> writer;
writer = Corpus2::TokenWriter::create_stream_writer(output_format, std::cout, tagset);
boost::shared_ptr<Corpus2::TokenReader> reader;
foreach (std::string cf, corpora_files) {
reader = Corpus2::TokenReader::create_path_reader(input_format, tagset, cf);
reader->set_option("ann");
runner.apply_rules(reader, writer);
}
if (corpus_stdin) {
reader = Corpus2::TokenReader::create_stream_reader(input_format, tagset, std::cin);
reader->set_option("ann");
runner.apply_rules(reader, writer);
}
if (progress) {
Corpus2::TokenTimer& timer = Corpus2::global_timer();
timer.stats();
}
}
} catch (PwrNlp::PwrNlpError& e) {
std::cerr << e.info() << std::endl;
return 2;
}
return 0;
}
...@@ -2,79 +2,109 @@ ...@@ -2,79 +2,109 @@
#include <fstream> #include <fstream>
#include <iomanip> #include <iomanip>
#include <libwccl/values/strset.h> #include <libwccl/values/strset.h>
#include <libwccl/parser/Parser.h> #include <libwccl/parser/Parser.h>
#include <libwccl/ops/tagrulesequence.h>
#include <libcorpus2/tagsetmanager.h> #include <libcorpus2/tagsetmanager.h>
#include <libcorpus2/util/tokentimer.h> #include <libcorpus2/util/tokentimer.h>
#include <boost/bind.hpp> #include <boost/bind.hpp>
#include <boost/algorithm/string.hpp> #include <boost/algorithm/string.hpp>
#include <boost/make_shared.hpp> #include <boost/make_shared.hpp>
#include <boost/filesystem.hpp>
#include <boost/program_options.hpp> #include <boost/program_options.hpp>
#include <libcorpus2/io/xcesreader.h> #include <libcorpus2/io/reader.h>
#include <libcorpus2/io/xceswriter.h> #include <libcorpus2/io/writer.h>
#include <antlr/NoViableAltException.hpp>
#include <antlr/MismatchedTokenException.hpp>
namespace { namespace {
bool quiet = false; bool quiet = false;
bool progress = false;
struct options { struct options {
bool first; bool first;
bool until_done;
int until_done_iterations;
}; };
} }
bool load_more_rules(Wccl::Parser& parser, const std::string& filename, Wccl::TagRuleSequence& rules) class RuleRunner
{
public:
RuleRunner(const Corpus2::Tagset& tagset)
: tagset_(tagset), parser_(tagset_), progress_(false)
, tag_rule_iterations_(0), total_match_rules_(0), total_tag_rules_(0)
{ {
boost::shared_ptr<Wccl::TagRuleSequence> ret; }
void use_progress(bool use) {
progress_ = use;
if (use) {
Corpus2::TokenTimer& timer = Corpus2::global_timer();
timer.register_signal_handler();
}
}
void set_tag_rule_iterations(int i) {
tag_rule_iterations_ = i;
}
std::pair<int,int> load_more_rules(const std::string &filename);
void apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader,
boost::shared_ptr<Corpus2::TokenWriter> writer);
bool empty() const {
return size() > 0;
}
size_t size() const {
return total_match_rules_ + total_tag_rules_;
}
size_t total_match_rules() const {
return total_match_rules_;
}
size_t total_tag_rules() const {
return total_tag_rules_;
}
private:
const Corpus2::Tagset& tagset_;
Wccl::Parser parser_;
std::vector<std::string> file_names_;
std::vector<boost::shared_ptr<Wccl::WcclFile> > parsed_files_;
bool progress_;
int tag_rule_iterations_;
size_t total_match_rules_, total_tag_rules_;
};
std::pair<int,int> RuleRunner::load_more_rules(const std::string& filename)
{
boost::shared_ptr<Wccl::WcclFile> parsed_file;
try { try {
std::ifstream is(filename.c_str()); std::ifstream is(filename.c_str());
if (!is.good()) { if (!is.good()) {
throw Wccl::FileNotFound(filename, "", __FUNCTION__); throw Wccl::FileNotFound(filename, "", __FUNCTION__);
} }
parsed_file = parser_.parseWcclFile(is, ".");
ret = parser.parseTagRuleSequence(is); if (parsed_file) {
if (ret) { boost::filesystem::path p(filename);
if (!quiet) { file_names_.push_back(p.stem());
std::cerr << "Loaded " << ret->size() << " rule(s) from " size_t match_rules = parsed_file->get_match_rules().size();
<< filename << "\n"; size_t tag_rules = parsed_file->get_tag_rules().size();
} total_match_rules_ += match_rules;
std::copy(ret->begin(), ret->end(), std::back_inserter(rules)); total_tag_rules_ += tag_rules;
return true; parsed_files_.push_back(parsed_file);
return std::make_pair(tag_rules, match_rules);
} else { } else {
std::cerr << "Problem while parsing -- " std::cerr << "Problem while parsing -- "
<< "parser returned NULL!" << std::endl; << "parser returned NULL!" << std::endl;
} }
} catch (antlr::MismatchedTokenException &e) {
std::cerr << e.getFileLineColumnString()
<< " " << e.getMessage() << std::endl;
} catch(antlr::NoViableAltException &e) {
std::cerr << e.getFileLineColumnString()
<< " " << e.getMessage() << std::endl;
} catch (Wccl::InvalidVariableName &e) {
std::cerr << "Wccl::InvalidVariableName " << e.info() << std::endl;
} catch (Wccl::VariableTypeMismatch &e) {
std::cerr << "Wccl::VariableTypeMismatch " << e.info() << std::endl;
} catch (Wccl::WcclError& e) {
std::cerr << "Wccl::WcclError:" << e.info() << std::endl;
} catch (PwrNlp::PwrNlpError& e) { } catch (PwrNlp::PwrNlpError& e) {
std::cerr << "PwrNlp::PwrNlpError " << e.info() << std::endl; std::cerr << e.scope() << " Error: " << e.info() << std::endl;
} catch (antlr::ANTLRException& e) {
std::cerr << "Antlr error " << e.getMessage() << std::endl;
} }
return false; return std::make_pair(0,0);
} }
void apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader, void RuleRunner::apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader,
boost::shared_ptr<Corpus2::TokenWriter> writer, Wccl::TagRuleSequence& rules, boost::shared_ptr<Corpus2::TokenWriter> writer)
const options& opts)
{ {
Corpus2::TokenTimer& timer = Corpus2::global_timer(); Corpus2::TokenTimer& timer = Corpus2::global_timer();
while (boost::shared_ptr<Corpus2::Chunk> c = reader->get_next_chunk()) { while (boost::shared_ptr<Corpus2::Chunk> c = reader->get_next_chunk()) {
...@@ -86,45 +116,45 @@ void apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader, ...@@ -86,45 +116,45 @@ void apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader,
"'ann'' option broken?\n"; "'ann'' option broken?\n";
return; return;
} }
if (opts.until_done) {
rules.execute_until_done(as, opts.until_done_iterations); foreach (boost::shared_ptr<Wccl::WcclFile>& f, parsed_files_) {
if (tag_rule_iterations_ == 0) {
f->get_tag_rules_ptr()->execute_once(as);
} else if (tag_rule_iterations_ < 0) {
f->get_tag_rules_ptr()->execute_until_done(as);
} else { } else {
rules.execute_once(as); f->get_tag_rules_ptr()->execute_until_done(as, tag_rule_iterations_);
}
f->get_match_rules_ptr()->apply_all(as);
} }
timer.count_sentence(*as); timer.count_sentence(*as);
if (progress) { if (progress_) {
timer.check_slice(); timer.check_slice();
} }
if (opts.first) break;
//writer->write_sentence(*as); //writer->write_sentence(*as);
} }
writer->write_chunk(*c); writer->write_chunk(*c);
if (opts.first) break;
}
if (progress) {
timer.stats();
} }
} }
void usage(char* name) void usage(char* name)
{ {
std::cerr << "This program runs WCCL disambiguation rules.\n"; std::cerr << "This program runs WCCL match and/or tag rules. Tag rules are applied first.\n";
std::cerr << "Usage " << name << " [OPTIONS] FILES\n" std::cerr << "Usage " << name << " [OPTIONS] FILES\n"
<< "Files ending with .xml are treated as corpora, otherwise \n" << "Files ending with .xml are treated as corpora, otherwise "
<< "as CCL files. Use - to read corpus from stdin (as with -I)\n" << "as WCCL files. Use - to read corpus from stdin (as with -I)\n"
<< "Note: the ann option is implied on all input formats\n"; << "Note: the ,ann option is implied on all input formats\n";
} }
int main(int argc, char** argv) int main(int argc, char** argv)
{ {
std::string tagset_load = "kipi"; std::string tagset_load = "kipi";
std::string input_format; std::string input_format;
std::string output_format; std::string output_format;
bool progress = false;
options opts; options opts;
opts.first = false; opts.first = false;
opts.until_done = false;
opts.until_done_iterations = 1000;
std::vector<std::string> corpora_files, ccl_files, files; std::vector<std::string> corpora_files, ccl_files, files;
bool corpus_stdin = true; bool corpus_stdin = true;
using boost::program_options::value; using boost::program_options::value;
...@@ -145,19 +175,17 @@ int main(int argc, char** argv) ...@@ -145,19 +175,17 @@ int main(int argc, char** argv)
("files,f", value(&files), ("files,f", value(&files),
"Files to load, looking at the extension to determine type\n") "Files to load, looking at the extension to determine type\n")
("corpus-from-stdin,I", value(&corpus_stdin)->zero_tokens(), ("corpus-from-stdin,I", value(&corpus_stdin)->zero_tokens(),
"Read corpus from stdin") "Read corpus from stdin (requires that no corpora filenames are passed)")
("input-format,i", value(&input_format)->default_value("xces"), ("input-format,i", value(&input_format)->default_value("xces"),
readers_help.c_str()) readers_help.c_str())
("output-format,o", value(&output_format)->default_value("xces"), ("output-format,o", value(&output_format)->default_value("ccl"),
writers_help.c_str()) writers_help.c_str())
("progress,p", value(&progress)->zero_tokens(), ("progress,p", value(&progress)->zero_tokens(),
"Show progress info") "Show progress info")
("quiet,q", value(&quiet)->zero_tokens(), ("quiet,q", value(&quiet)->zero_tokens(),
"Suppress messages\n") "Suppress messages\n")
("until-done,u", value(&opts.until_done)->zero_tokens(), ("until-done-iterations,u", value<int>()->implicit_value(1000),
"Until-done mode\n") "Until-done iteration limit, no arg for default limit(1000)\n")
("until-done-iterations", value(&opts.until_done_iterations),
"Until-done iteration limit\n")
("first-sentence-only,1", value(&opts.first)->zero_tokens(), ("first-sentence-only,1", value(&opts.first)->zero_tokens(),
"Only process first sentence\n") "Only process first sentence\n")
("help,h", "Show help") ("help,h", "Show help")
...@@ -202,32 +230,31 @@ int main(int argc, char** argv) ...@@ -202,32 +230,31 @@ int main(int argc, char** argv)
try { try {
const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load); const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load);
Wccl::Parser parser(tagset); RuleRunner runner(tagset);
Wccl::TagRuleSequence rules; runner.set_tag_rule_iterations(vm["until-done-iterations"].as<int>());
foreach (const std::string& f, ccl_files) { runner.use_progress(progress);
size_t sz = rules.size(); foreach (const std::string& file, ccl_files) {
if (!load_more_rules(parser, f, rules)) { std::pair<int,int> res = runner.load_more_rules(file);
std::cerr << "Warning: error while parsing " << f << "\n"; if (res.first == 0 && res.second == 0) {
} std::cerr << "Warning: no rules loaded from " << file << "\n";
if (rules.size() == sz) { } else if (!quiet) {
std::cerr << "Warning: no rules loaded from " << f << "\n"; std::cerr << "Loaded " << res.first << " tag rule(s) and "
<< res.second << " match rule(s) from " << file << "\n";
} }
} }
if (!rules.empty()) { if (!runner.empty()) {
Corpus2::TokenTimer& timer = Corpus2::global_timer();
timer.register_signal_handler();
boost::shared_ptr<Corpus2::TokenWriter> writer; boost::shared_ptr<Corpus2::TokenWriter> writer;
writer = Corpus2::TokenWriter::create_stream_writer(output_format, std::cout, tagset); writer = Corpus2::TokenWriter::create_stream_writer(output_format, std::cout, tagset);
boost::shared_ptr<Corpus2::TokenReader> reader; boost::shared_ptr<Corpus2::TokenReader> reader;
foreach (const std::string& f, corpora_files) { foreach (std::string cf, corpora_files) {
reader = Corpus2::TokenReader::create_path_reader(input_format, tagset, f); reader = Corpus2::TokenReader::create_path_reader(input_format, tagset, cf);
reader->set_option("ann"); reader->set_option("ann");
apply_rules(reader, writer, rules, opts); runner.apply_rules(reader, writer);
} }
if (corpus_stdin) { if (corpus_stdin) {
reader = Corpus2::TokenReader::create_stream_reader(input_format, tagset, std::cin); reader = Corpus2::TokenReader::create_stream_reader(input_format, tagset, std::cin);
reader->set_option("ann"); reader->set_option("ann");
apply_rules(reader, writer, rules, opts); runner.apply_rules(reader, writer);
} }
if (progress) { if (progress) {
Corpus2::TokenTimer& timer = Corpus2::global_timer(); Corpus2::TokenTimer& timer = Corpus2::global_timer();
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment