Merge branch 'master' of nlp.pwr.wroc.pl:wccl

c5fe1b46 · Adam Wardynski · 61cb697b · 1923cc70 · c5fe1b46 · c5fe1b46
Commit c5fe1b46 authored May 9, 2011 by Adam Wardynski
--- a/libwccl/wcclfile.cpp
+++ b/libwccl/wcclfile.cpp
@@ -36,6 +36,14 @@ boost::shared_ptr<const TagRuleSequence> WcclFile::get_tag_rules_ptr() const
 	return tag_rules_;
 }
+boost::shared_ptr<MatchRuleSequence> WcclFile::get_match_rules_ptr()
+{
+	if (!has_match_rules()) {
+		throw WcclError("There are no match rules.");
+	}
+	return match_rules_;
+}
 boost::shared_ptr<const MatchRuleSequence> WcclFile::get_match_rules_ptr() const
 {
 	if (!has_match_rules()) {

--- a/wccl-apps/CMakeLists.txt
+++ b/wccl-apps/CMakeLists.txt
@@ -27,11 +27,9 @@ add_executable(wccl-rules wccl-rules.cpp)
 target_link_libraries (wccl-rules wccl ${Boost_LIBRARIES} antlr ${LIBS})
 add_executable(wccl-parser wccl-parser.cpp)
 target_link_libraries (wccl-parser wccl ${Boost_LIBRARIES} antlr ${LIBS})
-add_executable(wccl-match wccl-match.cpp)
-target_link_libraries (wccl-match wccl ${Boost_LIBRARIES} antlr ${LIBS})
 if(UNIX)
-	install(TARGETS wccl-features wccl-run wccl-rules wccl-parser wccl-match
+	install(TARGETS wccl-features wccl-run wccl-rules wccl-parser
 		RUNTIME DESTINATION bin
 	)
 endif(UNIX)
--- a/wccl-apps/wccl-match.cpp
+++ b/wccl-apps/wccl-match.cpp
-#include <cstdlib>
-#include <fstream>
-#include <iomanip>
-#include <libwccl/values/strset.h>
-#include <libwccl/parser/Parser.h>
-#include <libcorpus2/tagsetmanager.h>
-#include <libcorpus2/util/tokentimer.h>
-#include <boost/bind.hpp>
-#include <boost/algorithm/string.hpp>
-#include <boost/make_shared.hpp>
-#include <boost/filesystem.hpp>
-#include <boost/program_options.hpp>
-#include <libcorpus2/io/reader.h>
-#include <libcorpus2/io/writer.h>
-namespace {
-	bool quiet = false;
-	struct options {
-		bool first;
-		bool until_done;
-		int until_done_iterations;
-	};
-}
-class MatchRunner
-{
-public:
-	MatchRunner(const Corpus2::Tagset& tagset)
-		: tagset_(tagset), parser_(tagset_), progress_(false)
-	{
-	}
-	void use_progress(bool use) {
-		progress_ = use;
-		if (use) {
-			Corpus2::TokenTimer& timer = Corpus2::global_timer();
-			timer.register_signal_handler();
-		}
-	}
-	bool load_more_rules(const std::string &filename);
-	bool load_operator_string(const std::string &op_string);
-	void apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader,
-		boost::shared_ptr<Corpus2::TokenWriter> writer);
-	bool empty() {
-		return rules_.empty();
-	}
-private:
-	const Corpus2::Tagset& tagset_;
-	Wccl::Parser parser_;
-	std::vector<std::string> rule_names_;
-	std::vector<boost::shared_ptr<Wccl::MatchRule> > rules_;
-	bool progress_;
-};
-bool MatchRunner::load_more_rules(const std::string& filename)
-{
-	boost::shared_ptr<Wccl::MatchRule> retOp;
-	try {
-		std::ifstream is(filename.c_str());
-		if (!is.good()) {
-			throw Wccl::FileNotFound(filename, "", __FUNCTION__);
-		}
-		retOp = parser_.parseMatchRule(is);
-		if (retOp) {
-			boost::filesystem::path p(filename);
-			rule_names_.push_back(p.stem());
-			rules_.push_back(retOp);
-			return true;
-		} else {
-			std::cerr << "Problem while parsing -- "
-				<< "parser returned NULL!" << std::endl;
-		}
-	} catch (PwrNlp::PwrNlpError& e) {
-		std::cerr << e.scope() << " Error: " << e.info() << std::endl;
-	}
-	return false;
-}
-void MatchRunner::apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader,
-	boost::shared_ptr<Corpus2::TokenWriter> writer)
-{
-	Corpus2::TokenTimer& timer = Corpus2::global_timer();
-	while (boost::shared_ptr<Corpus2::Chunk> c = reader->get_next_chunk()) {
-		foreach (boost::shared_ptr<Corpus2::Sentence>& s, c->sentences()) {
-			boost::shared_ptr<Corpus2::AnnotatedSentence> as;
-			as = boost::dynamic_pointer_cast<Corpus2::AnnotatedSentence>(s);
-			if (!as) {
-				std::cerr << "Did not get an AnnotatedSentence from reader,"
-					"'ann'' option broken?\n";
-				return;
-			}
-			foreach (const boost::shared_ptr<Wccl::MatchRule>& r, rules_) {
-				r->apply(as);
-			}
-			timer.count_sentence(*as);
-			if (progress_) {
-				timer.check_slice();
-			}
-			//writer->write_sentence(*as);
-		}
-		writer->write_chunk(*c);
-	}
-}
-void usage(char* name)
-{
-	std::cerr << "This program runs WCCL match rules.\n";
-	std::cerr << "Usage " << name << " [OPTIONS] FILES\n"
-		<< "Files ending with .xml are treated as corpora, otherwise \n"
-		<< "as CCL files. Use - to read corpus from stdin (as with -I)\n"
-		<< "Note: the ann option is implied on all input formats\n";
-}
-int main(int argc, char** argv)
-{
-	std::string tagset_load = "kipi";
-	std::string input_format;
-	std::string output_format;
-	bool progress = false;
-	options opts;
-	opts.first = false;
-	opts.until_done = false;
-	opts.until_done_iterations = 1000;
-	std::vector<std::string> corpora_files, ccl_files, files;
-	bool corpus_stdin = true;
-	using boost::program_options::value;
-	std::string readers = boost::algorithm::join(Corpus2::TokenReader::available_reader_types_help(), " ");
-	std::string readers_help = "Input format, any of: " + readers + "\n";
-	std::string writers = boost::algorithm::join(Corpus2::TokenWriter::available_writer_types_help(), " ");
-	std::string writers_help = "Output format, any of: " + writers + "\n";;
-	boost::program_options::options_description desc("Allowed options");
-	desc.add_options()
-			("tagset,t", value(&tagset_load),
-			 "Tagset to use\n")
-			("corpus,c", value(&corpora_files),
-			 "Corpus file to load (XCES), do not load from stdin\n")
-			("ccl-file,C", value(&ccl_files),
-			 "CCL rule files\n")
-			("files,f", value(&files),
-			 "Files to load, looking at the extension to determine type\n")
-			("corpus-from-stdin,I", value(&corpus_stdin)->zero_tokens(),
-			 "Read corpus from stdin")
-			("input-format,i", value(&input_format)->default_value("xces"),
-			 readers_help.c_str())
-			("output-format,o", value(&output_format)->default_value("ccl"),
-			 writers_help.c_str())
-			("progress,p", value(&progress)->zero_tokens(),
-			 "Show progress info")
-			("quiet,q", value(&quiet)->zero_tokens(),
-			 "Suppress messages\n")
-			("until-done,u", value(&opts.until_done)->zero_tokens(),
-			 "Until-done mode\n")
-			("until-done-iterations", value(&opts.until_done_iterations),
-			 "Until-done iteration limit\n")
-			("first-sentence-only,1", value(&opts.first)->zero_tokens(),
-			 "Only process first sentence\n")
-			("help,h", "Show help")
-			;
-	boost::program_options::variables_map vm;
-	boost::program_options::positional_options_description p;
-	p.add("files", -1);
-	try {
-		boost::program_options::store(
-			boost::program_options::command_line_parser(argc, argv)
-			.options(desc).positional(p).run(), vm);
-	} catch (boost::program_options::error& e) {
-		std::cerr << e.what() << std::endl;
-		return 2;
-	}
-	boost::program_options::notify(vm);
-	if (vm.count("help")) {
-		usage(argv[0]);
-		std::cout << desc << "\n";
-		return 1;
-	}
-	foreach (const std::string& f, files) {
-		if (f == "-") {
-			corpus_stdin = true;
-		} else if (boost::algorithm::ends_with(f, ".xml")) {
-			corpora_files.push_back(f);
-		} else {
-			ccl_files.push_back(f);
-		}
-	}
-	// consider stdin only when no corpus files given
-	corpus_stdin = corpus_stdin && corpora_files.empty();
-	if (ccl_files.empty() || (corpora_files.empty() && !corpus_stdin)) {
-		usage(argv[0]);
-		return 2;
-	}
-	try {
-		const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load);
-		MatchRunner runner(tagset);
-		runner.use_progress(progress);
-		foreach (const std::string& file, ccl_files) {
-			runner.load_more_rules(file);
-		}
-		if (!runner.empty()) {
-			Corpus2::TokenTimer& timer = Corpus2::global_timer();
-			timer.register_signal_handler();
-			boost::shared_ptr<Corpus2::TokenWriter> writer;
-			writer = Corpus2::TokenWriter::create_stream_writer(output_format, std::cout, tagset);
-			boost::shared_ptr<Corpus2::TokenReader> reader;
-			foreach (std::string cf, corpora_files) {
-				reader = Corpus2::TokenReader::create_path_reader(input_format, tagset, cf);
-				reader->set_option("ann");
-				runner.apply_rules(reader, writer);
-			}
-			if (corpus_stdin) {
-				reader = Corpus2::TokenReader::create_stream_reader(input_format, tagset, std::cin);
-				reader->set_option("ann");
-				runner.apply_rules(reader, writer);
-			}
-			if (progress) {
-				Corpus2::TokenTimer& timer = Corpus2::global_timer();
-				timer.stats();
-			}
-		}
-	} catch (PwrNlp::PwrNlpError& e) {
-		std::cerr << e.info() << std::endl;
-		return 2;
-	}
-	return 0;
-}
--- a/wccl-apps/wccl-rules.cpp
+++ b/wccl-apps/wccl-rules.cpp
@@ -2,79 +2,109 @@
 #include <fstream>
 #include <iomanip>
 #include <libwccl/values/strset.h>
 #include <libwccl/parser/Parser.h>
-#include <libwccl/ops/tagrulesequence.h>
 #include <libcorpus2/tagsetmanager.h>
 #include <libcorpus2/util/tokentimer.h>
 #include <boost/bind.hpp>
 #include <boost/algorithm/string.hpp>
 #include <boost/make_shared.hpp>
+#include <boost/filesystem.hpp>
 #include <boost/program_options.hpp>
-#include <libcorpus2/io/xcesreader.h>
+#include <libcorpus2/io/reader.h>
-#include <libcorpus2/io/xceswriter.h>
+#include <libcorpus2/io/writer.h>
-#include <antlr/NoViableAltException.hpp>
-#include <antlr/MismatchedTokenException.hpp>
 namespace {
 	bool quiet = false;
-	bool progress = false;
 	struct options {
 		bool first;
-		bool until_done;
-		int until_done_iterations;
 	};
 }
-bool load_more_rules(Wccl::Parser& parser, const std::string& filename, Wccl::TagRuleSequence& rules)
+class RuleRunner
+{
+public:
+	RuleRunner(const Corpus2::Tagset& tagset)
+		: tagset_(tagset), parser_(tagset_), progress_(false)
+		, tag_rule_iterations_(0), total_match_rules_(0), total_tag_rules_(0)
 	{
-	boost::shared_ptr<Wccl::TagRuleSequence> ret;
+	}
+	void use_progress(bool use) {
+		progress_ = use;
+		if (use) {
+			Corpus2::TokenTimer& timer = Corpus2::global_timer();
+			timer.register_signal_handler();
+		}
+	}
+	void set_tag_rule_iterations(int i) {
+		tag_rule_iterations_ = i;
+	}
+	std::pair<int,int> load_more_rules(const std::string &filename);
+	void apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader,
+		boost::shared_ptr<Corpus2::TokenWriter> writer);
+	bool empty() const {
+		return size() > 0;
+	}
+	size_t size() const {
+		return total_match_rules_ + total_tag_rules_;
+	}
+	size_t total_match_rules() const {
+		return total_match_rules_;
+	}
+	size_t total_tag_rules() const {
+		return total_tag_rules_;
+	}
+private:
+	const Corpus2::Tagset& tagset_;
+	Wccl::Parser parser_;
+	std::vector<std::string> file_names_;
+	std::vector<boost::shared_ptr<Wccl::WcclFile> > parsed_files_;
+	bool progress_;
+	int tag_rule_iterations_;
+	size_t total_match_rules_, total_tag_rules_;
+};
+std::pair<int,int> RuleRunner::load_more_rules(const std::string& filename)
+{
+	boost::shared_ptr<Wccl::WcclFile> parsed_file;
 	try {
 		std::ifstream is(filename.c_str());
 		if (!is.good()) {
 			throw Wccl::FileNotFound(filename, "", __FUNCTION__);
 		}
+		parsed_file = parser_.parseWcclFile(is, ".");
-		ret = parser.parseTagRuleSequence(is);
+		if (parsed_file) {
-		if (ret) {
+			boost::filesystem::path p(filename);
-			if (!quiet) {
+			file_names_.push_back(p.stem());
-				std::cerr << "Loaded " << ret->size() << " rule(s) from "
+			size_t match_rules = parsed_file->get_match_rules().size();
-					<< filename << "\n";
+			size_t tag_rules = parsed_file->get_tag_rules().size();
-			}
+			total_match_rules_ += match_rules;
-			std::copy(ret->begin(), ret->end(), std::back_inserter(rules));
+			total_tag_rules_ += tag_rules;
-			return true;
+			parsed_files_.push_back(parsed_file);
+			return std::make_pair(tag_rules, match_rules);
 		} else {
 			std::cerr << "Problem while parsing -- "
 				<< "parser returned NULL!" << std::endl;
 		}
-	} catch (antlr::MismatchedTokenException &e) {
-		std::cerr << e.getFileLineColumnString()
-				<< " " << e.getMessage() << std::endl;
-	} catch(antlr::NoViableAltException &e) {
-		std::cerr << e.getFileLineColumnString()
-				<< " " << e.getMessage() << std::endl;
-	} catch (Wccl::InvalidVariableName &e) {
-		std::cerr << "Wccl::InvalidVariableName " << e.info() << std::endl;
-	} catch (Wccl::VariableTypeMismatch &e) {
-		std::cerr << "Wccl::VariableTypeMismatch " << e.info() << std::endl;
-	} catch (Wccl::WcclError& e) {
-		std::cerr << "Wccl::WcclError:" << e.info() << std::endl;
 	} catch (PwrNlp::PwrNlpError& e) {
-		std::cerr << "PwrNlp::PwrNlpError " << e.info() << std::endl;
+		std::cerr << e.scope() << " Error: " << e.info() << std::endl;
-	} catch (antlr::ANTLRException& e) {
-		std::cerr << "Antlr error " << e.getMessage() << std::endl;
 	}
-	return false;
+	return std::make_pair(0,0);
 }
-void apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader,
+void RuleRunner::apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader,
-	boost::shared_ptr<Corpus2::TokenWriter> writer, Wccl::TagRuleSequence& rules,
+	boost::shared_ptr<Corpus2::TokenWriter> writer)
-	const options& opts)
 {
 	Corpus2::TokenTimer& timer = Corpus2::global_timer();
 	while (boost::shared_ptr<Corpus2::Chunk> c = reader->get_next_chunk()) {
@@ -86,45 +116,45 @@ void apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader,
 					"'ann'' option broken?\n";
 				return;
 			}
-			if (opts.until_done) {
-				rules.execute_until_done(as, opts.until_done_iterations);
+			foreach (boost::shared_ptr<Wccl::WcclFile>& f, parsed_files_) {
+				if (tag_rule_iterations_ == 0) {
+					f->get_tag_rules_ptr()->execute_once(as);
+				} else if (tag_rule_iterations_ < 0) {
+					f->get_tag_rules_ptr()->execute_until_done(as);
 				} else  {
-				rules.execute_once(as);
+					f->get_tag_rules_ptr()->execute_until_done(as, tag_rule_iterations_);
+				}
+				f->get_match_rules_ptr()->apply_all(as);
 			}
 			timer.count_sentence(*as);
-			if (progress) {
+			if (progress_) {
 				timer.check_slice();
 			}
-			if (opts.first) break;
 			//writer->write_sentence(*as);
 		}
 		writer->write_chunk(*c);
-		if (opts.first) break;
-	}
-	if (progress) {
-		timer.stats();
 	}
 }
 void usage(char* name)
 {
-	std::cerr << "This program runs WCCL disambiguation rules.\n";
+	std::cerr << "This program runs WCCL match and/or tag rules. Tag rules are applied first.\n";
 	std::cerr << "Usage " << name << " [OPTIONS] FILES\n"
-		<< "Files ending with .xml are treated as corpora, otherwise \n"
+		<< "Files ending with .xml are treated as corpora, otherwise "
-		<< "as CCL files. Use - to read corpus from stdin (as with -I)\n"
+		<< "as WCCL files. Use - to read corpus from stdin (as with -I)\n"
-		<< "Note: the ann option is implied on all input formats\n";
+		<< "Note: the ,ann option is implied on all input formats\n";
 }
 int main(int argc, char** argv)
 {
 	std::string tagset_load = "kipi";
 	std::string input_format;
 	std::string output_format;
+	bool progress = false;
 	options opts;
 	opts.first = false;
-	opts.until_done = false;
-	opts.until_done_iterations = 1000;
 	std::vector<std::string> corpora_files, ccl_files, files;
 	bool corpus_stdin = true;
 	using boost::program_options::value;
@@ -145,19 +175,17 @@ int main(int argc, char** argv)
 			("files,f", value(&files),
 			 "Files to load, looking at the extension to determine type\n")
 			("corpus-from-stdin,I", value(&corpus_stdin)->zero_tokens(),
-			 "Read corpus from stdin")
+			 "Read corpus from stdin (requires that no corpora filenames are passed)")
 			("input-format,i", value(&input_format)->default_value("xces"),
 			 readers_help.c_str())
-			("output-format,o", value(&output_format)->default_value("xces"),
+			("output-format,o", value(&output_format)->default_value("ccl"),
 			 writers_help.c_str())
 			("progress,p", value(&progress)->zero_tokens(),
 			 "Show progress info")
 			("quiet,q", value(&quiet)->zero_tokens(),
 			 "Suppress messages\n")
-			("until-done,u", value(&opts.until_done)->zero_tokens(),
+			("until-done-iterations,u", value<int>()->implicit_value(1000),
-			 "Until-done mode\n")
+			 "Until-done iteration limit, no arg for default limit(1000)\n")
-			("until-done-iterations", value(&opts.until_done_iterations),
-			 "Until-done iteration limit\n")
 			("first-sentence-only,1", value(&opts.first)->zero_tokens(),
 			 "Only process first sentence\n")
 			("help,h", "Show help")
@@ -202,32 +230,31 @@ int main(int argc, char** argv)
 	try {
 		const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load);
-		Wccl::Parser parser(tagset);
+		RuleRunner runner(tagset);
-		Wccl::TagRuleSequence rules;
+		runner.set_tag_rule_iterations(vm["until-done-iterations"].as<int>());
-		foreach (const std::string& f, ccl_files) {
+		runner.use_progress(progress);
-			size_t sz = rules.size();
+		foreach (const std::string& file, ccl_files) {
-			if (!load_more_rules(parser, f, rules)) {
+			std::pair<int,int> res = runner.load_more_rules(file);
-				std::cerr << "Warning: error while parsing " << f << "\n";
+			if (res.first == 0 && res.second == 0) {
-			}
+				std::cerr << "Warning: no rules loaded from " << file << "\n";
-			if (rules.size() == sz) {
+			} else if (!quiet) {
-				std::cerr << "Warning: no rules loaded from " << f << "\n";
+				std::cerr << "Loaded " << res.first << " tag rule(s) and "
+					<< res.second << " match rule(s) from " << file << "\n";
 			}
 		}
-		if (!rules.empty()) {
+		if (!runner.empty()) {
-			Corpus2::TokenTimer& timer = Corpus2::global_timer();
-			timer.register_signal_handler();
 			boost::shared_ptr<Corpus2::TokenWriter> writer;
 			writer = Corpus2::TokenWriter::create_stream_writer(output_format, std::cout, tagset);
 			boost::shared_ptr<Corpus2::TokenReader> reader;
-			foreach (const std::string& f, corpora_files) {
+			foreach (std::string cf, corpora_files) {
-				reader = Corpus2::TokenReader::create_path_reader(input_format, tagset, f);
+				reader = Corpus2::TokenReader::create_path_reader(input_format, tagset, cf);
 				reader->set_option("ann");
-				apply_rules(reader, writer, rules, opts);
+				runner.apply_rules(reader, writer);
 			}
 			if (corpus_stdin) {
 				reader = Corpus2::TokenReader::create_stream_reader(input_format, tagset, std::cin);
 				reader->set_option("ann");
-				apply_rules(reader, writer, rules, opts);
+				runner.apply_rules(reader, writer);
 			}
 			if (progress) {
 				Corpus2::TokenTimer& timer = Corpus2::global_timer();