From a3d5ae74248d26b514d9b792b1b53eb118355035 Mon Sep 17 00:00:00 2001
From: ilor <kailoran@gmail.com>
Date: Fri, 11 Feb 2011 13:06:38 +0100
Subject: [PATCH] rough util for running wccl rules

---
 CMakeLists.txt           |   1 +
 wcclrules/CMakeLists.txt |  24 +++++
 wcclrules/main.cpp       | 183 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 208 insertions(+)
 create mode 100644 wcclrules/CMakeLists.txt
 create mode 100644 wcclrules/main.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3d1aa50..27b8ca0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,4 +60,5 @@ endif(MSVC OR BORLAND)
 add_subdirectory(libwccl)
 add_subdirectory(wcclparser)
 add_subdirectory(wcclrun)
+add_subdirectory(wcclrules)
 add_subdirectory(tests)
diff --git a/wcclrules/CMakeLists.txt b/wcclrules/CMakeLists.txt
new file mode 100644
index 0000000..46eb509
--- /dev/null
+++ b/wcclrules/CMakeLists.txt
@@ -0,0 +1,24 @@
+PROJECT( wcclrules )
+
+find_package(LibXML++ REQUIRED)
+include_directories(${LibXML++_INCLUDE_DIRS})
+link_directories(${LibXML++_LIBRARY_DIRS})
+set(LIBS ${LIBS} ${LibXML++_LIBRARIES})
+
+include_directories( ${CMAKE_SOURCE_DIR} )
+
+add_definitions(-DLIBWCCL_WCCLRUN_DATA_DIR="${PROJECT_SOURCE_DIR}/")
+
+add_executable(wcclrules
+  main.cpp
+)
+target_link_libraries (wcclrules wccl ${Boost_LIBRARIES} antlr ${LIBS})
+
+include_directories(${Boost_INCLUDE_DIR})
+link_directories(${Boost_LIBRARY_DIRS})
+
+if(UNIX)
+	install(TARGETS wcclrules
+		RUNTIME DESTINATION bin
+	)
+endif(UNIX)
diff --git a/wcclrules/main.cpp b/wcclrules/main.cpp
new file mode 100644
index 0000000..da88578
--- /dev/null
+++ b/wcclrules/main.cpp
@@ -0,0 +1,183 @@
+#include <cstdlib>
+#include <fstream>
+#include <iomanip>
+
+
+#include <libwccl/values/strset.h>
+#include <libwccl/parser/Parser.h>
+#include <libwccl/ops/rulesequence.h>
+#include <libcorpus2/tagsetmanager.h>
+
+#include <boost/bind.hpp>
+#include <boost/algorithm/string.hpp>
+#include <boost/make_shared.hpp>
+#include <boost/program_options.hpp>
+#include <libcorpus2/io/xcesreader.h>
+#include <libcorpus2/io/xceswriter.h>
+
+#include <antlr/NoViableAltException.hpp>
+#include <antlr/MismatchedTokenException.hpp>
+
+namespace {
+	bool quiet = false;
+
+	struct options {
+		bool first;
+		bool until_done;
+		int until_done_iterations;
+	};
+}
+
+bool load_more_rules(Wccl::Parser& parser, const std::string& filename, Wccl::RuleSequence& rules)
+{
+	boost::shared_ptr<Wccl::RuleSequence> ret;
+	try {
+		std::ifstream is(filename.c_str());
+		if (!is.good()) {
+			throw Wccl::FileNotFound(filename, "", __FUNCTION__);
+		}
+
+		ret = parser.parseRuleSequence(is);
+		if (ret) {
+			std::cerr << ret->size() << "\n";
+			std::copy(ret->begin(), ret->end(), std::back_inserter(rules));
+			return true;
+		} else {
+			std::cerr << "Problem while parsing -- "
+				<< "parser returned NULL!" << std::endl;
+		}
+	} catch (antlr::MismatchedTokenException &e) {
+		std::cerr << e.getFileLineColumnString()
+				<< " " << e.getMessage() << std::endl;
+	} catch(antlr::NoViableAltException &e) {
+		std::cerr << e.getFileLineColumnString()
+				<< " " << e.getMessage() << std::endl;
+	} catch (Wccl::InvalidVariableName &e) {
+		std::cerr << "Wccl::InvalidVariableName " << e.info() << std::endl;
+	} catch (Wccl::VariableTypeMismatch &e) {
+		std::cerr << "Wccl::VariableTypeMismatch " << e.info() << std::endl;
+	} catch (Wccl::WcclError& e) {
+		std::cerr << "Wccl::WcclError:" << e.info() << std::endl;
+	} catch (PwrNlp::PwrNlpError& e) {
+		std::cerr << "PwrNlp::PwrNlpError " << e.info() << std::endl;
+	} catch (antlr::ANTLRException& e) {
+		std::cerr << "Antlr error " << e.getMessage() << std::endl;
+	}
+	return false;
+}
+
+void do_stream(boost::shared_ptr<Corpus2::TokenWriter> writer, const Corpus2::Tagset& tagset, Wccl::RuleSequence& rules,
+	std::istream& is, const options& opts)
+{
+	Corpus2::XcesReader xr(tagset, is);
+	Corpus2::Sentence::Ptr s;
+	while ((s = xr.get_next_sentence())) {
+		rules.execute_once(s);
+		writer->write_sentence(*s);
+		if (opts.first) break;
+	}
+}
+
+
+int main(int argc, char** argv)
+{
+	std::string tagset_load = "kipi";
+	std::string output_format;
+	options opts;
+	opts.first = false;
+	opts.until_done = false;
+	opts.until_done_iterations = 1000;
+	std::vector<std::string> corpora_files, ccl_files, files;
+	bool corpus_stdin = true;
+	using boost::program_options::value;
+
+	std::string writers = boost::algorithm::join(Corpus2::TokenWriter::available_writer_types_help(), " ");
+	std::string writers_help = "Output format, any of: " + writers + "\n";
+
+	boost::program_options::options_description desc("Allowed options");
+	desc.add_options()
+			("tagset,t", value(&tagset_load),
+			 "Tagset to use\n")
+			("corpus,c", value(&corpora_files),
+			 "Corpus file to load (XCES), do not load from stdin\n")
+			("ccl-file,C", value(&ccl_files),
+			 "CCL rule files\n")
+			("files,f", value(&files),
+			 "Files to load, looking at the extension to determine type\n")
+			("output-format,o", value(&output_format)->default_value("xces"),
+			 writers_help.c_str())
+			("quiet,q", value(&quiet)->zero_tokens(),
+			 "Suppress messages\n")
+			("until-done,u", value(&opts.until_done)->zero_tokens(),
+			 "Until-done mode\n")
+			("until-done-iterations", value(&opts.until_done_iterations),
+			 "Until-done iteration limit\n")
+			("first-sentence-only,1", value(&opts.first)->zero_tokens(),
+			 "Only process first sentence\n")
+			("help,h", "Show help")
+			;
+	boost::program_options::variables_map vm;
+	boost::program_options::positional_options_description p;
+	p.add("files", -1);
+
+	try {
+		boost::program_options::store(
+			boost::program_options::command_line_parser(argc, argv)
+			.options(desc).positional(p).run(), vm);
+	} catch (boost::program_options::error& e) {
+		std::cerr << e.what() << std::endl;
+		return 2;
+	}
+	boost::program_options::notify(vm);
+
+	if (vm.count("help")) {
+		std::cerr << "Usage " << argv[0] << " [OPTIONS] FILES\n"
+			<< "Files ending with .xml are treated as corpora, otherwise \n"
+			<< "as CCL files. Use - to read corpus from stdin (as with -I)";
+		std::cout << desc << "\n";
+		return 1;
+	}
+
+	foreach (const std::string& f, files) {
+		if (boost::algorithm::ends_with(f, ".xml")) {
+			corpora_files.push_back(f);
+			corpus_stdin = false;
+		} else {
+			ccl_files.push_back(f);
+		}
+	}
+
+	try {
+		const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load);
+		Wccl::Parser parser(tagset);
+		Wccl::RuleSequence rules;
+		foreach (const std::string& f, ccl_files) {
+			size_t sz = rules.size();
+			if (!load_more_rules(parser, f, rules)) {
+				std::cerr << "Warning: error while parsing " << f << "\n";
+			}
+			if (rules.size() == sz) {
+				std::cerr << "Warning: no rules loaded from " << f << "\n";
+			}
+		}
+		if (!rules.empty()) {
+			boost::shared_ptr<Corpus2::TokenWriter> writer;
+			writer.reset(Corpus2::TokenWriter::create(output_format, std::cout, tagset));			foreach (const std::string& f, corpora_files) {
+				std::ifstream ifs(f.c_str());
+				if (ifs.good()) {
+					do_stream(writer, tagset, rules, ifs, opts);
+				} else {
+					std::cerr << "Error reading corpus from " << f << "\n";
+				}
+			}
+			if (corpus_stdin) {
+				do_stream(writer, tagset, rules, std::cin, opts);
+			}
+		}
+	} catch (PwrNlp::PwrNlpError& e) {
+		std::cerr << e.info() << std::endl;
+		return 2;
+	}
+
+	return 0;
+}
-- 
GitLab