wccl-features

c79eaf53 · ilor · 8ef00260 · c79eaf53 · c79eaf53 · c79eaf53
Commit c79eaf53 authored Apr 17, 2011 by ilor
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -61,4 +61,5 @@ add_subdirectory(libwccl)
 add_subdirectory(wcclparser)
 add_subdirectory(wcclrun)
 add_subdirectory(wcclrules)
+add_subdirectory(wccl-features)
 add_subdirectory(tests)
--- a/wccl-features/CMakeLists.txt
+++ b/wccl-features/CMakeLists.txt
+PROJECT( wccl-features )
+find_package(Libedit)
+if (Libedit_FOUND)
+	message(STATUS "Building with libedit")
+	add_definitions( -DHAVE_LIBEDIT )
+	set(LIBS ${LIBS} ${Libedit_LIBRARIES})
+endif (Libedit_FOUND)
+find_package(LibXML++ REQUIRED)
+include_directories(${LibXML++_INCLUDE_DIRS})
+link_directories(${LibXML++_LIBRARY_DIRS})
+set(LIBS ${LIBS} ${LibXML++_LIBRARIES})
+include_directories( ${CMAKE_SOURCE_DIR} )
+add_definitions(-DLIBWCCL_WCCLRUN_DATA_DIR="${PROJECT_SOURCE_DIR}/")
+add_executable(wccl-features
+  main.cpp
+)
+target_link_libraries (wccl-features wccl ${Boost_LIBRARIES} antlr ${LIBS})
+include_directories(${Boost_INCLUDE_DIR})
+link_directories(${Boost_LIBRARY_DIRS})
+if(UNIX)
+	install(TARGETS wccl-features
+		RUNTIME DESTINATION bin
+	)
+endif(UNIX)
--- a/wccl-features/main.cpp
+++ b/wccl-features/main.cpp
+#include <cstdlib>
+#include <cstdio>
+#include <fstream>
+#include <iomanip>
+#include <libwccl/values/strset.h>
+#include <libwccl/parser/Parser.h>
+#include <libcorpus2/tagsetmanager.h>
+#include <boost/bind.hpp>
+#include <boost/algorithm/string.hpp>
+#include <boost/make_shared.hpp>
+#include <boost/program_options.hpp>
+#include <boost/filesystem.hpp>
+#include <libcorpus2/io/xcesreader.h>
+#include <boost/lexical_cast.hpp>
+#include <boost/regex.hpp>
+#include <antlr/NoViableAltException.hpp>
+#include <antlr/MismatchedTokenException.hpp>
+namespace {
+	bool quiet = false;
+	bool tabs = false;
+	bool output_orths = true;
+	bool output_variables = false;
+	bool global_numbering = false;
+	bool output_header = true;
+	bool in_sentence_numbering = true;
+}
+class streamsave
+{
+public:
+	streamsave(std::ostream& os) : os_(os), flags_(os.flags()) {}
+	~streamsave() { os_.flags(flags_); }
+private:
+	std::ostream& os_;
+	std::ios_base::fmtflags flags_;
+};
+class Runner
+{
+public:
+	Runner(const Corpus2::Tagset& tagset)
+	 : tagset_(tagset), parser_(tagset_), token_idx(0)
+	{
+	}
+	int load_more_operators(const std::string &filename);
+	int load_operator_string(const std::string &line);
+	void print_header_head();
+	void print_header_body(const std::string &attribute_prefix);
+	void print_header_foot();
+	void print_data(const std::vector< std::vector<std::string> >& data);
+	std::vector< std::vector<std::string> > do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence);
+	void do_stream(std::istream& is, bool first);
+	bool empty() {
+		return bool_ops_.empty() && str_ops_.empty() && tset_ops_.empty();
+	}
+private:
+	const Corpus2::Tagset& tagset_;
+	Wccl::Parser parser_;
+	typedef std::map<
+		std::string,
+		boost::shared_ptr<Wccl::Operator<Wccl::Bool> >
+	> bool_ops_map_t;
+	bool_ops_map_t bool_ops_;
+	typedef std::map<
+		std::string,
+		boost::shared_ptr<Wccl::Operator<Wccl::StrSet> >
+	> str_ops_map_t;
+	str_ops_map_t str_ops_;
+	typedef std::map<
+		std::string,
+		std::pair<std::set<Corpus2::Tag>, boost::shared_ptr<Wccl::Operator<Wccl::TSet> > >
+	> tset_ops_map_t;
+	tset_ops_map_t tset_ops_;
+	int token_idx;
+};
+int Runner::load_more_operators(const std::string& filename)
+{
+	int ops_parsed = 0;
+		std::ifstream is(filename.c_str());
+		if (!is.good()) {
+			throw Wccl::FileNotFound(filename, "", __FUNCTION__);
+		}
+		std::string line;
+		int line_no = 0;
+		while (std::getline(is, line)) {
+			++line_no;
+			if (line.size() < 3) continue;
+			if (line[0] == '#') continue;
+			int loaded = load_operator_string(line);
+			if (loaded > 0) {
+				ops_parsed += loaded;
+			} else {
+				std::cerr << "Line " << line_no << " did not match: " << line << "\n";
+			}
+		}
+	return ops_parsed;
+}
+int Runner::load_operator_string(const std::string &line)
+{
+	int ops_loaded = 0;
+	boost::regex e("(STRING|BOOL|MASK\\h([a-z@,]+))\\h+"
+			"(?:name:([a-zA-Z0-9_-]+)\\h)?"
+			"(?:range:([0-9-]+):([0-9-]+)\\h)?"
+			"(.*)");
+	boost::smatch what;
+	if(boost::regex_match(line, what, e, boost::match_extra)) {
+		try {
+			const std::string& orig_name = what[3].matched ? what[3] : what[6];
+			const std::string& orig_op_string = what[6];
+			std::vector <std::string> op_strings;
+			std::vector <std::string> names;
+			if (what[4].matched) {
+				int rfrom = boost::lexical_cast<int>(what[4]);
+				int rto = boost::lexical_cast<int>(what[5]);
+				for (int i = rfrom; i <= rto; ++i) {
+					std::string pos = boost::lexical_cast<std::string>(i);
+					op_strings.push_back(boost::algorithm::replace_all_copy(
+						orig_op_string, "_R_", pos));
+					names.push_back(orig_name + pos);
+				}
+			} else {
+				op_strings.push_back(orig_op_string);
+				names.push_back(orig_name);
+			}
+			for (size_t opi = 0; opi < op_strings.size(); ++opi) {
+				const std::string& name = names[opi];
+				const std::string& op_string = op_strings[opi];
+				if (what[1] == "STRING") {
+					str_ops_.insert(std::make_pair(name,
+						parser_.parseStringOperator(op_string)));
+					++ops_loaded;
+				} else if (what[1] == "BOOL") {
+					bool_ops_.insert(std::make_pair(name,
+						parser_.parseBoolOperator(op_string)));
+					++ops_loaded;
+				} else {
+					Corpus2::Tag tag = tagset_.parse_symbol_string(what[2]);
+					std::vector<std::string> sym = tagset_.tag_to_symbol_string_vector(tag, false);
+					std::set<Corpus2::Tag> t;
+					foreach (const std::string& s, sym) {
+						t.insert(tagset_.parse_symbol(s));
+					}
+					tset_ops_.insert(std::make_pair(name, std::make_pair(t,
+						parser_.parseSymSetOperator(op_string))));
+					++ops_loaded;
+				}
+			}
+		} catch (PwrNlp::PwrNlpError& e) {
+			std::cerr << e.scope() <<  " error: " << e.info() << std::endl;
+		}
+	}
+	return ops_loaded;
+}
+void Runner::print_header_head()
+{
+	std::cout << "% Generated by wccl-features\n";
+	std::cout << "@RELATION wccl\n";
+	std::cout << "\n";
+}
+void Runner::print_header_body(const std::string& attribute_prefix)
+{
+	foreach (const str_ops_map_t::value_type v, str_ops_) {
+		std::cout << "@ATTRIBUTE "
+			<< attribute_prefix << v.first << " string\n";
+	}
+	foreach (const bool_ops_map_t::value_type v, bool_ops_) {
+		std::cout << "@ATTRIBUTE "
+			<< attribute_prefix << v.first << " class {0,1}\n";
+	}
+	foreach (const tset_ops_map_t::value_type v, tset_ops_) {
+		foreach (const Corpus2::Tag& tag, v.second.first) {
+			std::cout << "@ATTRIBUTE "
+				<< attribute_prefix << v.first << "_"
+				<< tagset_.tag_to_symbol_string(tag) << " class {0,1}\n";
+		}
+	}
+}
+void Runner::print_header_foot()
+{
+	std::cout << "\n@DATA\n";
+}
+void Runner::print_data(const std::vector<std::vector<std::string> > &data)
+{
+	foreach (const std::vector<std::string>& feats, data) {
+		std::cout << boost::algorithm::join(feats, ",") << "\n";
+	}
+}
+std::vector< std::vector<std::string> > Runner::do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence)
+{
+	Wccl::SentenceContext sc(sentence);
+	std::vector< std::vector<std::string> > sfeats;
+	while (sc.is_current_inside()) {
+		sfeats.resize(sfeats.size() + 1);
+		std::vector<std::string>& feats = sfeats.back();
+		foreach (const str_ops_map_t::value_type v, str_ops_) {
+			boost::shared_ptr<const Wccl::StrSet> s = v.second->apply(sc);
+			assert(s);
+			if (s->contents().empty()) {
+				feats.push_back("\"\"");
+			} else {
+				feats.push_back("\"" + PwrNlp::to_utf8(*s->contents().begin()) + "\"");
+			}
+		}
+		foreach (const bool_ops_map_t::value_type v, bool_ops_) {
+			boost::shared_ptr<const Wccl::Bool> b = v.second->apply(sc);
+			assert(b);
+			if (*b) {
+				feats.push_back("1");
+			} else {
+				feats.push_back("0");
+			}
+		}
+		foreach (const tset_ops_map_t::value_type v, tset_ops_) {
+			boost::shared_ptr<const Wccl::TSet> t = v.second.second->apply(sc);
+			assert(t);
+			foreach (const Corpus2::Tag& tag, v.second.first) {
+				if (!tag.get_masked(t->contents()).is_null()) {
+					feats.push_back("1");
+				} else {
+					feats.push_back("0");
+				}
+			}
+		}
+		sc.advance();
+	}
+	return sfeats;
+}
+void Runner::do_stream(std::istream& is, bool first)
+{
+	Corpus2::XcesReader xr(tagset_, is);
+	Corpus2::Sentence::Ptr s;
+	print_header_head();
+	print_header_body("");
+	print_header_foot();
+	while ((s = xr.get_next_sentence())) {
+		print_data(do_sentence(s));
+		std::cout << "\n";
+		if (first) break;
+	}
+}
+//void Runner::do_files(std::istream& is, bool first)
+int main(int argc, char** argv)
+{
+	std::string tagset_load = "kipi";
+	bool first = false;
+	std::vector<std::string> corpora_files, files, operator_strings;
+	bool corpus_stdin = false;
+	using boost::program_options::value;
+	boost::program_options::options_description desc("Allowed options");
+	desc.add_options()
+			("tagset,t", value(&tagset_load),
+			 "Tagset to use")
+			("corpus,c", value(&corpora_files),
+			 "Corpus file to load (XCES)")
+			("ccl-operator,C", value(&operator_strings),
+			 "CCL operator file or string")
+			("files,f", value(&files),
+			 "Files to load, looking at the extension to determine type")
+			("corpus-from-stdin,I", value(&corpus_stdin)->zero_tokens(),
+			 "Read corpus from stdin")
+			("quiet,q", value(&quiet)->zero_tokens(),
+			 "Suppress messages")
+			("first-sentence-only,1", value(&first)->zero_tokens(),
+			 "Only process first sentence")
+			("tabs", value(&tabs)->zero_tokens(),
+			 "Output a tab-separated file")
+			("local-counts,l", value(&in_sentence_numbering),
+			 "Output in-sentence token counts")
+			("global-counts,g", value(&global_numbering),
+			 "Output global counts")
+			("output-orths,O", value(&output_orths),
+			 "Output token orths")
+			("output-variables,V", value(&output_variables),
+			 "Output operator variables")
+			("output-header,H", value(&output_header),
+			 "Output table header")
+			("help,h", "Show help")
+			;
+	boost::program_options::variables_map vm;
+	boost::program_options::positional_options_description p;
+	p.add("files", -1);
+	try {
+		boost::program_options::store(
+			boost::program_options::command_line_parser(argc, argv)
+			.options(desc).positional(p).run(), vm);
+	} catch (boost::program_options::error& e) {
+		std::cerr << e.what() << std::endl;
+		return 2;
+	}
+	boost::program_options::notify(vm);
+	if (vm.count("help")) {
+		std::cerr << "Usage " << argv[0] << " [OPTIONS] FILES\n"
+			<< "Files ending with .xml are treated as corpora, otherwise \n"
+			<< "as CCL files. Use - to read corpus from stdin (as with -I)\n"
+			<< "Files not ending with an extension are treated as raw operator strings\n";
+		std::cout << desc << "\n";
+		return 1;
+	}
+	foreach (const std::string& f, files) {
+		if (f == "-") {
+			corpus_stdin = true;
+		} else if (boost::algorithm::ends_with(f, ".xml")) {
+			corpora_files.push_back(f);
+		} else {
+			operator_strings.push_back(f);
+		}
+	}
+	if ((corpora_files.empty() && !corpus_stdin) || (operator_strings.empty() && !output_orths)) {
+		std::cerr << "Nothing to do, try " << argv[0] << " -h\n";
+		return 2;
+	}
+	try {
+		const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load);
+		Runner runner(tagset);
+		foreach (const std::string& f, operator_strings) {
+			if (boost::algorithm::ends_with(f, ".ccl")) {
+				if (!runner.load_more_operators(f)) {
+					std::cerr << "Warning: error while parsing " << f << "\n";
+				}
+			}
+		}
+		if (!runner.empty()) {
+			foreach (const std::string& f, corpora_files) {
+				std::ifstream ifs(f.c_str());
+				if (ifs.good()) {
+					runner.do_stream(ifs, first);
+				} else {
+					std::cerr << "Error reading corpus from " << f << "\n";
+				}
+			}
+			if (corpus_stdin) {
+				runner.do_stream(std::cin, first);
+			}
+		}
+	} catch (PwrNlp::PwrNlpError& e) {
+		std::cerr << e.info() << std::endl;
+		return 2;
+	}
+	return 0;
+}