wccl-features update

9c7dabf1 · ilor · 4d705f86 · 9c7dabf1
Commit 9c7dabf1 authored 13 years ago by ilor
--- a/wccl-features/main.cpp
+++ b/wccl-features/main.cpp
@@ -43,10 +43,10 @@ private:
 	std::ios_base::fmtflags flags_;
 };

-class Runner
+class FeatureRunner
 {
 public:
-	Runner(const Corpus2::Tagset& tagset)
+	FeatureRunner(const Corpus2::Tagset& tagset)
 	 : tagset_(tagset), parser_(tagset_), token_idx(0)
 	{
 	}
@@ -56,15 +56,26 @@ public:
 	int load_operator_string(const std::string &line);

 	void print_header_head();
-	void print_header_body(const std::string &attribute_prefix);
+	void print_header_body(const std::string &attribute_prefix,
+		bool nos = false);
 	void print_header_foot();

 	void print_data(const std::vector< std::vector<std::string> >& data);
+	void print_data(const std::vector<std::vector<std::string> > &data,
+		const std::vector<bool> rowmask);

-	std::vector< std::vector<std::string> > do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence);
+	std::vector< std::vector<std::string> > do_sentence(
+		const boost::shared_ptr<Corpus2::Sentence>& sentence);
+
+	void do_sentence(
+		const boost::shared_ptr<Corpus2::Sentence>& sentence,
+		std::vector< std::vector<std::string> >& sfeats,
+		bool nos = false);

 	void do_stream(std::istream& is, bool first);

+	void do_files(std::vector<std::string>& files, bool first);
+
 	bool empty() {
 		return bool_ops_.empty() && str_ops_.empty() && tset_ops_.empty();
 	}
@@ -93,7 +104,7 @@ private:
 	int token_idx;
 };

-int Runner::load_more_operators(const std::string& filename)
+int FeatureRunner::load_more_operators(const std::string& filename)
 {
 	int ops_parsed = 0;

@@ -117,7 +128,7 @@ int Runner::load_more_operators(const std::string& filename)
 	return ops_parsed;
 }

-int Runner::load_operator_string(const std::string &line)
+int FeatureRunner::load_operator_string(const std::string &line)
 {
 	int ops_loaded = 0;
 	boost::regex e("(STRING|BOOL|MASK\\h([a-z@,]+))\\h+"
@@ -174,59 +185,93 @@ int Runner::load_operator_string(const std::string &line)
 	return ops_loaded;
 }

-void Runner::print_header_head()
+void FeatureRunner::print_header_head()
 {
 	std::cout << "% Generated by wccl-features\n";
 	std::cout << "@RELATION wccl\n";
 	std::cout << "\n";
 }

-void Runner::print_header_body(const std::string& attribute_prefix)
+void FeatureRunner::print_header_body(const std::string& attribute_prefix,
+	bool nos /*=false*/)
 {
-	foreach (const str_ops_map_t::value_type v, str_ops_) {
-		std::cout << "@ATTRIBUTE "
-			<< attribute_prefix << v.first << " string\n";
+	if (!nos) {
+		foreach (const str_ops_map_t::value_type v, str_ops_) {
+			std::cout << "@ATTRIBUTE "
+				<< attribute_prefix << v.first << " string\n";
+		}
 	}
 	foreach (const bool_ops_map_t::value_type v, bool_ops_) {
 		std::cout << "@ATTRIBUTE "
-			<< attribute_prefix << v.first << " class {0,1}\n";
+			<< attribute_prefix << v.first << " {0,1}\n";
 	}
 	foreach (const tset_ops_map_t::value_type v, tset_ops_) {
 		foreach (const Corpus2::Tag& tag, v.second.first) {
 			std::cout << "@ATTRIBUTE "
 				<< attribute_prefix << v.first << "_"
-				<< tagset_.tag_to_symbol_string(tag) << " class {0,1}\n";
+				<< tagset_.tag_to_symbol_string(tag) << " {0,1}\n";
 		}
 	}
 }

-void Runner::print_header_foot()
+void FeatureRunner::print_header_foot()
 {
 	std::cout << "\n@DATA\n";
 }

-void Runner::print_data(const std::vector<std::vector<std::string> > &data)
+void FeatureRunner::print_data(const std::vector<std::vector<std::string> > &data)
 {
 	foreach (const std::vector<std::string>& feats, data) {
 		std::cout << boost::algorithm::join(feats, ",") << "\n";
 	}
 }

-std::vector< std::vector<std::string> > Runner::do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence)
+void FeatureRunner::print_data(
+	const std::vector<std::vector<std::string> > &data,
+	const std::vector<bool> rowmask)
 {
-	Wccl::SentenceContext sc(sentence);
+	assert(data.size() == rowmask.size());
+	for (size_t i = 0; i < data.size(); ++i) {
+		if (rowmask[i]) {
+			std::cout << boost::algorithm::join(data[i], ",") << "\n";
+		}
+	}
+}

+std::vector< std::vector<std::string> > FeatureRunner::do_sentence(
+	const boost::shared_ptr<Corpus2::Sentence>& sentence)
+{
 	std::vector< std::vector<std::string> > sfeats;
+	do_sentence(sentence, sfeats);
+	return sfeats;
+}
+
+
+void FeatureRunner::do_sentence(
+	const boost::shared_ptr<Corpus2::Sentence>& sentence,
+	std::vector< std::vector<std::string> >& sfeats,
+	bool nos /*=false*/)
+{
+	Wccl::SentenceContext sc(sentence);
+
 	while (sc.is_current_inside()) {
-		sfeats.resize(sfeats.size() + 1);
-		std::vector<std::string>& feats = sfeats.back();
-		foreach (const str_ops_map_t::value_type v, str_ops_) {
-			boost::shared_ptr<const Wccl::StrSet> s = v.second->apply(sc);
-			assert(s);
-			if (s->contents().empty()) {
-				feats.push_back("\"\"");
-			} else {
-				feats.push_back("\"" + PwrNlp::to_utf8(*s->contents().begin()) + "\"");
+		if (sfeats.size() < static_cast<size_t>(sc.get_position() + 1)) {
+			sfeats.resize(sc.get_position() + 1);
+		}
+		assert(!sfeats.empty());
+		std::vector<std::string>& feats = sfeats[sc.get_position()];
+		if (!nos) {
+			foreach (const str_ops_map_t::value_type v, str_ops_) {
+				boost::shared_ptr<const Wccl::StrSet> s = v.second->apply(sc);
+				assert(s);
+				if (s->contents().empty()) {
+					feats.push_back("\"\"");
+				} else {
+					feats.push_back("\"" +
+						boost::algorithm::replace_all_copy(
+							PwrNlp::to_utf8(*s->contents().begin()),
+							"\"", "\\\"") + "\"");
+				}
 			}
 		}
 		foreach (const bool_ops_map_t::value_type v, bool_ops_) {
@@ -251,11 +296,10 @@ std::vector< std::vector<std::string> > Runner::do_sentence(const boost::shared_
 		}
 		sc.advance();
 	}
-	return sfeats;
 }


-void Runner::do_stream(std::istream& is, bool first)
+void FeatureRunner::do_stream(std::istream& is, bool first)
 {
 	Corpus2::XcesReader xr(tagset_, is);
 	Corpus2::Sentence::Ptr s;
@@ -269,7 +313,98 @@ void Runner::do_stream(std::istream& is, bool first)
 	}
 }

-//void Runner::do_files(std::istream& is, bool first)
+void FeatureRunner::do_files(std::vector<std::string>& files, bool first)
+{
+	std::vector<boost::shared_ptr<Corpus2::TokenReader> > readers;
+	if (files.size() < 2) return;
+	readers.push_back(Corpus2::TokenReader::create_path_reader(
+		"xces", tagset_, files[0]));
+	for (size_t i = 1; i < files.size(); ++i) {
+		readers.push_back(Corpus2::TokenReader::create_path_reader(
+			"xces,disamb_only", tagset_, files[i]));
+	}
+	print_header_head();
+	for (size_t i = 0; i < files.size(); ++i) {
+		print_header_body("T" + boost::lexical_cast<std::string>(i) + "_");
+	}
+	std::cout << "@ATTRIBUTE correct {0";
+	for (size_t si = 1; si < files.size(); ++si) {
+		//std::cout << "@ATTRIBUTE tag" << si << "ok " << "{0,1}" << "\n";
+		std::cout << "," << si;
+	}
+	std::cout << "}\n";
+	print_header_foot();
+	bool more = !first;
+	int processed = 0;
+	do {
+		std::vector<Corpus2::Sentence::Ptr> sentences;
+		foreach (const boost::shared_ptr<Corpus2::TokenReader>& r, readers) {
+			Corpus2::Sentence::Ptr s = r->get_next_sentence();
+			if (s) {
+				sentences.push_back(s);
+			}
+		}
+		if (sentences.size() == readers.size()) {
+			std::vector< std::vector< std::string> > data;
+			std::vector<bool> rowmask;
+			size_t gold_size = sentences[0]->size();
+			for (size_t si = 1; si < sentences.size(); ++si) {
+				if (sentences[si]->size() != gold_size) {
+					std::cerr << "Sentence size mismatch at " <<
+						processed << " " << si << "\n";
+					return;
+				}
+			}
+			data.resize(gold_size);
+			rowmask.resize(gold_size);
+			do_sentence(sentences[0], data, false);
+			for (size_t si = 1; si < sentences.size(); ++si) {
+				do_sentence(sentences[si], data, false);
+			}
+			int maxv = 0;
+			for (size_t i = 0; i < gold_size; ++i) {
+				std::set<Corpus2::Tag> gold_tags;
+				const Corpus2::Token& gold_token = *(*sentences[0])[i];
+				foreach (const Corpus2::Lexeme& gl, gold_token.disamb_lexemes()) {
+					gold_tags.insert(gl.tag());
+				}
+				int wci = 0;
+				std::map<Corpus2::Tag, int> v;
+				for (size_t si = 1; si < sentences.size(); ++si) {
+					const Corpus2::Token& token = *(*sentences[si])[i];
+					bool wc = false;
+					foreach (const Corpus2::Lexeme& gl, token.lexemes()) {
+						if (gold_tags.find(gl.tag()) != gold_tags.end()) {
+							wc = true;
+							wci = si;
+						}
+						v[gl.tag()]++;
+						maxv = std::max(maxv, v[gl.tag()]);
+					}
+					//data[i].push_back(wc ? "1" : "0");
+				}
+				typedef std::pair<Corpus2::Tag, int> pp;
+				int mv = 0;
+				bool tie = false;
+				foreach (const pp& p, v) {
+					if (p.second == mv) {
+						tie = true;
+					} else if (p.second > mv) {
+						tie = false;
+						mv = p.second;
+					}
+				}
+
+				data[i].push_back(boost::lexical_cast<std::string>(wci));
+				rowmask[i] = tie;
+			}
+			print_data(data, rowmask);
+			++processed;
+		} else {
+			more = false;
+		}
+	} while (more);
+}


 int main(int argc, char** argv)
@@ -348,7 +483,7 @@ int main(int argc, char** argv)
 	}
 	try {
 		const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load);
-		Runner runner(tagset);
+		FeatureRunner runner(tagset);
 		foreach (const std::string& f, operator_strings) {
 			if (boost::algorithm::ends_with(f, ".ccl")) {
 				if (!runner.load_more_operators(f)) {
@@ -357,16 +492,16 @@ int main(int argc, char** argv)
 			}
 		}
 		if (!runner.empty()) {
-			foreach (const std::string& f, corpora_files) {
-				std::ifstream ifs(f.c_str());
+			if (corpora_files.size() == 1) {
+				std::ifstream ifs(corpora_files[0].c_str());
 				if (ifs.good()) {
 					runner.do_stream(ifs, first);
 				} else {
-					std::cerr << "Error reading corpus from " << f << "\n";
+					std::cerr << "Error reading corpus from "
+						<< corpora_files[0] << "\n";
 				}
-			}
-			if (corpus_stdin) {
-				runner.do_stream(std::cin, first);
+			} else {
+				runner.do_files(corpora_files, first);
 			}
 		}
 	} catch (PwrNlp::PwrNlpError& e) {