diff --git a/wccl-features/main.cpp b/wccl-features/main.cpp index c829ed3142c807373eed58c94373e83d25dfa361..c1e864103787e1514a1d6f8e6fdda6de9bc0b365 100644 --- a/wccl-features/main.cpp +++ b/wccl-features/main.cpp @@ -43,10 +43,10 @@ private: std::ios_base::fmtflags flags_; }; -class Runner +class FeatureRunner { public: - Runner(const Corpus2::Tagset& tagset) + FeatureRunner(const Corpus2::Tagset& tagset) : tagset_(tagset), parser_(tagset_), token_idx(0) { } @@ -56,15 +56,26 @@ public: int load_operator_string(const std::string &line); void print_header_head(); - void print_header_body(const std::string &attribute_prefix); + void print_header_body(const std::string &attribute_prefix, + bool nos = false); void print_header_foot(); void print_data(const std::vector< std::vector<std::string> >& data); + void print_data(const std::vector<std::vector<std::string> > &data, + const std::vector<bool> rowmask); - std::vector< std::vector<std::string> > do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence); + std::vector< std::vector<std::string> > do_sentence( + const boost::shared_ptr<Corpus2::Sentence>& sentence); + + void do_sentence( + const boost::shared_ptr<Corpus2::Sentence>& sentence, + std::vector< std::vector<std::string> >& sfeats, + bool nos = false); void do_stream(std::istream& is, bool first); + void do_files(std::vector<std::string>& files, bool first); + bool empty() { return bool_ops_.empty() && str_ops_.empty() && tset_ops_.empty(); } @@ -93,7 +104,7 @@ private: int token_idx; }; -int Runner::load_more_operators(const std::string& filename) +int FeatureRunner::load_more_operators(const std::string& filename) { int ops_parsed = 0; @@ -117,7 +128,7 @@ int Runner::load_more_operators(const std::string& filename) return ops_parsed; } -int Runner::load_operator_string(const std::string &line) +int FeatureRunner::load_operator_string(const std::string &line) { int ops_loaded = 0; boost::regex e("(STRING|BOOL|MASK\\h([a-z@,]+))\\h+" @@ -174,59 +185,93 @@ int Runner::load_operator_string(const std::string &line) return ops_loaded; } -void Runner::print_header_head() +void FeatureRunner::print_header_head() { std::cout << "% Generated by wccl-features\n"; std::cout << "@RELATION wccl\n"; std::cout << "\n"; } -void Runner::print_header_body(const std::string& attribute_prefix) +void FeatureRunner::print_header_body(const std::string& attribute_prefix, + bool nos /*=false*/) { - foreach (const str_ops_map_t::value_type v, str_ops_) { - std::cout << "@ATTRIBUTE " - << attribute_prefix << v.first << " string\n"; + if (!nos) { + foreach (const str_ops_map_t::value_type v, str_ops_) { + std::cout << "@ATTRIBUTE " + << attribute_prefix << v.first << " string\n"; + } } foreach (const bool_ops_map_t::value_type v, bool_ops_) { std::cout << "@ATTRIBUTE " - << attribute_prefix << v.first << " class {0,1}\n"; + << attribute_prefix << v.first << " {0,1}\n"; } foreach (const tset_ops_map_t::value_type v, tset_ops_) { foreach (const Corpus2::Tag& tag, v.second.first) { std::cout << "@ATTRIBUTE " << attribute_prefix << v.first << "_" - << tagset_.tag_to_symbol_string(tag) << " class {0,1}\n"; + << tagset_.tag_to_symbol_string(tag) << " {0,1}\n"; } } } -void Runner::print_header_foot() +void FeatureRunner::print_header_foot() { std::cout << "\n@DATA\n"; } -void Runner::print_data(const std::vector<std::vector<std::string> > &data) +void FeatureRunner::print_data(const std::vector<std::vector<std::string> > &data) { foreach (const std::vector<std::string>& feats, data) { std::cout << boost::algorithm::join(feats, ",") << "\n"; } } -std::vector< std::vector<std::string> > Runner::do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence) +void FeatureRunner::print_data( + const std::vector<std::vector<std::string> > &data, + const std::vector<bool> rowmask) { - Wccl::SentenceContext sc(sentence); + assert(data.size() == rowmask.size()); + for (size_t i = 0; i < data.size(); ++i) { + if (rowmask[i]) { + std::cout << boost::algorithm::join(data[i], ",") << "\n"; + } + } +} +std::vector< std::vector<std::string> > FeatureRunner::do_sentence( + const boost::shared_ptr<Corpus2::Sentence>& sentence) +{ std::vector< std::vector<std::string> > sfeats; + do_sentence(sentence, sfeats); + return sfeats; +} + + +void FeatureRunner::do_sentence( + const boost::shared_ptr<Corpus2::Sentence>& sentence, + std::vector< std::vector<std::string> >& sfeats, + bool nos /*=false*/) +{ + Wccl::SentenceContext sc(sentence); + while (sc.is_current_inside()) { - sfeats.resize(sfeats.size() + 1); - std::vector<std::string>& feats = sfeats.back(); - foreach (const str_ops_map_t::value_type v, str_ops_) { - boost::shared_ptr<const Wccl::StrSet> s = v.second->apply(sc); - assert(s); - if (s->contents().empty()) { - feats.push_back("\"\""); - } else { - feats.push_back("\"" + PwrNlp::to_utf8(*s->contents().begin()) + "\""); + if (sfeats.size() < static_cast<size_t>(sc.get_position() + 1)) { + sfeats.resize(sc.get_position() + 1); + } + assert(!sfeats.empty()); + std::vector<std::string>& feats = sfeats[sc.get_position()]; + if (!nos) { + foreach (const str_ops_map_t::value_type v, str_ops_) { + boost::shared_ptr<const Wccl::StrSet> s = v.second->apply(sc); + assert(s); + if (s->contents().empty()) { + feats.push_back("\"\""); + } else { + feats.push_back("\"" + + boost::algorithm::replace_all_copy( + PwrNlp::to_utf8(*s->contents().begin()), + "\"", "\\\"") + "\""); + } } } foreach (const bool_ops_map_t::value_type v, bool_ops_) { @@ -251,11 +296,10 @@ std::vector< std::vector<std::string> > Runner::do_sentence(const boost::shared_ } sc.advance(); } - return sfeats; } -void Runner::do_stream(std::istream& is, bool first) +void FeatureRunner::do_stream(std::istream& is, bool first) { Corpus2::XcesReader xr(tagset_, is); Corpus2::Sentence::Ptr s; @@ -269,7 +313,98 @@ void Runner::do_stream(std::istream& is, bool first) } } -//void Runner::do_files(std::istream& is, bool first) +void FeatureRunner::do_files(std::vector<std::string>& files, bool first) +{ + std::vector<boost::shared_ptr<Corpus2::TokenReader> > readers; + if (files.size() < 2) return; + readers.push_back(Corpus2::TokenReader::create_path_reader( + "xces", tagset_, files[0])); + for (size_t i = 1; i < files.size(); ++i) { + readers.push_back(Corpus2::TokenReader::create_path_reader( + "xces,disamb_only", tagset_, files[i])); + } + print_header_head(); + for (size_t i = 0; i < files.size(); ++i) { + print_header_body("T" + boost::lexical_cast<std::string>(i) + "_"); + } + std::cout << "@ATTRIBUTE correct {0"; + for (size_t si = 1; si < files.size(); ++si) { + //std::cout << "@ATTRIBUTE tag" << si << "ok " << "{0,1}" << "\n"; + std::cout << "," << si; + } + std::cout << "}\n"; + print_header_foot(); + bool more = !first; + int processed = 0; + do { + std::vector<Corpus2::Sentence::Ptr> sentences; + foreach (const boost::shared_ptr<Corpus2::TokenReader>& r, readers) { + Corpus2::Sentence::Ptr s = r->get_next_sentence(); + if (s) { + sentences.push_back(s); + } + } + if (sentences.size() == readers.size()) { + std::vector< std::vector< std::string> > data; + std::vector<bool> rowmask; + size_t gold_size = sentences[0]->size(); + for (size_t si = 1; si < sentences.size(); ++si) { + if (sentences[si]->size() != gold_size) { + std::cerr << "Sentence size mismatch at " << + processed << " " << si << "\n"; + return; + } + } + data.resize(gold_size); + rowmask.resize(gold_size); + do_sentence(sentences[0], data, false); + for (size_t si = 1; si < sentences.size(); ++si) { + do_sentence(sentences[si], data, false); + } + int maxv = 0; + for (size_t i = 0; i < gold_size; ++i) { + std::set<Corpus2::Tag> gold_tags; + const Corpus2::Token& gold_token = *(*sentences[0])[i]; + foreach (const Corpus2::Lexeme& gl, gold_token.disamb_lexemes()) { + gold_tags.insert(gl.tag()); + } + int wci = 0; + std::map<Corpus2::Tag, int> v; + for (size_t si = 1; si < sentences.size(); ++si) { + const Corpus2::Token& token = *(*sentences[si])[i]; + bool wc = false; + foreach (const Corpus2::Lexeme& gl, token.lexemes()) { + if (gold_tags.find(gl.tag()) != gold_tags.end()) { + wc = true; + wci = si; + } + v[gl.tag()]++; + maxv = std::max(maxv, v[gl.tag()]); + } + //data[i].push_back(wc ? "1" : "0"); + } + typedef std::pair<Corpus2::Tag, int> pp; + int mv = 0; + bool tie = false; + foreach (const pp& p, v) { + if (p.second == mv) { + tie = true; + } else if (p.second > mv) { + tie = false; + mv = p.second; + } + } + + data[i].push_back(boost::lexical_cast<std::string>(wci)); + rowmask[i] = tie; + } + print_data(data, rowmask); + ++processed; + } else { + more = false; + } + } while (more); +} int main(int argc, char** argv) @@ -348,7 +483,7 @@ int main(int argc, char** argv) } try { const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load); - Runner runner(tagset); + FeatureRunner runner(tagset); foreach (const std::string& f, operator_strings) { if (boost::algorithm::ends_with(f, ".ccl")) { if (!runner.load_more_operators(f)) { @@ -357,16 +492,16 @@ int main(int argc, char** argv) } } if (!runner.empty()) { - foreach (const std::string& f, corpora_files) { - std::ifstream ifs(f.c_str()); + if (corpora_files.size() == 1) { + std::ifstream ifs(corpora_files[0].c_str()); if (ifs.good()) { runner.do_stream(ifs, first); } else { - std::cerr << "Error reading corpus from " << f << "\n"; + std::cerr << "Error reading corpus from " + << corpora_files[0] << "\n"; } - } - if (corpus_stdin) { - runner.do_stream(std::cin, first); + } else { + runner.do_files(corpora_files, first); } } } catch (PwrNlp::PwrNlpError& e) {