Skip to content
Snippets Groups Projects
Commit 9c7dabf1 authored by ilor's avatar ilor
Browse files

wccl-features update

parent 4d705f86
Branches
No related merge requests found
......@@ -43,10 +43,10 @@ private:
std::ios_base::fmtflags flags_;
};
class Runner
class FeatureRunner
{
public:
Runner(const Corpus2::Tagset& tagset)
FeatureRunner(const Corpus2::Tagset& tagset)
: tagset_(tagset), parser_(tagset_), token_idx(0)
{
}
......@@ -56,15 +56,26 @@ public:
int load_operator_string(const std::string &line);
void print_header_head();
void print_header_body(const std::string &attribute_prefix);
void print_header_body(const std::string &attribute_prefix,
bool nos = false);
void print_header_foot();
void print_data(const std::vector< std::vector<std::string> >& data);
void print_data(const std::vector<std::vector<std::string> > &data,
const std::vector<bool> rowmask);
std::vector< std::vector<std::string> > do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence);
std::vector< std::vector<std::string> > do_sentence(
const boost::shared_ptr<Corpus2::Sentence>& sentence);
void do_sentence(
const boost::shared_ptr<Corpus2::Sentence>& sentence,
std::vector< std::vector<std::string> >& sfeats,
bool nos = false);
void do_stream(std::istream& is, bool first);
void do_files(std::vector<std::string>& files, bool first);
bool empty() {
return bool_ops_.empty() && str_ops_.empty() && tset_ops_.empty();
}
......@@ -93,7 +104,7 @@ private:
int token_idx;
};
int Runner::load_more_operators(const std::string& filename)
int FeatureRunner::load_more_operators(const std::string& filename)
{
int ops_parsed = 0;
......@@ -117,7 +128,7 @@ int Runner::load_more_operators(const std::string& filename)
return ops_parsed;
}
int Runner::load_operator_string(const std::string &line)
int FeatureRunner::load_operator_string(const std::string &line)
{
int ops_loaded = 0;
boost::regex e("(STRING|BOOL|MASK\\h([a-z@,]+))\\h+"
......@@ -174,59 +185,93 @@ int Runner::load_operator_string(const std::string &line)
return ops_loaded;
}
void Runner::print_header_head()
void FeatureRunner::print_header_head()
{
std::cout << "% Generated by wccl-features\n";
std::cout << "@RELATION wccl\n";
std::cout << "\n";
}
void Runner::print_header_body(const std::string& attribute_prefix)
void FeatureRunner::print_header_body(const std::string& attribute_prefix,
bool nos /*=false*/)
{
foreach (const str_ops_map_t::value_type v, str_ops_) {
std::cout << "@ATTRIBUTE "
<< attribute_prefix << v.first << " string\n";
if (!nos) {
foreach (const str_ops_map_t::value_type v, str_ops_) {
std::cout << "@ATTRIBUTE "
<< attribute_prefix << v.first << " string\n";
}
}
foreach (const bool_ops_map_t::value_type v, bool_ops_) {
std::cout << "@ATTRIBUTE "
<< attribute_prefix << v.first << " class {0,1}\n";
<< attribute_prefix << v.first << " {0,1}\n";
}
foreach (const tset_ops_map_t::value_type v, tset_ops_) {
foreach (const Corpus2::Tag& tag, v.second.first) {
std::cout << "@ATTRIBUTE "
<< attribute_prefix << v.first << "_"
<< tagset_.tag_to_symbol_string(tag) << " class {0,1}\n";
<< tagset_.tag_to_symbol_string(tag) << " {0,1}\n";
}
}
}
void Runner::print_header_foot()
void FeatureRunner::print_header_foot()
{
std::cout << "\n@DATA\n";
}
void Runner::print_data(const std::vector<std::vector<std::string> > &data)
void FeatureRunner::print_data(const std::vector<std::vector<std::string> > &data)
{
foreach (const std::vector<std::string>& feats, data) {
std::cout << boost::algorithm::join(feats, ",") << "\n";
}
}
std::vector< std::vector<std::string> > Runner::do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence)
void FeatureRunner::print_data(
const std::vector<std::vector<std::string> > &data,
const std::vector<bool> rowmask)
{
Wccl::SentenceContext sc(sentence);
assert(data.size() == rowmask.size());
for (size_t i = 0; i < data.size(); ++i) {
if (rowmask[i]) {
std::cout << boost::algorithm::join(data[i], ",") << "\n";
}
}
}
std::vector< std::vector<std::string> > FeatureRunner::do_sentence(
const boost::shared_ptr<Corpus2::Sentence>& sentence)
{
std::vector< std::vector<std::string> > sfeats;
do_sentence(sentence, sfeats);
return sfeats;
}
void FeatureRunner::do_sentence(
const boost::shared_ptr<Corpus2::Sentence>& sentence,
std::vector< std::vector<std::string> >& sfeats,
bool nos /*=false*/)
{
Wccl::SentenceContext sc(sentence);
while (sc.is_current_inside()) {
sfeats.resize(sfeats.size() + 1);
std::vector<std::string>& feats = sfeats.back();
foreach (const str_ops_map_t::value_type v, str_ops_) {
boost::shared_ptr<const Wccl::StrSet> s = v.second->apply(sc);
assert(s);
if (s->contents().empty()) {
feats.push_back("\"\"");
} else {
feats.push_back("\"" + PwrNlp::to_utf8(*s->contents().begin()) + "\"");
if (sfeats.size() < static_cast<size_t>(sc.get_position() + 1)) {
sfeats.resize(sc.get_position() + 1);
}
assert(!sfeats.empty());
std::vector<std::string>& feats = sfeats[sc.get_position()];
if (!nos) {
foreach (const str_ops_map_t::value_type v, str_ops_) {
boost::shared_ptr<const Wccl::StrSet> s = v.second->apply(sc);
assert(s);
if (s->contents().empty()) {
feats.push_back("\"\"");
} else {
feats.push_back("\"" +
boost::algorithm::replace_all_copy(
PwrNlp::to_utf8(*s->contents().begin()),
"\"", "\\\"") + "\"");
}
}
}
foreach (const bool_ops_map_t::value_type v, bool_ops_) {
......@@ -251,11 +296,10 @@ std::vector< std::vector<std::string> > Runner::do_sentence(const boost::shared_
}
sc.advance();
}
return sfeats;
}
void Runner::do_stream(std::istream& is, bool first)
void FeatureRunner::do_stream(std::istream& is, bool first)
{
Corpus2::XcesReader xr(tagset_, is);
Corpus2::Sentence::Ptr s;
......@@ -269,7 +313,98 @@ void Runner::do_stream(std::istream& is, bool first)
}
}
//void Runner::do_files(std::istream& is, bool first)
void FeatureRunner::do_files(std::vector<std::string>& files, bool first)
{
std::vector<boost::shared_ptr<Corpus2::TokenReader> > readers;
if (files.size() < 2) return;
readers.push_back(Corpus2::TokenReader::create_path_reader(
"xces", tagset_, files[0]));
for (size_t i = 1; i < files.size(); ++i) {
readers.push_back(Corpus2::TokenReader::create_path_reader(
"xces,disamb_only", tagset_, files[i]));
}
print_header_head();
for (size_t i = 0; i < files.size(); ++i) {
print_header_body("T" + boost::lexical_cast<std::string>(i) + "_");
}
std::cout << "@ATTRIBUTE correct {0";
for (size_t si = 1; si < files.size(); ++si) {
//std::cout << "@ATTRIBUTE tag" << si << "ok " << "{0,1}" << "\n";
std::cout << "," << si;
}
std::cout << "}\n";
print_header_foot();
bool more = !first;
int processed = 0;
do {
std::vector<Corpus2::Sentence::Ptr> sentences;
foreach (const boost::shared_ptr<Corpus2::TokenReader>& r, readers) {
Corpus2::Sentence::Ptr s = r->get_next_sentence();
if (s) {
sentences.push_back(s);
}
}
if (sentences.size() == readers.size()) {
std::vector< std::vector< std::string> > data;
std::vector<bool> rowmask;
size_t gold_size = sentences[0]->size();
for (size_t si = 1; si < sentences.size(); ++si) {
if (sentences[si]->size() != gold_size) {
std::cerr << "Sentence size mismatch at " <<
processed << " " << si << "\n";
return;
}
}
data.resize(gold_size);
rowmask.resize(gold_size);
do_sentence(sentences[0], data, false);
for (size_t si = 1; si < sentences.size(); ++si) {
do_sentence(sentences[si], data, false);
}
int maxv = 0;
for (size_t i = 0; i < gold_size; ++i) {
std::set<Corpus2::Tag> gold_tags;
const Corpus2::Token& gold_token = *(*sentences[0])[i];
foreach (const Corpus2::Lexeme& gl, gold_token.disamb_lexemes()) {
gold_tags.insert(gl.tag());
}
int wci = 0;
std::map<Corpus2::Tag, int> v;
for (size_t si = 1; si < sentences.size(); ++si) {
const Corpus2::Token& token = *(*sentences[si])[i];
bool wc = false;
foreach (const Corpus2::Lexeme& gl, token.lexemes()) {
if (gold_tags.find(gl.tag()) != gold_tags.end()) {
wc = true;
wci = si;
}
v[gl.tag()]++;
maxv = std::max(maxv, v[gl.tag()]);
}
//data[i].push_back(wc ? "1" : "0");
}
typedef std::pair<Corpus2::Tag, int> pp;
int mv = 0;
bool tie = false;
foreach (const pp& p, v) {
if (p.second == mv) {
tie = true;
} else if (p.second > mv) {
tie = false;
mv = p.second;
}
}
data[i].push_back(boost::lexical_cast<std::string>(wci));
rowmask[i] = tie;
}
print_data(data, rowmask);
++processed;
} else {
more = false;
}
} while (more);
}
int main(int argc, char** argv)
......@@ -348,7 +483,7 @@ int main(int argc, char** argv)
}
try {
const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load);
Runner runner(tagset);
FeatureRunner runner(tagset);
foreach (const std::string& f, operator_strings) {
if (boost::algorithm::ends_with(f, ".ccl")) {
if (!runner.load_more_operators(f)) {
......@@ -357,16 +492,16 @@ int main(int argc, char** argv)
}
}
if (!runner.empty()) {
foreach (const std::string& f, corpora_files) {
std::ifstream ifs(f.c_str());
if (corpora_files.size() == 1) {
std::ifstream ifs(corpora_files[0].c_str());
if (ifs.good()) {
runner.do_stream(ifs, first);
} else {
std::cerr << "Error reading corpus from " << f << "\n";
std::cerr << "Error reading corpus from "
<< corpora_files[0] << "\n";
}
}
if (corpus_stdin) {
runner.do_stream(std::cin, first);
} else {
runner.do_files(corpora_files, first);
}
}
} catch (PwrNlp::PwrNlpError& e) {
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment