Skip to content
Snippets Groups Projects
Commit 967dd4d1 authored by ilor's avatar ilor
Browse files

wccl-run upgrade: simple filtering, sentecne # output, output aggregation

parent 4a9bf953
No related branches found
No related tags found
No related merge requests found
......@@ -26,6 +26,7 @@ namespace {
bool output_orths = true;
bool output_variables = false;
bool global_numbering = false;
bool sentence_indices = true;
bool output_header = true;
bool in_sentence_numbering = true;
}
......@@ -45,7 +46,7 @@ class Runner
public:
Runner(const Corpus2::Tagset& tagset)
: tagset_(tagset), parser_(tagset_), token_idx(0), progress_(false),
search_path_(".")
search_path_("."), want_header_(true), aggregate_output_(false)
{
}
......@@ -57,31 +58,52 @@ public:
}
}
void set_aggregate_output(bool v) { aggregate_output_ = v; }
bool load_more_operators(const std::string &filename);
bool load_operator_string(const std::string &op_string);
size_t size() const {
return ops_.size() + (filter_op_ ? 1 : 0);
}
const std::vector< boost::shared_ptr<Wccl::FunctionalOperator> >& operators() const {
return ops_;
}
void do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence);
void do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence,
std::vector< std::vector< UnicodeString > >& outputs, int sidx);
void run(boost::shared_ptr<Corpus2::TokenReader> , bool first);
void output_tabular(const std::vector< std::vector< UnicodeString > > outputs);
void output_tabular(const std::vector< std::vector< UnicodeString > >& outputs);
void set_search_path(const std::string& path) {
search_path_ = path;
}
void set_filter_op(const std::string op_name, const std::string& op_value) {
filter_op_name_ = op_name;
filter_op_value_ = op_value;
}
private:
void do_operator_variables(const boost::shared_ptr<Wccl::FunctionalOperator>& op,
std::vector<UnicodeString>& out, bool variables);
const Corpus2::Tagset& tagset_;
Wccl::Parser parser_;
std::vector< boost::shared_ptr<Wccl::FunctionalOperator> > ops_;
boost::shared_ptr< Wccl::FunctionalOperator > filter_op_;
std::vector< std::string > op_names_;
int token_idx;
bool progress_;
std::string search_path_;
std::string filter_op_name_;
std::string filter_op_value_;
bool want_header_;
bool aggregate_output_;
};
bool Runner::load_more_operators(const std::string& filename)
......@@ -96,12 +118,17 @@ bool Runner::load_more_operators(const std::string& filename)
retOp = parser_.parseWcclFile(is, search_path_);
if (retOp) {
boost::filesystem::path p(filename);
std::string prefix = p.stem() + ":";
std::string prefix = ""; //p.stem() + ":";
Wccl::UntypedOpSequence::name_op_v_t pairs = retOp->gen_all_op_pairs();
foreach (const Wccl::UntypedOpSequence::name_op_pair_t v, pairs) {
op_names_.push_back(prefix + v.first);
std::string opname = v.first;
if (opname == filter_op_name_) {
filter_op_ = v.second;
} else {
op_names_.push_back(opname);
ops_.push_back(v.second);
}
}
return true;
} else {
std::cerr << "Problem while parsing -- "
......@@ -148,26 +175,48 @@ bool Runner::load_operator_string(const std::string& op_string)
return false;
}
void Runner::do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence)
void Runner::do_operator_variables(const boost::shared_ptr<Wccl::FunctionalOperator>& op,
std::vector<UnicodeString>& out, bool variables)
{
std::cerr << "dos";
Wccl::SentenceContext sc(sentence);
std::vector< std::vector< UnicodeString > > outputs;
if (variables) {
foreach (const std::string& varname, op->valid_variable_names()) {
out.push_back((*op)[varname].to_string_u(tagset_));
}
}
}
void Runner::do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence,
std::vector< std::vector< UnicodeString > >& outputs, int sidx)
{
Wccl::SentenceContext sc(sentence);
streamsave sv(std::cout);
if (output_header) {
if (output_header && want_header_) {
outputs.resize(outputs.size() + 1);
std::vector< UnicodeString >& out = outputs.back();
if (global_numbering) {
out.push_back(UnicodeString::fromUTF8("##"));
}
if (sentence_indices) {
out.push_back(UnicodeString::fromUTF8("S#"));
}
if (in_sentence_numbering) {
out.push_back(UnicodeString::fromUTF8("#"));
}
if (output_orths) {
out.push_back(UnicodeString::fromUTF8("orth"));
}
if (filter_op_) {
out.push_back(UnicodeString::fromUTF8(filter_op_name_));
if (output_variables) {
boost::shared_ptr<Wccl::FunctionalOperator> o = filter_op_;
foreach (const std::string& varname, o->valid_variable_names()) {
const Wccl::Value& value = (*o)[varname];
std::string label = "(" + filter_op_name_ + ")" + value.make_var_repr(varname);
out.push_back(UnicodeString::fromUTF8(label));
}
}
}
for (size_t i = 0; i < op_names_.size(); ++i) {
out.push_back(UnicodeString::fromUTF8(op_names_[i]));
if (output_variables) {
......@@ -181,36 +230,48 @@ void Runner::do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence)
}
}
for (size_t i = 0; i < sentence->size(); ++i) {
++token_idx;
sc.set_position(i);
UnicodeString vstr;
if (filter_op_) {
boost::shared_ptr<const Wccl::Value> v = filter_op_->base_apply(sc);
vstr = v->to_string_u(tagset_);
std::string uvstr = PwrNlp::to_utf8(vstr);
if (uvstr != filter_op_value_) {
continue;
} else {
}
}
outputs.resize(outputs.size() + 1);
std::vector< UnicodeString >& out = outputs.back();
++token_idx;
if (global_numbering) {
out.push_back(UnicodeString::fromUTF8(boost::lexical_cast<std::string>(token_idx)));
}
if (sentence_indices) {
out.push_back(UnicodeString::fromUTF8(boost::lexical_cast<std::string>(sidx)));
}
if (in_sentence_numbering) {
out.push_back(UnicodeString::fromUTF8(boost::lexical_cast<std::string>(i + 1)));
}
if (output_orths) {
out.push_back(sentence->tokens()[i]->orth());
}
if (filter_op_) {
out.push_back(vstr);
do_operator_variables(filter_op_, out, output_variables);
}
sc.set_position(i);
foreach (const boost::shared_ptr<Wccl::FunctionalOperator>& o, ops_) {
boost::shared_ptr<const Wccl::Value> v = o->base_apply(sc);
UnicodeString vstr = v->to_string_u(tagset_);
out.push_back(vstr);
if (output_variables) {
foreach (const std::string& varname, o->valid_variable_names()) {
out.push_back((*o)[varname].to_string_u(tagset_));
do_operator_variables(o, out, output_variables);
}
}
}
}
output_tabular(outputs);
}
void Runner::output_tabular(const std::vector<std::vector<UnicodeString> > outputs)
void Runner::output_tabular(const std::vector<std::vector<UnicodeString> >& outputs)
{
std::vector<int> lengths(outputs[0].size());
foreach (const std::vector< UnicodeString >& line, outputs) {
......@@ -238,27 +299,42 @@ void Runner::output_tabular(const std::vector<std::vector<UnicodeString> > outpu
void Runner::run(boost::shared_ptr<Corpus2::TokenReader> reader, bool first)
{
std::vector<std::vector<UnicodeString> > outputs;
Corpus2::Sentence::Ptr s;
Corpus2::TokenTimer& timer = Corpus2::global_timer();
int sidx = 0;
while ((s = reader->get_next_sentence())) {
do_sentence(s);
do_sentence(s, outputs, sidx++);
timer.count_sentence(*s);
if (aggregate_output_) {
want_header_ = false;
}
if (!outputs.empty() && !aggregate_output_) {
output_tabular(outputs);
}
if (progress_) {
timer.check_slice();
}
if (!filter_op_) {
std::cout << "\n";
}
if (first) break;
}
if (!outputs.empty() && aggregate_output_) {
output_tabular(outputs);
}
}
int main(int argc, char** argv)
{
std::string tagset_load = "kipi";
std::string filter_op_name, filter_op_value;
bool first = false, progress = false;
std::string input_format;
std::string search_path;
std::vector<std::string> corpora_files, files, operator_strings;
bool corpus_stdin = false;
bool aggregate = false;
using boost::program_options::value;
std::string readers = boost::algorithm::join(Corpus2::TokenReader::available_reader_types_help(), " ");
std::string readers_help = "Input format, any of: " + readers + "\n";
......@@ -289,12 +365,20 @@ int main(int argc, char** argv)
"Output in-sentence token counts")
("global-counts,g", value(&global_numbering),
"Output global counts")
("output-sentence-indices,H", value(&sentence_indices),
"Output sentence indices")
("output-orths,O", value(&output_orths),
"Output token orths")
("output-variables,V", value(&output_variables),
"Output operator variables")
("output-header,H", value(&output_header),
"Output table header")
("filter-operator,F", value(&filter_op_name),
"Filter operator name")
("filter-value", value(&filter_op_value)->default_value("True"),
"Filter operator expected valye")
("aggregate-output,A", value(&aggregate),
"Aggregate output (prettier, slower)")
("progress,p", value(&progress)->zero_tokens(),
"Show progress info")
("help,h", "Show help")
......@@ -342,20 +426,24 @@ int main(int argc, char** argv)
if (!search_path.empty()) {
runner.set_search_path(search_path);
}
if (!filter_op_name.empty()) {
runner.set_filter_op(filter_op_name, filter_op_value);
}
runner.set_aggregate_output(aggregate);
foreach (const std::string& f, operator_strings) {
if (boost::algorithm::ends_with(f, ".ccl")) {
size_t sz = runner.operators().size();
size_t sz = runner.size();
if (!runner.load_more_operators(f)) {
std::cerr << "Warning: error while parsing " << f << "\n";
}
if (runner.operators().size() == sz) {
if (runner.size() == sz) {
std::cerr << "Warning: no operators loaded from " << f << "\n";
}
} else {
runner.load_operator_string(f);
}
}
if (!runner.operators().empty()) {
if (runner.size() > 0) {
foreach (const std::string& f, corpora_files) {
runner.run(Corpus2::TokenReader::create_path_reader(
input_format, tagset, f), first);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment