Skip to content
Snippets Groups Projects
Commit 24af99d1 authored by ilor's avatar ilor
Browse files

refactor wcclrun a bit, add formatting options: --tabs makes it output plain...

refactor wcclrun a bit, add formatting options: --tabs makes it output plain tab separated data, -l and -g control local / global token counts
parent 8c302114
Branches
No related tags found
No related merge requests found
...@@ -16,12 +16,50 @@ ...@@ -16,12 +16,50 @@
#include <antlr/NoViableAltException.hpp> #include <antlr/NoViableAltException.hpp>
#include <antlr/MismatchedTokenException.hpp> #include <antlr/MismatchedTokenException.hpp>
namespace {
bool quiet = false;
bool tabs = false;
bool output_orths = true;
bool global_numbering = false;
bool in_sentence_numbering = true;
}
bool load_more_operators(const std::string& filename, Wccl::Parser& parser, class streamsave
std::vector< boost::shared_ptr<Wccl::FunctionalOperator> >& ops)
{ {
public:
streamsave(std::ostream& os) : os_(os), flags_(os.flags()) {}
~streamsave() { os_.flags(flags_); }
private:
std::ostream& os_;
std::ios_base::fmtflags flags_;
};
boost::shared_ptr<const Wccl::Value> retVal; class Runner
{
public:
Runner(const Corpus2::Tagset& tagset)
: tagset_(tagset), parser_(tagset_), token_idx(0)
{
}
bool load_more_operators(const std::string &filename);
const std::vector< boost::shared_ptr<Wccl::FunctionalOperator> >& operators() const {
return ops_;
}
void do_head();
void do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence);
void do_stream(std::istream& is, bool first);
private:
const Corpus2::Tagset& tagset_;
Wccl::Parser parser_;
std::vector< boost::shared_ptr<Wccl::FunctionalOperator> > ops_;
int token_idx;
};
bool Runner::load_more_operators(const std::string& filename)
{
boost::shared_ptr<Wccl::FunctionalOperator> retOp; boost::shared_ptr<Wccl::FunctionalOperator> retOp;
try { try {
std::ifstream is(filename.c_str()); std::ifstream is(filename.c_str());
...@@ -29,9 +67,9 @@ bool load_more_operators(const std::string& filename, Wccl::Parser& parser, ...@@ -29,9 +67,9 @@ bool load_more_operators(const std::string& filename, Wccl::Parser& parser,
throw Wccl::FileNotFound(filename, "", __FUNCTION__); throw Wccl::FileNotFound(filename, "", __FUNCTION__);
} }
retOp = parser.parseAnyOperator(is); retOp = parser_.parseAnyOperator(is);
if (retOp) { if (retOp) {
ops.push_back(retOp); ops_.push_back(retOp);
return true; return true;
} else { } else {
std::cerr << "Problem while parsing -- " std::cerr << "Problem while parsing -- "
...@@ -57,24 +95,13 @@ bool load_more_operators(const std::string& filename, Wccl::Parser& parser, ...@@ -57,24 +95,13 @@ bool load_more_operators(const std::string& filename, Wccl::Parser& parser,
return false; return false;
} }
class streamsave void Runner::do_head()
{
public:
streamsave(std::ostream& os) : os_(os), flags_(os.flags()) {}
~streamsave() { os_.flags(flags_); }
private:
std::ostream& os_;
std::ios_base::fmtflags flags_;
};
void do_head(const Corpus2::Tagset& tagset,
const std::vector< boost::shared_ptr<Wccl::FunctionalOperator> >& ops)
{ {
streamsave sv(std::cout); streamsave sv(std::cout);
std::cout << "## "; std::cout << "## ";
std::cout << std::setw(20) << "orth"; std::cout << std::setw(20) << "orth";
int i = 0; int i = 0;
foreach (const boost::shared_ptr<Wccl::FunctionalOperator>& o, ops) { foreach (const boost::shared_ptr<Wccl::FunctionalOperator>& o, ops_) {
++i; ++i;
std::cout << " "; std::cout << " ";
std::cout.setf(std::ios::right); std::cout.setf(std::ios::right);
...@@ -85,13 +112,11 @@ void do_head(const Corpus2::Tagset& tagset, ...@@ -85,13 +112,11 @@ void do_head(const Corpus2::Tagset& tagset,
std::cout << "\n"; std::cout << "\n";
} }
void do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence, void Runner::do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence)
const Corpus2::Tagset& tagset,
const std::vector< boost::shared_ptr<Wccl::FunctionalOperator> >& ops)
{ {
Wccl::SentenceContext sc(sentence); Wccl::SentenceContext sc(sentence);
std::vector< std::vector< UnicodeString > > outputs(sentence->size()); std::vector< std::vector< UnicodeString > > outputs(sentence->size());
std::vector<int> lengths(ops.size() + 1, 0); std::vector<int> lengths(ops_.size() + 1, 0);
streamsave sv(std::cout); streamsave sv(std::cout);
for (size_t i = 0; i < sentence->size(); ++i) { for (size_t i = 0; i < sentence->size(); ++i) {
sc.set_position(i); sc.set_position(i);
...@@ -99,34 +124,56 @@ void do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence, ...@@ -99,34 +124,56 @@ void do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence,
outputs[i].push_back(orth); outputs[i].push_back(orth);
lengths[0] = std::max(lengths[0], orth.length()); lengths[0] = std::max(lengths[0], orth.length());
int li = 1; int li = 1;
foreach (const boost::shared_ptr<Wccl::FunctionalOperator>& o, ops) { foreach (const boost::shared_ptr<Wccl::FunctionalOperator>& o, ops_) {
boost::shared_ptr<const Wccl::Value> v = o->base_apply(sc); boost::shared_ptr<const Wccl::Value> v = o->base_apply(sc);
UnicodeString vstr = v->to_string_u(tagset); UnicodeString vstr = v->to_string_u(tagset_);
lengths[li] = std::max(lengths[li], vstr.length()); lengths[li] = std::max(lengths[li], vstr.length());
++li; ++li;
outputs[i].push_back(vstr); outputs[i].push_back(vstr);
} }
} }
for (size_t i = 0; i < sentence->size(); ++i) { for (size_t i = 0; i < sentence->size(); ++i) {
++token_idx;
if (global_numbering) {
if (tabs) {
std::cout << token_idx << "\t";
} else {
std::cout << std::setw(6) << token_idx << " ";
}
}
if (in_sentence_numbering) {
if (tabs) {
std::cout << (i + 1) << "\t";
} else {
std::cout << std::setw(2) << (i + 1) << " "; std::cout << std::setw(2) << (i + 1) << " ";
for (size_t oi = 0; oi < outputs[i].size(); ++oi) { }
}
size_t b = output_orths ? 0 : 1;
for (size_t oi = b; oi < outputs[i].size(); ++oi) {
UnicodeString u = outputs[i][oi]; UnicodeString u = outputs[i][oi];
u.padTrailing(lengths[oi]); if (oi + 1 < outputs[i].size()) {
std::cout << PwrNlp::to_utf8(u) << " "; if (tabs) {
std::cout << PwrNlp::to_utf8(u);
std::cout << "\t";
} else {
u.padTrailing(lengths[oi] + 1);
std::cout << PwrNlp::to_utf8(u);
}
} else {
std::cout << PwrNlp::to_utf8(u);
}
} }
std::cout << "\n"; std::cout << "\n";
} }
} }
void do_file(const std::string& filename, const Corpus2::Tagset& tagset, void Runner::do_stream(std::istream& is, bool first)
const std::vector< boost::shared_ptr<Wccl::FunctionalOperator> >& ops,
bool first)
{ {
Corpus2::XcesReader xr(tagset, filename); Corpus2::XcesReader xr(tagset_, is);
Corpus2::Sentence::Ptr s; Corpus2::Sentence::Ptr s;
//do_head(tagset, ops); //do_head(tagset, ops);
while ((s = xr.get_next_sentence())) { while ((s = xr.get_next_sentence())) {
do_sentence(s, tagset, ops); do_sentence(s);
std::cout << "\n"; std::cout << "\n";
if (first) break; if (first) break;
} }
...@@ -137,8 +184,7 @@ int main(int argc, char** argv) ...@@ -137,8 +184,7 @@ int main(int argc, char** argv)
std::string tagset_load = "kipi"; std::string tagset_load = "kipi";
bool first = false; bool first = false;
std::vector<std::string> corpora_files, ccl_files, files; std::vector<std::string> corpora_files, ccl_files, files;
bool quiet = false; bool corpus_stdin = false;
bool dump_variables = false;
using boost::program_options::value; using boost::program_options::value;
boost::program_options::options_description desc("Allowed options"); boost::program_options::options_description desc("Allowed options");
...@@ -151,10 +197,18 @@ int main(int argc, char** argv) ...@@ -151,10 +197,18 @@ int main(int argc, char** argv)
"CCL query file\n") "CCL query file\n")
("files,f", value(&files), ("files,f", value(&files),
"Files to load, look at extecion to determine type\n") "Files to load, look at extecion to determine type\n")
("corpus-from-stdin,I", value(&corpus_stdin)->zero_tokens(),
"Read corpus from stdin\n")
("quiet,q", value(&quiet)->zero_tokens(), ("quiet,q", value(&quiet)->zero_tokens(),
"Suppress messages\n") "Suppress messages\n")
("first-sentence-only,1", value(&first)->zero_tokens(), ("first-sentence-only,1", value(&first)->zero_tokens(),
"Only process first sentence\n") "Only process first sentence\n")
("tabs", value(&tabs)->zero_tokens(),
"Output a tab-separated file\n")
("local-counts,l", value(&in_sentence_numbering),
"Output in-sentence token counts\n")
("global-counts,g", value(&global_numbering),
"Output global counts\n")
("help,h", "Show help") ("help,h", "Show help")
; ;
boost::program_options::variables_map vm; boost::program_options::variables_map vm;
...@@ -172,12 +226,17 @@ int main(int argc, char** argv) ...@@ -172,12 +226,17 @@ int main(int argc, char** argv)
boost::program_options::notify(vm); boost::program_options::notify(vm);
if (vm.count("help")) { if (vm.count("help")) {
std::cerr << "Usage " << argv[0] << " [OPTIONS] FILES\n"
<< "Files ending with .xml are treated as corpora, otherwise \n"
<< "as CCL files. Use - to read corpus from stdin (as with -I)";
std::cout << desc << "\n"; std::cout << desc << "\n";
return 1; return 1;
} }
foreach (const std::string& f, files) { foreach (const std::string& f, files) {
if (boost::algorithm::ends_with(f, ".xml")) { if (f == "-") {
corpus_stdin = true;
} else if (boost::algorithm::ends_with(f, ".xml")) {
corpora_files.push_back(f); corpora_files.push_back(f);
} else { } else {
ccl_files.push_back(f); ccl_files.push_back(f);
...@@ -186,20 +245,27 @@ int main(int argc, char** argv) ...@@ -186,20 +245,27 @@ int main(int argc, char** argv)
try { try {
const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load); const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load);
std::vector< boost::shared_ptr<Wccl::FunctionalOperator> > operators; Runner runner(tagset);
Wccl::Parser parser(tagset);
foreach (const std::string& f, ccl_files) { foreach (const std::string& f, ccl_files) {
int sz = operators.size(); size_t sz = runner.operators().size();
if (!load_more_operators(f, parser, operators)) { if (!runner.load_more_operators(f)) {
std::cerr << "Warning: error while parsing " << f << "\n"; std::cerr << "Warning: error while parsing " << f << "\n";
} }
if (operators.size() == sz) { if (runner.operators().size() == sz) {
std::cerr << "Warning: no operators loaded from " << f << "\n"; std::cerr << "Warning: no operators loaded from " << f << "\n";
} }
} }
if (!operators.empty()) { if (!runner.operators().empty()) {
foreach (const std::string& f, corpora_files) { foreach (const std::string& f, corpora_files) {
do_file(f, tagset, operators, first); std::ifstream ifs(f.c_str());
if (ifs.good()) {
runner.do_stream(ifs, first);
} else {
std::cerr << "Error reading corpus from " << f << "\n";
}
}
if (corpus_stdin) {
runner.do_stream(std::cin, first);
} }
} }
} catch (PwrNlp::PwrNlpError& e) { } catch (PwrNlp::PwrNlpError& e) {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment