Newer
Older
#include <cstdlib>
#include <cstdio>
#include <fstream>
#include <iomanip>
#include <libwccl/values/strset.h>
#include <libwccl/parser/Parser.h>
#include <libcorpus2/tagsetmanager.h>
#include <boost/bind.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/make_shared.hpp>
#include <boost/program_options.hpp>
#include <boost/filesystem.hpp>
#include <libcorpus2/io/xcesreader.h>
#include <boost/lexical_cast.hpp>
#include <antlr/NoViableAltException.hpp>
#include <antlr/MismatchedTokenException.hpp>
namespace {
bool quiet = false;
bool tabs = false;
bool output_orths = true;
bool output_variables = false;
bool global_numbering = false;
bool in_sentence_numbering = true;
}
class streamsave
public:
streamsave(std::ostream& os) : os_(os), flags_(os.flags()) {}
~streamsave() { os_.flags(flags_); }
private:
std::ostream& os_;
std::ios_base::fmtflags flags_;
};
class Runner
{
public:
Runner(const Corpus2::Tagset& tagset)
: tagset_(tagset), parser_(tagset_), token_idx(0)
{
}
bool load_more_operators(const std::string &filename);
const std::vector< boost::shared_ptr<Wccl::FunctionalOperator> >& operators() const {
return ops_;
}
void do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence);
void do_stream(std::istream& is, bool first);
private:
const Corpus2::Tagset& tagset_;
Wccl::Parser parser_;
std::vector< boost::shared_ptr<Wccl::FunctionalOperator> > ops_;
std::vector< std::string > op_names_;
int token_idx;
};
bool Runner::load_more_operators(const std::string& filename)
{
boost::shared_ptr<Wccl::FunctionalOperator> retOp;
try {
std::ifstream is(filename.c_str());
if (!is.good()) {
throw Wccl::FileNotFound(filename, "", __FUNCTION__);
}
retOp = parser_.parseAnyOperator(is);
if (retOp) {
boost::filesystem::path p(filename);
op_names_.push_back(p.stem());
ops_.push_back(retOp);
} else {
std::cerr << "Problem while parsing -- "
<< "parser returned NULL!" << std::endl;
}
} catch (antlr::MismatchedTokenException &e) {
std::cerr << e.getFileLineColumnString()
<< " " << e.getMessage() << std::endl;
} catch(antlr::NoViableAltException &e) {
std::cerr << e.getFileLineColumnString()
<< " " << e.getMessage() << std::endl;
} catch (Wccl::InvalidVariableName &e) {
std::cerr << "Wccl::InvalidVariableName " << e.info() << std::endl;
} catch (Wccl::VariableTypeMismatch &e) {
std::cerr << "Wccl::VariableTypeMismatch " << e.info() << std::endl;
} catch (Wccl::WcclError& e) {
std::cerr << "Wccl::WcclError:" << e.info() << std::endl;
} catch (PwrNlp::PwrNlpError& e) {
std::cerr << "PwrNlp::PwrNlpError " << e.info() << std::endl;
} catch (antlr::ANTLRException& e) {
std::cerr << "Antlr error " << e.getMessage() << std::endl;
}
void Runner::do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence)
{
Wccl::SentenceContext sc(sentence);
std::vector< std::vector< UnicodeString > > outputs;
streamsave sv(std::cout);
if (1) { //header
outputs.resize(outputs.size() + 1);
std::vector< UnicodeString >& out = outputs.back();
if (global_numbering) {
out.push_back(UnicodeString::fromUTF8("##"));
}
if (in_sentence_numbering) {
out.push_back(UnicodeString::fromUTF8("#"));
}
if (output_orths) {
out.push_back(UnicodeString::fromUTF8("orth"));
}
for (size_t i = 0; i < op_names_.size(); ++i) {
out.push_back(UnicodeString::fromUTF8(op_names_[i]));
if (output_variables) {
boost::shared_ptr<Wccl::FunctionalOperator> o = ops_[i];
foreach (const std::string& varname, o->valid_variable_names()) {
const Wccl::Value& value = (*o)[varname];
std::string label = "(" + op_names_[i] + ")" + value.make_var_repr(varname);
out.push_back(UnicodeString::fromUTF8(label));
}
}
}
}
for (size_t i = 0; i < sentence->size(); ++i) {
outputs.resize(outputs.size() + 1);
std::vector< UnicodeString >& out = outputs.back();
++token_idx;
if (global_numbering) {
out.push_back(UnicodeString::fromUTF8(boost::lexical_cast<std::string>(token_idx)));
}
if (in_sentence_numbering) {
out.push_back(UnicodeString::fromUTF8(boost::lexical_cast<std::string>(i + 1)));
}
if (output_orths) {
out.push_back(sentence->tokens()[i]->orth());
}
sc.set_position(i);
foreach (const boost::shared_ptr<Wccl::FunctionalOperator>& o, ops_) {
boost::shared_ptr<const Wccl::Value> v = o->base_apply(sc);
UnicodeString vstr = v->to_string_u(tagset_);
out.push_back(vstr);
if (output_variables) {
foreach (const std::string& varname, o->valid_variable_names()) {
out.push_back((*o)[varname].to_string_u(tagset_));
}
}
}
}
std::vector<int> lengths(outputs[0].size());
foreach (const std::vector< UnicodeString >& line, outputs) {
for (size_t i = 0; i < line.size(); ++i) {
lengths[i] = std::max(lengths[i], line[i].length());
}
}
foreach (const std::vector< UnicodeString >& line, outputs) {
for (size_t i = 0; i < line.size(); ++i) {
UnicodeString u = line[i];
if (tabs) {
std::cout << PwrNlp::to_utf8(line[i]) << "\t";
} else {
u.padTrailing(lengths[i] + 1);
std::cout << PwrNlp::to_utf8(u);
}
}
std::cout << "\n";
}
}
void Runner::do_stream(std::istream& is, bool first)
Corpus2::XcesReader xr(tagset_, is);
Corpus2::Sentence::Ptr s;
while ((s = xr.get_next_sentence())) {
do_sentence(s);
std::cout << "\n";
if (first) break;
}
}
int main(int argc, char** argv)
{
std::string tagset_load = "kipi";
bool first = false;
std::vector<std::string> corpora_files, ccl_files, files;
bool corpus_stdin = false;
using boost::program_options::value;
boost::program_options::options_description desc("Allowed options");
desc.add_options()
("tagset,t", value(&tagset_load),
"Tagset to use\n")
("corpus,c", value(&corpora_files),
"Corpus file to load (XCES)\n")
("ccl-file,C", value(&ccl_files),
"CCL query file\n")
("files,f", value(&files),
("corpus-from-stdin,I", value(&corpus_stdin)->zero_tokens(),
"Read corpus from stdin\n")
("quiet,q", value(&quiet)->zero_tokens(),
"Suppress messages\n")
("first-sentence-only,1", value(&first)->zero_tokens(),
"Only process first sentence\n")
("tabs", value(&tabs)->zero_tokens(),
"Output a tab-separated file\n")
("local-counts,l", value(&in_sentence_numbering),
"Output in-sentence token counts\n")
("global-counts,g", value(&global_numbering),
"Output global counts\n")
("output-orths,O", value(&output_orths),
"Output token orths\n")
("output-variables,V", value(&output_variables),
"Output operator variables\n")
("help,h", "Show help")
;
boost::program_options::variables_map vm;
boost::program_options::positional_options_description p;
p.add("files", -1);
try {
boost::program_options::store(
boost::program_options::command_line_parser(argc, argv)
.options(desc).positional(p).run(), vm);
} catch (boost::program_options::error& e) {
std::cerr << e.what() << std::endl;
return 2;
}
boost::program_options::notify(vm);
if (vm.count("help")) {
std::cerr << "Usage " << argv[0] << " [OPTIONS] FILES\n"
<< "Files ending with .xml are treated as corpora, otherwise \n"
<< "as CCL files. Use - to read corpus from stdin (as with -I)";
std::cout << desc << "\n";
return 1;
}
foreach (const std::string& f, files) {
if (f == "-") {
corpus_stdin = true;
} else if (boost::algorithm::ends_with(f, ".xml")) {
corpora_files.push_back(f);
} else {
ccl_files.push_back(f);
}
}
try {
const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load);
Runner runner(tagset);
foreach (const std::string& f, ccl_files) {
size_t sz = runner.operators().size();
if (!runner.load_more_operators(f)) {
std::cerr << "Warning: error while parsing " << f << "\n";
}
if (runner.operators().size() == sz) {
std::cerr << "Warning: no operators loaded from " << f << "\n";
}
if (!runner.operators().empty()) {
foreach (const std::string& f, corpora_files) {
std::ifstream ifs(f.c_str());
if (ifs.good()) {
runner.do_stream(ifs, first);
} else {
std::cerr << "Error reading corpus from " << f << "\n";
}
}
if (corpus_stdin) {
runner.do_stream(std::cin, first);