Newer
Older
#include <cstdlib>
#include <fstream>
#include <iomanip>
#include <libwccl/values/strset.h>
#include <libwccl/parser/Parser.h>
#include <libwccl/ops/rulesequence.h>
#include <libcorpus2/tagsetmanager.h>
#include <libcorpus2/util/tokentimer.h>
#include <boost/bind.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/make_shared.hpp>
#include <boost/program_options.hpp>
#include <libcorpus2/io/xcesreader.h>
#include <libcorpus2/io/xceswriter.h>
#include <antlr/NoViableAltException.hpp>
#include <antlr/MismatchedTokenException.hpp>
namespace {
bool quiet = false;
bool progress = false;
struct options {
bool first;
bool until_done;
int until_done_iterations;
};
}
bool load_more_rules(Wccl::Parser& parser, const std::string& filename, Wccl::RuleSequence& rules)
{
boost::shared_ptr<Wccl::RuleSequence> ret;
try {
std::ifstream is(filename.c_str());
if (!is.good()) {
throw Wccl::FileNotFound(filename, "", __FUNCTION__);
}
ret = parser.parseRuleSequence(is);
if (ret) {
if (!quiet) {
std::cerr << "Loaded " << ret->size() << " rule(s) from "
<< filename << "\n";
}
std::copy(ret->begin(), ret->end(), std::back_inserter(rules));
return true;
} else {
std::cerr << "Problem while parsing -- "
<< "parser returned NULL!" << std::endl;
}
} catch (antlr::MismatchedTokenException &e) {
std::cerr << e.getFileLineColumnString()
<< " " << e.getMessage() << std::endl;
} catch(antlr::NoViableAltException &e) {
std::cerr << e.getFileLineColumnString()
<< " " << e.getMessage() << std::endl;
} catch (Wccl::InvalidVariableName &e) {
std::cerr << "Wccl::InvalidVariableName " << e.info() << std::endl;
} catch (Wccl::VariableTypeMismatch &e) {
std::cerr << "Wccl::VariableTypeMismatch " << e.info() << std::endl;
} catch (Wccl::WcclError& e) {
std::cerr << "Wccl::WcclError:" << e.info() << std::endl;
} catch (PwrNlp::PwrNlpError& e) {
std::cerr << "PwrNlp::PwrNlpError " << e.info() << std::endl;
} catch (antlr::ANTLRException& e) {
std::cerr << "Antlr error " << e.getMessage() << std::endl;
}
return false;
}
void apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader,
boost::shared_ptr<Corpus2::TokenWriter> writer, Wccl::RuleSequence& rules,
const options& opts)
Corpus2::TokenTimer& timer = Corpus2::global_timer();
while (boost::shared_ptr<Corpus2::Chunk> c = reader->get_next_chunk()) {
foreach (boost::shared_ptr<Corpus2::Sentence>& s, c->sentences()) {
boost::shared_ptr<Corpus2::AnnotatedSentence> as;
as = Corpus2::AnnotatedSentence::wrap_sentence(s);
if (opts.until_done) {
rules.execute_until_done(as, opts.until_done_iterations);
rules.execute_once(as);
timer.count_sentence(*as);
if (progress) {
timer.check_slice();
}
if (opts.first) break;
writer->write_sentence(*as);
//writer->write_chunk(*c);
if (progress) {
timer.stats();
}
}
int main(int argc, char** argv)
{
std::string tagset_load = "kipi";
std::string input_format;
std::string output_format;
options opts;
opts.first = false;
opts.until_done = false;
opts.until_done_iterations = 1000;
std::vector<std::string> corpora_files, ccl_files, files;
bool corpus_stdin = true;
using boost::program_options::value;
std::string readers = boost::algorithm::join(Corpus2::TokenReader::available_reader_types_help(), " ");
std::string readers_help = "Input format, any of: " + readers + "\n";
std::string writers = boost::algorithm::join(Corpus2::TokenWriter::available_writer_types_help(), " ");
std::string writers_help = "Output format, any of: " + writers + "\n";;
boost::program_options::options_description desc("Allowed options");
desc.add_options()
("tagset,t", value(&tagset_load),
"Tagset to use\n")
("corpus,c", value(&corpora_files),
"Corpus file to load (XCES), do not load from stdin\n")
("ccl-file,C", value(&ccl_files),
"CCL rule files\n")
("files,f", value(&files),
"Files to load, looking at the extension to determine type\n")
("input-format,i", value(&input_format)->default_value("xces"),
readers_help.c_str())
("output-format,o", value(&output_format)->default_value("xces"),
writers_help.c_str())
("progress,p", value(&progress)->zero_tokens(),
"Show progress info")
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
("quiet,q", value(&quiet)->zero_tokens(),
"Suppress messages\n")
("until-done,u", value(&opts.until_done)->zero_tokens(),
"Until-done mode\n")
("until-done-iterations", value(&opts.until_done_iterations),
"Until-done iteration limit\n")
("first-sentence-only,1", value(&opts.first)->zero_tokens(),
"Only process first sentence\n")
("help,h", "Show help")
;
boost::program_options::variables_map vm;
boost::program_options::positional_options_description p;
p.add("files", -1);
try {
boost::program_options::store(
boost::program_options::command_line_parser(argc, argv)
.options(desc).positional(p).run(), vm);
} catch (boost::program_options::error& e) {
std::cerr << e.what() << std::endl;
return 2;
}
boost::program_options::notify(vm);
if (vm.count("help")) {
std::cerr << "Usage " << argv[0] << " [OPTIONS] FILES\n"
<< "Files ending with .xml are treated as corpora, otherwise \n"
<< "as CCL files. Use - to read corpus from stdin (as with -I)";
std::cout << desc << "\n";
return 1;
}
foreach (const std::string& f, files) {
if (boost::algorithm::ends_with(f, ".xml")) {
corpora_files.push_back(f);
} else {
ccl_files.push_back(f);
}
}
// consider stdin only when no corpus files given
corpus_stdin = corpus_stdin && corpora_files.empty();
try {
const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load);
Wccl::Parser parser(tagset);
Wccl::RuleSequence rules;
foreach (const std::string& f, ccl_files) {
size_t sz = rules.size();
if (!load_more_rules(parser, f, rules)) {
std::cerr << "Warning: error while parsing " << f << "\n";
}
if (rules.size() == sz) {
std::cerr << "Warning: no rules loaded from " << f << "\n";
}
}
if (!rules.empty()) {
Corpus2::TokenTimer& timer = Corpus2::global_timer();
timer.register_signal_handler();
boost::shared_ptr<Corpus2::TokenWriter> writer;
writer.reset(Corpus2::TokenWriter::create(output_format, std::cout, tagset));
boost::shared_ptr<Corpus2::TokenReader> reader;
foreach (const std::string& f, corpora_files) {
reader = Corpus2::TokenReader::create_path_reader(input_format, tagset, f);
apply_rules(reader, writer, rules, opts);
reader = Corpus2::TokenReader::create_stream_reader(input_format, tagset, std::cin);
apply_rules(reader, writer, rules, opts);