#include <cstdlib> #include <fstream> #include <iomanip> #include <libwccl/values/strset.h> #include <libwccl/parser/Parser.h> #include <libwccl/ops/rulesequence.h> #include <libcorpus2/tagsetmanager.h> #include <libcorpus2/util/tokentimer.h> #include <boost/bind.hpp> #include <boost/algorithm/string.hpp> #include <boost/make_shared.hpp> #include <boost/program_options.hpp> #include <libcorpus2/io/xcesreader.h> #include <libcorpus2/io/xceswriter.h> #include <antlr/NoViableAltException.hpp> #include <antlr/MismatchedTokenException.hpp> namespace { bool quiet = false; bool progress = false; struct options { bool first; bool until_done; int until_done_iterations; }; } bool load_more_rules(Wccl::Parser& parser, const std::string& filename, Wccl::RuleSequence& rules) { boost::shared_ptr<Wccl::RuleSequence> ret; try { std::ifstream is(filename.c_str()); if (!is.good()) { throw Wccl::FileNotFound(filename, "", __FUNCTION__); } ret = parser.parseRuleSequence(is); if (ret) { if (!quiet) { std::cerr << "Loaded " << ret->size() << " rule(s) from " << filename << "\n"; } std::copy(ret->begin(), ret->end(), std::back_inserter(rules)); return true; } else { std::cerr << "Problem while parsing -- " << "parser returned NULL!" << std::endl; } } catch (antlr::MismatchedTokenException &e) { std::cerr << e.getFileLineColumnString() << " " << e.getMessage() << std::endl; } catch(antlr::NoViableAltException &e) { std::cerr << e.getFileLineColumnString() << " " << e.getMessage() << std::endl; } catch (Wccl::InvalidVariableName &e) { std::cerr << "Wccl::InvalidVariableName " << e.info() << std::endl; } catch (Wccl::VariableTypeMismatch &e) { std::cerr << "Wccl::VariableTypeMismatch " << e.info() << std::endl; } catch (Wccl::WcclError& e) { std::cerr << "Wccl::WcclError:" << e.info() << std::endl; } catch (PwrNlp::PwrNlpError& e) { std::cerr << "PwrNlp::PwrNlpError " << e.info() << std::endl; } catch (antlr::ANTLRException& e) { std::cerr << "Antlr error " << e.getMessage() << std::endl; } return false; } void apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader, boost::shared_ptr<Corpus2::TokenWriter> writer, Wccl::RuleSequence& rules, const options& opts) { Corpus2::TokenTimer& timer = Corpus2::global_timer(); while (boost::shared_ptr<Corpus2::Chunk> c = reader->get_next_chunk()) { foreach (boost::shared_ptr<Corpus2::Sentence>& s, c->sentences()) { boost::shared_ptr<Corpus2::AnnotatedSentence> as; as = Corpus2::AnnotatedSentence::wrap_sentence(s); if (opts.until_done) { rules.execute_until_done(as, opts.until_done_iterations); } else { rules.execute_once(as); } timer.count_sentence(*as); if (progress) { timer.check_slice(); } if (opts.first) break; writer->write_sentence(*as); } //writer->write_chunk(*c); if (opts.first) break; } if (progress) { timer.stats(); } } int main(int argc, char** argv) { std::string tagset_load = "kipi"; std::string input_format; std::string output_format; options opts; opts.first = false; opts.until_done = false; opts.until_done_iterations = 1000; std::vector<std::string> corpora_files, ccl_files, files; bool corpus_stdin = true; using boost::program_options::value; std::string readers = boost::algorithm::join(Corpus2::TokenReader::available_reader_types_help(), " "); std::string readers_help = "Input format, any of: " + readers + "\n"; std::string writers = boost::algorithm::join(Corpus2::TokenWriter::available_writer_types_help(), " "); std::string writers_help = "Output format, any of: " + writers + "\n";; boost::program_options::options_description desc("Allowed options"); desc.add_options() ("tagset,t", value(&tagset_load), "Tagset to use\n") ("corpus,c", value(&corpora_files), "Corpus file to load (XCES), do not load from stdin\n") ("ccl-file,C", value(&ccl_files), "CCL rule files\n") ("files,f", value(&files), "Files to load, looking at the extension to determine type\n") ("input-format,i", value(&input_format)->default_value("xces"), readers_help.c_str()) ("output-format,o", value(&output_format)->default_value("xces"), writers_help.c_str()) ("progress,p", value(&progress)->zero_tokens(), "Show progress info") ("quiet,q", value(&quiet)->zero_tokens(), "Suppress messages\n") ("until-done,u", value(&opts.until_done)->zero_tokens(), "Until-done mode\n") ("until-done-iterations", value(&opts.until_done_iterations), "Until-done iteration limit\n") ("first-sentence-only,1", value(&opts.first)->zero_tokens(), "Only process first sentence\n") ("help,h", "Show help") ; boost::program_options::variables_map vm; boost::program_options::positional_options_description p; p.add("files", -1); try { boost::program_options::store( boost::program_options::command_line_parser(argc, argv) .options(desc).positional(p).run(), vm); } catch (boost::program_options::error& e) { std::cerr << e.what() << std::endl; return 2; } boost::program_options::notify(vm); if (vm.count("help")) { std::cerr << "Usage " << argv[0] << " [OPTIONS] FILES\n" << "Files ending with .xml are treated as corpora, otherwise \n" << "as CCL files. Use - to read corpus from stdin (as with -I)"; std::cout << desc << "\n"; return 1; } foreach (const std::string& f, files) { if (boost::algorithm::ends_with(f, ".xml")) { corpora_files.push_back(f); } else { ccl_files.push_back(f); } } // consider stdin only when no corpus files given corpus_stdin = corpus_stdin && corpora_files.empty(); try { const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load); Wccl::Parser parser(tagset); Wccl::RuleSequence rules; foreach (const std::string& f, ccl_files) { size_t sz = rules.size(); if (!load_more_rules(parser, f, rules)) { std::cerr << "Warning: error while parsing " << f << "\n"; } if (rules.size() == sz) { std::cerr << "Warning: no rules loaded from " << f << "\n"; } } if (!rules.empty()) { Corpus2::TokenTimer& timer = Corpus2::global_timer(); timer.register_signal_handler(); boost::shared_ptr<Corpus2::TokenWriter> writer; writer.reset(Corpus2::TokenWriter::create(output_format, std::cout, tagset)); boost::shared_ptr<Corpus2::TokenReader> reader; foreach (const std::string& f, corpora_files) { reader = Corpus2::TokenReader::create_path_reader(input_format, tagset, f); apply_rules(reader, writer, rules, opts); } if (corpus_stdin) { reader = Corpus2::TokenReader::create_stream_reader(input_format, tagset, std::cin); apply_rules(reader, writer, rules, opts); } } } catch (PwrNlp::PwrNlpError& e) { std::cerr << e.info() << std::endl; return 2; } return 0; }