main.cpp

#include <cstdlib>
#include <fstream>
#include <iomanip>


#include <libwccl/values/strset.h>
#include <libwccl/parser/Parser.h>
#include <libwccl/ops/rulesequence.h>
#include <libcorpus2/tagsetmanager.h>
#include <libcorpus2/util/tokentimer.h>


#include <boost/bind.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/make_shared.hpp>
#include <boost/program_options.hpp>
#include <libcorpus2/io/xcesreader.h>
#include <libcorpus2/io/xceswriter.h>

#include <antlr/NoViableAltException.hpp>
#include <antlr/MismatchedTokenException.hpp>

namespace {
	bool quiet = false;
	bool progress = false;

	struct options {
		bool first;
		bool until_done;
		int until_done_iterations;
	};
}

bool load_more_rules(Wccl::Parser& parser, const std::string& filename, Wccl::RuleSequence& rules)
{
	boost::shared_ptr<Wccl::RuleSequence> ret;
	try {
		std::ifstream is(filename.c_str());
		if (!is.good()) {
			throw Wccl::FileNotFound(filename, "", __FUNCTION__);
		}

		ret = parser.parseRuleSequence(is);
		if (ret) {
			if (!quiet) {
				std::cerr << "Loaded " << ret->size() << " rule(s) from "
					<< filename << "\n";
			}
			std::copy(ret->begin(), ret->end(), std::back_inserter(rules));
			return true;
		} else {
			std::cerr << "Problem while parsing -- "
				<< "parser returned NULL!" << std::endl;
		}
	} catch (antlr::MismatchedTokenException &e) {
		std::cerr << e.getFileLineColumnString()
				<< " " << e.getMessage() << std::endl;
	} catch(antlr::NoViableAltException &e) {
		std::cerr << e.getFileLineColumnString()
				<< " " << e.getMessage() << std::endl;
	} catch (Wccl::InvalidVariableName &e) {
		std::cerr << "Wccl::InvalidVariableName " << e.info() << std::endl;
	} catch (Wccl::VariableTypeMismatch &e) {
		std::cerr << "Wccl::VariableTypeMismatch " << e.info() << std::endl;
	} catch (Wccl::WcclError& e) {
		std::cerr << "Wccl::WcclError:" << e.info() << std::endl;
	} catch (PwrNlp::PwrNlpError& e) {
		std::cerr << "PwrNlp::PwrNlpError " << e.info() << std::endl;
	} catch (antlr::ANTLRException& e) {
		std::cerr << "Antlr error " << e.getMessage() << std::endl;
	}
	return false;
}

void apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader,
	boost::shared_ptr<Corpus2::TokenWriter> writer, Wccl::RuleSequence& rules,
	const options& opts)
{
	Corpus2::TokenTimer& timer = Corpus2::global_timer();
	while (boost::shared_ptr<Corpus2::Chunk> c = reader->get_next_chunk()) {
		foreach (boost::shared_ptr<Corpus2::Sentence>& s, c->sentences()) {
			boost::shared_ptr<Corpus2::AnnotatedSentence> as;
			as = Corpus2::AnnotatedSentence::wrap_sentence(s);
			if (opts.until_done) {
				rules.execute_until_done(as, opts.until_done_iterations);
			} else {
				rules.execute_once(as);
			}
			timer.count_sentence(*as);
			if (progress) {
				timer.check_slice();
			}
			if (opts.first) break;
			writer->write_sentence(*as);
		}
		//writer->write_chunk(*c);
		if (opts.first) break;
	}
	if (progress) {
		timer.stats();
	}
}


int main(int argc, char** argv)
{
	std::string tagset_load = "kipi";
	std::string input_format;
	std::string output_format;
	options opts;
	opts.first = false;
	opts.until_done = false;
	opts.until_done_iterations = 1000;
	std::vector<std::string> corpora_files, ccl_files, files;
	bool corpus_stdin = true;
	using boost::program_options::value;

	std::string readers = boost::algorithm::join(Corpus2::TokenReader::available_reader_types_help(), " ");
	std::string readers_help = "Input format, any of: " + readers + "\n";
	std::string writers = boost::algorithm::join(Corpus2::TokenWriter::available_writer_types_help(), " ");
	std::string writers_help = "Output format, any of: " + writers + "\n";;

	boost::program_options::options_description desc("Allowed options");
	desc.add_options()
			("tagset,t", value(&tagset_load),
			 "Tagset to use\n")
			("corpus,c", value(&corpora_files),
			 "Corpus file to load (XCES), do not load from stdin\n")
			("ccl-file,C", value(&ccl_files),
			 "CCL rule files\n")
			("files,f", value(&files),
			 "Files to load, looking at the extension to determine type\n")
			("input-format,i", value(&input_format)->default_value("xces"),
			 readers_help.c_str())
			("output-format,o", value(&output_format)->default_value("xces"),
			 writers_help.c_str())
			("progress,p", value(&progress)->zero_tokens(),
			 "Show progress info")
			("quiet,q", value(&quiet)->zero_tokens(),
			 "Suppress messages\n")
			("until-done,u", value(&opts.until_done)->zero_tokens(),
			 "Until-done mode\n")
			("until-done-iterations", value(&opts.until_done_iterations),
			 "Until-done iteration limit\n")
			("first-sentence-only,1", value(&opts.first)->zero_tokens(),
			 "Only process first sentence\n")
			("help,h", "Show help")
			;
	boost::program_options::variables_map vm;
	boost::program_options::positional_options_description p;
	p.add("files", -1);

	try {
		boost::program_options::store(
			boost::program_options::command_line_parser(argc, argv)
			.options(desc).positional(p).run(), vm);
	} catch (boost::program_options::error& e) {
		std::cerr << e.what() << std::endl;
		return 2;
	}
	boost::program_options::notify(vm);

	if (vm.count("help")) {
		std::cerr << "Usage " << argv[0] << " [OPTIONS] FILES\n"
			<< "Files ending with .xml are treated as corpora, otherwise \n"
			<< "as CCL files. Use - to read corpus from stdin (as with -I)";
		std::cout << desc << "\n";
		return 1;
	}

	foreach (const std::string& f, files) {
		if (boost::algorithm::ends_with(f, ".xml")) {
			corpora_files.push_back(f);
		} else {
			ccl_files.push_back(f);
		}
	}

    // consider stdin only when no corpus files given
    corpus_stdin = corpus_stdin && corpora_files.empty();

	try {
		const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load);
		Wccl::Parser parser(tagset);
		Wccl::RuleSequence rules;
		foreach (const std::string& f, ccl_files) {
			size_t sz = rules.size();
			if (!load_more_rules(parser, f, rules)) {
				std::cerr << "Warning: error while parsing " << f << "\n";
			}
			if (rules.size() == sz) {
				std::cerr << "Warning: no rules loaded from " << f << "\n";
			}
		}
		if (!rules.empty()) {
			Corpus2::TokenTimer& timer = Corpus2::global_timer();
			timer.register_signal_handler();
			boost::shared_ptr<Corpus2::TokenWriter> writer;
			writer.reset(Corpus2::TokenWriter::create(output_format, std::cout, tagset));
			boost::shared_ptr<Corpus2::TokenReader> reader;
			foreach (const std::string& f, corpora_files) {
				reader = Corpus2::TokenReader::create_path_reader(input_format, tagset, f);
				apply_rules(reader, writer, rules, opts);
			}
			if (corpus_stdin) {
				reader = Corpus2::TokenReader::create_stream_reader(input_format, tagset, std::cin);
				apply_rules(reader, writer, rules, opts);
			}
		}
	} catch (PwrNlp::PwrNlpError& e) {
		std::cerr << e.info() << std::endl;
		return 2;
	}

	return 0;
}