Skip to content
Snippets Groups Projects
main.cpp 5.81 KiB
Newer Older
#include <cstdlib>
#include <fstream>
#include <iomanip>


#include <libwccl/values/strset.h>
#include <libwccl/parser/Parser.h>
#include <libcorpus2/tagsetmanager.h>

#include <boost/bind.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/make_shared.hpp>
#include <boost/program_options.hpp>
#include <libcorpus2/io/xcesreader.h>

#include <antlr/NoViableAltException.hpp>
#include <antlr/MismatchedTokenException.hpp>


void load_more_operators(const std::string& filename, Wccl::Parser& parser,
	std::vector< boost::shared_ptr<Wccl::FunctionalOperator> >& ops)
{

	boost::shared_ptr<const Wccl::Value> retVal;
	boost::shared_ptr<Wccl::FunctionalOperator> retOp;
	try {
		std::ifstream is(filename.c_str());
		if (!is.good()) {
			throw Wccl::FileNotFound(filename, "", __FUNCTION__);
		}

		retOp = parser.parseAnyOperator(is);
		if (retOp) {
			ops.push_back(retOp);
		} else {
			std::cerr << "Problem while parsing -- "
				<< "parser returned NULL!" << std::endl;
		}
	} catch (antlr::MismatchedTokenException &e) {
		std::cerr << e.getFileLineColumnString()
				<< " " << e.getMessage() << std::endl;
	} catch(antlr::NoViableAltException &e) {
		std::cerr << e.getFileLineColumnString()
				<< " " << e.getMessage() << std::endl;
	} catch (Wccl::InvalidVariableName &e) {
		std::cerr << "Wccl::InvalidVariableName " << e.info() << std::endl;
	} catch (Wccl::VariableTypeMismatch &e) {
		std::cerr << "Wccl::VariableTypeMismatch " << e.info() << std::endl;
	} catch (Wccl::WcclError& e) {
		std::cerr << "Wccl::WcclError:" << e.info() << std::endl;
	} catch (PwrNlp::PwrNlpError& e) {
		std::cerr << "PwrNlp::PwrNlpError " << e.info() << std::endl;
	} catch (antlr::ANTLRException& e) {
		std::cerr << "Antlr error " << e.getMessage() << std::endl;
	}
}

class streamsave
{
public:
	streamsave(std::ostream& os) : os_(os), flags_(os.flags()) {}
	~streamsave() { os_.flags(flags_); }
private:
	std::ostream& os_;
	std::ios_base::fmtflags flags_;
};

void do_head(const Corpus2::Tagset& tagset,
	const std::vector< boost::shared_ptr<Wccl::FunctionalOperator> >& ops)
{
	streamsave sv(std::cout);
	std::cout << "## ";
	std::cout << std::setw(20) << "orth";
	int i = 0;
	foreach (const boost::shared_ptr<Wccl::FunctionalOperator>& o, ops) {
		++i;
		std::cout << " ";
		std::cout.setf(std::ios::right);
		std::cout << std::setw(15) << "operator ";
		std::cout.setf(std::ios::left);
		std::cout << std::setw(5) << i;
	}
	std::cout << "\n";
}

void do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence,
	const Corpus2::Tagset& tagset,
	const std::vector< boost::shared_ptr<Wccl::FunctionalOperator> >& ops)
{
	Wccl::SentenceContext sc(sentence);
	std::vector< std::vector< UnicodeString > > outputs(sentence->size());
ilor's avatar
ilor committed
	std::vector<int> lengths(ops.size() + 1, 0);
	streamsave sv(std::cout);
	for (size_t i = 0; i < sentence->size(); ++i) {
		sc.set_position(i);
		UnicodeString orth = sentence->tokens()[i]->orth();
		outputs[i].push_back(orth);
		lengths[0] = std::max(lengths[0], orth.length());
		int li = 1;
		foreach (const boost::shared_ptr<Wccl::FunctionalOperator>& o, ops) {
			boost::shared_ptr<const Wccl::Value> v = o->base_apply(sc);
			UnicodeString vstr = v->to_string_u(tagset);
			lengths[li] = std::max(lengths[li], vstr.length());
			++li;
			outputs[i].push_back(vstr);
		}
	}
	for (size_t i = 0; i < sentence->size(); ++i) {
		std::cout << std::setw(2) << (i + 1) << " ";
		for (size_t oi = 0; oi < outputs[i].size(); ++oi) {
			UnicodeString u = outputs[i][oi];
			u.padTrailing(lengths[oi]);
			std::cout << PwrNlp::to_utf8(u) << " ";
		}
		std::cout << "\n";
	}
}

void do_file(const std::string& filename, const Corpus2::Tagset& tagset,
	const std::vector< boost::shared_ptr<Wccl::FunctionalOperator> >& ops,
	bool first)
{
	Corpus2::XcesReader xr(tagset, filename);
	Corpus2::Sentence::Ptr s;
	//do_head(tagset, ops);
	while ((s = xr.get_next_sentence())) {
		do_sentence(s, tagset, ops);
		std::cout << "\n";
		if (first) break;
	}
}

int main(int argc, char** argv)
{
	std::string tagset_load = "kipi";
	bool first = false;
	std::vector<std::string> corpora_files, ccl_files, files;
	bool quiet = false;
	bool dump_variables = false;
	using boost::program_options::value;

	boost::program_options::options_description desc("Allowed options");
	desc.add_options()
			("tagset,t", value(&tagset_load),
			 "Tagset to use\n")
			("corpus,c", value(&corpora_files),
			 "Corpus file to load (XCES)\n")
			("ccl-file,C", value(&ccl_files),
			 "CCL query file\n")
			("files,f", value(&files),
			 "Files to load, look at extecion to determine type\n")
			("quiet,q", value(&quiet)->zero_tokens(),
			 "Suppress messages\n")
			("first-sentence-only,1", value(&first)->zero_tokens(),
			 "Only process first sentence\n")
			("help,h", "Show help")
			;
	boost::program_options::variables_map vm;
	boost::program_options::positional_options_description p;
	p.add("files", -1);

	try {
		boost::program_options::store(
			boost::program_options::command_line_parser(argc, argv)
			.options(desc).positional(p).run(), vm);
	} catch (boost::program_options::error& e) {
		std::cerr << e.what() << std::endl;
		return 2;
	}
	boost::program_options::notify(vm);

	if (vm.count("help")) {
		std::cout << desc << "\n";
		return 1;
	}

	foreach (const std::string& f, files) {
		if (boost::algorithm::ends_with(f, ".xml")) {
			corpora_files.push_back(f);
		} else {
			ccl_files.push_back(f);
		}
	}

	try {
		const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load);
		std::vector< boost::shared_ptr<Wccl::FunctionalOperator> > operators;
		Wccl::Parser parser(tagset);
		foreach (const std::string& f, ccl_files) {
			load_more_operators(f, parser, operators);
		}
		if (!operators.empty()) {
			foreach (const std::string& f, corpora_files) {
				do_file(f, tagset, operators, first);
			}
		}
	} catch (PwrNlp::PwrNlpError& e) {
		std::cerr << e.info() << std::endl;
		return 2;
	}

	return 0;
}