/*
    Copyright (C) 2010 Tomasz Ĺšniatowski, Adam Radziszewski
    Part of the libcorpus2 project

    This program is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your option)
any later version.

    This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. 

    See the LICENSE.CORPUS2, LICENSE.POLIQARP, COPYING.LESSER and COPYING files for more details.
*/

#include <libcorpus2/io/xcesvalidate.h>
#include <libcorpus2/tagset.h>

#include <boost/foreach.hpp>
#include <libxml++/libxml++.h>
#include <libxml++/nodes/node.h>
#include <libxml++/nodes/element.h>
#include <libxml++/parsers/saxparser.h>

#include <iostream>
namespace Corpus2 {

class XcesValidatorImpl : public xmlpp::SaxParser
{
public:
	XcesValidatorImpl(const Tagset& tagset, std::ostream& out);

protected:
	void on_start_element(const Glib::ustring & name,
			const AttributeList& attributes);
	void on_end_element(const Glib::ustring & name);
	void on_cdata_block(const Glib::ustring & text);
	void on_characters(const Glib::ustring & text);

	const Tagset& tagset_;

	enum state_t { XS_NONE, XS_TOK, XS_ORTH, XS_LEX, XS_LEMMA, XS_TAG };
	state_t state_;

	std::string last_orth_;

	std::string sbuf_;

	std::ostream& os_;

	int token_idx_;

	int tag_idx_;
};

XcesValidator::XcesValidator(const Tagset& tagset, std::ostream& out)
	: impl_(new XcesValidatorImpl(tagset, out))
{
}

XcesValidator::~XcesValidator()
{
}

void XcesValidator::validate_stream(std::istream &is)
{
	impl_->parse_stream(is);
}

void XcesValidator::validate_file(const std::string& filename)
{
	impl_->parse_file(filename);
}

XcesValidatorImpl::XcesValidatorImpl(const Tagset& tagset,
		std::ostream& out)
	: xmlpp::SaxParser()
	, tagset_(tagset), state_(XS_NONE)
	, last_orth_(), sbuf_(), os_(out), token_idx_(0), tag_idx_(0)
{
}



void XcesValidatorImpl::on_start_element(const Glib::ustring &name,
		const AttributeList& /*attributes*/)
{
	if (name == "tok") {
		state_ = XS_TOK;
		++token_idx_;
		tag_idx_ = 0;
		last_orth_ = "";
	} else if (state_ == XS_TOK && name == "orth") {
		state_ = XS_ORTH;
		sbuf_ = "";
	} else if (state_ == XS_TOK && name == "lex") {
		state_ = XS_LEX;
	} else if (state_ == XS_LEX && name == "ctag") {
		state_ = XS_TAG;
		++tag_idx_;
		sbuf_ = "";
	}
}


namespace {
	void error_preamble(std::ostream& os, const std::string& orth,
			const std::string& tag, int tokenid, int tagid) {
		os << "Token " << tokenid << " (" << orth << "), tag " << tagid
			<< " (" << tag << "): ";
	}
}

void XcesValidatorImpl::on_end_element(const Glib::ustring &name)
{
	if (state_ == XS_ORTH && name == "orth") {
		last_orth_ = sbuf_;
		state_ = XS_TOK;
	} else if (state_ == XS_TAG && name == "ctag") {
		try {
			Tag tag = tagset_.parse_simple_tag(sbuf_);
			std::stringstream ss;
			if (!tagset_.validate_tag(tag, Tagset::ParseStrict, &ss)) {
				error_preamble(os_, last_orth_, sbuf_, token_idx_, tag_idx_);
				os_ << ss.str() << "\n";
			}
		} catch (TagParseError& e) {
			error_preamble(os_, last_orth_, sbuf_, token_idx_, tag_idx_);
			os_ << e.info() << "\n";
		}
		state_ = XS_LEX;
	} else if (state_ == XS_LEX && name == "lex") {
		state_ = XS_TOK;
	} else if (state_ == XS_TOK && name == "tok") {
		state_ = XS_NONE;
	}
}

void XcesValidatorImpl::on_cdata_block(const Glib::ustring &/*text*/)
{
}

void XcesValidatorImpl::on_characters(const Glib::ustring &text)
{
	sbuf_ += (std::string)text;
}

} /* end ns Corpus2 */