Skip to content
Snippets Groups Projects
xcesreader.cpp 4.50 KiB
#include <libcorpus2/io/xcesreader.h>
#include <libcorpus2/io/sax.h>
#include <libpwrutils/foreach.h>
#include <libxml++/libxml++.h>

namespace Corpus2 {

class XcesReaderImpl : public BasicSaxParser
{
public:
	XcesReaderImpl(const Tagset& tagset, std::deque<Chunk*>& obuf,
			bool disamb_only, bool disamb_sh);

	~XcesReaderImpl();

protected:
	void on_start_element(const Glib::ustring & name,
			const AttributeList& attributes);
	void on_end_element(const Glib::ustring & name);

	const Tagset& tagset_;

	enum state_t { XS_NONE, XS_CHUNK, XS_SENTENCE, XS_TOK, XS_ORTH, XS_LEX,
			XS_LEMMA, XS_TAG };
	state_t state_;

	PwrNlp::Whitespace::Enum wa_;

	Glib::ustring sbuf_;

	Token* tok_;

	Sentence* sent_;

	Chunk* chunk_;

	std::deque<Chunk*>& obuf_;

	bool disamb_only_;

	bool disamb_sh_;
};

XcesReader::XcesReader(const Tagset& tagset, std::istream& is,
		bool disamb_only, bool disamb_sh)
	: BufferedChunkReader(tagset), is_(is)
	, impl_(new XcesReaderImpl(tagset, chunk_buf_, disamb_only, disamb_sh))
{
}

XcesReader::~XcesReader()
{
}

void XcesReader::ensure_more()
{
	static const int BUFSIZE=1024;
	while (chunk_buf_.empty() && is().good()) {
		unsigned char buf[BUFSIZE+1];
		is().read(reinterpret_cast<char*>(buf), BUFSIZE);
		impl_->parse_chunk_raw(buf, is().gcount());
		if (is().eof()) {
			impl_->finish_chunk_parsing();
		}
	}
}

XcesReaderImpl::XcesReaderImpl(const Tagset& tagset,
		std::deque<Chunk*>& obuf, bool disamb_only, bool disamb_sh)
	: BasicSaxParser()
	, tagset_(tagset), state_(XS_NONE), wa_(PwrNlp::Whitespace::Newline)
	, sbuf_(), tok_(NULL), sent_(NULL), chunk_(NULL), obuf_(obuf)
	, disamb_only_(disamb_only), disamb_sh_(disamb_sh)
{
}

XcesReaderImpl::~XcesReaderImpl()
{
	delete tok_;
	delete sent_;
	delete chunk_;
}

void XcesReaderImpl::on_start_element(const Glib::ustring &name,
		const AttributeList& attributes)
{
	if (name == "chunk") {
		std::string type;
		foreach (const Attribute& a, attributes) {
			if (a.name == "type") {
				type = a.value;
			}
		}
		if (state_ == XS_NONE) {
			if (type == "s") {
				throw XcesError("Top level <chunk> is type=\"s\"");
			}
			state_ = XS_CHUNK;
			chunk_ = new Chunk;
			foreach (const Attribute& a, attributes) {
				chunk_->set_attribute(a.name, a.value);
			}
		} else if (state_ == XS_CHUNK) {
			if (type != "s") {
				throw XcesError("Sub level <chunk> not type=\"s\"");
			}
			state_ = XS_SENTENCE;
			sent_ = new Sentence;
		} else {
			throw XcesError("Unexpected <chunk>");
		}
	} else if (state_ == XS_SENTENCE && name == "tok") {
		state_ = XS_TOK;
		tok_ = new Token();
		tok_->set_wa(wa_);
		wa_ = PwrNlp::Whitespace::Space;
	} else if (state_ == XS_TOK && name == "orth") {
		state_ = XS_ORTH;
		grab_characters_ = true;
		clear_buf();
	} else if (state_ == XS_TOK && name == "lex") {
		assert(tok_ != NULL);
		bool is_disamb = false;
		if (!disamb_sh_) {
			foreach (const Attribute& a, attributes) {
				if (a.name == "disamb" && a.value == "1") {
					is_disamb = true;
				}
			}
		} else {
			is_disamb = true;
			foreach (const Attribute& a, attributes) {
				if (a.name == "disamb_sh" && a.value == "0") {
					is_disamb = false;
				}
			}
		}
		if (!disamb_only_ || is_disamb) {
			tok_->add_lexeme(Lexeme());
			tok_->lexemes().back().set_disamb(is_disamb);
			state_ = XS_LEX;
		}
	} else if (state_ == XS_LEX && name == "base") {
		state_ = XS_LEMMA;
		grab_characters_ = true;
		clear_buf();
	} else if (state_ == XS_LEX && name == "ctag") {
		state_ = XS_TAG;
		grab_characters_ = true;
		clear_buf();
	} else if (name == "ns") {
		wa_ = PwrNlp::Whitespace::None;
	}
}

void XcesReaderImpl::on_end_element(const Glib::ustring &name)
{
	if (state_ == XS_ORTH && name == "orth") {
		tok_->set_orth(UnicodeString::fromUTF8(get_buf()));
		grab_characters_ = false;
		state_ = XS_TOK;
	} else if (state_ == XS_LEMMA && name == "base") {
		tok_->lexemes().back().set_lemma(UnicodeString::fromUTF8(get_buf()));
		grab_characters_ = false;
		state_ = XS_LEX;
	} else if (state_ == XS_TAG && name == "ctag") {
		Tag tag = tagset_.parse_simple_tag(get_buf(), true);
		tok_->lexemes().back().set_tag(tag);
		grab_characters_ = false;
		state_ = XS_LEX;
	} else if (state_ == XS_LEX && name == "lex") {
		state_ = XS_TOK;
	} else if (state_ == XS_TOK && name == "tok") {
		sent_->append(tok_);
		tok_ = NULL;
		state_ = XS_SENTENCE;
	} else if (state_ == XS_SENTENCE && name == "chunk") {
		chunk_->append(sent_);
		sent_ = NULL;
		state_ = XS_CHUNK;
	} else if (state_ == XS_CHUNK && name == "chunk") {
		obuf_.push_back(chunk_);
		chunk_ = NULL;
		state_ = XS_NONE;
	}
}

} /* end ns Corpus2 */