part 1 of the xml readers refactoring -- extract xmlreader from xces and ccl readers

c03647e8 · ilor · 3061dc24 · c03647e8 · c03647e8 · c03647e8
Commit c03647e8 authored 14 years ago by ilor
--- a/libcorpus2/CMakeLists.txt
+++ b/libcorpus2/CMakeLists.txt
@@ -68,6 +68,7 @@ SET(libcorpus2_STAT_SRC
 	io/xcesreader.cpp
 	io/xcesvalidate.cpp
 	io/xceswriter.cpp
+	io/xmlreader.cpp
 	util/settings.cpp
 	util/symboldictionary.cpp
 	util/tokentimer.cpp

--- a/libcorpus2/io/cclreader.cpp
+++ b/libcorpus2/io/cclreader.cpp
@@ -15,7 +15,7 @@ or FITNESS FOR A PARTICULAR PURPOSE.
 */

 #include <libcorpus2/io/cclreader.h>
-#include <libcorpus2/io/sax.h>
+#include <libcorpus2/io/xmlreader.h>
 #include <libpwrutils/foreach.h>
 #include <libxml++/libxml++.h>
 #include <libxml2/libxml/parser.h>
@@ -26,7 +26,7 @@ or FITNESS FOR A PARTICULAR PURPOSE.

 namespace Corpus2 {

-class CclReaderImpl : public BasicSaxParser
+class CclReaderImpl : public XmlReader
 {
 public:
 	CclReaderImpl(const Tagset& tagset,
@@ -36,29 +36,23 @@ public:
 	~CclReaderImpl();

 protected:
-	void on_start_element(const Glib::ustring & name,
-			const AttributeList& attributes);
-	void on_end_element(const Glib::ustring & name);
+	bool process_start_element(const Glib::ustring & name,
+		const AttributeList& attributes);

-	void finish_sentence();
+	bool process_end_element(const Glib::ustring& name);

-	const Tagset& tagset_;
+	void start_chunk(const AttributeList &attributes);

-	enum state_t { XS_NONE, XS_CHUNK, XS_SENTENCE, XS_TOK, XS_ANN, XS_ORTH, XS_LEX,
-			XS_LEMMA, XS_TAG, XS_REL };
-	state_t state_;
+	void start_sentence(const AttributeList &attributes);

-	bool chunkless_;
+	void start_token(const AttributeList &attributes);

-	bool out_of_chunk_;
+	void finish_token();

-	PwrNlp::Whitespace::Enum wa_;
+	static const int STATE_ANN = 901;
+	static const int STATE_REL = 902;

-	Glib::ustring sbuf_;
-
-	Token* tok_;
-
-	boost::shared_ptr<AnnotatedSentence> sent_;
+	boost::shared_ptr<AnnotatedSentence> ann_sent_;

 	std::string ann_chan_;

@@ -69,14 +63,6 @@ protected:
 	token_ann_t token_anns_;

 	std::set<std::string> token_ann_heads_;
-
-	boost::shared_ptr<Chunk> chunk_;
-
-	std::deque< boost::shared_ptr<Chunk> >& obuf_;
-
-	bool disamb_only_;
-
-	bool disamb_sh_;
 };

 CclReader::CclReader(const Tagset& tagset, std::istream& is,
@@ -121,58 +107,53 @@ void CclReader::ensure_more()
 CclReaderImpl::CclReaderImpl(const Tagset& tagset,
 		std::deque< boost::shared_ptr<Chunk> >& obuf,
 		bool disamb_only, bool disamb_sh)
-	: BasicSaxParser()
-	, tagset_(tagset), state_(XS_NONE), chunkless_(false), out_of_chunk_(false)
-	, wa_(PwrNlp::Whitespace::Newline)
-	, sbuf_(), tok_(NULL), sent_(), chunk_(), obuf_(obuf)
-	, disamb_only_(disamb_only), disamb_sh_(disamb_sh)
+	: XmlReader(tagset, obuf)
 {
+	XmlReader::set_disamb_only(disamb_only);
+	XmlReader::set_disamb_sh(disamb_sh);
+	sentence_tag_name_ = "sentence";
 }

 CclReaderImpl::~CclReaderImpl()
 {
-	delete tok_;
 }

-void CclReaderImpl::on_start_element(const Glib::ustring &name,
-		const AttributeList& attributes)
+void CclReaderImpl::start_chunk(const AttributeList& attributes)
 {
-	if (name == "chunk") {
-		std::string type;
-		foreach (const Attribute& a, attributes) {
-			if (a.name == "type") {
-				type = a.value;
-			}
-		}
-		if (type == "s") {
-			throw XcesError("Trying to parse XCES as CCL (<chunk type=\"s\">)");
-		} else if (state_ == XS_NONE) {
-			chunk_ = boost::make_shared<Chunk>();
-			state_ = XS_CHUNK;
-			foreach (const Attribute& a, attributes) {
-				chunk_->set_attribute(a.name, a.value);
-			}
-		} else if (state_ == XS_CHUNK) {
-			throw XcesError("Nested <chunk>");
-		} else {
-			throw XcesError("Unexpected <chunk>");
-		}
-	} else if (state_ == XS_CHUNK && name == "sentence") {
-		state_ = XS_SENTENCE;
-		sent_ = boost::make_shared<AnnotatedSentence>();
-	} else if (state_ == XS_SENTENCE && name == "tok") {
-		state_ = XS_TOK;
-		tok_ = new Token();
-		tok_->set_wa(wa_);
-		wa_ = PwrNlp::Whitespace::Space;
-		token_anns_.clear();
-		token_ann_heads_.clear();
-	} else if (state_ == XS_TOK && name == "orth") {
-		state_ = XS_ORTH;
-		grab_characters_ = true;
-		clear_buf();
-	} else if (state_ == XS_TOK && name == "ann") {
-		state_ = XS_ANN;
+	chunk_ = boost::make_shared<Chunk>();
+	std::string type = get_type_from_attributes(attributes);
+	if (type == "s") {
+		throw XcesError("Trying to parse XCES as CCL (<chunk type=\"s\">)");
+	}
+	foreach (const Attribute& a, attributes) {
+		chunk_->set_attribute(a.name, a.value);
+	}
+	state_ = STATE_CHUNK;
+	std::cerr << "Chunk";
+}
+
+
+
+void CclReaderImpl::start_sentence(const AttributeList& /*attributes*/)
+{
+	ann_sent_ = boost::make_shared<AnnotatedSentence>();
+	sent_ = ann_sent_;
+	state_ = STATE_SENTENCE;
+}
+
+
+void CclReaderImpl::start_token(const AttributeList& attributes)
+{
+	XmlReader::start_token(attributes);
+	token_anns_.clear();
+	token_ann_heads_.clear();
+}
+
+bool CclReaderImpl::process_start_element(const Glib::ustring & name,
+	const AttributeList& attributes)
+{
+	if (state_ == STATE_TOK && name == "ann") {
+		state_ = STATE_ANN;
 		grab_characters_ = true;
 		clear_buf();
 		ann_chan_ = "";
@@ -187,102 +168,40 @@ void CclReaderImpl::on_start_element(const Glib::ustring &name,
 		if (ann_chan_.empty()) {
 			throw XcesError("<ann> with no channel name");
 		}
-	} else if (state_ == XS_TOK && name == "lex") {
-		assert(tok_ != NULL);
-		bool is_disamb = false;
-		foreach (const Attribute& a, attributes) {
-			if (a.name == "disamb" && a.value == "1") {
-				is_disamb = true;
-			}
-		}
-		if (!disamb_only_ || is_disamb) {
-			tok_->add_lexeme(Lexeme());
-			tok_->lexemes().back().set_disamb(is_disamb);
-			state_ = XS_LEX;
-		}
-	} else if (state_ == XS_LEX && name == "base") {
-		state_ = XS_LEMMA;
-		grab_characters_ = true;
-		clear_buf();
-	} else if (state_ == XS_LEX && name == "ctag") {
-		state_ = XS_TAG;
-		grab_characters_ = true;
-		clear_buf();
-	} else if (name == "ns") {
-		wa_ = PwrNlp::Whitespace::None;
-	} else if (name == "tok" && state_ == XS_NONE) {
-		std::cerr << "Warning: out-of-chunk token, assuming sentence start on line ";
-		std::cerr << this->context_->input->line << "\n";
-		chunkless_ = true;
-		out_of_chunk_ = true;
-		chunk_ = boost::make_shared<Chunk>();
-		sent_ = boost::make_shared<AnnotatedSentence>();
-		state_ = XS_TOK;
-		tok_ = new Token();
-		tok_->set_wa(wa_);
-		wa_ = PwrNlp::Whitespace::Space;
-	}
-}
-
-void CclReaderImpl::finish_sentence()
-{
-	chunk_->append(sent_);
-	sent_.reset();
-	if (chunkless_) {
-		obuf_.push_back(chunk_);
-		chunk_.reset();
-		state_ = XS_NONE;
-		chunkless_ = false;
+		return true;
 	} else {
-		state_ = XS_CHUNK;
+		return false;
 	}
 }

-void CclReaderImpl::on_end_element(const Glib::ustring &name)
+bool CclReaderImpl::process_end_element(const Glib::ustring & name)
 {
-	if (state_ == XS_ORTH && name == "orth") {
-		tok_->set_orth(UnicodeString::fromUTF8(get_buf()));
-		grab_characters_ = false;
-		state_ = XS_TOK;
-	} else if (state_ == XS_ANN && name == "ann") {
+	if (state_ == STATE_ANN && name == "ann") {
 		std::string buf = get_buf();
 		grab_characters_ = false;
 		int segid = atoi(buf.c_str());
-		if (!sent_->has_channel(ann_chan_)) {
-			sent_->create_channel(ann_chan_);
+		if (!ann_sent_->has_channel(ann_chan_)) {
+			ann_sent_->create_channel(ann_chan_);
 		}
 		if (segid > 0) {
 			token_anns_.insert(std::make_pair(ann_chan_, segid));
 			token_ann_heads_.insert(ann_chan_);
 		}
-		state_ = XS_TOK;
-	} else if (state_ == XS_LEMMA && name == "base") {
-		tok_->lexemes().back().set_lemma(UnicodeString::fromUTF8(get_buf()));
-		grab_characters_ = false;
-		state_ = XS_LEX;
-	} else if (state_ == XS_TAG && name == "ctag") {
-		Tag tag = tagset_.parse_simple_tag(get_buf(), true);
-		tok_->lexemes().back().set_tag(tag);
-		grab_characters_ = false;
-		state_ = XS_LEX;
-	} else if (state_ == XS_LEX && name == "lex") {
-		state_ = XS_TOK;
-	} else if (state_ == XS_TOK && name == "tok") {
-		sent_->append(tok_);
-		tok_ = NULL;
-		state_ = XS_SENTENCE;
-		foreach (const token_ann_t::value_type& v, token_anns_) {
-			sent_->get_channel(v.first).set_segment_at(sent_->size() - 1, v.second);
-			if (token_ann_heads_.find(v.first) != token_ann_heads_.end()) {
-				sent_->get_channel(v.first).set_head_at(sent_->size() - 1, true);
-			}
+		state_ = STATE_TOK;
+		return true;
+	} else {
+		return false;
+	}
+}
+
+void CclReaderImpl::finish_token()
+{
+	XmlReader::finish_token();
+	foreach (const token_ann_t::value_type& v, token_anns_) {
+		ann_sent_->get_channel(v.first).set_segment_at(sent_->size() - 1, v.second);
+		if (token_ann_heads_.find(v.first) != token_ann_heads_.end()) {
+			ann_sent_->get_channel(v.first).set_head_at(sent_->size() - 1, true);
 		}
-	} else if (state_ == XS_SENTENCE && name == "sentence") {
-		finish_sentence();
-	} else if (state_ == XS_CHUNK && name == "chunk") {
-		obuf_.push_back(chunk_);
-		chunk_.reset();
-		state_ = XS_NONE;
 	}
 }


--- a/libcorpus2/io/xcesreader.cpp
+++ b/libcorpus2/io/xcesreader.cpp
@@ -15,7 +15,7 @@ or FITNESS FOR A PARTICULAR PURPOSE.
 */

 #include <libcorpus2/io/xcesreader.h>
-#include <libcorpus2/io/sax.h>
+#include <libcorpus2/io/xmlreader.h>
 #include <libpwrutils/foreach.h>
 #include <libxml++/libxml++.h>
 #include <libxml2/libxml/parser.h>
@@ -24,7 +24,7 @@ or FITNESS FOR A PARTICULAR PURPOSE.

 namespace Corpus2 {

-class XcesReaderImpl : public BasicSaxParser
+class XcesReaderImpl : public XmlReader
 {
 public:
 	XcesReaderImpl(const Tagset& tagset,
@@ -34,37 +34,6 @@ public:
 	~XcesReaderImpl();

 protected:
-	void on_start_element(const Glib::ustring & name,
-			const AttributeList& attributes);
-	void on_end_element(const Glib::ustring & name);
-
-	void finish_sentence();
-
-	const Tagset& tagset_;
-
-	enum state_t { XS_NONE, XS_CHUNK, XS_SENTENCE, XS_TOK, XS_ORTH, XS_LEX,
-			XS_LEMMA, XS_TAG };
-	state_t state_;
-
-	bool chunkless_;
-
-	bool out_of_chunk_;
-
-	PwrNlp::Whitespace::Enum wa_;
-
-	Glib::ustring sbuf_;
-
-	Token* tok_;
-
-	Sentence::Ptr sent_;
-
-	boost::shared_ptr<Chunk> chunk_;
-
-	std::deque< boost::shared_ptr<Chunk> >& obuf_;
-
-	bool disamb_only_;
-
-	bool disamb_sh_;
 };

 XcesReader::XcesReader(const Tagset& tagset, std::istream& is,
@@ -109,153 +78,15 @@ void XcesReader::ensure_more()
 XcesReaderImpl::XcesReaderImpl(const Tagset& tagset,
 		std::deque< boost::shared_ptr<Chunk> >& obuf,
 		bool disamb_only, bool disamb_sh)
-	: BasicSaxParser()
-	, tagset_(tagset), state_(XS_NONE), chunkless_(false), out_of_chunk_(false)
-	, wa_(PwrNlp::Whitespace::Newline)
-	, sbuf_(), tok_(NULL), sent_(), chunk_(), obuf_(obuf)
-	, disamb_only_(disamb_only), disamb_sh_(disamb_sh)
+	: XmlReader(tagset, obuf)
 {
+	XmlReader::set_disamb_only(disamb_only);
+	XmlReader::set_disamb_sh(disamb_sh);
+	sentence_tag_name_ = "chunk";
 }

 XcesReaderImpl::~XcesReaderImpl()
 {
-	delete tok_;
-}
-
-void XcesReaderImpl::on_start_element(const Glib::ustring &name,
-		const AttributeList& attributes)
-{
-	if (name == "chunk") {
-		std::string type;
-		foreach (const Attribute& a, attributes) {
-			if (a.name == "type") {
-				type = a.value;
-			}
-		}
-		if (out_of_chunk_) {
-			finish_sentence();
-			out_of_chunk_ = false;
-		}
-		if (state_ == XS_NONE) {
-			if (type == "s") {
-				//throw XcesError("Top level <chunk> is type=\"s\"");
-				state_ = XS_SENTENCE;
-				chunkless_ = true;
-				chunk_ = boost::make_shared<Chunk>();
-				sent_ = boost::make_shared<Sentence>();
-			} else {
-				chunk_ = boost::make_shared<Chunk>();
-				state_ = XS_CHUNK;
-				foreach (const Attribute& a, attributes) {
-					chunk_->set_attribute(a.name, a.value);
-				}
-			}
-		} else if (state_ == XS_CHUNK) {
-			if (type != "s") {
-				throw XcesError("Sub level <chunk> not type=\"s\"");
-			}
-			state_ = XS_SENTENCE;
-			sent_ = boost::make_shared<Sentence>();
-		} else {
-			throw XcesError("Unexpected <chunk>");
-		}
-	} else if (state_ == XS_SENTENCE && name == "tok") {
-		state_ = XS_TOK;
-		tok_ = new Token();
-		tok_->set_wa(wa_);
-		wa_ = PwrNlp::Whitespace::Space;
-	} else if (state_ == XS_TOK && name == "orth") {
-		state_ = XS_ORTH;
-		grab_characters_ = true;
-		clear_buf();
-	} else if (state_ == XS_TOK && name == "lex") {
-		assert(tok_ != NULL);
-		bool is_disamb = false;
-		if (!disamb_sh_) {
-			foreach (const Attribute& a, attributes) {
-				if (a.name == "disamb" && a.value == "1") {
-					is_disamb = true;
-				}
-			}
-		} else {
-			is_disamb = true;
-			foreach (const Attribute& a, attributes) {
-				if (a.name == "disamb_sh" && a.value == "0") {
-					is_disamb = false;
-				}
-			}
-		}
-		if (!disamb_only_ || is_disamb) {
-			tok_->add_lexeme(Lexeme());
-			tok_->lexemes().back().set_disamb(is_disamb);
-			state_ = XS_LEX;
-		}
-	} else if (state_ == XS_LEX && name == "base") {
-		state_ = XS_LEMMA;
-		grab_characters_ = true;
-		clear_buf();
-	} else if (state_ == XS_LEX && name == "ctag") {
-		state_ = XS_TAG;
-		grab_characters_ = true;
-		clear_buf();
-	} else if (name == "ns") {
-		wa_ = PwrNlp::Whitespace::None;
-	} else if (name == "tok" && state_ == XS_NONE) {
-		std::cerr << "Warning: out-of-chunk token, assuming sentence start on line ";
-		std::cerr << this->context_->input->line << "\n";
-		chunkless_ = true;
-		out_of_chunk_ = true;
-		chunk_ = boost::make_shared<Chunk>();
-		sent_ = boost::make_shared<Sentence>();
-		state_ = XS_TOK;
-		tok_ = new Token();
-		tok_->set_wa(wa_);
-		wa_ = PwrNlp::Whitespace::Space;
-	}
-}
-
-void XcesReaderImpl::finish_sentence()
-{
-	chunk_->append(sent_);
-	sent_.reset();
-	if (chunkless_) {
-		obuf_.push_back(chunk_);
-		chunk_.reset();
-		state_ = XS_NONE;
-		chunkless_ = false;
-	} else {
-		state_ = XS_CHUNK;
-	}
-}
-
-void XcesReaderImpl::on_end_element(const Glib::ustring &name)
-{
-	if (state_ == XS_ORTH && name == "orth") {
-		tok_->set_orth(UnicodeString::fromUTF8(get_buf()));
-		grab_characters_ = false;
-		state_ = XS_TOK;
-	} else if (state_ == XS_LEMMA && name == "base") {
-		tok_->lexemes().back().set_lemma(UnicodeString::fromUTF8(get_buf()));
-		grab_characters_ = false;
-		state_ = XS_LEX;
-	} else if (state_ == XS_TAG && name == "ctag") {
-		Tag tag = tagset_.parse_simple_tag(get_buf(), true);
-		tok_->lexemes().back().set_tag(tag);
-		grab_characters_ = false;
-		state_ = XS_LEX;
-	} else if (state_ == XS_LEX && name == "lex") {
-		state_ = XS_TOK;
-	} else if (state_ == XS_TOK && name == "tok") {
-		sent_->append(tok_);
-		tok_ = NULL;
-		state_ = XS_SENTENCE;
-	} else if (state_ == XS_SENTENCE && name == "chunk") {
-		finish_sentence();
-	} else if (state_ == XS_CHUNK && name == "chunk") {
-		obuf_.push_back(chunk_);
-		chunk_.reset();
-		state_ = XS_NONE;
-	}
 }

 } /* end ns Corpus2 */
--- a/libcorpus2/io/xmlreader.cpp
+++ b/libcorpus2/io/xmlreader.cpp
+/*
+    Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski
+    Part of the libcorpus2 project
+
+    This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3 of the License, or (at your option)
+any later version.
+
+    This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. 
+
+    See the LICENSE and COPYING files for more details.
+*/
+
+#include <libcorpus2/io/xmlreader.h>
+#include <libpwrutils/foreach.h>
+#include <libxml++/libxml++.h>
+#include <libxml2/libxml/parser.h>
+#include <boost/make_shared.hpp>
+#include <fstream>
+
+namespace Corpus2 {
+
+
+XmlReader::XmlReader(const Tagset& tagset,
+		std::deque< boost::shared_ptr<Chunk> >& obuf)
+	: BasicSaxParser()
+	, tagset_(tagset), state_(STATE_NONE)
+	, chunkless_(false), out_of_chunk_(false)
+	, wa_(PwrNlp::Whitespace::Newline)
+	, sbuf_(), tok_(NULL), sent_(), chunk_(), obuf_(obuf)
+	, disamb_only_(false), disamb_sh_(false)
+	, warn_on_inconsistent_(true), warn_on_unexpected_(true)
+{
+}
+
+XmlReader::~XmlReader()
+{
+	delete tok_;
+}
+
+std::string XmlReader::get_type_from_attributes(const AttributeList& attributes) const
+{
+	std::string type;
+	foreach (const Attribute& a, attributes) {
+		if (a.name == "type") {
+			type = a.value;
+		}
+	}
+	return type;
+}
+
+
+void XmlReader::on_start_element(const Glib::ustring &name,
+		const AttributeList& attributes)
+{
+	std::cerr << name << state_ << "\n";
+	if (state_ == STATE_NONE && name == "chunk") {
+		start_chunk(attributes);
+	} else if (state_ == STATE_CHUNK && name == sentence_tag_name_) {
+		start_sentence(attributes);
+	} else if (state_ == STATE_SENTENCE && name == "tok") {
+		start_token(attributes);
+	} else if (state_ == STATE_TOK && name == "orth") {
+		state_ = STATE_ORTH;
+		grab_characters_ = true;
+		clear_buf();
+	} else if (state_ == STATE_TOK && name == "lex") {
+		start_lexeme(attributes);
+	} else if (state_ == STATE_LEX && name == "base") {
+		state_ = STATE_LEMMA;
+		grab_characters_ = true;
+		clear_buf();
+	} else if (state_ == STATE_LEX && name == "ctag") {
+		state_ = STATE_TAG;
+		grab_characters_ = true;
+		clear_buf();
+	} else if (name == "ns") {
+		wa_ = PwrNlp::Whitespace::None;
+	} else if (state_ == STATE_NONE && name == "tok") {
+		if (warn_on_inconsistent_) {
+			std::cerr << "Warning: out-of-chunk token, assuming sentence start on line ";
+			std::cerr << this->context_->input->line << "\n";
+		}
+		chunkless_ = true;
+		out_of_chunk_ = true;
+		AttributeList fake;
+		start_chunk(fake);
+		start_sentence(fake);
+		start_token(attributes);
+	} else if (state_ == STATE_NONE && name == "cesAna") {
+		//nop
+	} else if (state_ == STATE_NONE && name == "chunkList") {
+		//nop
+	} else if (process_start_element(name, attributes)) {
+		//nop
+	} else if (warn_on_unexpected_) {
+		std::cerr << "Unexpected tag <" << name << "> on line ";
+		std::cerr << this->context_->input->line << " (" << state_ << ")\n";
+	}
+}
+
+bool XmlReader::process_start_element(const Glib::ustring &/*name*/,
+	const AttributeList &/*attributes*/)
+{
+	return false;
+}
+
+bool XmlReader::process_end_element(const Glib::ustring & /*name*/)
+{
+	return false;
+}
+
+
+void XmlReader::start_chunk(const AttributeList& attributes)
+{
+	if (out_of_chunk_) {
+		finish_sentence();
+		out_of_chunk_ = false;
+	}
+	std::string type = get_type_from_attributes(attributes);
+	chunk_ = boost::make_shared<Chunk>();
+
+	if (type == "s") {
+		// top-level chunk is a sentence
+		start_sentence(attributes);
+		chunkless_ = true;
+	} else {
+		foreach (const Attribute& a, attributes) {
+			chunk_->set_attribute(a.name, a.value);
+		}
+		state_ = STATE_CHUNK;
+	}
+}
+
+void XmlReader::start_sentence(const AttributeList &attributes)
+{
+	std::string type = get_type_from_attributes(attributes);
+	if (type != "s") {
+		throw XcesError("Sub level <chunk> not type=\"s\"");
+	}
+	sent_ = boost::make_shared<Corpus2::Sentence>();
+	state_ = STATE_SENTENCE;
+}
+
+void XmlReader::start_token(const AttributeList &/*attributes*/)
+{
+	state_ = STATE_TOK;
+	tok_ = new Token();
+	tok_->set_wa(wa_);
+	wa_ = PwrNlp::Whitespace::Space;
+}
+
+void XmlReader::start_lexeme(const AttributeList &attributes)
+{
+	assert(tok_ != NULL);
+	bool is_disamb = false;
+	if (!disamb_sh_) {
+		foreach (const Attribute& a, attributes) {
+			if (a.name == "disamb" && a.value == "1") {
+				is_disamb = true;
+			}
+		}
+	} else {
+		is_disamb = true;
+		foreach (const Attribute& a, attributes) {
+			if (a.name == "disamb_sh" && a.value == "0") {
+				is_disamb = false;
+			}
+		}
+	}
+	if (!disamb_only_ || is_disamb) {
+		tok_->add_lexeme(Lexeme());
+		tok_->lexemes().back().set_disamb(is_disamb);
+		state_ = STATE_LEX;
+	}
+}
+
+void XmlReader::finish_chunk()
+{
+	std::cerr << "FC\n";
+	assert(chunk_);
+	obuf_.push_back(chunk_);
+	chunk_.reset();
+	state_ = STATE_NONE;
+}
+
+void XmlReader::finish_sentence()
+{
+	assert(chunk_);
+	chunk_->append(sent_);
+	sent_.reset();
+	if (chunkless_) {
+		obuf_.push_back(chunk_);
+		chunk_.reset();
+		state_ = STATE_NONE;
+		chunkless_ = false;
+	} else {
+		state_ = STATE_CHUNK;
+	}
+}
+
+void XmlReader::finish_token()
+{
+	assert(sent_);
+	sent_->append(tok_);
+	tok_ = NULL;
+	state_ = STATE_SENTENCE;
+}
+
+void XmlReader::on_end_element(const Glib::ustring &name)
+{
+	std::cerr << "/" << name << state_ << "\n";
+
+	if (state_ == STATE_ORTH && name == "orth") {
+		tok_->set_orth(UnicodeString::fromUTF8(get_buf()));
+		grab_characters_ = false;
+		state_ = STATE_TOK;
+	} else if (state_ == STATE_LEMMA && name == "base") {
+		tok_->lexemes().back().set_lemma(UnicodeString::fromUTF8(get_buf()));
+		grab_characters_ = false;
+		state_ = STATE_LEX;
+	} else if (state_ == STATE_TAG && name == "ctag") {
+		Tag tag = tagset_.parse_simple_tag(get_buf(), true);
+		tok_->lexemes().back().set_tag(tag);
+		grab_characters_ = false;
+		state_ = STATE_LEX;
+	} else if (state_ == STATE_LEX && name == "lex") {
+		state_ = STATE_TOK;
+	} else if (state_ == STATE_TOK && name == "tok") {
+		finish_token();
+	} else if (state_ == STATE_SENTENCE && name == sentence_tag_name_) {
+		finish_sentence();
+	} else if (state_ == STATE_CHUNK && name == "chunk") {
+		finish_chunk();
+	} else {
+		process_end_element(name);
+	}
+}
+
+} /* end ns Corpus2 */
--- a/libcorpus2/io/xmlreader.h
+++ b/libcorpus2/io/xmlreader.h
+/*
+    Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski
+    Part of the libcorpus2 project
+
+    This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3 of the License, or (at your option)
+any later version.
+
+    This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. 
+
+    See the LICENSE and COPYING files for more details.
+*/
+
+#ifndef LIBCORPUS2_IO_XMLREADER_H
+#define LIBCORPUS2_IO_XMLREADER_H
+
+#include <libcorpus2/io/reader.h>
+#include <libcorpus2/io/xces.h>
+#include <libcorpus2/chunk.h>
+#include <deque>
+#include <boost/scoped_ptr.hpp>
+#include <libcorpus2/io/sax.h>
+#include <libpwrutils/foreach.h>
+
+namespace Corpus2 {
+
+class XmlReader : public BasicSaxParser
+{
+public:
+	XmlReader(const Tagset& tagset,
+		std::deque< boost::shared_ptr<Chunk> >& obuf);
+
+	virtual ~XmlReader();
+
+	void set_disamb_only(bool v) {
+		disamb_only_ = v;
+	}
+
+	void set_disamb_sh(bool v) {
+		disamb_sh_ = v;
+	}
+
+protected:
+	std::string get_type_from_attributes(const AttributeList& attributes) const;
+
+	void on_start_element(const Glib::ustring & name,
+			const AttributeList& attributes);
+
+	void on_end_element(const Glib::ustring & name);
+
+	virtual bool process_start_element(const Glib::ustring & name,
+			const AttributeList& attributes);
+
+	virtual bool process_end_element(const Glib::ustring & name);
+
+	virtual void start_chunk(const AttributeList& attributes);
+
+	virtual void start_sentence(const AttributeList& attributes);
+
+	virtual void start_token(const AttributeList& attributes);
+
+	void start_lexeme(const AttributeList& attributes);
+
+	virtual void finish_chunk();
+
+	virtual void finish_sentence();
+
+	virtual void finish_token();
+
+	const Tagset& tagset_;
+
+	static const int STATE_NONE = 0;
+	static const int STATE_CHUNK = 1;
+	static const int STATE_SENTENCE = 2;
+	static const int STATE_TOK = 3;
+	static const int STATE_ORTH = 4;
+	static const int STATE_LEX = 5;
+	static const int STATE_LEMMA = 6;
+	static const int STATE_TAG = 7;
+
+	int state_;
+
+	bool chunkless_;
+
+	bool out_of_chunk_;
+
+	/// Whitespace for the next token
+	PwrNlp::Whitespace::Enum wa_;
+
+	/// Character data buffer
+	Glib::ustring sbuf_;
+
+	/// Token being constructed
+	Token* tok_;
+
+	/// Sentence being constructed
+	Sentence::Ptr sent_;
+
+	/// Chunk being constructed
+	boost::shared_ptr<Chunk> chunk_;
+
+	/// Output chunk buffer
+	std::deque< boost::shared_ptr<Chunk> >& obuf_;
+
+	/// Flag to only read disamb tags
+	bool disamb_only_;
+
+	/// Read Pantera-like disamb_sh diamb tag markings
+	bool disamb_sh_;
+
+	bool warn_on_inconsistent_;
+
+	bool warn_on_unexpected_;
+
+	std::string sentence_tag_name_;
+};
+
+} /* end ns Corpus2 */
+
+#endif // LIBCORPUS2_IO_XMLREADER_H
--- a/tests/ioann.cpp
+++ b/tests/ioann.cpp
@@ -99,6 +99,7 @@ BOOST_AUTO_TEST_CASE( iobase )
 	ssin << swiatopoglad_ann;
 	Corpus2::CclReader xr(tagset, ssin);
 	boost::shared_ptr<Corpus2::Chunk> chunk = xr.get_next_chunk();
+	BOOST_REQUIRE(chunk);
 	std::stringstream ss;
 	boost::shared_ptr<Corpus2::TokenWriter> w(Corpus2::TokenWriter::create("xces,flat", ss, tagset));
 	w->write_chunk(*chunk);