diff --git a/libcorpus2/CMakeLists.txt b/libcorpus2/CMakeLists.txt
index 0994d5fcef88f610f68941223339adc84748b180..aa066f935f3e062ffa7310725030968159db976b 100644
--- a/libcorpus2/CMakeLists.txt
+++ b/libcorpus2/CMakeLists.txt
@@ -54,6 +54,7 @@ SET(libcorpus2_STAT_SRC
 	tagsetmanager.cpp
 	tagsetparser.cpp
 	token.cpp
+	io/cclreader.cpp
 	io/orthwriter.cpp
 	io/plainwriter.cpp
 	io/premorphwriter.cpp
diff --git a/libcorpus2/ann/annotatedsentence.cpp b/libcorpus2/ann/annotatedsentence.cpp
index 68b561f92d1af65a3412bb6dbf420cf5c1bca184..327e9bad2be2d574c84d35e077ff1a088a0a861e 100644
--- a/libcorpus2/ann/annotatedsentence.cpp
+++ b/libcorpus2/ann/annotatedsentence.cpp
@@ -1,6 +1,8 @@
 #include <libcorpus2/ann/annotatedsentence.h>
 #include <libcorpus2/ann/view.h>
 #include <boost/make_shared.hpp>
+#include <sstream>
+#include <libpwrutils/plural.h>
 
 namespace Corpus2 {
 
@@ -56,7 +58,7 @@ boost::shared_ptr<AnnotationView> create_view(
 	const std::string& ann_name)
 {
 	const AnnotationChannel& chan = s->get_channel(ann_name);
-	std::vector<Annotation> ann = chan.make_annotation_vector();
+	std::vector<Annotation> ann = chan.make_annotation_vector(AnnotationChannel::O_INCLUSIVE);
 	boost::shared_ptr<AnnotationView> view;
 	view = boost::make_shared<AnnotationView>(s, ann_name);
 	foreach (const Annotation& a, ann) {
@@ -76,4 +78,29 @@ boost::shared_ptr<AnnotationView> create_view(
 	return view;
 }
 
+void AnnotatedSentence::append(Token *t)
+{
+	Sentence::append(t);
+	foreach (chan_map_t::value_type& v, channels_) {
+		v.second.resize(size());
+	}
+}
+
+std::string AnnotatedSentence::annotation_info() const
+{
+	std::stringstream ss;
+	foreach (const chan_map_t::value_type& v, channels_) {
+		ss << "Channel " << v.first << ": \t";
+		int ann, disj, un;
+		v.second.do_counts(ann, disj, un);
+		ss << PwrNlp::enpln(ann, "annotation") << ", ";
+		ss << disj << " disjoint, ";
+		int a = size() - un;
+		double r = (double)a / size();
+		ss << "annotations span: " << a << "/" << size() << " tokens (" << r*100 << "%)";
+		ss << "\n";
+	}
+	return ss.str();
+}
+
 } /* end ns Corpus2 */
diff --git a/libcorpus2/ann/annotatedsentence.h b/libcorpus2/ann/annotatedsentence.h
index db2d4eade8c42a946a8db84071a096f057eba027..c69203bfc8a7101061052f1ed486cd4bd3159b77 100644
--- a/libcorpus2/ann/annotatedsentence.h
+++ b/libcorpus2/ann/annotatedsentence.h
@@ -4,11 +4,10 @@
 #include <libcorpus2/sentence.h>
 #include <libcorpus2/exception.h>
 #include <libcorpus2/ann/channel.h>
+#include <libcorpus2/ann/view.h>
 
 namespace Corpus2 {
 
-class AnnotationView;
-
 /**
  * Exception class for use when a requested annotation channel does not exist
  */
@@ -44,6 +43,9 @@ public:
 
 	Sentence::Ptr clone_shared() const;
 
+	/// typedef for the channels
+	typedef std::map<std::string, AnnotationChannel> chan_map_t;
+
 	/**
 	 * Create an AnnotatedSentence from a Sentence, grabing all the tokens
 	 * directly (afterwards the source Sentence has no tokens).
@@ -94,10 +96,16 @@ public:
 		return i->second;
 	}
 
-private:
-	/// typedef for tha channels
-	typedef std::map<std::string, AnnotationChannel> chan_map_t;
+	const chan_map_t& all_channels() const {
+		return channels_;
+	}
+
+	/// Sentence override, extends annotation objects
+	void append(Token *t);
 
+	std::string annotation_info() const;
+
+private:
 	/// the actual channels
 	chan_map_t channels_;
 };
diff --git a/libcorpus2/ann/channel.cpp b/libcorpus2/ann/channel.cpp
index cb5fa318467103fa11fb509c4a8c958fe97c865e..eae9000dec58dd8fa89a80875c3554ef7ad2a683 100644
--- a/libcorpus2/ann/channel.cpp
+++ b/libcorpus2/ann/channel.cpp
@@ -3,6 +3,7 @@
 #include <algorithm>
 #include <boost/bind.hpp>
 #include <sstream>
+#include <set>
 
 namespace Corpus2 {
 
@@ -16,6 +17,13 @@ AnnotationChannel::AnnotationChannel(int size)
 {
 }
 
+void AnnotationChannel::resize(int size)
+{
+	segments_.resize(size);
+	iobs_.resize(size);
+	heads_.resize(size);
+}
+
 void AnnotationChannel::make_iob_from_segments()
 {
 	int prev_seg = 0;
@@ -78,6 +86,13 @@ int AnnotationChannel::get_segment_at(int idx) const
 	}
 }
 
+void AnnotationChannel::set_segment_at(int token_idx, int segment_idx)
+{
+	if (token_idx >= 0 && token_idx < static_cast<int>(segments_.size())) {
+		segments_[token_idx] = segment_idx;
+	}
+}
+
 IOB::Enum AnnotationChannel::get_iob_at(int idx)
 {
 	if (idx >= 0 && idx < static_cast<int>(iobs_.size())) {
@@ -110,9 +125,11 @@ void  AnnotationChannel::set_head_at(int idx, bool v)
 	}
 }
 
-std::vector<Annotation> AnnotationChannel::make_annotation_vector() const
+std::vector<Annotation> AnnotationChannel::make_annotation_vector(
+	AnnotationVectorMode mode) const
 {
 	std::vector<Annotation> rv;
+	std::vector<int> not_annotated;
 	int smax = 0;
 	for (size_t i = 0; i < segments_.size(); ++i) {
 		int s = segments_[i];
@@ -124,8 +141,15 @@ std::vector<Annotation> AnnotationChannel::make_annotation_vector() const
 			if (heads_[i]) {
 				rv[s - 1].head_index = i;
 			}
+		} else if (mode & AnnotationChannel::O_INCLUSIVE) {
+			not_annotated.push_back(i);
 		}
 	}
+	foreach (int na, not_annotated) {
+		rv.push_back(Annotation());
+		rv.back().indices.push_back(na);
+		rv.back().head_index = na;
+	}
 	rv.erase(std::remove_if(rv.begin(), rv.end(),
 		boost::bind(&Annotation::empty, _1)), rv.end());
 	foreach (Annotation& a, rv) {
@@ -146,4 +170,26 @@ std::string AnnotationChannel::dump_iob() const
 	return ss.str();
 }
 
+void AnnotationChannel::do_counts(int& annotations, int& disjoint, int& unannotated) const
+{
+	std::set<int> used_sids;
+	std::set<int> disjoint_sids;
+	int last_sid = 0;
+	annotations = 0;
+	disjoint = 0;
+	unannotated = 0;
+	foreach (int sid, segments_) {
+		if (sid == 0) {
+			++unannotated;
+		} else if (!used_sids.insert(sid).second) { //was already there
+			if (last_sid != sid) {
+				disjoint_sids.insert(sid);
+			}
+		}
+		last_sid = sid;
+	}
+	annotations = used_sids.size();
+	disjoint = disjoint_sids.size();
+}
+
 } /* end ns Corpus2 */
diff --git a/libcorpus2/ann/channel.h b/libcorpus2/ann/channel.h
index cea905e31a57e52a3c59fa438b4d3a398ff05ac6..e6765d36eb12f3d4e3f6aaeda19765fd4f6da4b5 100644
--- a/libcorpus2/ann/channel.h
+++ b/libcorpus2/ann/channel.h
@@ -52,6 +52,12 @@ public:
 	 */
 	explicit AnnotationChannel(int size);
 
+	int size() const {
+		return segments_.size();
+	}
+
+	void resize(int size);
+
 	/**
 	 * Discard IOB annotation information, regenerate it from the segment info.
 	 */
@@ -68,18 +74,22 @@ public:
 	 */
 	int renumber_segments();
 
-	/**
-	 * Create a vector of AnnotationSegment objects, each corresponding to
-	 * an annotation, with the annotations possibly being disjoint.
-	 */
-	std::vector<Annotation> make_annotation_vector() const;
+	enum AnnotationVectorMode
+	{
+		O_DISJOINT_EXCLUSIVE = 0,
+		O_CONTINUOUS = 1,
+		O_INCLUSIVE = 2,
+		O_CONTINUOUS_INCLUSIVE = 3,
+	};
 
 	/**
 	 * Create a vector of AnnotationSegment objects, each corresponding to
-	 * an annotation, forcing the annotations to be continous (disjoint
-	 * annotations are split)
+	 * an annotation, with the annotations possibly being disjoint unless
+	 * O_CONTINUOUS is specified in mode, and omiting unanottated tokens unless
+	 * O_INCLUSIVE is specified.
 	 */
-	std::vector<Annotation> make_continuous_annotation_vector() const;
+	std::vector<Annotation> make_annotation_vector(
+		AnnotationVectorMode mode = O_DISJOINT_EXCLUSIVE) const;
 
 	/**
 	 * The segment-index array accesor
@@ -100,6 +110,11 @@ public:
 	 */
 	int get_segment_at(int idx) const;
 
+	/**
+	 * Segment index setter, out of range indices are not processed.
+	 */
+	void set_segment_at(int token_idx, int segment_idx);
+
 	/**
 	 * The IOB data vector
 	 */
@@ -132,6 +147,8 @@ public:
 	 */
 	std::string dump_iob() const;
 
+	void do_counts(int& annotations, int& disjoint, int& unannotated) const;
+
 private:
 	/// segment indices
 	std::vector<int> segments_;
diff --git a/libcorpus2/io/cclreader.cpp b/libcorpus2/io/cclreader.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..23179f9237e892f277d6ec5a057bef8da9357013
--- /dev/null
+++ b/libcorpus2/io/cclreader.cpp
@@ -0,0 +1,289 @@
+/*
+    Copyright (C) 2010 Tomasz Åšniatowski, Adam Radziszewski
+    Part of the libcorpus2 project
+
+    This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3 of the License, or (at your option)
+any later version.
+
+    This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. 
+
+    See the LICENSE and COPYING files for more details.
+*/
+
+#include <libcorpus2/io/cclreader.h>
+#include <libcorpus2/io/sax.h>
+#include <libpwrutils/foreach.h>
+#include <libxml++/libxml++.h>
+#include <libxml2/libxml/parser.h>
+#include <boost/make_shared.hpp>
+#include <libcorpus2/ann/annotatedsentence.h>
+#include <cstdlib>
+#include <fstream>
+
+namespace Corpus2 {
+
+class CclReaderImpl : public BasicSaxParser
+{
+public:
+	CclReaderImpl(const Tagset& tagset,
+		std::deque< boost::shared_ptr<Chunk> >& obuf,
+		bool disamb_only, bool disamb_sh);
+
+	~CclReaderImpl();
+
+protected:
+	void on_start_element(const Glib::ustring & name,
+			const AttributeList& attributes);
+	void on_end_element(const Glib::ustring & name);
+
+	void finish_sentence();
+
+	const Tagset& tagset_;
+
+	enum state_t { XS_NONE, XS_CHUNK, XS_SENTENCE, XS_TOK, XS_ANN, XS_ORTH, XS_LEX,
+			XS_LEMMA, XS_TAG, XS_REL };
+	state_t state_;
+
+	bool chunkless_;
+
+	bool out_of_chunk_;
+
+	PwrNlp::Whitespace::Enum wa_;
+
+	Glib::ustring sbuf_;
+
+	Token* tok_;
+
+	boost::shared_ptr<AnnotatedSentence> sent_;
+
+	std::string ann_chan_;
+
+	bool ann_head_;
+
+	typedef std::map<std::string, int> token_ann_t;
+
+	token_ann_t token_anns_;
+
+	std::set<std::string> token_ann_heads_;
+
+	boost::shared_ptr<Chunk> chunk_;
+
+	std::deque< boost::shared_ptr<Chunk> >& obuf_;
+
+	bool disamb_only_;
+
+	bool disamb_sh_;
+};
+
+CclReader::CclReader(const Tagset& tagset, std::istream& is,
+		bool disamb_only, bool disamb_sh)
+	: BufferedChunkReader(tagset),
+	impl_(new CclReaderImpl(tagset, chunk_buf_, disamb_only, disamb_sh))
+{
+	this->is_ = &is;
+}
+
+CclReader::CclReader(const Tagset& tagset, const std::string& filename, bool disamb_only, bool disamb_sh)
+	: BufferedChunkReader(tagset),
+	impl_(new CclReaderImpl(tagset, chunk_buf_, disamb_only, disamb_sh))
+{
+	this->is_owned_.reset(new std::ifstream(filename.c_str(), std::ifstream::in));
+
+	if (this->is_owned_->bad()) {
+		throw Corpus2Error("File not found!");
+	}
+	else {
+		this->is_ = is_owned_.get();
+	}
+}
+
+CclReader::~CclReader()
+{
+}
+
+void CclReader::ensure_more()
+{
+	static const int BUFSIZE=1024;
+	while (chunk_buf_.empty() && is().good()) {
+		unsigned char buf[BUFSIZE+1];
+		is().read(reinterpret_cast<char*>(buf), BUFSIZE);
+		impl_->parse_chunk_raw(buf, is().gcount());
+		if (is().eof()) {
+			impl_->finish_chunk_parsing();
+		}
+	}
+}
+
+CclReaderImpl::CclReaderImpl(const Tagset& tagset,
+		std::deque< boost::shared_ptr<Chunk> >& obuf,
+		bool disamb_only, bool disamb_sh)
+	: BasicSaxParser()
+	, tagset_(tagset), state_(XS_NONE), chunkless_(false), out_of_chunk_(false)
+	, wa_(PwrNlp::Whitespace::Newline)
+	, sbuf_(), tok_(NULL), sent_(), chunk_(), obuf_(obuf)
+	, disamb_only_(disamb_only), disamb_sh_(disamb_sh)
+{
+}
+
+CclReaderImpl::~CclReaderImpl()
+{
+	delete tok_;
+}
+
+void CclReaderImpl::on_start_element(const Glib::ustring &name,
+		const AttributeList& attributes)
+{
+	if (name == "chunk") {
+		std::string type;
+		foreach (const Attribute& a, attributes) {
+			if (a.name == "type") {
+				type = a.value;
+			}
+		}
+		if (type == "s") {
+			throw XcesError("Trying to parse XCES as CCL (<chunk type=\"s\">)");
+		} else if (state_ == XS_NONE) {
+			chunk_ = boost::make_shared<Chunk>();
+			state_ = XS_CHUNK;
+			foreach (const Attribute& a, attributes) {
+				chunk_->set_attribute(a.name, a.value);
+			}
+		} else if (state_ == XS_CHUNK) {
+			throw XcesError("Nested <chunk>");
+		} else {
+			throw XcesError("Unexpected <chunk>");
+		}
+	} else if (state_ == XS_CHUNK && name == "sentence") {
+		state_ = XS_SENTENCE;
+		sent_ = boost::make_shared<AnnotatedSentence>();
+	} else if (state_ == XS_SENTENCE && name == "tok") {
+		state_ = XS_TOK;
+		tok_ = new Token();
+		tok_->set_wa(wa_);
+		wa_ = PwrNlp::Whitespace::Space;
+		token_anns_.clear();
+		token_ann_heads_.clear();
+	} else if (state_ == XS_TOK && name == "orth") {
+		state_ = XS_ORTH;
+		grab_characters_ = true;
+		clear_buf();
+	} else if (state_ == XS_TOK && name == "ann") {
+		state_ = XS_ANN;
+		grab_characters_ = true;
+		clear_buf();
+		ann_chan_ = "";
+		ann_head_ = false;
+		foreach (const Attribute& a, attributes) {
+			if (a.name == "chan") {
+				ann_chan_ = a.value;
+			} else if (a.name == "head" && a.value == "1") {
+				ann_head_ = true;
+			}
+		}
+		if (ann_chan_.empty()) {
+			throw XcesError("<ann> with no channel name");
+		}
+	} else if (state_ == XS_TOK && name == "lex") {
+		assert(tok_ != NULL);
+		bool is_disamb = false;
+		foreach (const Attribute& a, attributes) {
+			if (a.name == "disamb" && a.value == "1") {
+				is_disamb = true;
+			}
+		}
+		if (!disamb_only_ || is_disamb) {
+			tok_->add_lexeme(Lexeme());
+			tok_->lexemes().back().set_disamb(is_disamb);
+			state_ = XS_LEX;
+		}
+	} else if (state_ == XS_LEX && name == "base") {
+		state_ = XS_LEMMA;
+		grab_characters_ = true;
+		clear_buf();
+	} else if (state_ == XS_LEX && name == "ctag") {
+		state_ = XS_TAG;
+		grab_characters_ = true;
+		clear_buf();
+	} else if (name == "ns") {
+		wa_ = PwrNlp::Whitespace::None;
+	} else if (name == "tok" && state_ == XS_NONE) {
+		std::cerr << "Warning: out-of-chunk token, assuming sentence start on line ";
+		std::cerr << this->context_->input->line << "\n";
+		chunkless_ = true;
+		out_of_chunk_ = true;
+		chunk_ = boost::make_shared<Chunk>();
+		sent_ = boost::make_shared<AnnotatedSentence>();
+		state_ = XS_TOK;
+		tok_ = new Token();
+		tok_->set_wa(wa_);
+		wa_ = PwrNlp::Whitespace::Space;
+	}
+}
+
+void CclReaderImpl::finish_sentence()
+{
+	chunk_->append(sent_);
+	sent_.reset();
+	if (chunkless_) {
+		obuf_.push_back(chunk_);
+		chunk_.reset();
+		state_ = XS_NONE;
+		chunkless_ = false;
+	} else {
+		state_ = XS_CHUNK;
+	}
+}
+
+void CclReaderImpl::on_end_element(const Glib::ustring &name)
+{
+	if (state_ == XS_ORTH && name == "orth") {
+		tok_->set_orth(UnicodeString::fromUTF8(get_buf()));
+		grab_characters_ = false;
+		state_ = XS_TOK;
+	} else if (state_ == XS_ANN && name == "ann") {
+		std::string buf = get_buf();
+		grab_characters_ = false;
+		int segid = atoi(buf.c_str());
+		if (!sent_->has_channel(ann_chan_)) {
+			sent_->create_channel(ann_chan_);
+		}
+		if (segid > 0) {
+			token_anns_.insert(std::make_pair(ann_chan_, segid));
+			token_ann_heads_.insert(ann_chan_);
+		}
+		state_ = XS_TOK;
+	} else if (state_ == XS_LEMMA && name == "base") {
+		tok_->lexemes().back().set_lemma(UnicodeString::fromUTF8(get_buf()));
+		grab_characters_ = false;
+		state_ = XS_LEX;
+	} else if (state_ == XS_TAG && name == "ctag") {
+		Tag tag = tagset_.parse_simple_tag(get_buf(), true);
+		tok_->lexemes().back().set_tag(tag);
+		grab_characters_ = false;
+		state_ = XS_LEX;
+	} else if (state_ == XS_LEX && name == "lex") {
+		state_ = XS_TOK;
+	} else if (state_ == XS_TOK && name == "tok") {
+		sent_->append(tok_);
+		tok_ = NULL;
+		state_ = XS_SENTENCE;
+		foreach (const token_ann_t::value_type& v, token_anns_) {
+			sent_->get_channel(v.first).set_segment_at(sent_->size() - 1, v.second);
+			if (token_ann_heads_.find(v.first) != token_ann_heads_.end()) {
+				sent_->get_channel(v.first).set_head_at(sent_->size() - 1, true);
+			}
+		}
+	} else if (state_ == XS_SENTENCE && name == "sentence") {
+		finish_sentence();
+	} else if (state_ == XS_CHUNK && name == "chunk") {
+		obuf_.push_back(chunk_);
+		chunk_.reset();
+		state_ = XS_NONE;
+	}
+}
+
+} /* end ns Corpus2 */
diff --git a/libcorpus2/io/cclreader.h b/libcorpus2/io/cclreader.h
new file mode 100644
index 0000000000000000000000000000000000000000..3807ff9d5cd8462fe1a48cf5dd43255b8159e60c
--- /dev/null
+++ b/libcorpus2/io/cclreader.h
@@ -0,0 +1,57 @@
+/*
+    Copyright (C) 2010 Tomasz Åšniatowski, Adam Radziszewski
+    Part of the libcorpus2 project
+
+    This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3 of the License, or (at your option)
+any later version.
+
+    This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. 
+
+    See the LICENSE and COPYING files for more details.
+*/
+
+#ifndef LIBCORPUS2_IO_CCLREADER_H
+#define LIBCORPUS2_IO_CCLREADER_H
+
+#include <libcorpus2/io/reader.h>
+#include <libcorpus2/io/xces.h>
+#include <libcorpus2/chunk.h>
+#include <deque>
+#include <boost/scoped_ptr.hpp>
+
+namespace Corpus2 {
+
+class CclReaderImpl;
+
+class CclReader : public BufferedChunkReader
+{
+public:
+	CclReader(const Tagset& tagset, std::istream& is,
+			bool disamb_only = false, bool disamb_sh = false);
+
+	CclReader(const Tagset& tagset, const std::string& filename,
+			bool disamb_only = false, bool disamb_sh = false);
+
+	~CclReader();
+
+	std::istream& is() {
+		return *is_;
+	}
+
+protected:
+	void ensure_more();
+
+	// std::istream& is_;
+	std::istream* is_;
+	boost::scoped_ptr<std::istream> is_owned_;
+
+	boost::scoped_ptr<CclReaderImpl> impl_;
+};
+
+} /* end ns Corpus2 */
+
+#endif // LIBCORPUS2_IO_CCLREADER_H
diff --git a/libcorpus2/io/xcesreader.h b/libcorpus2/io/xcesreader.h
index f742f036e3125f229dee3415bfa6382c7f51160d..004b62fd8f3fd112e0110d766039fbd89d7b2bbc 100644
--- a/libcorpus2/io/xcesreader.h
+++ b/libcorpus2/io/xcesreader.h
@@ -52,7 +52,6 @@ protected:
 	boost::scoped_ptr<XcesReaderImpl> impl_;
 };
 
-
 } /* end ns Corpus2 */
 
 #endif // LIBCORPUS2_IO_XCESREADER_H
diff --git a/libcorpus2/sentence.h b/libcorpus2/sentence.h
index 2a08e84674c047a897e18b396bb4f63d837a2311..6eb8522ec744579dd18460c569218ff2d3177f6e 100644
--- a/libcorpus2/sentence.h
+++ b/libcorpus2/sentence.h
@@ -69,7 +69,9 @@ public:
 	}
 
 	/// Helper function for appending tokens
-	void append(Token* t) {
+	/// Might be overriden in a child class to make adding a token keep
+	/// extra invariants
+	virtual void append(Token* t) {
 		tokens_.push_back(t);
 	}
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 2d7d8bbee77ddf04ab6910c9d18af7d240e93c7a..e2b53b044182c393a80620bb129624158172586b 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -9,6 +9,7 @@ add_executable( tests
 	ann_basic.cpp
 	basic.cpp
 	io.cpp
+	ioann.cpp
 	tag_split.cpp
 	tagset_parse.cpp
 )
diff --git a/tests/ioann.cpp b/tests/ioann.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8334bf32effb64027a96cf93eb1604ae23484147
--- /dev/null
+++ b/tests/ioann.cpp
@@ -0,0 +1,133 @@
+/*
+    Copyright (C) 2010 Tomasz Åšniatowski, Adam Radziszewski
+    Part of the libcorpus2 project
+
+    This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3 of the License, or (at your option)
+any later version.
+
+    This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.
+
+    See the LICENSE and COPYING files for more details.
+*/
+
+#include <boost/test/unit_test.hpp>
+#include <set>
+#include <libpwrutils/foreach.h>
+#include <libpwrutils/bitset.h>
+#include <libcorpus2/tagsetmanager.h>
+#include <libcorpus2/io/cclreader.h>
+#include <libcorpus2/io/writer.h>
+#include <libcorpus2/ann/annotatedsentence.h>
+
+namespace {
+static char swiatopoglad[] =
+"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
+"<!DOCTYPE cesAna SYSTEM \"xcesAnaIPI.dtd\">\n"
+"<cesAna xmlns:xlink=\"http://www.w3.org/1999/xlink\" version=\"1.0\" type=\"lex disamb\">\n"
+"<chunkList>\n"
+"<chunk id=\"ch51\" type=\"tok\">\n"
+"<chunk type=\"s\">\n"
+"<tok>\n"
+"<orth>UwaÅ¼am</orth>\n"
+"<lex disamb=\"1\"><base>uwaÅ¼aÄ‡</base><ctag>fin:sg:pri:imperf</ctag></lex>\n"
+"</tok>\n"
+"<ns/>\n"
+"<tok>\n"
+"<orth>,</orth>\n"
+"<lex disamb=\"1\"><base>,</base><ctag>interp</ctag></lex>\n"
+"</tok>\n"
+"<tok>\n"
+"<orth>Å¼e</orth>\n"
+"<lex disamb=\"1\"><base>Å¼e</base><ctag>conj</ctag></lex>\n"
+"</tok>\n"
+"<tok>\n"
+"<orth>Å›wiatopoglÄ…d</orth>\n"
+"<lex><base>Å›wiatopoglÄ…d</base><ctag>subst:sg:acc:m3</ctag></lex>\n"
+"<lex disamb=\"1\"><base>Å›wiatopoglÄ…d</base><ctag>subst:sg:nom:m3</ctag></lex>\n"
+"</tok>\n"
+"</chunk>\n"
+"</chunk>\n"
+"</chunkList>\n"
+"</cesAna>\n"
+;
+
+static char swiatopoglad_ann[] =
+"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
+"<chunkList>\n"
+"<chunk id=\"ch51\" type=\"tok\">\n"
+"<sentence>\n"
+"<tok>\n"
+"<ann chan=\"cute\">1</ann>\n"
+"<ann chan=\"meh\">1</ann>\n"
+"<orth>UwaÅ¼am</orth>\n"
+"<lex disamb=\"1\"><base>uwaÅ¼aÄ‡</base><ctag>fin:sg:pri:imperf</ctag></lex>\n"
+"</tok>\n"
+"<ns/>\n"
+"<tok>\n"
+"<ann chan=\"cute\">1</ann>\n"
+"<ann chan=\"meh\">2</ann>\n"
+"<orth>,</orth>\n"
+"<lex disamb=\"1\"><base>,</base><ctag>interp</ctag></lex>\n"
+"</tok>\n"
+"<tok>\n"
+"<ann chan=\"meh\" head=\"1\">1</ann>\n"
+"<orth>Å¼e</orth>\n"
+"<lex disamb=\"1\"><base>Å¼e</base><ctag>conj</ctag></lex>\n"
+"</tok>\n"
+"<tok>\n"
+"<orth>Å›wiatopoglÄ…d</orth>\n"
+"<lex><base>Å›wiatopoglÄ…d</base><ctag>subst:sg:acc:m3</ctag></lex>\n"
+"<lex disamb=\"1\"><base>Å›wiatopoglÄ…d</base><ctag>subst:sg:nom:m3</ctag></lex>\n"
+"<ann chan=\"cute\">2</ann>\n"
+"</tok>\n"
+"</sentence>\n"
+"</chunk>\n"
+"</chunkList>\n"
+;
+}
+
+BOOST_AUTO_TEST_SUITE( ioann )
+
+BOOST_AUTO_TEST_CASE( iobase )
+{
+	const Corpus2::Tagset& tagset = Corpus2::get_named_tagset("kipi");
+	std::stringstream ssin;
+	ssin << swiatopoglad_ann;
+	Corpus2::CclReader xr(tagset, ssin);
+	boost::shared_ptr<Corpus2::Chunk> chunk = xr.get_next_chunk();
+	std::stringstream ss;
+	boost::shared_ptr<Corpus2::TokenWriter> w(Corpus2::TokenWriter::create("xces,flat", ss, tagset));
+	w->write_chunk(*chunk);
+	w->finish();
+	BOOST_CHECK_EQUAL(ss.str(), swiatopoglad);
+	BOOST_REQUIRE(!chunk->sentences().empty());
+	boost::shared_ptr<Corpus2::AnnotatedSentence> as;
+	as = boost::dynamic_pointer_cast<Corpus2::AnnotatedSentence>(chunk->sentences()[0]);
+	BOOST_REQUIRE(as);
+
+	BOOST_REQUIRE(as->has_channel("cute"));
+	as->get_channel("cute").make_iob_from_segments();
+	BOOST_CHECK_EQUAL(as->get_channel("cute").dump_iob(), "BIOB");
+	Corpus2::Sentence::Ptr cute = Corpus2::create_view(as, "cute");
+	BOOST_REQUIRE_EQUAL(cute->size(), 3);
+	BOOST_CHECK_EQUAL(cute->tokens()[0]->orth_utf8(), "UwaÅ¼am,");
+	BOOST_CHECK_EQUAL(cute->tokens()[1]->orth_utf8(), "Å¼e");
+	BOOST_CHECK_EQUAL(cute->tokens()[2]->orth_utf8(), "Å›wiatopoglÄ…d");
+
+	BOOST_REQUIRE(as->has_channel("meh"));
+	as->get_channel("meh").make_iob_from_segments();
+	BOOST_CHECK_EQUAL(as->get_channel("meh").dump_iob(), "BBBO");
+	Corpus2::Sentence::Ptr meh = Corpus2::create_view(as, "meh");
+	BOOST_REQUIRE_EQUAL(meh->size(), 3);
+	BOOST_CHECK_EQUAL(meh->tokens()[0]->orth_utf8(), ",");
+	BOOST_CHECK_EQUAL(meh->tokens()[1]->orth_utf8(), "UwaÅ¼am Å¼e");
+	BOOST_CHECK_EQUAL(meh->tokens()[2]->orth_utf8(), "Å›wiatopoglÄ…d");
+
+	std::cerr << as->annotation_info();
+}
+
+BOOST_AUTO_TEST_SUITE_END();