From e9277e51b678c5766ad621512f07c946349cbe92 Mon Sep 17 00:00:00 2001
From: ilor <kailoran@gmail.com>
Date: Tue, 12 Apr 2011 17:20:47 +0200
Subject: [PATCH] add mark and unmark operators, tweak wcclrules to work with
 annotated data

---
 libwccl/CMakeLists.txt            |   2 +
 libwccl/ops/tagactions/mark.cpp   |  93 ++++++++++++++++++++
 libwccl/ops/tagactions/mark.h     |  88 +++++++++++++++++++
 libwccl/ops/tagactions/unmark.cpp |  60 +++++++++++++
 libwccl/ops/tagactions/unmark.h   |  67 +++++++++++++++
 libwccl/parser/grammar.g          |  38 +++++++++
 tests/CMakeLists.txt              |   1 +
 tests/mark.cpp                    | 137 ++++++++++++++++++++++++++++++
 wcclrules/main.cpp                |  38 +++++----
 9 files changed, 508 insertions(+), 16 deletions(-)
 create mode 100644 libwccl/ops/tagactions/mark.cpp
 create mode 100644 libwccl/ops/tagactions/mark.h
 create mode 100644 libwccl/ops/tagactions/unmark.cpp
 create mode 100644 libwccl/ops/tagactions/unmark.h
 create mode 100644 tests/mark.cpp

diff --git a/libwccl/CMakeLists.txt b/libwccl/CMakeLists.txt
index fd26bba..89ee70f 100644
--- a/libwccl/CMakeLists.txt
+++ b/libwccl/CMakeLists.txt
@@ -67,9 +67,11 @@ SET(libwccl_STAT_SRC
 	ops/rulesequence.cpp
 	ops/tagaction.cpp
 	ops/tagactions/delete.cpp
+	ops/tagactions/mark.cpp
 	ops/tagactions/relabel.cpp
 	ops/tagactions/select.cpp
 	ops/tagactions/unify.cpp
+	ops/tagactions/unmark.cpp
 	ops/tagrule.cpp
 	parser/grammar.g
 	parser/Parser.cpp
diff --git a/libwccl/ops/tagactions/mark.cpp b/libwccl/ops/tagactions/mark.cpp
new file mode 100644
index 0000000..42eea2d
--- /dev/null
+++ b/libwccl/ops/tagactions/mark.cpp
@@ -0,0 +1,93 @@
+#include <libwccl/ops/tagactions/mark.h>
+#include <libpwrutils/foreach.h>
+#include <libcorpus2/ann/annotatedsentence.h>
+#include <sstream>
+
+namespace Wccl {
+
+Bool Mark::execute(const ActionExecContext& context) const
+{
+	SentenceContext& sc = context.sentence_context();
+
+	const boost::shared_ptr<const Position>& range_left = pos_begin_->apply(context);
+	if (range_left->get_value() == Position::Nowhere) {
+		return Bool(false);
+	}
+	const boost::shared_ptr<const Position>& range_right = pos_end_->apply(context);
+	if (range_right->get_value() == Position::Nowhere) {
+		return Bool(false);
+	}
+	int abs_left, abs_right;
+	if (!sc.validate_range(*range_left, *range_right, abs_left, abs_right)) {
+		return Bool(false);
+	}
+
+	int abs_head;
+	if (pos_head_) {
+		const boost::shared_ptr<const Position>& head = pos_head_->apply(context);
+		if (head->get_value() == Position::Nowhere) {
+			return Bool(false);
+		}
+		abs_head = sc.get_abs_position(*head);
+		if (abs_head < abs_left || abs_head > abs_right) {
+			return Bool(false);
+		}
+	} else {
+		abs_head = abs_left;
+	}
+
+	boost::shared_ptr<Corpus2::AnnotatedSentence> as;
+	as = boost::dynamic_pointer_cast<Corpus2::AnnotatedSentence>(sc.get_sentence_ptr());
+	if (!as) {
+		throw WcclError("Operator needs an annotated sentence");
+	}
+	if (!as->has_channel(chan_name_)) {
+		as->create_channel(chan_name_);
+	}
+	Corpus2::AnnotationChannel& channel = as->get_channel(chan_name_);
+
+	int segment_idx = channel.get_new_segment_index();
+	//std::cerr << "Marking " << chan_name_ << " from " << abs_left << " to "
+	//	<< abs_right << " as " << segment_idx << "\n";
+
+	for (int i = abs_left; i <= abs_right; ++i) {
+		if (channel.get_segment_at(i) > 0) {
+			//std::cerr << "There already is an annotation at " << i
+			//	<< " (" << channel.get_segment_at(i) << ")\n";
+			//throw WcclError("Mark action would overwrite existing annotation");
+			return Bool(false);
+		}
+	}
+
+	for (int i = abs_left; i <= abs_right; ++i) {
+		channel.set_segment_at(i, segment_idx);
+		channel.set_head_at(i, false);
+	}
+	channel.set_head_at(abs_head, true);
+	return Bool(true);
+}
+
+std::string Mark::to_string(const Corpus2::Tagset& tagset) const
+{
+	std::ostringstream os;
+	os << name() << "(" << pos_begin_->to_string(tagset) << ", "
+			<< pos_end_->to_string(tagset);
+	if (pos_head_) {
+		os << ", " << pos_head_->to_string(tagset);
+	}
+	os << ", \"" << chan_name_ << "\")";
+	return os.str();
+}
+
+std::ostream& Mark::write_to(std::ostream& os) const
+{
+	os << name() << "(" << *pos_begin_ << ", " << *pos_end_;
+	if (pos_head_) {
+		os << ", " << *pos_head_ ;
+	}
+	os << ", \"" << chan_name_ << "\")";
+	return os;
+}
+
+
+} /* end ns Wccl */
diff --git a/libwccl/ops/tagactions/mark.h b/libwccl/ops/tagactions/mark.h
new file mode 100644
index 0000000..de0e471
--- /dev/null
+++ b/libwccl/ops/tagactions/mark.h
@@ -0,0 +1,88 @@
+#ifndef LIBWCCL_OPS_TAGACTIONS_MARK_H
+#define LIBWCCL_OPS_TAGACTIONS_MARK_H
+
+#include <libwccl/ops/tagaction.h>
+#include <libwccl/values/position.h>
+#include <libwccl/values/bool.h>
+#include <libwccl/ops/function.h>
+
+namespace Wccl {
+
+/**
+ * Action to mark an annotation fragment on a channel.
+ */
+class Mark : public TagAction
+{
+public:
+	typedef boost::shared_ptr<Function<Position> > PosFunctionPtr;
+
+	Mark(
+		const PosFunctionPtr& pos_begin,
+		const PosFunctionPtr& pos_end,
+		const PosFunctionPtr& pos_head,
+		const std::string& chan_name)
+		: pos_begin_(pos_begin),
+		  pos_end_(pos_end),
+		  pos_head_(pos_head),
+		  chan_name_(chan_name)
+	{
+		BOOST_ASSERT(pos_begin_);
+		BOOST_ASSERT(pos_end_);
+		BOOST_ASSERT(!chan_name.empty());
+	}
+
+	Mark(
+		const PosFunctionPtr& pos_begin,
+		const PosFunctionPtr& pos_end,
+		const std::string& chan_name)
+		: pos_begin_(pos_begin),
+		  pos_end_(pos_end),
+		  pos_head_(),
+		  chan_name_(chan_name)
+	{
+		BOOST_ASSERT(pos_begin_);
+		BOOST_ASSERT(pos_end_);
+		BOOST_ASSERT(!chan_name.empty());
+	}
+
+	/**
+	 * @returns Name of the function.
+	 */
+	std::string name() const {
+		return "mark";
+	}
+
+	/**
+	 * @returns String representation of the Action
+	 */
+	std::string to_string(const Corpus2::Tagset& tagset) const;
+
+	/**
+	 * Executes the Action on given context: Marks an annotation within
+	 * the given channel on the range supplied. It is an error if there
+	 * already is an annotation anywhere in the range in the channel.
+	 * The Range is trimmed to sentence boundaries.
+	 * No action is made in case of an invalid/empty range.
+	 * @returns True if there were any changes made; False otherwise
+	 */
+	Bool execute(const ActionExecContext &context) const;
+
+protected:
+	/**
+	 * Writes string representation of the Action to
+	 * an output stream.
+	 * @returns Stream written to.
+	 * @note May be incomplete and/or containt internal info.
+	 */
+	std::ostream& write_to(std::ostream& ostream) const;
+
+private:
+	const PosFunctionPtr pos_begin_;
+	const PosFunctionPtr pos_end_;
+	const PosFunctionPtr pos_head_;
+	const std::string chan_name_;
+};
+
+} /* end ns Wccl */
+
+#endif // LIBWCCL_OPS_TAGACTIONS_MARK_H
diff --git a/libwccl/ops/tagactions/unmark.cpp b/libwccl/ops/tagactions/unmark.cpp
new file mode 100644
index 0000000..e090cbc
--- /dev/null
+++ b/libwccl/ops/tagactions/unmark.cpp
@@ -0,0 +1,60 @@
+#include <libwccl/ops/tagactions/unmark.h>
+#include <libpwrutils/foreach.h>
+#include <libcorpus2/ann/annotatedsentence.h>
+#include <sstream>
+
+namespace Wccl {
+
+Bool Unmark::execute(const ActionExecContext& context) const
+{
+	SentenceContext& sc = context.sentence_context();
+
+	const boost::shared_ptr<const Position>& position = pos_->apply(context);
+	if (position->get_value() == Position::Nowhere) {
+		return Bool(false);
+	}
+
+	int abs_pos = sc.get_abs_position(*position);
+	if (!sc.is_inside(abs_pos)) {
+		return Bool(false);
+	}
+
+	boost::shared_ptr<Corpus2::AnnotatedSentence> as;
+	as = boost::dynamic_pointer_cast<Corpus2::AnnotatedSentence>(sc.get_sentence_ptr());
+	if (!as) {
+		throw WcclError("Operator needs an annotated sentence");
+	}
+	if (!as->has_channel(chan_name_)) {
+		return Bool(false);
+	}
+	Corpus2::AnnotationChannel& channel = as->get_channel(chan_name_);
+
+	int segment_idx = channel.get_segment_at(abs_pos);
+	if (segment_idx == 0) {
+		return Bool(false);
+	}
+
+	for (int i = 0; i < channel.size(); ++i) {
+		if (channel.segments()[i] == segment_idx) {
+			channel.set_segment_at(i, 0);
+		}
+	}
+	return Bool(true);
+}
+
+std::string Unmark::to_string(const Corpus2::Tagset& tagset) const
+{
+	std::ostringstream os;
+	os << name() << "(" << pos_->to_string(tagset) << ", \""
+			<< chan_name_ << "\")";
+	return os.str();
+}
+
+std::ostream& Unmark::write_to(std::ostream& os) const
+{
+	os << name() << "(" << *pos_ << ", \"" << chan_name_ << "\")";
+	return os;
+}
+
+
+} /* end ns Wccl */
diff --git a/libwccl/ops/tagactions/unmark.h b/libwccl/ops/tagactions/unmark.h
new file mode 100644
index 0000000..c344fc0
--- /dev/null
+++ b/libwccl/ops/tagactions/unmark.h
@@ -0,0 +1,67 @@
+#ifndef LIBWCCL_OPS_TAGACTIONS_UNMARK_H
+#define LIBWCCL_OPS_TAGACTIONS_UNMARK_H
+
+#include <libwccl/ops/tagaction.h>
+#include <libwccl/values/position.h>
+#include <libwccl/values/bool.h>
+#include <libwccl/ops/function.h>
+
+namespace Wccl {
+
+/**
+ * Action to unmark (delete) an annotation passing through a token.
+ */
+class Unmark : public TagAction
+{
+public:
+	typedef boost::shared_ptr<Function<Position> > PosFunctionPtr;
+
+	Unmark(
+		const PosFunctionPtr& pos,
+		const std::string& chan_name)
+		: pos_(pos),
+		  chan_name_(chan_name)
+	{
+		BOOST_ASSERT(pos_);
+		BOOST_ASSERT(!chan_name.empty());
+	}
+
+	/**
+	 * @returns Name of the function.
+	 */
+	std::string name() const {
+		return "unmark";
+	}
+
+	/**
+	 * @returns String representation of the Action
+	 */
+	std::string to_string(const Corpus2::Tagset& tagset) const;
+
+protected:
+	/**
+	 * Writes string representation of the Action to
+	 * an output stream.
+	 * @returns Stream written to.
+	 * @note May be incomplete and/or containt internal info.
+	 */
+	std::ostream& write_to(std::ostream& ostream) const;
+
+	/**
+	 * Executes the Action on given context: Marks an annotation within
+	 * the given channel on the range supplied. It is an error if there
+	 * already is an annotation anywhere in the range in the channel.
+	 * The Range is trimmed to sentence boundaries.
+	 * No action is made in case of an invalid/empty range.
+	 * @returns True if there were any changes made; False otherwise
+	 */
+	Bool execute(const ActionExecContext &context) const;
+
+private:
+	const PosFunctionPtr pos_;
+	const std::string chan_name_;
+};
+
+} /* end ns Wccl */
+
+#endif // LIBWCCL_OPS_TAGACTIONS_UNMARK_H
diff --git a/libwccl/parser/grammar.g b/libwccl/parser/grammar.g
index 376de76..3bf431b 100644
--- a/libwccl/parser/grammar.g
+++ b/libwccl/parser/grammar.g
@@ -66,6 +66,8 @@ header {
 	#include <libwccl/ops/tagactions/delete.h>
 	#include <libwccl/ops/tagactions/select.h>
 	#include <libwccl/ops/tagactions/relabel.h>
+	#include <libwccl/ops/tagactions/mark.h>
+	#include <libwccl/ops/tagactions/unmark.h>
 
 	// Match operators
 	#include <libwccl/values/tokenmatch.h>
@@ -1485,6 +1487,8 @@ action
 	//
 	| act = action_unify [tagset, vars]
 	//
+	| act = action_mark [tagset, vars]
+	| act = action_unmark [tagset, vars]
 ;
 
 // Action sequence - the actions are separated with commas:
@@ -1697,6 +1701,40 @@ action_unify
 		}
 ;
 
+// ----------------------------------------------------------------------------
+// Mark action
+action_mark
+	[const Corpus2::Tagset& tagset, Variables& vars]
+	returns [boost::shared_ptr<Mark> action]
+{
+	boost::shared_ptr<Function<Position> > pos_begin, pos_end, pos_head;
+}
+	: "mark" LPAREN
+			pos_begin    = position_operator [tagset, vars] COMMA
+			pos_end      = position_operator [tagset, vars] COMMA
+			(pos_head    = position_operator [tagset, vars] COMMA)?
+			chan_name: STRING
+		RPAREN  {
+			action.reset(new Mark(pos_begin, pos_end, pos_head, ((antlr::Token*)chan_name)->getText()));
+		}
+;
+
+// ----------------------------------------------------------------------------
+// Unmark action
+action_unmark
+	[const Corpus2::Tagset& tagset, Variables& vars]
+	returns [boost::shared_ptr<Unmark> action]
+{
+	boost::shared_ptr<Function<Position> > pos;
+}
+	: "unmark" LPAREN
+			pos = position_operator [tagset, vars] COMMA
+			chan_name: STRING
+		RPAREN  {
+			action.reset(new Unmark(pos, ((antlr::Token*)chan_name)->getText()));
+		}
+;
+
 // ----------------------------------------------------------------------------
 // ----------------------------------------------------------------------------
 // Match rules
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 5ff0f38..4cd9787 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -27,6 +27,7 @@ add_executable(tests
 	getorth.cpp
 	logicalpredicates.cpp
 	main.cpp
+	mark.cpp
 	match.cpp
 	position.cpp
 	positionpredicates.cpp
diff --git a/tests/mark.cpp b/tests/mark.cpp
new file mode 100644
index 0000000..23e52b7
--- /dev/null
+++ b/tests/mark.cpp
@@ -0,0 +1,137 @@
+#include <boost/test/unit_test.hpp>
+#include <boost/bind.hpp>
+#include <libcorpus2/ann/annotatedsentence.h>
+#include <libwccl/ops/functions/constant.h>
+
+#include <libwccl/ops/tagactions/mark.h>
+
+using namespace Wccl;
+
+BOOST_AUTO_TEST_SUITE(mark)
+
+struct MarkFix
+{
+	MarkFix()
+		: as(boost::make_shared<Corpus2::AnnotatedSentence>()),
+		sc(as),
+		cx(sc, boost::make_shared<Variables>()),
+		pos_zero(0),
+		pos_one(1),
+		pos_minus_one(-1),
+		nowhere(Position::Nowhere),
+		begin(Position::Begin),
+		end(Position::End),
+		pos_zero_constant(new Constant<Position>(pos_zero)),
+		pos_one_constant(new Constant<Position>(pos_one)),
+		pos_minus_one_constant(new Constant<Position>(pos_minus_one)),
+		nowhere_constant(new Constant<Position>(nowhere)),
+		begin_constant(new Constant<Position>(begin)),
+		end_constant(new Constant<Position>(end))
+	{
+		as->append(new Corpus2::Token(UnicodeString::fromUTF8("t1"), PwrNlp::Whitespace::Newline));
+		as->append(new Corpus2::Token(UnicodeString::fromUTF8("t2"), PwrNlp::Whitespace::Newline));
+		as->append(new Corpus2::Token(UnicodeString::fromUTF8("t3"), PwrNlp::Whitespace::Newline));
+		as->append(new Corpus2::Token(UnicodeString::fromUTF8("t4"), PwrNlp::Whitespace::Newline));
+		as->append(new Corpus2::Token(UnicodeString::fromUTF8("t5"), PwrNlp::Whitespace::Newline));
+		as->create_channel("ch1");
+		as->get_channel("ch1").set_segment_at(2, 1);
+		as->get_channel("ch1").set_segment_at(3, 1);
+		as->get_channel("ch1").set_head_at(3, true);
+	}
+
+	boost::shared_ptr<Corpus2::AnnotatedSentence> as;
+	SentenceContext sc;
+	Corpus2::Tagset tagset;
+	ActionExecContext cx;
+
+	Position pos_zero;
+	Position pos_one;
+	Position pos_minus_one;
+	Position nowhere;
+	Position begin;
+	Position end;
+	boost::shared_ptr<Function<Position> > pos_zero_constant;
+	boost::shared_ptr<Function<Position> > pos_one_constant;
+	boost::shared_ptr<Function<Position> > pos_minus_one_constant;
+	boost::shared_ptr<Function<Position> > nowhere_constant;
+	boost::shared_ptr<Function<Position> > begin_constant;
+	boost::shared_ptr<Function<Position> > end_constant;
+};
+
+BOOST_FIXTURE_TEST_CASE(mark_empty, MarkFix)
+{
+	boost::shared_ptr<Corpus2::AnnotatedSentence> as_clone = boost::dynamic_pointer_cast<Corpus2::AnnotatedSentence>(as->clone_shared());
+	Mark mark(pos_minus_one_constant, pos_minus_one_constant, "ch1");
+	BOOST_CHECK(!mark.execute(cx));
+	//BOOST_CHECK((*as) == (*as_clone));
+	BOOST_CHECK_EQUAL(as->get_channel("ch1").dump_alpha(), "__aA_");
+}
+
+BOOST_FIXTURE_TEST_CASE(mark_already_there, MarkFix)
+{
+	boost::shared_ptr<Corpus2::AnnotatedSentence> as_clone = boost::dynamic_pointer_cast<Corpus2::AnnotatedSentence>(as->clone_shared());
+	sc.set_position(1);
+	Mark mark(begin_constant, pos_one_constant, "ch1");
+	BOOST_CHECK(!mark.execute(cx));
+	BOOST_CHECK_EQUAL(as->get_channel("ch1").dump_alpha(), "__aA_");
+}
+
+BOOST_FIXTURE_TEST_CASE(mark_begin, MarkFix)
+{
+	boost::shared_ptr<Corpus2::AnnotatedSentence> as_clone = boost::dynamic_pointer_cast<Corpus2::AnnotatedSentence>(as->clone_shared());
+	sc.set_position(1);
+	Mark mark(begin_constant, pos_minus_one_constant, "ch1");
+	BOOST_CHECK(mark.execute(cx));
+	BOOST_CHECK_EQUAL(as->get_channel("ch1").dump_alpha(), "B_aA_");
+}
+
+BOOST_FIXTURE_TEST_CASE(mark_begin_head0, MarkFix)
+{
+	boost::shared_ptr<Corpus2::AnnotatedSentence> as_clone = boost::dynamic_pointer_cast<Corpus2::AnnotatedSentence>(as->clone_shared());
+	sc.set_position(0);
+	Mark mark(begin_constant, pos_one_constant, pos_zero_constant, "ch1");
+	BOOST_CHECK(mark.execute(cx));
+	BOOST_CHECK_EQUAL(as->get_channel("ch1").dump_alpha(), "BbaA_");
+}
+
+BOOST_FIXTURE_TEST_CASE(mark_begin_head1, MarkFix)
+{
+	boost::shared_ptr<Corpus2::AnnotatedSentence> as_clone = boost::dynamic_pointer_cast<Corpus2::AnnotatedSentence>(as->clone_shared());
+	sc.set_position(0);
+	Mark mark(begin_constant, pos_one_constant, pos_one_constant, "ch1");
+	BOOST_CHECK(mark.execute(cx));
+	BOOST_CHECK_EQUAL(as->get_channel("ch1").dump_alpha(), "bBaA_");
+}
+
+BOOST_FIXTURE_TEST_CASE(mark_other, MarkFix)
+{
+	sc.set_position(1);
+	Mark mark(begin_constant, pos_one_constant, begin_constant, "ch2");
+	BOOST_CHECK(mark.execute(cx));
+	BOOST_CHECK_EQUAL(as->get_channel("ch2").dump_alpha(), "Aaa__");
+}
+
+
+//------ to_string test cases -------
+
+BOOST_FIXTURE_TEST_CASE(mark_to_string, MarkFix)
+{
+	Mark mark(begin_constant, end_constant, "ch");
+	BOOST_CHECK_EQUAL("mark(begin, end, \"ch\")", mark.to_string(tagset));
+	Mark mark2(begin_constant, pos_one_constant, "ch2");
+	BOOST_CHECK_EQUAL("mark(begin, 1, \"ch2\")", mark2.to_string(tagset));
+	Mark mark3(pos_minus_one_constant, end_constant, "ch3");
+	BOOST_CHECK_EQUAL("mark(-1, end, \"ch3\")", mark3.to_string(tagset));
+}
+
+BOOST_FIXTURE_TEST_CASE(mark_to_string_head, MarkFix)
+	{
+	Mark mark(begin_constant, end_constant, pos_one_constant, "ch");
+	BOOST_CHECK_EQUAL("mark(begin, end, 1, \"ch\")", mark.to_string(tagset));
+	Mark mark2(begin_constant, pos_one_constant, pos_zero_constant, "ch2");
+	BOOST_CHECK_EQUAL("mark(begin, 1, 0, \"ch2\")", mark2.to_string(tagset));
+	Mark mark3(pos_minus_one_constant, end_constant, end_constant, "ch3");
+	BOOST_CHECK_EQUAL("mark(-1, end, end, \"ch3\")", mark3.to_string(tagset));
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/wcclrules/main.cpp b/wcclrules/main.cpp
index 7b83fe0..2126e5d 100644
--- a/wcclrules/main.cpp
+++ b/wcclrules/main.cpp
@@ -72,25 +72,28 @@ bool load_more_rules(Wccl::Parser& parser, const std::string& filename, Wccl::Ru
 	return false;
 }
 
-void do_stream(boost::shared_ptr<Corpus2::TokenWriter> writer, const Corpus2::Tagset& tagset, Wccl::RuleSequence& rules,
-	std::istream& is, const options& opts)
+void apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader,
+	boost::shared_ptr<Corpus2::TokenWriter> writer, Wccl::RuleSequence& rules,
+	const options& opts)
 {
-	Corpus2::XcesReader reader(tagset, is);
 	Corpus2::TokenTimer& timer = Corpus2::global_timer();
-	while (boost::shared_ptr<Corpus2::Chunk> c = reader.get_next_chunk()) {
+	while (boost::shared_ptr<Corpus2::Chunk> c = reader->get_next_chunk()) {
 		foreach (boost::shared_ptr<Corpus2::Sentence>& s, c->sentences()) {
+			boost::shared_ptr<Corpus2::AnnotatedSentence> as;
+			as = Corpus2::AnnotatedSentence::wrap_sentence(s);
 			if (opts.until_done) {
-				rules.execute_until_done(s, opts.until_done_iterations);
+				rules.execute_until_done(as, opts.until_done_iterations);
 			} else {
-				rules.execute_once(s);
+				rules.execute_once(as);
 			}
-			timer.count_sentence(*s);
+			timer.count_sentence(*as);
 			if (progress) {
 				timer.check_slice();
 			}
 			if (opts.first) break;
+			writer->write_sentence(*as);
 		}
-		writer->write_chunk(*c);
+		//writer->write_chunk(*c);
 		if (opts.first) break;
 	}
 	if (progress) {
@@ -102,6 +105,7 @@ void do_stream(boost::shared_ptr<Corpus2::TokenWriter> writer, const Corpus2::Ta
 int main(int argc, char** argv)
 {
 	std::string tagset_load = "kipi";
+	std::string input_format;
 	std::string output_format;
 	options opts;
 	opts.first = false;
@@ -111,8 +115,10 @@ int main(int argc, char** argv)
 	bool corpus_stdin = true;
 	using boost::program_options::value;
 
+	std::string readers = boost::algorithm::join(Corpus2::TokenReader::available_reader_types_help(), " ");
+	std::string readers_help = "Input format, any of: " + readers + "\n";
 	std::string writers = boost::algorithm::join(Corpus2::TokenWriter::available_writer_types_help(), " ");
-	std::string writers_help = "Output format, any of: " + writers + "\n";
+	std::string writers_help = "Output format, any of: " + writers + "\n";;
 
 	boost::program_options::options_description desc("Allowed options");
 	desc.add_options()
@@ -124,6 +130,8 @@ int main(int argc, char** argv)
 			 "CCL rule files\n")
 			("files,f", value(&files),
 			 "Files to load, looking at the extension to determine type\n")
+			("input-format,i", value(&input_format)->default_value("xces"),
+			 readers_help.c_str())
 			("output-format,o", value(&output_format)->default_value("xces"),
 			 writers_help.c_str())
 			("progress,p", value(&progress)->zero_tokens(),
@@ -189,16 +197,14 @@ int main(int argc, char** argv)
 			timer.register_signal_handler();
 			boost::shared_ptr<Corpus2::TokenWriter> writer;
 			writer.reset(Corpus2::TokenWriter::create(output_format, std::cout, tagset));
+			boost::shared_ptr<Corpus2::TokenReader> reader;
 			foreach (const std::string& f, corpora_files) {
-				std::ifstream ifs(f.c_str());
-				if (ifs.good()) {
-					do_stream(writer, tagset, rules, ifs, opts);
-				} else {
-					std::cerr << "Error reading corpus from " << f << "\n";
-				}
+				reader = Corpus2::TokenReader::create_path_reader(input_format, tagset, f);
+				apply_rules(reader, writer, rules, opts);
 			}
 			if (corpus_stdin) {
-				do_stream(writer, tagset, rules, std::cin, opts);
+				reader = Corpus2::TokenReader::create_stream_reader(input_format, tagset, std::cin);
+				apply_rules(reader, writer, rules, opts);
 			}
 		}
 	} catch (PwrNlp::PwrNlpError& e) {
-- 
GitLab