diff --git a/CMakeLists.txt b/CMakeLists.txt
index 952c9deab6259b08f73a09e4bdea9aaf9ac8172e..86c2925c6867f3d1323290c893f180621b7bcc9b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -65,7 +65,7 @@ MARK_AS_ADVANCED(LIBCORPUS2_SRC_DATA_DIR)
 
 add_subdirectory(libpwrutils)
 add_subdirectory(libcorpus2)
-add_subdirectory(tagset-tool)
+add_subdirectory(corpus2tools)
 add_subdirectory(tests)
-# add_subdirectory(swig)
+add_subdirectory(swig)
 
diff --git a/tagset-tool/CMakeLists.txt b/corpus2tools/CMakeLists.txt
similarity index 68%
rename from tagset-tool/CMakeLists.txt
rename to corpus2tools/CMakeLists.txt
index c527edaab7ccfb8a7e2130c0c56fc6e6ef1e49cb..3e7b198ba60e00065d0545eabb59dda56df2a145 100644
--- a/tagset-tool/CMakeLists.txt
+++ b/corpus2tools/CMakeLists.txt
@@ -9,16 +9,18 @@ endif (Libedit_FOUND)
 
 include_directories( ${CMAKE_SOURCE_DIR} )
 
-add_executable( tagset-tool main.cpp )
-
+add_executable( tagset-tool tagset-tool.cpp )
 target_link_libraries ( tagset-tool corpus2 pwrutils ${Boost_LIBRARIES} ${LIBS})
 
+add_executable( corpus-get corpus-get.cpp )
+target_link_libraries ( corpus-get corpus2 pwrutils ${Boost_LIBRARIES} ${LIBS})
+
 include_directories(${Boost_INCLUDE_DIR})
 link_directories(${Boost_LIBRARY_DIRS})
 
 if(UNIX)
 
-	install(TARGETS tagset-tool
+	install(TARGETS tagset-tool corpus-get
 		RUNTIME DESTINATION bin)
 
 endif(UNIX)
diff --git a/corpus2tools/corpus-get.cpp b/corpus2tools/corpus-get.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9ea144f6a296670d209c069c527d471dbfe07ddc
--- /dev/null
+++ b/corpus2tools/corpus-get.cpp
@@ -0,0 +1,77 @@
+#include <libcorpus2/tagsetmanager.h>
+#include <libcorpus2/util/ioformat-options.h>
+#include <boost/program_options.hpp>
+#include <boost/algorithm/string.hpp>
+
+int main(int argc, char** argv)
+{
+	std::string tagset_name, filename;
+	std::string input_format, output_format;
+	int sentence, token = -1;
+	size_t stats = 0;
+	using boost::program_options::value;
+	boost::program_options::options_description desc("Allowed options");
+	desc.add_options()
+			("filename,F", value(&filename),
+			 "filename")
+			("sentence,S", value(&sentence),
+			 "Sentence idx")
+			("stats,s", value(&stats),
+			 "Stats")
+			("token,T", value(&token),
+			 "Token idx ")
+			("tagset,t", value(&tagset_name)->default_value("kipi"),
+			 "Tagset name")
+			;
+	Corpus2::add_input_options(desc);
+	Corpus2::add_output_options(desc);
+	boost::program_options::variables_map vm;
+	boost::program_options::positional_options_description p;
+	p.add("filename", 1);
+	p.add("sentence", 1);
+	p.add("token", 1);
+
+	try {
+		boost::program_options::store(
+			boost::program_options::command_line_parser(argc, argv)
+			.options(desc).positional(p).run(), vm);
+	} catch (boost::program_options::error& e) {
+		std::cerr << e.what() << "\n";
+		return 2;
+	}
+	boost::program_options::notify(vm);
+	if (vm.count("help")) {
+		std::cout << desc << "\n";
+		return 1;
+	}
+	const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_name);
+	boost::shared_ptr<Corpus2::TokenReader> reader;
+	reader = Corpus2::create_reader(vm, tagset, filename);
+	Corpus2::Sentence::Ptr s;
+	boost::shared_ptr<Corpus2::TokenWriter> writer;
+	writer = Corpus2::create_writer(vm, tagset);
+	std::map<int,int> lens;
+	for (int i = 0; i <= sentence; ++i) {
+		s = reader->get_next_sentence();
+		if (s) {
+			lens[s->size()]++;
+			if (s->size() > stats) {
+				std::cerr << i << "\n";
+				writer->write_sentence(*s);
+			}
+		}
+	}
+	if (s) {
+		if (token == -1) {
+			writer->write_sentence(*s);
+		} else if (static_cast<size_t>(token) < s->size()) {
+			writer->write_token(*(*s)[token]);
+		}
+	}
+	if (stats) {
+		typedef std::pair<int,int> pp;
+		foreach (const pp& p, lens) {
+			std::cerr << p.first << " " << p.second << "\n";
+		}
+	}
+}
diff --git a/tagset-tool/main.cpp b/corpus2tools/tagset-tool.cpp
similarity index 100%
rename from tagset-tool/main.cpp
rename to corpus2tools/tagset-tool.cpp
diff --git a/doc/corpstats.py b/doc/corpstats.py
new file mode 100755
index 0000000000000000000000000000000000000000..bbfc6dbe2fd547a0c20a28a3cd1cf33fafc11b71
--- /dev/null
+++ b/doc/corpstats.py
@@ -0,0 +1,101 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+import sys
+from optparse import OptionParser
+from collections import defaultdict as dd
+import corpus2
+
+descr = """%prog [options] CORPUSFILE
+Reads a corpus file and reports some statistics.
+This script is a demo of the Python API.
+"""
+
+def tokens(rdr):
+	"""Yields subsequent tokens from a reader.
+	Declared here for demonstration."""
+	while True:
+		tok = rdr.get_next_token()
+		if not tok:
+			break
+		yield tok
+
+def sentences(rdr):
+	"""Yields subsequent sentences from a reader.
+	Declared here for demonstration."""
+	while True:
+		sent = rdr.get_next_sentence()
+		if not sent:
+			break
+		yield sent
+
+def chunks(rdr):
+	"""Yields subsequent sentences from a reader."""
+	while True:
+		chunk = rdr.get_next_chunk()
+		if not chunk:
+			break
+		yield chunk
+
+def go():
+	parser = OptionParser(usage=descr)
+	parser.add_option('-i', '--input-format', type='string', action='store',
+		dest='input_format', default='xces',
+		help='set the input format; default: xces')
+	#parser.add_option('-o', '--output-format', type='string', action='store',
+		#dest='output_format', default='xces',
+		#help='set the output format; default: xces')
+	parser.add_option('-t', '--tagset', type='string', action='store',
+		dest='tagset', default='kipi',
+		help='set the tagset used in input; default: kipi')
+	parser.add_option('-v', '--verbose', action='store_true',
+		dest='verbose', default=False,
+		help='report each token')
+	parser.add_option('-n', '--number-of-tags', type='int', action='store',
+		dest='num_tags', default=10,
+		help='set the max number of tags to report')
+	(options, args) = parser.parse_args()
+	
+	if len(args) != 1:
+		print 'You need to provide an input corpus.'
+		print 'See %s --help' % sys.argv[0]
+		sys.exit(1)
+	
+	inpath = args[0]
+	# load a tagset, create a reader
+	tagset = corpus2.get_named_tagset(options.tagset)
+	rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, inpath)
+	# init stats (for this example)
+	num_toks, num_sents, num_chunks = 0, 0, 0
+	tag_count = dd(int) 
+	
+	for chunk in chunks(rdr):
+		for sent in chunk.sentences():
+			for tok in sent.tokens():
+				if options.verbose:
+					print tok.orth_utf8()
+				
+				for lex in tok.lexemes():
+					tag_str = tagset.tag_to_string(lex.tag())
+					tag_count[tag_str] += 1
+					
+					if options.verbose:
+						lemma = lex.lemma_utf8()
+						print ('+' if lex.is_disamb() else ' '), lemma, tag_str
+						# if you want a unicode object, orth_utf8().decode('utf-8')
+				num_toks += 1
+			num_sents += 1
+		num_chunks += 1
+		
+	
+	print 'Tokens:', num_toks
+	print 'Sents: ', num_sents
+	print 'Chunks:', num_chunks
+	print
+	print 'Most frequent tags:'
+	for tc in sorted(tag_count.items(), key=lambda tc: (-tc[1], tc[0]))[:options.num_tags]:
+		print '\t%s\t%d' % tc
+		
+
+
+if __name__ == '__main__':
+	go()
diff --git a/libcorpus2/CMakeLists.txt b/libcorpus2/CMakeLists.txt
index 9b8fbd46064246325a2a2b8fe53d37261f137275..d351a6de1f3c64bd047701674e26c1a41e1461b3 100644
--- a/libcorpus2/CMakeLists.txt
+++ b/libcorpus2/CMakeLists.txt
@@ -57,6 +57,7 @@ SET(libcorpus2_STAT_SRC
 	tokenmetadata.cpp
 	io/cclreader.cpp
 	io/cclwriter.cpp
+	io/helpers.cpp
 	io/fastxces.cpp
 	io/iob-chan.cpp
 	io/nonewriter.cpp
@@ -76,6 +77,7 @@ SET(libcorpus2_STAT_SRC
 	io/xceswriter.cpp
 	io/xmlreader.cpp
 	io/xmlwriter.cpp
+	util/ioformat-options.cpp
 	util/settings.cpp
 	util/symboldictionary.cpp
 	util/tokentimer.cpp
diff --git a/libcorpus2/io/helpers.cpp b/libcorpus2/io/helpers.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9aac87844ee18597100b350a75af290b724aed34
--- /dev/null
+++ b/libcorpus2/io/helpers.cpp
@@ -0,0 +1,22 @@
+#include <libcorpus2/io/helpers.h>
+#include <libcorpus2/io/reader.h>
+#include <sstream>
+namespace Corpus2 {
+
+std::vector<boost::shared_ptr<Chunk> > read_chunks_from_utf8_string(
+        const std::string& data,
+        const Tagset& tagset,
+        const std::string& format)
+{
+	std::stringstream ss;
+	ss << data;
+	boost::shared_ptr<TokenReader> reader = TokenReader::create_stream_reader(
+		format, tagset, ss);
+	std::vector<boost::shared_ptr<Chunk> > chunks;
+	while (boost::shared_ptr<Chunk> c = reader->get_next_chunk()) {
+		chunks.push_back(c);
+	}
+	return chunks;
+}
+
+} /* end ns Corpus2 */
diff --git a/libcorpus2/io/helpers.h b/libcorpus2/io/helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..92d8a896e4342572b399f91c9ccf82f2387d10ac
--- /dev/null
+++ b/libcorpus2/io/helpers.h
@@ -0,0 +1,16 @@
+#ifndef LIBSORPUS2_IO_HELPERS_H
+#define LIBCORPUS2_IO_HELPERS_H
+
+#include <libcorpus2/chunk.h>
+#include <libcorpus2/tagset.h>
+
+namespace Corpus2 {
+
+std::vector<boost::shared_ptr<Chunk> > read_chunks_from_utf8_string(
+	const std::string& data,
+	const Tagset& tagset,
+	const std::string& format);
+
+} /* end ns Corpus2 */
+
+#endif // LIBCORPUS2_IO_HELPERS_H
diff --git a/libcorpus2/io/rft.cpp b/libcorpus2/io/rft.cpp
index 67552dff647ab71e176e2b233e6d003383a63580..8f28008dafccacd006d0656fa87a81844df67257 100644
--- a/libcorpus2/io/rft.cpp
+++ b/libcorpus2/io/rft.cpp
@@ -25,7 +25,7 @@ or FITNESS FOR A PARTICULAR PURPOSE.
 namespace Corpus2 {
 
 bool RftWriter::registered = TokenWriter::register_writer<RftWriter>(
-		"rft", "mbt,nowarn,colon,alltags,opt");
+		"rft", "mbt,nowarn,colon,alltags,opt,latin2");
 
 RftWriter::RftWriter(std::ostream& os, const Tagset& tagset,
 		const string_range_vector& params)
@@ -46,6 +46,8 @@ RftWriter::RftWriter(std::ostream& os, const Tagset& tagset,
 			opt_ = true;
 		} else if (p == "colon") {
 			colon_ = true;
+		} else if (p == "latin2") {
+			encoding_ = p;
 		}
 
 	}
@@ -53,7 +55,18 @@ RftWriter::RftWriter(std::ostream& os, const Tagset& tagset,
 
 void RftWriter::write_token(const Token& t)
 {
-	os() << t.orth_utf8();
+	if (encoding_.empty()) {
+		os() << t.orth_utf8();
+	} else {
+		char buf[256];
+		int len = t.orth().extract(0, t.orth().length(), buf, 255, encoding_.c_str());
+		if (len < 256) {
+			os() << buf;
+		} else {
+			std::cerr << "Characetr encoding error in codepage rft output\n";
+			os() << "???";
+		}
+	}
 	if (t.lexemes().empty()) {
 		if (warn_on_no_lexemes_) {
 			std::cerr << "No lexemes for token!";
diff --git a/libcorpus2/io/rft.h b/libcorpus2/io/rft.h
index 394df9720346576e5ddf394a7ccdfaf61029f122..b87b5dd616b9df8ef614965cfdc994ea2f40c193 100644
--- a/libcorpus2/io/rft.h
+++ b/libcorpus2/io/rft.h
@@ -64,6 +64,9 @@ private:
 
 	/// Dialect flag: output all lexemes, not just the preferred one
 	bool alltags_;
+
+	/// Dialect flag: use non-utf8 encoding
+	std::string encoding_;
 };
 
 class RftReader : public BufferedSentenceReader
diff --git a/libcorpus2/util/ioformat-options.cpp b/libcorpus2/util/ioformat-options.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0ed6ccf7e71c50004231cc9cab3c488da1b87691
--- /dev/null
+++ b/libcorpus2/util/ioformat-options.cpp
@@ -0,0 +1,66 @@
+#include <libcorpus2/util/ioformat-options.h>
+#include <boost/algorithm/string/join.hpp>
+
+namespace Corpus2 {
+
+void add_input_options(
+	boost::program_options::options_description& desc,
+	const std::string& default_format /*= "xces"*/
+	)
+{
+	std::string readers = boost::algorithm::join(
+		Corpus2::TokenReader::available_reader_types_help(), " ");
+	std::string readers_help = "Input format, any of: " + readers + "\n";
+	desc.add_options()
+		("input-format,i",
+		 boost::program_options::value<std::string>()->default_value(default_format),
+		 readers_help.c_str());
+}
+
+void add_output_options(
+	boost::program_options::options_description& desc,
+	const std::string& default_format /*= "xces"*/
+	)
+{
+	std::string writers = boost::algorithm::join(
+		Corpus2::TokenWriter::available_writer_types_help(), " ");
+	std::string writers_help = "Output format, any of: " + writers + "\n";
+	desc.add_options()
+		("output-format,o",
+		 boost::program_options::value<std::string>()->default_value(default_format),
+		 writers_help.c_str());
+}
+
+boost::shared_ptr<Corpus2::TokenReader> create_reader(
+	boost::program_options::variables_map& vm,
+	const Corpus2::Tagset& tagset,
+	const std::string& filename /*= "-"*/
+	)
+{
+	std::string format = vm["input-format"].as<std::string>();
+	if (filename.empty() || filename == "-") {
+		return Corpus2::TokenReader::create_stream_reader(
+			format, tagset, std::cin);
+	} else {
+		return Corpus2::TokenReader::create_path_reader(
+			format, tagset, filename);
+	}
+}
+
+boost::shared_ptr<Corpus2::TokenWriter> create_writer(
+	boost::program_options::variables_map& vm,
+	const Corpus2::Tagset& tagset,
+	const std::string& filename /*= "-"*/
+	)
+{
+	std::string format = vm["output-format"].as<std::string>();
+	if (filename.empty() || filename == "-") {
+		return Corpus2::TokenWriter::create_stream_writer(
+			format, std::cout, tagset);
+	} else {
+		return Corpus2::TokenWriter::create_path_writer(
+			format, filename, tagset);
+	}
+}
+
+} /* end ns Corpus2 */
diff --git a/libcorpus2/util/ioformat-options.h b/libcorpus2/util/ioformat-options.h
new file mode 100644
index 0000000000000000000000000000000000000000..cab00546d311f2db27d3b2924f579702b2fa90fc
--- /dev/null
+++ b/libcorpus2/util/ioformat-options.h
@@ -0,0 +1,34 @@
+#ifndef LIBSORPUS2_UTIL_IOFORMAT_OPTIONS_H
+#define LIBCORPUS2_UTIL_IOFORMAT_OPTIONS_H
+
+#include <boost/program_options.hpp>
+#include <libcorpus2/io/reader.h>
+#include <libcorpus2/io/writer.h>
+
+namespace Corpus2 {
+
+void add_input_options(
+	boost::program_options::options_description& desc,
+	const std::string& default_format = "xces"
+	);
+
+void add_output_options(
+	boost::program_options::options_description& desc,
+	const std::string& default_format = "xces"
+	);
+
+boost::shared_ptr<Corpus2::TokenReader> create_reader(
+	boost::program_options::variables_map& vm,
+	const Corpus2::Tagset& tagset,
+	const std::string& filename = "-"
+	);
+
+boost::shared_ptr<Corpus2::TokenWriter> create_writer(
+	boost::program_options::variables_map& vm,
+	const Corpus2::Tagset& tagset,
+	const std::string& filename = "-"
+	);
+
+} /* end ns Corpus2 */
+
+#endif // LIBCORPUS2_UTIL_IOFORMAT_OPTIONS_H
diff --git a/swig/CMakeLists.txt b/swig/CMakeLists.txt
index 978ece174dff98284913c1d162a075b38422948d..b61084c9d56e15d6dec10f99ce8b0280b6b6e35e 100644
--- a/swig/CMakeLists.txt
+++ b/swig/CMakeLists.txt
@@ -2,15 +2,12 @@
 
 PROJECT(corpus2SwigWrap)
 
-find_package(Corpus2 1.0.8 REQUIRED)
-set(CORPUS2_LIBS ${Corpus2_LIBRARY})
-
-find_package(PwrUtils 1.0.1 REQUIRED)
-set(PWRUTILS_LIBS ${PwrUtils_LIBRARY})
+set(CORPUS2_LIBS corpus2)
+set(PWRUTILS_LIBS pwrutils)
 set(CORPUS2_PWR_LIBS ${CORPUS2_LIBS} ${PWRUTILS_LIBS})
 
-include_directories (${Libcorpus2_SOURCE_DIR} "../libcorpus2")
-include_directories (${Libpwrutils_SOURCE_DIR} "../libpwrutils")
+include_directories (${corpus2_SOURCE_DIR})
+include_directories (${pwrutils_SOURCE_DIR})
 
 link_directories(${Libcorpus2_BINARY_DIR})
 
@@ -31,6 +28,12 @@ message(STATUS "INFO: " "python lib: ${PYTHON_INSTDIR}" )
 INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_PATH})
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
 
+if(CMAKE_COMPILER_IS_GNUCXX)
+	set(CMAKE_CXX_FLAGS "-ansi $ENV{CXXFLAGS}")
+	set(CMAKE_CXX_FLAGS_DEBUG "-O0 -DDEBUG -ggdb3 -ansi $ENV{CXXFLAGS}")
+	set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -fno-omit-frame-pointer")
+endif(CMAKE_COMPILER_IS_GNUCXX)
+
 # -----------------------------------------------------------------------------
 # -----------------------------------------------------------------------------
 
diff --git a/swig/boost_shared_ptr.i b/swig/boost_shared_ptr.i
index 7803b22154c79cba53773442e3387ca0e466f7da..6f3ec42a0061504e01d2802c46156964fa3928fe 100644
--- a/swig/boost_shared_ptr.i
+++ b/swig/boost_shared_ptr.i
@@ -14,6 +14,14 @@ namespace boost {
     shared_ptr();
     shared_ptr(T * p);
     T* operator->();
+    T* get();
+
+%pythoncode %{
+  def __bool__(self):
+     return self.get() is not None
+  __nonzero__=__bool__
+%}
+
   private:
     T * px;
     int pn;
diff --git a/swig/libcorpuschunk.i b/swig/libcorpuschunk.i
index baa3fc1e715e4a2aa2b401ce545ce300432bf1c9..df798640ec75a6ac7f0ddfd113c4aeb3050f82ce 100644
--- a/swig/libcorpuschunk.i
+++ b/swig/libcorpuschunk.i
@@ -46,6 +46,7 @@ namespace Corpus2 {
   };
 }
 
+%template(ChunkPtrVector) std::vector<boost::shared_ptr<Chunk> >;
 using namespace std;
 using namespace Corpus2;
 
diff --git a/swig/libcorpustokenreader.i b/swig/libcorpustokenreader.i
index 93043ff8446bdb1adc53001c4da9781f2399dae1..ef2f16cb1e7fd19f01560be26fa64b1a518d5e33 100644
--- a/swig/libcorpustokenreader.i
+++ b/swig/libcorpustokenreader.i
@@ -4,6 +4,7 @@
 %module libcorpustokenreader
 %{
   #include <libcorpus2/io/reader.h>
+  #include <libcorpus2/io/helpers.h>
 %}
 
 %include "libcorpustag.i"
@@ -17,13 +18,15 @@
 
 %nodefaultctor Corpus2::TokenReader;
 %template(TokenReaderPtr) boost::shared_ptr<Corpus2::TokenReader>;
+%template(TokenPtr) boost::shared_ptr<Corpus2::Token>;
 // %template(StdStringVector) std::vector<std::string>;
 // %template(ChunkPtr) boost::shared_ptr<Corpus2::Chunk>;
-
+typedef boost::shared_ptr<Corpus2::Token> TokenPtr;
 namespace Corpus2 {
   class TokenReader {
   public:
     typedef boost::shared_ptr<TokenReader> TokenReaderPtr;
+    //typedef boost::shared_ptr<Token> TokenPtr;
 
     /* --------------------------------------------------------------------- */
     explicit TokenReader(const Tagset& tagset);
@@ -38,6 +41,7 @@ namespace Corpus2 {
         return NULL;
       }
     }
+    %feature("autodoc", "1");
     static TokenReaderPtr create_path_reader(
       const std::string& class_id,
       const Tagset& tagset,
@@ -51,13 +55,14 @@ namespace Corpus2 {
         return NULL;
       }
     }
+    %feature("autodoc", "1");
     static TokenReaderPtr create_stream_reader(
       const std::string& class_id,
       const Tagset& tagset,
       std::istream& stream);
 
     /* --------------------------------------------------------------------- */
-    virtual Token* get_next_token() = 0;
+    /* virtual Token* get_next_token() = 0; */
     virtual Sentence::Ptr get_next_sentence() = 0;
     virtual boost::shared_ptr<Chunk> get_next_chunk() = 0;
 
@@ -73,6 +78,18 @@ namespace Corpus2 {
     static std::string reader_help(const std::string& class_id);
     static std::vector<std::string> available_reader_types_help();
   };
+
+  %extend TokenReader {
+    /* modfify the native get_next_token to wrap the tokens into shared_ptr */
+    boost::shared_ptr<Corpus2::Token> get_next_token() {
+      return boost::shared_ptr<Corpus2::Token>(self->get_next_token());
+    }
+  }
+
+%feature("autodoc", "1");
+  std::vector<boost::shared_ptr<Chunk> > read_chunks_from_utf8_string(
+    const std::string& data, const Tagset& tagset, const std::string& format);
+ 
 }
 
 using namespace std;