From 1084636d8bf7cc6a03914eefef5ad0463fc6015d Mon Sep 17 00:00:00 2001
From: ilor <kailoran@gmail.com>
Date: Fri, 20 May 2011 14:59:07 +0200
Subject: [PATCH] RFT wrter latin2 flag

---
 libcorpus2/io/rft.cpp | 17 +++++++++++++++--
 libcorpus2/io/rft.h   |  3 +++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/libcorpus2/io/rft.cpp b/libcorpus2/io/rft.cpp
index ead4021..1624404 100644
--- a/libcorpus2/io/rft.cpp
+++ b/libcorpus2/io/rft.cpp
@@ -25,7 +25,7 @@ or FITNESS FOR A PARTICULAR PURPOSE.
 namespace Corpus2 {
 
 bool RftWriter::registered = TokenWriter::register_writer<RftWriter>(
-		"rft", "mbt,nowarn,colon,alltags,opt");
+		"rft", "mbt,nowarn,colon,alltags,opt,latin2");
 
 RftWriter::RftWriter(std::ostream& os, const Tagset& tagset,
 		const string_range_vector& params)
@@ -46,6 +46,8 @@ RftWriter::RftWriter(std::ostream& os, const Tagset& tagset,
 			opt_ = true;
 		} else if (p == "colon") {
 			colon_ = true;
+		} else if (p == "latin2") {
+			encoding_ = p;
 		}
 
 	}
@@ -53,7 +55,18 @@ RftWriter::RftWriter(std::ostream& os, const Tagset& tagset,
 
 void RftWriter::write_token(const Token& t)
 {
-	os() << t.orth_utf8();
+	if (encoding_.empty()) {
+		os() << t.orth_utf8();
+	} else {
+		char buf[256];
+		int len = t.orth().extract(0, t.orth().length(), buf, 255, encoding_.c_str());
+		if (len < 256) {
+			os() << buf;
+		} else {
+			std::cerr << "Characetr encoding error in codepage rft output\n";
+			os() << "???";
+		}
+	}
 	if (t.lexemes().empty()) {
 		if (warn_on_no_lexemes_) {
 			std::cerr << "No lexemes for token!";
diff --git a/libcorpus2/io/rft.h b/libcorpus2/io/rft.h
index 394df97..b87b5dd 100644
--- a/libcorpus2/io/rft.h
+++ b/libcorpus2/io/rft.h
@@ -64,6 +64,9 @@ private:
 
 	/// Dialect flag: output all lexemes, not just the preferred one
 	bool alltags_;
+
+	/// Dialect flag: use non-utf8 encoding
+	std::string encoding_;
 };
 
 class RftReader : public BufferedSentenceReader
-- 
GitLab