From 1084636d8bf7cc6a03914eefef5ad0463fc6015d Mon Sep 17 00:00:00 2001 From: ilor <kailoran@gmail.com> Date: Fri, 20 May 2011 14:59:07 +0200 Subject: [PATCH] RFT wrter latin2 flag --- libcorpus2/io/rft.cpp | 17 +++++++++++++++-- libcorpus2/io/rft.h | 3 +++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/libcorpus2/io/rft.cpp b/libcorpus2/io/rft.cpp index ead4021..1624404 100644 --- a/libcorpus2/io/rft.cpp +++ b/libcorpus2/io/rft.cpp @@ -25,7 +25,7 @@ or FITNESS FOR A PARTICULAR PURPOSE. namespace Corpus2 { bool RftWriter::registered = TokenWriter::register_writer<RftWriter>( - "rft", "mbt,nowarn,colon,alltags,opt"); + "rft", "mbt,nowarn,colon,alltags,opt,latin2"); RftWriter::RftWriter(std::ostream& os, const Tagset& tagset, const string_range_vector& params) @@ -46,6 +46,8 @@ RftWriter::RftWriter(std::ostream& os, const Tagset& tagset, opt_ = true; } else if (p == "colon") { colon_ = true; + } else if (p == "latin2") { + encoding_ = p; } } @@ -53,7 +55,18 @@ RftWriter::RftWriter(std::ostream& os, const Tagset& tagset, void RftWriter::write_token(const Token& t) { - os() << t.orth_utf8(); + if (encoding_.empty()) { + os() << t.orth_utf8(); + } else { + char buf[256]; + int len = t.orth().extract(0, t.orth().length(), buf, 255, encoding_.c_str()); + if (len < 256) { + os() << buf; + } else { + std::cerr << "Characetr encoding error in codepage rft output\n"; + os() << "???"; + } + } if (t.lexemes().empty()) { if (warn_on_no_lexemes_) { std::cerr << "No lexemes for token!"; diff --git a/libcorpus2/io/rft.h b/libcorpus2/io/rft.h index 394df97..b87b5dd 100644 --- a/libcorpus2/io/rft.h +++ b/libcorpus2/io/rft.h @@ -64,6 +64,9 @@ private: /// Dialect flag: output all lexemes, not just the preferred one bool alltags_; + + /// Dialect flag: use non-utf8 encoding + std::string encoding_; }; class RftReader : public BufferedSentenceReader -- GitLab