diff --git a/libcorpus2/io/rft.cpp b/libcorpus2/io/rft.cpp index ead40217893a217749ec7e8b62a945c5dfdb36e6..1624404a32ad7cff11e7578433939d86e7cc0bb0 100644 --- a/libcorpus2/io/rft.cpp +++ b/libcorpus2/io/rft.cpp @@ -25,7 +25,7 @@ or FITNESS FOR A PARTICULAR PURPOSE. namespace Corpus2 { bool RftWriter::registered = TokenWriter::register_writer<RftWriter>( - "rft", "mbt,nowarn,colon,alltags,opt"); + "rft", "mbt,nowarn,colon,alltags,opt,latin2"); RftWriter::RftWriter(std::ostream& os, const Tagset& tagset, const string_range_vector& params) @@ -46,6 +46,8 @@ RftWriter::RftWriter(std::ostream& os, const Tagset& tagset, opt_ = true; } else if (p == "colon") { colon_ = true; + } else if (p == "latin2") { + encoding_ = p; } } @@ -53,7 +55,18 @@ RftWriter::RftWriter(std::ostream& os, const Tagset& tagset, void RftWriter::write_token(const Token& t) { - os() << t.orth_utf8(); + if (encoding_.empty()) { + os() << t.orth_utf8(); + } else { + char buf[256]; + int len = t.orth().extract(0, t.orth().length(), buf, 255, encoding_.c_str()); + if (len < 256) { + os() << buf; + } else { + std::cerr << "Characetr encoding error in codepage rft output\n"; + os() << "???"; + } + } if (t.lexemes().empty()) { if (warn_on_no_lexemes_) { std::cerr << "No lexemes for token!"; diff --git a/libcorpus2/io/rft.h b/libcorpus2/io/rft.h index 394df9720346576e5ddf394a7ccdfaf61029f122..b87b5dd616b9df8ef614965cfdc994ea2f40c193 100644 --- a/libcorpus2/io/rft.h +++ b/libcorpus2/io/rft.h @@ -64,6 +64,9 @@ private: /// Dialect flag: output all lexemes, not just the preferred one bool alltags_; + + /// Dialect flag: use non-utf8 encoding + std::string encoding_; }; class RftReader : public BufferedSentenceReader