Skip to content
Snippets Groups Projects
Commit 1084636d authored by ilor's avatar ilor
Browse files

RFT wrter latin2 flag

parent bca77ee2
No related branches found
No related tags found
No related merge requests found
......@@ -25,7 +25,7 @@ or FITNESS FOR A PARTICULAR PURPOSE.
namespace Corpus2 {
bool RftWriter::registered = TokenWriter::register_writer<RftWriter>(
"rft", "mbt,nowarn,colon,alltags,opt");
"rft", "mbt,nowarn,colon,alltags,opt,latin2");
RftWriter::RftWriter(std::ostream& os, const Tagset& tagset,
const string_range_vector& params)
......@@ -46,6 +46,8 @@ RftWriter::RftWriter(std::ostream& os, const Tagset& tagset,
opt_ = true;
} else if (p == "colon") {
colon_ = true;
} else if (p == "latin2") {
encoding_ = p;
}
}
......@@ -53,7 +55,18 @@ RftWriter::RftWriter(std::ostream& os, const Tagset& tagset,
void RftWriter::write_token(const Token& t)
{
if (encoding_.empty()) {
os() << t.orth_utf8();
} else {
char buf[256];
int len = t.orth().extract(0, t.orth().length(), buf, 255, encoding_.c_str());
if (len < 256) {
os() << buf;
} else {
std::cerr << "Characetr encoding error in codepage rft output\n";
os() << "???";
}
}
if (t.lexemes().empty()) {
if (warn_on_no_lexemes_) {
std::cerr << "No lexemes for token!";
......
......@@ -64,6 +64,9 @@ private:
/// Dialect flag: output all lexemes, not just the preferred one
bool alltags_;
/// Dialect flag: use non-utf8 encoding
std::string encoding_;
};
class RftReader : public BufferedSentenceReader
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment