Skip to content
Snippets Groups Projects
Commit 1084636d authored by ilor's avatar ilor
Browse files

RFT wrter latin2 flag

parent bca77ee2
No related merge requests found
...@@ -25,7 +25,7 @@ or FITNESS FOR A PARTICULAR PURPOSE. ...@@ -25,7 +25,7 @@ or FITNESS FOR A PARTICULAR PURPOSE.
namespace Corpus2 { namespace Corpus2 {
bool RftWriter::registered = TokenWriter::register_writer<RftWriter>( bool RftWriter::registered = TokenWriter::register_writer<RftWriter>(
"rft", "mbt,nowarn,colon,alltags,opt"); "rft", "mbt,nowarn,colon,alltags,opt,latin2");
RftWriter::RftWriter(std::ostream& os, const Tagset& tagset, RftWriter::RftWriter(std::ostream& os, const Tagset& tagset,
const string_range_vector& params) const string_range_vector& params)
...@@ -46,6 +46,8 @@ RftWriter::RftWriter(std::ostream& os, const Tagset& tagset, ...@@ -46,6 +46,8 @@ RftWriter::RftWriter(std::ostream& os, const Tagset& tagset,
opt_ = true; opt_ = true;
} else if (p == "colon") { } else if (p == "colon") {
colon_ = true; colon_ = true;
} else if (p == "latin2") {
encoding_ = p;
} }
} }
...@@ -53,7 +55,18 @@ RftWriter::RftWriter(std::ostream& os, const Tagset& tagset, ...@@ -53,7 +55,18 @@ RftWriter::RftWriter(std::ostream& os, const Tagset& tagset,
void RftWriter::write_token(const Token& t) void RftWriter::write_token(const Token& t)
{ {
os() << t.orth_utf8(); if (encoding_.empty()) {
os() << t.orth_utf8();
} else {
char buf[256];
int len = t.orth().extract(0, t.orth().length(), buf, 255, encoding_.c_str());
if (len < 256) {
os() << buf;
} else {
std::cerr << "Characetr encoding error in codepage rft output\n";
os() << "???";
}
}
if (t.lexemes().empty()) { if (t.lexemes().empty()) {
if (warn_on_no_lexemes_) { if (warn_on_no_lexemes_) {
std::cerr << "No lexemes for token!"; std::cerr << "No lexemes for token!";
......
...@@ -64,6 +64,9 @@ private: ...@@ -64,6 +64,9 @@ private:
/// Dialect flag: output all lexemes, not just the preferred one /// Dialect flag: output all lexemes, not just the preferred one
bool alltags_; bool alltags_;
/// Dialect flag: use non-utf8 encoding
std::string encoding_;
}; };
class RftReader : public BufferedSentenceReader class RftReader : public BufferedSentenceReader
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment