Skip to content
Snippets Groups Projects
Commit 1084636d authored by ilor's avatar ilor
Browse files

RFT wrter latin2 flag

parent bca77ee2
Branches
No related tags found
No related merge requests found
...@@ -25,7 +25,7 @@ or FITNESS FOR A PARTICULAR PURPOSE. ...@@ -25,7 +25,7 @@ or FITNESS FOR A PARTICULAR PURPOSE.
namespace Corpus2 { namespace Corpus2 {
bool RftWriter::registered = TokenWriter::register_writer<RftWriter>( bool RftWriter::registered = TokenWriter::register_writer<RftWriter>(
"rft", "mbt,nowarn,colon,alltags,opt"); "rft", "mbt,nowarn,colon,alltags,opt,latin2");
RftWriter::RftWriter(std::ostream& os, const Tagset& tagset, RftWriter::RftWriter(std::ostream& os, const Tagset& tagset,
const string_range_vector& params) const string_range_vector& params)
...@@ -46,6 +46,8 @@ RftWriter::RftWriter(std::ostream& os, const Tagset& tagset, ...@@ -46,6 +46,8 @@ RftWriter::RftWriter(std::ostream& os, const Tagset& tagset,
opt_ = true; opt_ = true;
} else if (p == "colon") { } else if (p == "colon") {
colon_ = true; colon_ = true;
} else if (p == "latin2") {
encoding_ = p;
} }
} }
...@@ -53,7 +55,18 @@ RftWriter::RftWriter(std::ostream& os, const Tagset& tagset, ...@@ -53,7 +55,18 @@ RftWriter::RftWriter(std::ostream& os, const Tagset& tagset,
void RftWriter::write_token(const Token& t) void RftWriter::write_token(const Token& t)
{ {
if (encoding_.empty()) {
os() << t.orth_utf8(); os() << t.orth_utf8();
} else {
char buf[256];
int len = t.orth().extract(0, t.orth().length(), buf, 255, encoding_.c_str());
if (len < 256) {
os() << buf;
} else {
std::cerr << "Characetr encoding error in codepage rft output\n";
os() << "???";
}
}
if (t.lexemes().empty()) { if (t.lexemes().empty()) {
if (warn_on_no_lexemes_) { if (warn_on_no_lexemes_) {
std::cerr << "No lexemes for token!"; std::cerr << "No lexemes for token!";
......
...@@ -64,6 +64,9 @@ private: ...@@ -64,6 +64,9 @@ private:
/// Dialect flag: output all lexemes, not just the preferred one /// Dialect flag: output all lexemes, not just the preferred one
bool alltags_; bool alltags_;
/// Dialect flag: use non-utf8 encoding
std::string encoding_;
}; };
class RftReader : public BufferedSentenceReader class RftReader : public BufferedSentenceReader
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment