Skip to content
Snippets Groups Projects
Commit 5f30bdd8 authored by Adam Radziszewski's avatar Adam Radziszewski
Browse files

RFT format dialect for TiMBL/MBT

parent 6154cd1f
Branches
No related merge requests found
...@@ -24,17 +24,21 @@ or FITNESS FOR A PARTICULAR PURPOSE. ...@@ -24,17 +24,21 @@ or FITNESS FOR A PARTICULAR PURPOSE.
namespace Corpus2 { namespace Corpus2 {
bool RftWriter::registered = TokenWriter::register_writer<RftWriter>( bool RftWriter::registered = TokenWriter::register_writer<RftWriter>(
"rft", ""); "rft", "mbt");
RftWriter::RftWriter(std::ostream& os, const Tagset& tagset, RftWriter::RftWriter(std::ostream& os, const Tagset& tagset,
const string_range_vector& params) const string_range_vector& params)
: TokenWriter(os, tagset, params), warn_on_no_lexemes_(true) : TokenWriter(os, tagset, params), warn_on_no_lexemes_(true)
, mbt_dialect_(false)
{ {
foreach (const string_range& param, params) { foreach (const string_range& param, params) {
std::string p = boost::copy_range<std::string>(param); std::string p = boost::copy_range<std::string>(param);
if (p == "nowarn") { if (p == "nowarn") {
warn_on_no_lexemes_ = false; warn_on_no_lexemes_ = false;
} }
else if (p == "mbt") {
mbt_dialect_ = true;
}
} }
} }
...@@ -48,7 +52,9 @@ void RftWriter::write_token(const Token& t) ...@@ -48,7 +52,9 @@ void RftWriter::write_token(const Token& t)
} else { } else {
const Lexeme& pref = t.get_preferred_lexeme(tagset()); const Lexeme& pref = t.get_preferred_lexeme(tagset());
std::string tag_str = tagset().tag_to_no_opt_string(pref.tag()); std::string tag_str = tagset().tag_to_no_opt_string(pref.tag());
os() << boost::algorithm::replace_all_copy(tag_str, ":", "."); os () << (mbt_dialect_
? tag_str // when MBT-compliant, suppress colon substitution
: boost::algorithm::replace_all_copy(tag_str, ":", "."));
} }
os() << "\n"; os() << "\n";
} }
...@@ -58,6 +64,9 @@ void RftWriter::write_sentence(const Sentence& s) ...@@ -58,6 +64,9 @@ void RftWriter::write_sentence(const Sentence& s)
foreach (const Token* t, s.tokens()) { foreach (const Token* t, s.tokens()) {
write_token(*t); write_token(*t);
} }
if (mbt_dialect_) {
os() << "<utt>";
}
os() << "\n"; os() << "\n";
} }
...@@ -68,8 +77,10 @@ void RftWriter::write_chunk(const Chunk& c) ...@@ -68,8 +77,10 @@ void RftWriter::write_chunk(const Chunk& c)
} }
} }
RftReader::RftReader(const Tagset& tagset, std::istream& is, bool disamb) RftReader::RftReader(const Tagset& tagset, std::istream& is, bool disamb,
bool mbt_dialect)
: BufferedSentenceReader(tagset), is_(is), disamb_(disamb) : BufferedSentenceReader(tagset), is_(is), disamb_(disamb)
, mbt_dialect_(mbt_dialect)
{ {
} }
...@@ -79,7 +90,8 @@ Sentence::Ptr RftReader::actual_next_sentence() ...@@ -79,7 +90,8 @@ Sentence::Ptr RftReader::actual_next_sentence()
Sentence::Ptr s; Sentence::Ptr s;
while (is().good()) { while (is().good()) {
std::getline(is(), line); std::getline(is(), line);
if (line.empty()) { if (line.empty()
|| (mbt_dialect_ && line.find_first_of("<utt>") == 0)) { // TODO: check
return s; return s;
} else { } else {
size_t tab = line.find('\t'); size_t tab = line.find('\t');
...@@ -88,7 +100,9 @@ Sentence::Ptr RftReader::actual_next_sentence() ...@@ -88,7 +100,9 @@ Sentence::Ptr RftReader::actual_next_sentence()
} else { } else {
std::string orth = line.substr(0, tab); std::string orth = line.substr(0, tab);
std::string tag_string = line.substr(tab + 1); std::string tag_string = line.substr(tab + 1);
boost::algorithm::replace_all(tag_string, ".", ":"); if (!mbt_dialect_) {
boost::algorithm::replace_all(tag_string, ".", ":");
}
Tag tag = tagset().parse_simple_tag(tag_string); Tag tag = tagset().parse_simple_tag(tag_string);
Token* t = new Token(); Token* t = new Token();
t->set_orth(UnicodeString::fromUTF8(orth)); t->set_orth(UnicodeString::fromUTF8(orth));
......
...@@ -48,12 +48,15 @@ public: ...@@ -48,12 +48,15 @@ public:
private: private:
bool warn_on_no_lexemes_; bool warn_on_no_lexemes_;
/// Whether using TiMBL/MBT variant (slightly different than RFT per se).
bool mbt_dialect_;
}; };
class RftReader : public BufferedSentenceReader class RftReader : public BufferedSentenceReader
{ {
public: public:
RftReader(const Tagset& tagset, std::istream& is, bool disamb); RftReader(const Tagset& tagset, std::istream& is, bool disamb,
bool mbt_dialect = false); // TODO move to some sort of params
std::istream& is() { std::istream& is() {
return is_; return is_;
...@@ -66,6 +69,8 @@ protected: ...@@ -66,6 +69,8 @@ protected:
std::istream& is_; std::istream& is_;
bool disamb_; bool disamb_;
/// Whether using TiMBL/MBT variant (slightly different than RFT per se).
bool mbt_dialect_;
}; };
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment