diff --git a/libcorpus2/io/rft.cpp b/libcorpus2/io/rft.cpp index 424fd0cecaafda2a340a8cf69c2e96eab3493ab0..9a566fca4d0cd8a05addfa4d01245f379b085e90 100644 --- a/libcorpus2/io/rft.cpp +++ b/libcorpus2/io/rft.cpp @@ -24,17 +24,21 @@ or FITNESS FOR A PARTICULAR PURPOSE. namespace Corpus2 { bool RftWriter::registered = TokenWriter::register_writer<RftWriter>( - "rft", ""); + "rft", "mbt"); RftWriter::RftWriter(std::ostream& os, const Tagset& tagset, const string_range_vector& params) : TokenWriter(os, tagset, params), warn_on_no_lexemes_(true) + , mbt_dialect_(false) { foreach (const string_range& param, params) { std::string p = boost::copy_range<std::string>(param); if (p == "nowarn") { warn_on_no_lexemes_ = false; } + else if (p == "mbt") { + mbt_dialect_ = true; + } } } @@ -48,7 +52,9 @@ void RftWriter::write_token(const Token& t) } else { const Lexeme& pref = t.get_preferred_lexeme(tagset()); std::string tag_str = tagset().tag_to_no_opt_string(pref.tag()); - os() << boost::algorithm::replace_all_copy(tag_str, ":", "."); + os () << (mbt_dialect_ + ? tag_str // when MBT-compliant, suppress colon substitution + : boost::algorithm::replace_all_copy(tag_str, ":", ".")); } os() << "\n"; } @@ -58,6 +64,9 @@ void RftWriter::write_sentence(const Sentence& s) foreach (const Token* t, s.tokens()) { write_token(*t); } + if (mbt_dialect_) { + os() << "<utt>"; + } os() << "\n"; } @@ -68,8 +77,10 @@ void RftWriter::write_chunk(const Chunk& c) } } -RftReader::RftReader(const Tagset& tagset, std::istream& is, bool disamb) +RftReader::RftReader(const Tagset& tagset, std::istream& is, bool disamb, + bool mbt_dialect) : BufferedSentenceReader(tagset), is_(is), disamb_(disamb) + , mbt_dialect_(mbt_dialect) { } @@ -79,7 +90,8 @@ Sentence::Ptr RftReader::actual_next_sentence() Sentence::Ptr s; while (is().good()) { std::getline(is(), line); - if (line.empty()) { + if (line.empty() + || (mbt_dialect_ && line.find_first_of("<utt>") == 0)) { // TODO: check return s; } else { size_t tab = line.find('\t'); @@ -88,7 +100,9 @@ Sentence::Ptr RftReader::actual_next_sentence() } else { std::string orth = line.substr(0, tab); std::string tag_string = line.substr(tab + 1); - boost::algorithm::replace_all(tag_string, ".", ":"); + if (!mbt_dialect_) { + boost::algorithm::replace_all(tag_string, ".", ":"); + } Tag tag = tagset().parse_simple_tag(tag_string); Token* t = new Token(); t->set_orth(UnicodeString::fromUTF8(orth)); diff --git a/libcorpus2/io/rft.h b/libcorpus2/io/rft.h index fcb0d6b6f406f40cbe5b676aa33e6610cd1d8b98..057a2b44c37024a2ca00117a399d0161be5c5b10 100644 --- a/libcorpus2/io/rft.h +++ b/libcorpus2/io/rft.h @@ -48,12 +48,15 @@ public: private: bool warn_on_no_lexemes_; + /// Whether using TiMBL/MBT variant (slightly different than RFT per se). + bool mbt_dialect_; }; class RftReader : public BufferedSentenceReader { public: - RftReader(const Tagset& tagset, std::istream& is, bool disamb); + RftReader(const Tagset& tagset, std::istream& is, bool disamb, + bool mbt_dialect = false); // TODO move to some sort of params std::istream& is() { return is_; @@ -66,6 +69,8 @@ protected: std::istream& is_; bool disamb_; + /// Whether using TiMBL/MBT variant (slightly different than RFT per se). + bool mbt_dialect_; };