diff --git a/libcorpus2/io/plainwriter.cpp b/libcorpus2/io/plainwriter.cpp index 225afe450d266e5f16373061238551d20626ef46..7ab72e7fc103182785362cfe9efc0a722a288165 100644 --- a/libcorpus2/io/plainwriter.cpp +++ b/libcorpus2/io/plainwriter.cpp @@ -20,38 +20,63 @@ or FITNESS FOR A PARTICULAR PURPOSE. namespace Corpus2 { bool PlainWriter::registered = TokenWriter::register_writer<PlainWriter>( - "plain"); + "plain", "nows,no_disamb_info,disamb_only,ds"); PlainWriter::PlainWriter(std::ostream& os, const Tagset& tagset, const string_range_vector& params) - : TokenWriter(os, tagset, params) + : TokenWriter(os, tagset, params), ws_(true), disamb_(true) + , disamb_only_(false) { + foreach (const string_range& param, params) { + std::string p = boost::copy_range<std::string>(param); + if (p == "nows") { + ws_ = false; + } else if (p == "no_disamb_info") { + disamb_ = false; + } else if (p == "disamb_only") { + disamb_only_ = true; + } else if (p == "ds") { + disamb_ = false; + disamb_only_ = true; + } + } } void PlainWriter::write_token(const Token &t) { - os() << t.orth_utf8() << "\n"; + os() << t.orth_utf8(); + if (ws_) { + os() << "\t" << PwrNlp::Whitespace::to_string(t.wa()); + } + os() << "\n"; foreach (const Lexeme& lex, t.lexemes()) { - os() << "\t" << lex.lemma_utf8() << "\t" - << tagset().tag_to_string(lex.tag()) << "\n"; + if (!disamb_only_ || lex.is_disamb()) { + os() << "\t" << lex.lemma_utf8() << "\t" + << tagset().tag_to_string(lex.tag()); + if (disamb_) { + if (lex.is_disamb()) { + os() << "\t"; + os() << "disamb"; + } + } + os() << "\n"; + } } } void PlainWriter::write_sentence(const Sentence &s) { - os() << "[[[\n"; foreach (const Token* t, s.tokens()) { write_token(*t); } - os() << "]]]\n"; + os() << "\n"; } void PlainWriter::write_chunk(const Chunk& c) { - os() << "[[[<<<\n\n"; foreach (const boost::shared_ptr<Sentence>& s, c.sentences()) { write_sentence(*s); } - os() << ">>>]]]\n\n"; + os() << "\n"; } } /* end ns Corpus2 */ diff --git a/libcorpus2/io/plainwriter.h b/libcorpus2/io/plainwriter.h index 63c5f87df2588f98a1537ed20165cbaffe32473d..2a852bef8a8fe2ecfd34ff6f5c98677ab65388fa 100644 --- a/libcorpus2/io/plainwriter.h +++ b/libcorpus2/io/plainwriter.h @@ -34,6 +34,13 @@ public: void write_chunk(const Chunk& c); static bool registered; + +private: + bool ws_; + + bool disamb_; + + bool disamb_only_; }; } /* end ns Corpus2 */