diff --git a/libcorpus2/io/fastxces.cpp b/libcorpus2/io/fastxces.cpp index 25d4a8c74097cbc128fb9c0d40caf757007b26be..a4813bbc53ef4af326eca28ae3368ba0ded5c1b0 100644 --- a/libcorpus2/io/fastxces.cpp +++ b/libcorpus2/io/fastxces.cpp @@ -104,7 +104,7 @@ private: if (!sent_->empty()) { chunk_->append(sent_); } - sent_ = boost::make_shared<Sentence>(); + sent_ = base_reader_.make_sentence(); tok_->set_wa(PwrNlp::Whitespace::Newline); } else { if (!chunk_->empty()) { diff --git a/libcorpus2/io/reader.cpp b/libcorpus2/io/reader.cpp index 0c75d9a29f9473a4f864bce0ad4c38787326ddb0..51777bf976ea06db7495bdf5512f3f4d6c67b93f 100644 --- a/libcorpus2/io/reader.cpp +++ b/libcorpus2/io/reader.cpp @@ -17,12 +17,14 @@ or FITNESS FOR A PARTICULAR PURPOSE. #include <libcorpus2/io/reader.h> #include <boost/make_shared.hpp> #include <boost/algorithm/string.hpp> +#include <libcorpus2/ann/annotatedsentence.h> #include <sstream> namespace Corpus2 { TokenReader::TokenReader(const Tagset& tagset) - : tagset_(tagset), tag_parse_mode_(Tagset::ParseDefault) + : tagset_(tagset), tag_parse_mode_(Tagset::ParseDefault), + use_annotated_sentences_(false) { } @@ -41,6 +43,8 @@ void TokenReader::set_option(const std::string &option) } else if (option == "strict") { tag_parse_mode_ = static_cast<Tagset::ParseMode>( Tagset::ParseDefault | (tag_parse_mode_ & Tagset::ParseFailWithIgn)); + } else if (option == "ann") { + use_annotated_sentences_ = true; } else { throw Corpus2Error("Unknown option passed to reader: " + option); } @@ -56,11 +60,22 @@ std::string TokenReader::get_option(const std::string &option) const } else if (option == "strict") { return (tag_parse_mode_ & ~Tagset::ParseFailWithIgn) == Tagset::ParseDefault ? option : ""; + } else if (option == "ann") { + return use_annotated_sentences_ ? option : ""; } else { return "unknown"; } } +boost::shared_ptr<Sentence> TokenReader::make_sentence() const +{ + if (use_annotated_sentences_) { + return boost::make_shared<AnnotatedSentence>(); + } else { + return boost::make_shared<Sentence>(); + } +} + boost::shared_ptr<TokenReader> TokenReader::create_path_reader( const std::string& class_id_params, const Tagset& tagset, diff --git a/libcorpus2/io/reader.h b/libcorpus2/io/reader.h index b0b3a8bcc70b440ba5bc3f9a16638674ac78d000..4f52aac410b116fdee0ceec9b1c49ffc46d1eab0 100644 --- a/libcorpus2/io/reader.h +++ b/libcorpus2/io/reader.h @@ -152,12 +152,17 @@ public: tag_parse_mode_ = mode; } + boost::shared_ptr<Sentence> make_sentence() const; + private: /// Tagset used by the Reader const Tagset& tagset_; /// Tag parse mode Tagset::ParseMode tag_parse_mode_; + + /// Flag to force creation of sentences as AnnotatedSentences + bool use_annotated_sentences_; }; namespace detail { diff --git a/libcorpus2/io/rft.cpp b/libcorpus2/io/rft.cpp index b35ed8627bb7314ffd0ffed1b86f4b647d157e77..73003a6b8911258a1015e095f3b8008c85738734 100644 --- a/libcorpus2/io/rft.cpp +++ b/libcorpus2/io/rft.cpp @@ -148,7 +148,7 @@ Sentence::Ptr RftReader::actual_next_sentence() t->set_orth(UnicodeString::fromUTF8(orth)); t->set_wa(PwrNlp::Whitespace::Space); if (!s) { - s = boost::make_shared<Sentence>(); + s = make_sentence(); t->set_wa(PwrNlp::Whitespace::Newline); } t->add_lexeme(Lexeme(t->orth(), tag)); diff --git a/libcorpus2/io/xmlreader.cpp b/libcorpus2/io/xmlreader.cpp index 9fd41fdc92ccebf1c9199b0192d88d78169dca00..5b34b5cdf52f9c20b10609d1605fbfbed1d44a0c 100644 --- a/libcorpus2/io/xmlreader.cpp +++ b/libcorpus2/io/xmlreader.cpp @@ -144,7 +144,7 @@ void XmlReader::start_sentence(const AttributeList &attributes) if (type != "s") { throw XcesError("Sub level <chunk> not type=\"s\""); } - sent_ = boost::make_shared<Corpus2::Sentence>(); + sent_ = base_reader_.make_sentence(); state_ = STATE_SENTENCE; }