From 05a68caf4c4f9883863eee441adbb0804c847b68 Mon Sep 17 00:00:00 2001 From: ilor <kailoran@gmail.com> Date: Thu, 21 Apr 2011 18:42:02 +0200 Subject: [PATCH] add ,ann option in readers that makes them use AnnotatedSentence --- libcorpus2/io/fastxces.cpp | 2 +- libcorpus2/io/reader.cpp | 17 ++++++++++++++++- libcorpus2/io/reader.h | 5 +++++ libcorpus2/io/rft.cpp | 2 +- libcorpus2/io/xmlreader.cpp | 2 +- 5 files changed, 24 insertions(+), 4 deletions(-) diff --git a/libcorpus2/io/fastxces.cpp b/libcorpus2/io/fastxces.cpp index 25d4a8c..a4813bb 100644 --- a/libcorpus2/io/fastxces.cpp +++ b/libcorpus2/io/fastxces.cpp @@ -104,7 +104,7 @@ private: if (!sent_->empty()) { chunk_->append(sent_); } - sent_ = boost::make_shared<Sentence>(); + sent_ = base_reader_.make_sentence(); tok_->set_wa(PwrNlp::Whitespace::Newline); } else { if (!chunk_->empty()) { diff --git a/libcorpus2/io/reader.cpp b/libcorpus2/io/reader.cpp index 0c75d9a..51777bf 100644 --- a/libcorpus2/io/reader.cpp +++ b/libcorpus2/io/reader.cpp @@ -17,12 +17,14 @@ or FITNESS FOR A PARTICULAR PURPOSE. #include <libcorpus2/io/reader.h> #include <boost/make_shared.hpp> #include <boost/algorithm/string.hpp> +#include <libcorpus2/ann/annotatedsentence.h> #include <sstream> namespace Corpus2 { TokenReader::TokenReader(const Tagset& tagset) - : tagset_(tagset), tag_parse_mode_(Tagset::ParseDefault) + : tagset_(tagset), tag_parse_mode_(Tagset::ParseDefault), + use_annotated_sentences_(false) { } @@ -41,6 +43,8 @@ void TokenReader::set_option(const std::string &option) } else if (option == "strict") { tag_parse_mode_ = static_cast<Tagset::ParseMode>( Tagset::ParseDefault | (tag_parse_mode_ & Tagset::ParseFailWithIgn)); + } else if (option == "ann") { + use_annotated_sentences_ = true; } else { throw Corpus2Error("Unknown option passed to reader: " + option); } @@ -56,11 +60,22 @@ std::string TokenReader::get_option(const std::string &option) const } else if (option == "strict") { return (tag_parse_mode_ & ~Tagset::ParseFailWithIgn) == Tagset::ParseDefault ? option : ""; + } else if (option == "ann") { + return use_annotated_sentences_ ? option : ""; } else { return "unknown"; } } +boost::shared_ptr<Sentence> TokenReader::make_sentence() const +{ + if (use_annotated_sentences_) { + return boost::make_shared<AnnotatedSentence>(); + } else { + return boost::make_shared<Sentence>(); + } +} + boost::shared_ptr<TokenReader> TokenReader::create_path_reader( const std::string& class_id_params, const Tagset& tagset, diff --git a/libcorpus2/io/reader.h b/libcorpus2/io/reader.h index b0b3a8b..4f52aac 100644 --- a/libcorpus2/io/reader.h +++ b/libcorpus2/io/reader.h @@ -152,12 +152,17 @@ public: tag_parse_mode_ = mode; } + boost::shared_ptr<Sentence> make_sentence() const; + private: /// Tagset used by the Reader const Tagset& tagset_; /// Tag parse mode Tagset::ParseMode tag_parse_mode_; + + /// Flag to force creation of sentences as AnnotatedSentences + bool use_annotated_sentences_; }; namespace detail { diff --git a/libcorpus2/io/rft.cpp b/libcorpus2/io/rft.cpp index b35ed86..73003a6 100644 --- a/libcorpus2/io/rft.cpp +++ b/libcorpus2/io/rft.cpp @@ -148,7 +148,7 @@ Sentence::Ptr RftReader::actual_next_sentence() t->set_orth(UnicodeString::fromUTF8(orth)); t->set_wa(PwrNlp::Whitespace::Space); if (!s) { - s = boost::make_shared<Sentence>(); + s = make_sentence(); t->set_wa(PwrNlp::Whitespace::Newline); } t->add_lexeme(Lexeme(t->orth(), tag)); diff --git a/libcorpus2/io/xmlreader.cpp b/libcorpus2/io/xmlreader.cpp index 9fd41fd..5b34b5c 100644 --- a/libcorpus2/io/xmlreader.cpp +++ b/libcorpus2/io/xmlreader.cpp @@ -144,7 +144,7 @@ void XmlReader::start_sentence(const AttributeList &attributes) if (type != "s") { throw XcesError("Sub level <chunk> not type=\"s\""); } - sent_ = boost::make_shared<Corpus2::Sentence>(); + sent_ = base_reader_.make_sentence(); state_ = STATE_SENTENCE; } -- GitLab