diff --git a/libcorpus2/io/cclreader.cpp b/libcorpus2/io/cclreader.cpp index f796eb21018b94e432a301d1238ff73c2a008a54..37d6cd2e999c306e6e9f1a9613804db5b02cb87c 100644 --- a/libcorpus2/io/cclreader.cpp +++ b/libcorpus2/io/cclreader.cpp @@ -24,6 +24,7 @@ or FITNESS FOR A PARTICULAR PURPOSE. #include <libcorpus2/ann/annotatedsentence.h> #include <cstdlib> #include <fstream> +#include <sstream> namespace Corpus2 { @@ -35,10 +36,18 @@ class CclReaderImpl : public XmlReader public: CclReaderImpl(const TokenReader& base_reader, std::deque< boost::shared_ptr<Chunk> >& obuf, - bool disamb_only, bool disamb_sh); + bool disamb_only, bool disamb_sh, bool autogen_sent_id); ~CclReaderImpl(); + void set_autogen_sent_id(bool autogen_sent_id) { + autogen_sent_id_ = autogen_sent_id; + } + + bool get_autogen_sent_id() const { + return autogen_sent_id_; + } + protected: bool process_start_element(const Glib::ustring & name, const AttributeList& attributes); @@ -70,19 +79,28 @@ protected: token_ann_t token_anns_; std::set<std::string> token_ann_heads_; + +private: + /// marker for autogenerating sentence identifiers (default is false -- + /// sentences identifiers will not be generated) + bool autogen_sent_id_; + unsigned int sent_number_; /// Sentence number, automatically generated + static const std::string SENT_ID_PREFFIX; }; +const std::string CclReaderImpl::SENT_ID_PREFFIX = "sentence"; CclReader::CclReader(const Tagset& tagset, std::istream& is, - bool disamb_only, bool disamb_sh) + bool disamb_only, bool disamb_sh, bool autogen_sent_id) : BufferedChunkReader(tagset), - impl_(new CclReaderImpl(*this, chunk_buf_, disamb_only, disamb_sh)) + impl_(new CclReaderImpl(*this, chunk_buf_, disamb_only, disamb_sh, autogen_sent_id)) { this->is_ = &is; } -CclReader::CclReader(const Tagset& tagset, const std::string& filename, bool disamb_only, bool disamb_sh) +CclReader::CclReader(const Tagset& tagset, const std::string& filename, + bool disamb_only, bool disamb_sh, bool autogen_sent_id) : BufferedChunkReader(tagset), - impl_(new CclReaderImpl(*this, chunk_buf_, disamb_only, disamb_sh)) + impl_(new CclReaderImpl(*this, chunk_buf_, disamb_only, disamb_sh, autogen_sent_id)) { this->is_owned_.reset(new std::ifstream(filename.c_str(), std::ifstream::in)); @@ -113,12 +131,14 @@ void CclReader::ensure_more() CclReaderImpl::CclReaderImpl(const TokenReader& base_reader, std::deque< boost::shared_ptr<Chunk> >& obuf, - bool disamb_only, bool disamb_sh) + bool disamb_only, bool disamb_sh, bool autogen_sent_id) : XmlReader(base_reader, obuf) { XmlReader::set_disamb_only(disamb_only); XmlReader::set_disamb_sh(disamb_sh); sentence_tag_name_ = "sentence"; + sent_number_ = 0; + autogen_sent_id_ = autogen_sent_id; } CclReaderImpl::~CclReaderImpl() @@ -127,6 +147,7 @@ CclReaderImpl::~CclReaderImpl() void CclReaderImpl::start_chunk(const AttributeList& attributes) { + // TODO: Autogenerating chunk identifiers chunk_ = boost::make_shared<Chunk>(); std::string type = get_type_from_attributes(attributes); if (type == "s") { @@ -150,6 +171,11 @@ void CclReaderImpl::start_sentence(const AttributeList &attributes) break; } } + if (id.empty() && autogen_sent_id_) { + std::ostringstream ss; + ss << ++sent_number_; + id = CclReaderImpl::SENT_ID_PREFFIX + ss.str(); + } ann_sent_ = boost::make_shared<AnnotatedSentence>(id); sent_ = ann_sent_; @@ -250,7 +276,10 @@ void CclReader::set_option(const std::string& option) impl_->set_warn_on_inconsistent(false); } else if (option == "disamb_only") { impl_->set_disamb_only(true); - } else { + } else if (option == "autogen_sent_id") { + impl_->set_autogen_sent_id(true); + } + else { BufferedChunkReader::set_option(option); } } @@ -261,6 +290,8 @@ std::string CclReader::get_option(const std::string& option) const return impl_->get_disamb_only() ? option : ""; } else if (option == "no_warn_inconsistent") { return impl_->get_warn_on_inconsistent() ? option : ""; + } else if (option == "autogen_sent_id") { + return impl_->get_autogen_sent_id() ? "autogen_sent_id" : ""; } return BufferedChunkReader::get_option(option); } diff --git a/libcorpus2/io/cclreader.h b/libcorpus2/io/cclreader.h index 067957bff25d294e87f42100e382b9c8214c2805..daac508b14b64d50aef9e8de5da07d85ead09ff3 100644 --- a/libcorpus2/io/cclreader.h +++ b/libcorpus2/io/cclreader.h @@ -31,10 +31,12 @@ class CclReader : public BufferedChunkReader { public: CclReader(const Tagset& tagset, std::istream& is, - bool disamb_only = false, bool disamb_sh = false); + bool disamb_only = false, bool disamb_sh = false, + bool autogen_sent_id = false); CclReader(const Tagset& tagset, const std::string& filename, - bool disamb_only = false, bool disamb_sh = false); + bool disamb_only = false, bool disamb_sh = false, + bool autogen_sent_id = false); ~CclReader();