diff --git a/libcorpus2/io/cclreader.cpp b/libcorpus2/io/cclreader.cpp index 37d6cd2e999c306e6e9f1a9613804db5b02cb87c..117526b1194e97b95e836738bbfecef4f134b2a3 100644 --- a/libcorpus2/io/cclreader.cpp +++ b/libcorpus2/io/cclreader.cpp @@ -36,7 +36,8 @@ class CclReaderImpl : public XmlReader public: CclReaderImpl(const TokenReader& base_reader, std::deque< boost::shared_ptr<Chunk> >& obuf, - bool disamb_only, bool disamb_sh, bool autogen_sent_id); + bool disamb_only, bool disamb_sh, bool autogen_sent_id, + bool autogen_chunk_id); ~CclReaderImpl(); @@ -48,6 +49,14 @@ public: return autogen_sent_id_; } + void set_autogen_chunk_id(bool autogen_chunk_id) { + autogen_chunk_id_ = autogen_chunk_id; + } + + bool get_autogen_chunk_id() const { + return autogen_chunk_id_; + } + protected: bool process_start_element(const Glib::ustring & name, const AttributeList& attributes); @@ -86,21 +95,30 @@ private: bool autogen_sent_id_; unsigned int sent_number_; /// Sentence number, automatically generated static const std::string SENT_ID_PREFFIX; + // and same for chunk... + bool autogen_chunk_id_; + unsigned int chunk_number_; + static const std::string CHUNK_ID_PREFFIX; }; const std::string CclReaderImpl::SENT_ID_PREFFIX = "sentence"; +const std::string CclReaderImpl::CHUNK_ID_PREFFIX = "chunk"; CclReader::CclReader(const Tagset& tagset, std::istream& is, - bool disamb_only, bool disamb_sh, bool autogen_sent_id) + bool disamb_only, bool disamb_sh, bool autogen_sent_id, + bool autogen_chunk_id) : BufferedChunkReader(tagset), - impl_(new CclReaderImpl(*this, chunk_buf_, disamb_only, disamb_sh, autogen_sent_id)) + impl_(new CclReaderImpl(*this, chunk_buf_, disamb_only, disamb_sh, + autogen_sent_id, autogen_chunk_id)) { this->is_ = &is; } CclReader::CclReader(const Tagset& tagset, const std::string& filename, - bool disamb_only, bool disamb_sh, bool autogen_sent_id) + bool disamb_only, bool disamb_sh, bool autogen_sent_id, + bool autogen_chunk_id) : BufferedChunkReader(tagset), - impl_(new CclReaderImpl(*this, chunk_buf_, disamb_only, disamb_sh, autogen_sent_id)) + impl_(new CclReaderImpl(*this, chunk_buf_, disamb_only, disamb_sh, + autogen_sent_id, autogen_chunk_id)) { this->is_owned_.reset(new std::ifstream(filename.c_str(), std::ifstream::in)); @@ -131,14 +149,17 @@ void CclReader::ensure_more() CclReaderImpl::CclReaderImpl(const TokenReader& base_reader, std::deque< boost::shared_ptr<Chunk> >& obuf, - bool disamb_only, bool disamb_sh, bool autogen_sent_id) + bool disamb_only, bool disamb_sh, bool autogen_sent_id, + bool autogen_chunk_id) : XmlReader(base_reader, obuf) { XmlReader::set_disamb_only(disamb_only); XmlReader::set_disamb_sh(disamb_sh); sentence_tag_name_ = "sentence"; sent_number_ = 0; + chunk_number_ = 0; autogen_sent_id_ = autogen_sent_id; + autogen_chunk_id_ = autogen_chunk_id; } CclReaderImpl::~CclReaderImpl() @@ -147,8 +168,16 @@ CclReaderImpl::~CclReaderImpl() void CclReaderImpl::start_chunk(const AttributeList& attributes) { - // TODO: Autogenerating chunk identifiers + std::string id = get_id_from_attributes(attributes); + if (id.empty() && autogen_chunk_id_) { + std::ostringstream ss; + ss << ++chunk_number_; + id = CclReaderImpl::CHUNK_ID_PREFFIX + ss.str(); + } + chunk_ = boost::make_shared<Chunk>(); + chunk_->set_attribute("id", id); + std::string type = get_type_from_attributes(attributes); if (type == "s") { throw XcesError("Trying to parse XCES as CCL (<chunk type=\"s\">)"); @@ -278,6 +307,8 @@ void CclReader::set_option(const std::string& option) impl_->set_disamb_only(true); } else if (option == "autogen_sent_id") { impl_->set_autogen_sent_id(true); + } else if (option == "autogen_chunk_id") { + impl_->set_autogen_chunk_id(true); } else { BufferedChunkReader::set_option(option); @@ -292,6 +323,8 @@ std::string CclReader::get_option(const std::string& option) const return impl_->get_warn_on_inconsistent() ? option : ""; } else if (option == "autogen_sent_id") { return impl_->get_autogen_sent_id() ? "autogen_sent_id" : ""; + } else if (option == "autogen_chunk_id") { + return impl_->get_autogen_chunk_id() ? "autogen_chunk_id" : ""; } return BufferedChunkReader::get_option(option); } diff --git a/libcorpus2/io/cclreader.h b/libcorpus2/io/cclreader.h index daac508b14b64d50aef9e8de5da07d85ead09ff3..59c8e8b1b85ad4cd42bdb071f2da4570a2551a0e 100644 --- a/libcorpus2/io/cclreader.h +++ b/libcorpus2/io/cclreader.h @@ -32,11 +32,11 @@ class CclReader : public BufferedChunkReader public: CclReader(const Tagset& tagset, std::istream& is, bool disamb_only = false, bool disamb_sh = false, - bool autogen_sent_id = false); + bool autogen_sent_id = false, bool autogen_chunk_id_ = false); CclReader(const Tagset& tagset, const std::string& filename, bool disamb_only = false, bool disamb_sh = false, - bool autogen_sent_id = false); + bool autogen_sent_id = false, bool autogen_chunk_id_ = false); ~CclReader(); diff --git a/libcorpus2/io/docreader.cpp b/libcorpus2/io/docreader.cpp index cccb62583ecb230d48d5c0cf1cc2e30ba747b2d5..ca038e7170574a0cd8f3cfe64a1b31dcdb3eb5aa 100644 --- a/libcorpus2/io/docreader.cpp +++ b/libcorpus2/io/docreader.cpp @@ -61,7 +61,19 @@ namespace Corpus2 { { if (option == "autogen_sent_id") { ccl_reader_->set_option("autogen_sent_id"); + } else if (option == "autogen_chunk_id") { + ccl_reader_->set_option("autogen_chunk_id"); } } + std::string DocumentReader::get_option(const std::string& option) const { + if (option == "autogen_sent_id") { + return ccl_reader_->get_option("autogen_sent_id"); + } + else if (option == "autogen_chunk_id") { + return ccl_reader_->get_option("autogen_chunk_id"); + } + return ""; + } + } /* end ns Corpus2 */ diff --git a/libcorpus2/io/docreader.h b/libcorpus2/io/docreader.h index fe683c9cfd44b981b0d0a8b0c5a47a92114af9c6..65422e2534c0e2725b9433579f9325b61ece4a91 100644 --- a/libcorpus2/io/docreader.h +++ b/libcorpus2/io/docreader.h @@ -59,6 +59,11 @@ public: */ void set_option(const std::string& option); + /** + * @return option + */ + std::string get_option(const std::string& option) const; + private: /** * Makes CclReader and RelationReader for given paths to files. diff --git a/libcorpus2/io/xmlreader.cpp b/libcorpus2/io/xmlreader.cpp index 5b34b5cdf52f9c20b10609d1605fbfbed1d44a0c..773eb857cb85273f6298cc0d683d4ef0bd6beb72 100644 --- a/libcorpus2/io/xmlreader.cpp +++ b/libcorpus2/io/xmlreader.cpp @@ -52,6 +52,16 @@ std::string XmlReader::get_type_from_attributes(const AttributeList& attributes) return type; } +std::string XmlReader::get_id_from_attributes(const AttributeList& attributes) const +{ + foreach (const Attribute& a, attributes) { + if (a.name == "id") { + return a.value; + } + } + return ""; +} + void XmlReader::on_start_element(const Glib::ustring &name, const AttributeList& attributes) diff --git a/libcorpus2/io/xmlreader.h b/libcorpus2/io/xmlreader.h index d9234d4c38fd8e437f15bd8bebf7ab17dea8ad16..4517e511c6881969d8d7d21305c67e158190e317 100644 --- a/libcorpus2/io/xmlreader.h +++ b/libcorpus2/io/xmlreader.h @@ -49,6 +49,7 @@ public: protected: std::string get_type_from_attributes(const AttributeList& attributes) const; + std::string get_id_from_attributes(const AttributeList& attributes) const; void on_start_element(const Glib::ustring & name, const AttributeList& attributes); diff --git a/swig/documentreader.i b/swig/documentreader.i index 69229e8265c99d60496f8997b66fa2a16d9b62ba..6a538f840ece454a738a28c033d5aba46e4388ee 100644 --- a/swig/documentreader.i +++ b/swig/documentreader.i @@ -34,6 +34,7 @@ namespace Corpus2 { boost::shared_ptr<Document> read(); void set_option(const std::string& option); + std::string get_option(const std::string& option) const; /* --------------------------------------------------------------------- */ ~DocumentReader();