diff --git a/CMakeLists.txt b/CMakeLists.txt index 98ced3ce6d3123d37c7849c42b64b39e73d68431..f7fe7ba4061e0b124c1e69bd6309a081c0c35281 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ PROJECT(Corpus2Library) set(corpus2_ver_major "1") -set(corpus2_ver_minor "4") +set(corpus2_ver_minor "5") set(corpus2_ver_patch "0") cmake_minimum_required(VERSION 2.8.0) diff --git a/libcorpus2/io/cclreader.cpp b/libcorpus2/io/cclreader.cpp index ba113bbe53895eec9d1335517c44b7ff9a9591f6..36f821aeb416a05793e2cf46dc261489b01ff2e4 100644 --- a/libcorpus2/io/cclreader.cpp +++ b/libcorpus2/io/cclreader.cpp @@ -36,25 +36,26 @@ class CclReaderImpl : public XmlReader public: CclReaderImpl(const TokenReader& base_reader, std::deque< boost::shared_ptr<Chunk> >& obuf, - bool disamb_only, bool disamb_sh, bool autogen_sent_id, - bool autogen_chunk_id); + bool disamb_only, bool disamb_sh); ~CclReaderImpl(); - void set_autogen_sent_id(bool autogen_sent_id) { - autogen_sent_id_ = autogen_sent_id; + void set_autogen_sent_id(bool) { + // won't work + // now this is always true for each reader } bool get_autogen_sent_id() const { - return autogen_sent_id_; + return true; // always true by design } - void set_autogen_chunk_id(bool autogen_chunk_id) { - autogen_chunk_id_ = autogen_chunk_id; + void set_autogen_chunk_id(bool) { + // won't work + // left for backwards compatibility } bool get_autogen_chunk_id() const { - return autogen_chunk_id_; + return true; // always true by design } protected: @@ -88,27 +89,13 @@ protected: token_ann_t token_anns_; std::set<std::string> token_ann_heads_; - -private: - /// marker for autogenerating sentence identifiers (default is false -- - /// sentences identifiers will not be generated) - bool autogen_sent_id_; - unsigned int sent_number_; /// Sentence number, automatically generated - static const std::string SENT_ID_PREFFIX; - // and same for chunk... - bool autogen_chunk_id_; - unsigned int chunk_number_; - static const std::string CHUNK_ID_PREFFIX; }; -const std::string CclReaderImpl::SENT_ID_PREFFIX = "s"; -const std::string CclReaderImpl::CHUNK_ID_PREFFIX = "ch"; CclReader::CclReader(const Tagset& tagset, std::istream& is, bool disamb_only, bool disamb_sh, bool autogen_sent_id, bool autogen_chunk_id) : BufferedChunkReader(tagset), - impl_(new CclReaderImpl(*this, chunk_buf_, disamb_only, disamb_sh, - autogen_sent_id, autogen_chunk_id)) + impl_(new CclReaderImpl(*this, chunk_buf_, disamb_only, disamb_sh)) { this->is_ = &is; } @@ -117,8 +104,7 @@ CclReader::CclReader(const Tagset& tagset, const std::string& filename, bool disamb_only, bool disamb_sh, bool autogen_sent_id, bool autogen_chunk_id) : BufferedChunkReader(tagset), - impl_(new CclReaderImpl(*this, chunk_buf_, disamb_only, disamb_sh, - autogen_sent_id, autogen_chunk_id)) + impl_(new CclReaderImpl(*this, chunk_buf_, disamb_only, disamb_sh)) { this->is_owned_.reset(new std::ifstream(filename.c_str(), std::ifstream::in)); @@ -149,17 +135,12 @@ void CclReader::ensure_more() CclReaderImpl::CclReaderImpl(const TokenReader& base_reader, std::deque< boost::shared_ptr<Chunk> >& obuf, - bool disamb_only, bool disamb_sh, bool autogen_sent_id, - bool autogen_chunk_id) + bool disamb_only, bool disamb_sh) : XmlReader(base_reader, obuf) { XmlReader::set_disamb_only(disamb_only); XmlReader::set_disamb_sh(disamb_sh); sentence_tag_name_ = "sentence"; - sent_number_ = 0; - chunk_number_ = 0; - autogen_sent_id_ = autogen_sent_id; - autogen_chunk_id_ = autogen_chunk_id; } CclReaderImpl::~CclReaderImpl() @@ -169,11 +150,6 @@ CclReaderImpl::~CclReaderImpl() void CclReaderImpl::start_chunk(const AttributeList& attributes) { std::string id = get_id_from_attributes(attributes); - if (id.empty() && autogen_chunk_id_) { - std::ostringstream ss; - ss << ++chunk_number_; - id = CclReaderImpl::CHUNK_ID_PREFFIX + ss.str(); - } chunk_ = boost::make_shared<Chunk>(); chunk_->set_attribute("id", id); @@ -200,11 +176,6 @@ void CclReaderImpl::start_sentence(const AttributeList &attributes) break; } } - if (id.empty() && autogen_sent_id_) { - std::ostringstream ss; - ss << ++sent_number_; - id = CclReaderImpl::SENT_ID_PREFFIX + ss.str(); - } ann_sent_ = boost::make_shared<AnnotatedSentence>(id); sent_ = ann_sent_; @@ -306,9 +277,9 @@ void CclReader::set_option(const std::string& option) } else if (option == "disamb_only") { impl_->set_disamb_only(true); } else if (option == "autogen_sent_id") { - impl_->set_autogen_sent_id(true); + // no action, left for backwards compatibility } else if (option == "autogen_chunk_id") { - impl_->set_autogen_chunk_id(true); + // no action, left for backwards compatibility } else if (option == "no_warn_unexpected_xml") { impl_->set_warn_on_unexpected(false); } @@ -324,9 +295,9 @@ std::string CclReader::get_option(const std::string& option) const } else if (option == "no_warn_inconsistent") { return impl_->get_warn_on_inconsistent() ? "" : option; } else if (option == "autogen_sent_id") { - return impl_->get_autogen_sent_id() ? option : ""; + return option; // left for backward compatibility } else if (option == "autogen_chunk_id") { - return impl_->get_autogen_chunk_id() ? option : ""; + return option; // left for backward compatibility } else if (option == "no_warn_unexpected_xml") { return impl_->get_warn_on_unexpected() ? "" : option; } diff --git a/libcorpus2/io/reader.cpp b/libcorpus2/io/reader.cpp index 8e66fc71f92716bdb3b35de0113181b174971146..d75dd236ab666d18faefc40b43c74a1ff6e01b38 100644 --- a/libcorpus2/io/reader.cpp +++ b/libcorpus2/io/reader.cpp @@ -49,12 +49,17 @@ TokenReader::TokenReader(const Tagset& tagset) : tagset_(tagset), tag_parse_mode_(Tagset::ParseDefault), use_annotated_sentences_(false) { + sent_number_ = 0; + chunk_number_ = 0; } TokenReader::~TokenReader() { } +const std::string TokenReader::SENT_ID_PREFFIX = "s"; +const std::string TokenReader::CHUNK_ID_PREFFIX = "ch"; + void TokenReader::set_option(const std::string &option) { if (option == "ign") { @@ -99,6 +104,38 @@ boost::shared_ptr<Sentence> TokenReader::make_sentence() const } } + +bool TokenReader::name_sent(boost::shared_ptr<Sentence> sent) +{ + if (!sent) { + return false; + } + if (sent->id().empty()) { + std::stringstream id_string; + id_string << SENT_ID_PREFFIX << (++sent_number_); + sent->set_id(id_string.str()); + return true; + } + return false; +} + +bool TokenReader::name_chunk(boost::shared_ptr<Chunk> chunk) +{ + if (!chunk) { + return false; + } + if (chunk->has_attribute("id") && !chunk->get_attribute("id").empty()) { + // already a non-empty id + return false; + } + else { + std::stringstream id_string; + id_string << CHUNK_ID_PREFFIX << (++chunk_number_); + chunk->set_attribute("id", id_string.str()); + return true; + } +} + namespace { std::string guess_plugin_name(const std::string& reader_class_id, int idx) { @@ -260,6 +297,7 @@ Sentence::Ptr BufferedChunkReader::get_next_sentence() } else { Sentence::Ptr s = sentence_buf_.front(); sentence_buf_.pop_front(); + name_sent(s); return s; } } @@ -272,6 +310,12 @@ boost::shared_ptr<Chunk> BufferedChunkReader::get_next_chunk() } else { boost::shared_ptr<Chunk> t = chunk_buf_.front(); chunk_buf_.pop_front(); + if (t) { + name_chunk(t); + BOOST_FOREACH(Sentence::Ptr s, t->sentences()) { + name_sent(s); + } + } return t; } } @@ -309,9 +353,12 @@ Sentence::Ptr BufferedSentenceReader::get_next_sentence() if (sentence_buf_ != NULL) { Sentence::Ptr s = sentence_buf_; sentence_buf_.reset(); + name_sent(s); return s; } else { - return actual_next_sentence(); + Sentence::Ptr s = actual_next_sentence(); + name_sent(s); + return s; } } @@ -332,6 +379,7 @@ boost::shared_ptr<Chunk> BufferedSentenceReader::get_next_chunk() if (s) { sentence_buf_ = s; } + name_chunk(c); return c; } } diff --git a/libcorpus2/io/reader.h b/libcorpus2/io/reader.h index dd20bf087b668039f9e93cc1f13032d8dd5124e4..1241f8f480265ef4d28b7830c0f0d8d78c0e6ff3 100644 --- a/libcorpus2/io/reader.h +++ b/libcorpus2/io/reader.h @@ -162,6 +162,24 @@ public: boost::shared_ptr<Sentence> make_sentence() const; +protected: + unsigned int sent_number_; /// Sentence number, automatically generated + unsigned int chunk_number_; /// Chunk numer, automatically generated + static const std::string SENT_ID_PREFFIX; /// Sent id prefix + static const std::string CHUNK_ID_PREFFIX; /// Chunk id prefix + + /** + * If sentence has no name/id, will auto-generate one. + * Returns whether no name was set. + */ + bool name_sent(boost::shared_ptr<Sentence> sent); + + /** + * If chunk has no name/id, will auto-generate one. + * Returns whether no name was set. + */ + bool name_chunk(boost::shared_ptr<Chunk> chunk); + private: /// Tagset used by the Reader const Tagset& tagset_; @@ -281,7 +299,7 @@ bool TokenReader::register_path_reader(const std::string& class_id, * Convenience class for readers that keep a buffer of chunks. Sentence * and token accessors are based upon the chunk buffer. * - * A dervied class only neds to override ensure_more with a function that + * A dervied class only needs to override ensure_more with a function that * fills the chunk buffer. */ class BufferedChunkReader : public TokenReader @@ -322,6 +340,11 @@ protected: * * Note that the chunk accessor might well read the entire input and return * one huge chunk with all the sentences. + * + * By default the reader will split sequences of sentences into chunks + * (i.e. paragraphs) before each token that has the preceding whitespace + * amount set to PwrNlp::Whitespace::ManyNewlines. This behaviour is + * controlled by the chunkify_ flag. */ class BufferedSentenceReader : public TokenReader {