diff --git a/libcorpus2/ann/annotatedsentence.cpp b/libcorpus2/ann/annotatedsentence.cpp index 6a7bd60e30512e2b33a93b7d3b04060fab79768c..68b561f92d1af65a3412bb6dbf420cf5c1bca184 100644 --- a/libcorpus2/ann/annotatedsentence.cpp +++ b/libcorpus2/ann/annotatedsentence.cpp @@ -24,6 +24,27 @@ Sentence::Ptr AnnotatedSentence::clone_shared() const return copy; } +boost::shared_ptr<AnnotatedSentence> AnnotatedSentence::wrap_sentence( + const boost::shared_ptr<Sentence>& s) +{ + boost::shared_ptr<AnnotatedSentence> a = boost::make_shared<AnnotatedSentence>(); + foreach (Token* t, s->tokens()) { + a->append(t); + } + s->release_tokens(); + return a; +} + +boost::shared_ptr<AnnotatedSentence> AnnotatedSentence::wrap_sentence_clone( + const boost::shared_ptr<Sentence>& s) +{ + boost::shared_ptr<AnnotatedSentence> a = boost::make_shared<AnnotatedSentence>(); + foreach (Token* t, s->tokens()) { + a->append(t->clone()); + } + return a; +} + void AnnotatedSentence::create_channel(const std::string& name) { channels_.insert(std::make_pair(name, AnnotationChannel(tokens_.size()))); diff --git a/libcorpus2/ann/annotatedsentence.h b/libcorpus2/ann/annotatedsentence.h index 86f0d257f0b2464417d63b27de11ccce29381614..db2d4eade8c42a946a8db84071a096f057eba027 100644 --- a/libcorpus2/ann/annotatedsentence.h +++ b/libcorpus2/ann/annotatedsentence.h @@ -9,6 +9,9 @@ namespace Corpus2 { class AnnotationView; +/** + * Exception class for use when a requested annotation channel does not exist + */ class MissingAnnotationChannel : public Corpus2Error { public: @@ -22,22 +25,56 @@ public: } }; - +/** + * A class describing Sentences with additional information in the form of + * annotation channels. + * + * Note: channels are not automatiaclly resized. The sentence should not have + * tokens added or removed after annotation channels are created. + */ class AnnotatedSentence : public Corpus2::Sentence { public: + /** + * Create an empty AnnotatedSentece with no tokens and no channels + */ AnnotatedSentence(); ~AnnotatedSentence(); - Ptr clone_shared() const; - + Sentence::Ptr clone_shared() const; + + /** + * Create an AnnotatedSentence from a Sentence, grabing all the tokens + * directly (afterwards the source Sentence has no tokens). + */ + static boost::shared_ptr<AnnotatedSentence> wrap_sentence( + const boost::shared_ptr<Sentence>& s); + + /** + * Create an AnnotatedSentence from a Sentence, cloning all the tokens. + * The source Sentence is not modified. + */ + static boost::shared_ptr<AnnotatedSentence> wrap_sentence_clone( + const boost::shared_ptr<Sentence>& s); + + /** + * Create an annotation channel named name in this annotated sentence. + * If the channel already exists, nothing happens. + */ void create_channel(const std::string& name); + /** + * @return true iif this sentence has an annotation channel named name + */ bool has_channel(const std::string& name) const { return channels_.find(name) != channels_.end(); } + /** + * Return the annotation channel by name or throw MissingAnnotationChannel + * if there is no such channel + */ AnnotationChannel& get_channel(const std::string& name) { chan_map_t::iterator i = channels_.find(name); if (i == channels_.end()) { @@ -46,6 +83,9 @@ public: return i->second; } + /** + * Const version of get_channel + */ const AnnotationChannel& get_channel(const std::string& name) const { chan_map_t::const_iterator i = channels_.find(name); if (i == channels_.end()) { @@ -55,11 +95,20 @@ public: } private: + /// typedef for tha channels typedef std::map<std::string, AnnotationChannel> chan_map_t; + /// the actual channels chan_map_t channels_; }; +/** + * Create an AnnotationView pseudo-sentence from an AnnotatedSentence that + * behaves like a sentence viewed through an annotation. + * + * This is a free function, not a member of AnnotatedSentence, because it is + * mandatory that the AnnotatedSentence be passed via a shared_ptr. + */ boost::shared_ptr<AnnotationView> create_view( const boost::shared_ptr<AnnotatedSentence>& s, const std::string& ann_name); diff --git a/libcorpus2/ann/channel.cpp b/libcorpus2/ann/channel.cpp index 601b65cbce90e9af28ffe6b25bd299dee7407810..5ff16f92bc8d83b2b5189b67ab0871a5e153a48c 100644 --- a/libcorpus2/ann/channel.cpp +++ b/libcorpus2/ann/channel.cpp @@ -48,6 +48,25 @@ void AnnotationChannel::make_segments_from_iob() } } +int AnnotationChannel::renumber_segments() +{ + std::map<int, int> re; + int next = 0; + for (size_t i = 0; i < segments_.size(); ++i) { + if (segments_[i] > 0) { + std::map<int, int>::const_iterator ci = re.find(segments_[i]); + if (ci != re.end()) { + segments_[i] = ci->second; + } else { + ++next; + re.insert(std::make_pair(segments_[i], next)); + segments_[i] = next; + } + } + } + return next; +} + IOB::Enum AnnotationChannel::get_iob_at(int idx) { if (idx >= 0 && idx < static_cast<int>(iobs_.size())) { @@ -78,6 +97,7 @@ std::vector<Annotation> AnnotationChannel::make_annotation_vector() const rv[s].head_index = i; } } + std::sort(rv.begin(), rv.end(), AnnotationHeadCompare()); rv.erase(std::remove_if(rv.begin(), rv.end(), boost::bind(&Annotation::empty, _1))); return rv; diff --git a/libcorpus2/ann/channel.h b/libcorpus2/ann/channel.h index eb8618f74db23074d900fc736861146067f72c85..0637638a46b95ecdff866f5a3581b9df8d7d6fdf 100644 --- a/libcorpus2/ann/channel.h +++ b/libcorpus2/ann/channel.h @@ -4,47 +4,112 @@ #include <libcorpus2/ann/iob.h> #include <vector> #include <boost/utility.hpp> +#include <algorithm> +#include <map> namespace Corpus2 { + +/** + * A general indexed annotation spanning a possibly disjoint group of + * tokens, with a distinguished 'head' token. + */ struct Annotation { Annotation() : indices(), head_index(-1) {} std::vector<int> indices; int head_index; - bool empty() const { return indices.empty(); } + bool empty() const { + return indices.empty(); + } + bool sane() const { + return empty() || + std::find(indices.begin(), indices.end(), head_index) != indices.end(); + } +}; + +struct AnnotationHeadCompare +{ + bool operator()(const Annotation& a1, const Annotation& a2) { + return a1.head_index < a2.head_index; + } }; +/** + * An annotation channel in IOB format and segment index format, with manual + * sync between the two. + */ class AnnotationChannel { public: + /** + * Empty ctor + */ AnnotationChannel(); + /** + * Ctor for an annotation of a specific size + */ explicit AnnotationChannel(int size); + /** + * Discard IOB annotation information, regenerate it from the segment info. + */ void make_iob_from_segments(); + /** + * Discard indexed annotation info, regenerate from IOB. + */ void make_segments_from_iob(); + /** + * Number segments from 1 so there are no ommited indices, return the + * highest used index + */ + int renumber_segments(); + + /** + * Create a vector of AnnotationSegment objects, each corresponding to + * an annotation, with the annotations possibly being disjoint. + */ std::vector<Annotation> make_annotation_vector() const; + /** + * Create a vector of AnnotationSegment objects, each corresponding to + * an annotation, forcing the annotations to be continous (disjoint + * annotations are split) + */ std::vector<Annotation> make_continuous_annotation_vector() const; + /** + * The segment-index array accesor + */ const std::vector<int>& segments() const { return segments_; } + /** + * The IOB data vector + */ const std::vector<IOB::Enum>& iobs() const { return iobs_; } + /** + * IOB getter, returns IOB::O if idx is out of range + */ IOB::Enum get_iob_at(int idx); + /** + * IOB setter, out of range indices are not processed. + */ void set_iob_at(int idx, IOB::Enum iob); private: + /// segment indices std::vector<int> segments_; + /// IOB data std::vector<IOB::Enum> iobs_; }; diff --git a/libcorpus2/ann/iob.h b/libcorpus2/ann/iob.h index 68d94f2805e3f6cf65b7d3c492dd419d85099c42..7be4086926855240d7dacdd77f3ff8d2d2bc0f4a 100644 --- a/libcorpus2/ann/iob.h +++ b/libcorpus2/ann/iob.h @@ -8,6 +8,10 @@ namespace Corpus2 { namespace IOB { /** * A simple enumeration for IOB annotation chunk tagging + * O indicates not part of a chunk + * B indicates beginning of a new chunk + * I indicates continuation of a chunk started by a preceeding BI* sequence + * The only invalid sequence is O followed by I */ enum Enum { O = 0, @@ -16,6 +20,7 @@ namespace IOB { PostLast }; /// Convert an enum value to an uppercase I, O or B string + /// Returns ? on invalid value const char* to_string(Enum iob); /// Create an enum value from an I, O or B string Enum from_string(const std::string& s); diff --git a/libcorpus2/ann/view.cpp b/libcorpus2/ann/view.cpp index 68047870c59ad1796e8ecd7aa58e673f6d491e61..ac06b1e2a82029130700a2b3fc5371f61426ca9e 100644 --- a/libcorpus2/ann/view.cpp +++ b/libcorpus2/ann/view.cpp @@ -25,6 +25,11 @@ Sentence::Ptr AnnotationView::clone_shared() const return copy; } +void AnnotationView::release_original() +{ + original_.reset(); +} + void AnnotationView::commit() { commit_to(original_, ann_name_); @@ -34,6 +39,9 @@ void AnnotationView::commit_to( const boost::shared_ptr<AnnotatedSentence> &original, const std::string &ann_name) { + if (!original) { + throw AnnotationViewOutOfSync("null-pointer-to-original"); + } if (!original->has_channel(ann_name)) { throw MissingAnnotationChannel(ann_name); } diff --git a/libcorpus2/ann/view.h b/libcorpus2/ann/view.h index d6790962daa694dd9bcbbcbde9550c01a964cd0a..45e75fecc9739314b74008d1fa8de1ea98217dc9 100644 --- a/libcorpus2/ann/view.h +++ b/libcorpus2/ann/view.h @@ -7,9 +7,38 @@ namespace Corpus2 { class AnnotatedSentence; +/** + * Exception class for signalling OOS between a View and a Sentence during + * e.g. commits + */ +class AnnotationViewOutOfSync : public Corpus2Error +{ +public: + AnnotationViewOutOfSync(const std::string& reason) + : Corpus2Error("AnnotationView out of sync with base sentence: " + reason) + { + } + + ~AnnotationViewOutOfSync() throw() + { + } +}; + +/** + * A class representing an AnnotatedSentence viewed through the segmentation + * some Annotation specifies. + * + * It has its own Tokens but can update the original Sentence via commit(). + */ class AnnotationView : public Sentence { public: + /** + * Create an AnnotationView from a sentence and an annotation name. + * The AnnotationView keeps a handle to the sentence. + * + * This shoudl not be used directly, use create_view. + */ AnnotationView(const boost::shared_ptr<AnnotatedSentence>& original, const std::string& ann_name_); @@ -17,10 +46,27 @@ public: Ptr clone_shared() const; + /** + * Push changes to the original Sentence. + */ void commit(); + /** + * Push changes to a different Sentence, but same channel + */ + void commit_to(const boost::shared_ptr<AnnotatedSentence>& original); + + /** + * Push changes to ana arbitary channel in an arbitrary sentence + */ void commit_to(const boost::shared_ptr<AnnotatedSentence>& original, const std::string& ann_name); + + /** + * Release the original Sentence. Use this when you no longer expect + * to need to go back or commit() changes. + */ + void release_original(); private: boost::shared_ptr<AnnotatedSentence> original_; const std::string ann_name_; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d0f1de97eafde2af57d3fe013ddda8e2e3b2b711..874b60a5a4bb419e23bc7a65fbee11e8ceeb530d 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -6,6 +6,7 @@ add_definitions(-DLIBCORPUS2_TEST_DATA_DIR="${PROJECT_SOURCE_DIR}/") add_executable( tests main.cpp + ann_basic.cpp basic.cpp tag_split.cpp tagset_parse.cpp diff --git a/tests/ann_basic.cpp b/tests/ann_basic.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e564a39663ca38c948cea3b84d776681cec51802 --- /dev/null +++ b/tests/ann_basic.cpp @@ -0,0 +1,45 @@ +/* + Copyright (C) 2010 Tomasz Åšniatowski, Adam Radziszewski + Part of the libcorpus2 project + + This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) +any later version. + + This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. + + See the LICENSE and COPYING files for more details. +*/ + +#include <boost/test/unit_test.hpp> +#include <boost/make_shared.hpp> +#include <libcorpus2/ann/annotatedsentence.h> + +BOOST_AUTO_TEST_SUITE(ann) + +BOOST_AUTO_TEST_CASE( wrap ) +{ + Corpus2::Sentence::Ptr s = boost::make_shared<Corpus2::Sentence>(); + s->append(new Corpus2::Token(UnicodeString::fromUTF8("orth"), PwrNlp::Whitespace::None)); + Corpus2::Sentence::Ptr a = Corpus2::AnnotatedSentence::wrap_sentence(s); + BOOST_CHECK(s->empty()); + BOOST_REQUIRE(!a->empty()); + BOOST_CHECK_EQUAL(a->tokens()[0]->orth_utf8(), "orth"); +} + +BOOST_AUTO_TEST_CASE( wrap_clone ) +{ + Corpus2::Sentence::Ptr s = boost::make_shared<Corpus2::Sentence>(); + s->append(new Corpus2::Token(UnicodeString::fromUTF8("orth"), PwrNlp::Whitespace::None)); + Corpus2::Sentence::Ptr a = Corpus2::AnnotatedSentence::wrap_sentence_clone(s); + BOOST_REQUIRE(!s->empty()); + s->tokens()[0]->set_orth(UnicodeString::fromUTF8("aaa")); + BOOST_REQUIRE(!a->empty()); + BOOST_CHECK_EQUAL(a->tokens()[0]->orth_utf8(), "orth"); +} + + +BOOST_AUTO_TEST_SUITE_END()