diff --git a/libcorpus2/CMakeLists.txt b/libcorpus2/CMakeLists.txt index 9f77877bf927ad9d183089448f8ff1dcb35d6170..fc8bc051c7daa1d3145954a436b0b174466925e0 100644 --- a/libcorpus2/CMakeLists.txt +++ b/libcorpus2/CMakeLists.txt @@ -41,6 +41,10 @@ link_directories(${Boost_LIBRARY_DIRS}) set(LIBS ${LIBS} ${Boost_LIBRARIES}) SET(libcorpus2_STAT_SRC + ann/annotatedsentence.cpp + ann/channel.cpp + ann/iob.cpp + ann/view.cpp chunk.cpp exception.cpp lexeme.cpp diff --git a/libcorpus2/ann/annotatedsentence.cpp b/libcorpus2/ann/annotatedsentence.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6a7bd60e30512e2b33a93b7d3b04060fab79768c --- /dev/null +++ b/libcorpus2/ann/annotatedsentence.cpp @@ -0,0 +1,58 @@ +#include <libcorpus2/ann/annotatedsentence.h> +#include <libcorpus2/ann/view.h> +#include <boost/make_shared.hpp> + +namespace Corpus2 { + +AnnotatedSentence::AnnotatedSentence() + : Sentence(), channels_() +{ +} + +AnnotatedSentence::~AnnotatedSentence() +{ +} + +Sentence::Ptr AnnotatedSentence::clone_shared() const +{ + boost::shared_ptr<AnnotatedSentence> copy; + copy = boost::make_shared<AnnotatedSentence>(); + foreach (const Token* t, tokens_) { + copy->append(t->clone()); + } + copy->channels_ = channels_; + return copy; +} + +void AnnotatedSentence::create_channel(const std::string& name) +{ + channels_.insert(std::make_pair(name, AnnotationChannel(tokens_.size()))); +} + + +boost::shared_ptr<AnnotationView> create_view( + const boost::shared_ptr<AnnotatedSentence>& s, + const std::string& ann_name) +{ + const AnnotationChannel& chan = s->get_channel(ann_name); + std::vector<Annotation> ann = chan.make_annotation_vector(); + boost::shared_ptr<AnnotationView> view; + view = boost::make_shared<AnnotationView>(s, ann_name); + foreach (const Annotation& a, ann) { + UnicodeString orth; + orth = s->tokens()[a.indices[0]]->orth(); + for (size_t idxi = 1; idxi < a.indices.size(); ++idxi) { + int idx = a.indices[idxi]; + orth += PwrNlp::Whitespace::to_whitespace(s->tokens()[idx]->wa()); + orth += s->tokens()[idx]->orth(); + } + Token* t = new Token(orth, s->tokens()[a.indices[0]]->wa()); + Token* head_token = s->tokens()[a.head_index]; + std::copy(head_token->lexemes().begin(), head_token->lexemes().end(), + std::back_inserter(t->lexemes())); + view->append(t); + } + return view; +} + +} /* end ns Corpus2 */ diff --git a/libcorpus2/ann/annotatedsentence.h b/libcorpus2/ann/annotatedsentence.h new file mode 100644 index 0000000000000000000000000000000000000000..86f0d257f0b2464417d63b27de11ccce29381614 --- /dev/null +++ b/libcorpus2/ann/annotatedsentence.h @@ -0,0 +1,70 @@ +#ifndef LIBCORPUS2_ANN_ANNOTATEDSENTENCE_H +#define LIBCORPUS2_ANN_ANNOTATEDSENTENCE_H + +#include <libcorpus2/sentence.h> +#include <libcorpus2/exception.h> +#include <libcorpus2/ann/channel.h> + +namespace Corpus2 { + +class AnnotationView; + +class MissingAnnotationChannel : public Corpus2Error +{ +public: + MissingAnnotationChannel(const std::string& name) + : Corpus2Error("Annotation channel missing: " + name) + { + } + + ~MissingAnnotationChannel() throw() + { + } +}; + + +class AnnotatedSentence : public Corpus2::Sentence +{ +public: + AnnotatedSentence(); + + ~AnnotatedSentence(); + + Ptr clone_shared() const; + + void create_channel(const std::string& name); + + bool has_channel(const std::string& name) const { + return channels_.find(name) != channels_.end(); + } + + AnnotationChannel& get_channel(const std::string& name) { + chan_map_t::iterator i = channels_.find(name); + if (i == channels_.end()) { + throw MissingAnnotationChannel(name); + } + return i->second; + } + + const AnnotationChannel& get_channel(const std::string& name) const { + chan_map_t::const_iterator i = channels_.find(name); + if (i == channels_.end()) { + throw MissingAnnotationChannel(name); + } + return i->second; + } + +private: + typedef std::map<std::string, AnnotationChannel> chan_map_t; + + chan_map_t channels_; +}; + +boost::shared_ptr<AnnotationView> create_view( + const boost::shared_ptr<AnnotatedSentence>& s, + const std::string& ann_name); + + +} /* end ns Corpus2 */ + +#endif // LIBCORPUS2_ANN_ANNOTATEDSENTENCE_H diff --git a/libcorpus2/ann/channel.cpp b/libcorpus2/ann/channel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..601b65cbce90e9af28ffe6b25bd299dee7407810 --- /dev/null +++ b/libcorpus2/ann/channel.cpp @@ -0,0 +1,86 @@ +#include <libcorpus2/ann/channel.h> +#include <algorithm> +#include <boost/bind.hpp> + +namespace Corpus2 { + +AnnotationChannel::AnnotationChannel() + : segments_(), iobs_() +{ +} + +AnnotationChannel::AnnotationChannel(int size) + : segments_(size), iobs_(size) +{ +} + +void AnnotationChannel::make_iob_from_segments() +{ + int prev_seg = 0; + for (size_t i = 0; i < segments_.size(); ++i) { + if (segments_[i] == 0) { + iobs_[i] = IOB::O; + } else if (segments_[i] != prev_seg) { + iobs_[i] = IOB::I; + } else { + iobs_[i] = IOB::B; + prev_seg = segments_[i]; + } + } +} + +void AnnotationChannel::make_segments_from_iob() +{ + int sid = 0; + bool i_can_has = false; + for (size_t i = 0; i < segments_.size(); ++i) { + if (iobs_[i] == IOB::O) { + segments_[i] = 0; + i_can_has = false; + } else { + if (iobs_[i] == IOB::B || !i_can_has) { + iobs_[i] = IOB::B; + ++sid; + } + segments_[i] = sid; + i_can_has = true; + } + } +} + +IOB::Enum AnnotationChannel::get_iob_at(int idx) +{ + if (idx >= 0 && idx < static_cast<int>(iobs_.size())) { + return iobs_[idx]; + } else { + return IOB::O; + } +} + +void AnnotationChannel::set_iob_at(int idx, IOB::Enum iob) +{ + if (idx >= 0 && idx < static_cast<int>(iobs_.size())) { + iobs_[idx] = iob; + } +} + +std::vector<Annotation> AnnotationChannel::make_annotation_vector() const +{ + std::vector<Annotation> rv; + int smax = 0; + for (size_t i = 0; i < segments_.size(); ++i) { + int s = segments_[i]; + if (s > smax) { + rv.resize(smax = s); + } + rv[s].indices.push_back(i); + if (rv[s].head_index == -1) { + rv[s].head_index = i; + } + } + rv.erase(std::remove_if(rv.begin(), rv.end(), + boost::bind(&Annotation::empty, _1))); + return rv; +} + +} /* end ns Corpus2 */ diff --git a/libcorpus2/ann/channel.h b/libcorpus2/ann/channel.h new file mode 100644 index 0000000000000000000000000000000000000000..eb8618f74db23074d900fc736861146067f72c85 --- /dev/null +++ b/libcorpus2/ann/channel.h @@ -0,0 +1,53 @@ +#ifndef LIBCORPUS2_ANN_CHANNEL_H +#define LIBCORPUS2_ANN_CHANNEL_H + +#include <libcorpus2/ann/iob.h> +#include <vector> +#include <boost/utility.hpp> + +namespace Corpus2 { + +struct Annotation +{ + Annotation() : indices(), head_index(-1) {} + std::vector<int> indices; + int head_index; + bool empty() const { return indices.empty(); } +}; + +class AnnotationChannel +{ +public: + AnnotationChannel(); + + explicit AnnotationChannel(int size); + + void make_iob_from_segments(); + + void make_segments_from_iob(); + + std::vector<Annotation> make_annotation_vector() const; + + std::vector<Annotation> make_continuous_annotation_vector() const; + + const std::vector<int>& segments() const { + return segments_; + } + + const std::vector<IOB::Enum>& iobs() const { + return iobs_; + } + + IOB::Enum get_iob_at(int idx); + + void set_iob_at(int idx, IOB::Enum iob); + +private: + std::vector<int> segments_; + + std::vector<IOB::Enum> iobs_; +}; + +} /* end ns Corpus2 */ + +#endif // LIBCORPUS2_ANN_CHANNEL_H diff --git a/libcorpus2/ann/iob.cpp b/libcorpus2/ann/iob.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ef44b49da661d88834d2e524097285ffa046691c --- /dev/null +++ b/libcorpus2/ann/iob.cpp @@ -0,0 +1,21 @@ +#include <libcorpus2/ann/iob.h> + +namespace Corpus2 { + +const char* IOB::to_string(IOB::Enum iob) +{ + if (iob == IOB::I) return "I"; + if (iob == IOB::O) return "O"; + if (iob == IOB::B) return "B"; + return "?"; +} + +IOB::Enum IOB::from_string(const std::string &s) +{ + if (s == "I") return IOB::I; + if (s == "O") return IOB::O; + if (s == "B") return IOB::B; + return IOB::PostLast; +} + +} /* end ns Corpus2 */ diff --git a/libcorpus2/ann/iob.h b/libcorpus2/ann/iob.h new file mode 100644 index 0000000000000000000000000000000000000000..68d94f2805e3f6cf65b7d3c492dd419d85099c42 --- /dev/null +++ b/libcorpus2/ann/iob.h @@ -0,0 +1,26 @@ +#ifndef LIBCORPUS2_ANN_IOB_H +#define LIBCORPUS2_ANN_IOB_H + +#include <string> + +namespace Corpus2 { + +namespace IOB { + /** + * A simple enumeration for IOB annotation chunk tagging + */ + enum Enum { + O = 0, + B = 1, + I = 2, + PostLast + }; + /// Convert an enum value to an uppercase I, O or B string + const char* to_string(Enum iob); + /// Create an enum value from an I, O or B string + Enum from_string(const std::string& s); +} + +} /* end ns Corpus2 */ + +#endif // LIBCORPUS2_ANN_IOB_H diff --git a/libcorpus2/ann/view.cpp b/libcorpus2/ann/view.cpp new file mode 100644 index 0000000000000000000000000000000000000000..68047870c59ad1796e8ecd7aa58e673f6d491e61 --- /dev/null +++ b/libcorpus2/ann/view.cpp @@ -0,0 +1,43 @@ +#include <libcorpus2/ann/view.h> +#include <libcorpus2/ann/annotatedsentence.h> +#include <boost/make_shared.hpp> + +namespace Corpus2 { + +AnnotationView::AnnotationView( + const boost::shared_ptr<AnnotatedSentence> &original, + const std::string &ann_name) + : Sentence(), original_(original), ann_name_(ann_name) +{ +} + +AnnotationView::~AnnotationView() +{ +} + +Sentence::Ptr AnnotationView::clone_shared() const +{ + boost::shared_ptr<AnnotationView> copy; + copy = boost::make_shared<AnnotationView>(original_, ann_name_); + foreach (const Token* t, tokens_) { + copy->append(t->clone()); + } + return copy; +} + +void AnnotationView::commit() +{ + commit_to(original_, ann_name_); +} + +void AnnotationView::commit_to( + const boost::shared_ptr<AnnotatedSentence> &original, + const std::string &ann_name) +{ + if (!original->has_channel(ann_name)) { + throw MissingAnnotationChannel(ann_name); + } + // ... +} + +} /* end ns Corpus2 */ diff --git a/libcorpus2/ann/view.h b/libcorpus2/ann/view.h new file mode 100644 index 0000000000000000000000000000000000000000..d6790962daa694dd9bcbbcbde9550c01a964cd0a --- /dev/null +++ b/libcorpus2/ann/view.h @@ -0,0 +1,31 @@ +#ifndef LIBCORPUS2_ANN_VIEW_H +#define LIBCORPUS2_ANN_VIEW_H + +#include <libcorpus2/sentence.h> + +namespace Corpus2 { + +class AnnotatedSentence; + +class AnnotationView : public Sentence +{ +public: + AnnotationView(const boost::shared_ptr<AnnotatedSentence>& original, + const std::string& ann_name_); + + ~AnnotationView(); + + Ptr clone_shared() const; + + void commit(); + + void commit_to(const boost::shared_ptr<AnnotatedSentence>& original, + const std::string& ann_name); +private: + boost::shared_ptr<AnnotatedSentence> original_; + const std::string ann_name_; +}; + +} /* end ns Corpus2 */ + +#endif // LIBCORPUS2_ANN_VIEW_H diff --git a/libcorpus2/sentence.cpp b/libcorpus2/sentence.cpp index 44fa4f2849e19fdf49fdb61c77d89ebaafb3173c..30f33e3de94a6d6514bcef47301290069bfbb0ff 100644 --- a/libcorpus2/sentence.cpp +++ b/libcorpus2/sentence.cpp @@ -19,6 +19,11 @@ or FITNESS FOR A PARTICULAR PURPOSE. namespace Corpus2 { +Sentence::Sentence() + : tokens_() +{ +} + Sentence::~Sentence() { foreach (const Token* t, tokens_) { diff --git a/libcorpus2/sentence.h b/libcorpus2/sentence.h index 1cc97928d3a62f55caa874f54d3f00e8576c620c..2a08e84674c047a897e18b396bb4f63d837a2311 100644 --- a/libcorpus2/sentence.h +++ b/libcorpus2/sentence.h @@ -30,15 +30,12 @@ public: typedef boost::shared_ptr<const Sentence> ConstPtr; /// Empty constructor - Sentence() - : tokens_() - { - } + Sentence(); - Ptr clone_shared() const; + virtual Ptr clone_shared() const; /// Destructor - ~Sentence(); + virtual ~Sentence(); void release_tokens(); @@ -82,7 +79,7 @@ public: return tokens_[0]; } -private: +protected: /// The tokens this sentence contains and owns std::vector<Token*> tokens_; };