diff --git a/libcorpus2/ann/channel.cpp b/libcorpus2/ann/channel.cpp index 394dbb0d9886edd02c9217154535c37b4af2c978..0138795f4644a1d35a04e13ae73a5a895f2f3505 100644 --- a/libcorpus2/ann/channel.cpp +++ b/libcorpus2/ann/channel.cpp @@ -3,6 +3,7 @@ #include <algorithm> #include <boost/bind.hpp> #include <sstream> +#include <iostream> #include <set> namespace Corpus2 { @@ -76,6 +77,19 @@ int AnnotationChannel::renumber_segments() } return next; } +int AnnotationChannel::get_new_segment_index() const +{ + //cache this? + std::vector<bool> used(segments_.size() + 1); + foreach (size_t sid, segments_) { + if (sid < used.size()) { + used[sid] = true; + } + } + int first = 1; + while ((first < static_cast<int>(used.size())) && used[first]) ++first; + return first; +} int AnnotationChannel::get_segment_at(int idx) const { @@ -188,6 +202,27 @@ std::string AnnotationChannel::dump_heads() const return ss.str(); } +std::string AnnotationChannel::dump_alpha() const +{ + std::stringstream ss; + for (int i = 0; i < size(); ++i) { + if (segments_[i] == 0) { + if (heads_[i]) { + ss << '#'; + } else { + ss << '_'; + } + } else { + if (heads_[i]) { + ss << static_cast<unsigned char>('A' - 1 + segments_[i]); + } else { + ss << static_cast<unsigned char>('a' - 1 + segments_[i]); + } + } + } + return ss.str(); +} + void AnnotationChannel::do_counts(int& annotations, int& disjoint, int& unannotated) const { std::set<int> used_sids; diff --git a/libcorpus2/ann/channel.h b/libcorpus2/ann/channel.h index 2c3a76f65458dcd23c5af8ffe26a202f45436d65..d4b02bcfe7a015c98ead04a1ce57e19966268830 100644 --- a/libcorpus2/ann/channel.h +++ b/libcorpus2/ann/channel.h @@ -74,6 +74,11 @@ public: */ int renumber_segments(); + /** + * Figure out an index for a new segment + */ + int get_new_segment_index() const; + enum AnnotationVectorMode { O_DISJOINT_EXCLUSIVE = 0, @@ -148,15 +153,22 @@ public: std::string dump_iob() const; /** - * Compose a string consisting of all segment indices in order. + * Compose a string consisting of all segment indices in order, e.g. "01102" */ std::string dump_segments() const; /** - * Compose a string consisting of all head flags in order + * Compose a string consisting of all head flags in order, e.g. " H H" */ std::string dump_heads() const; + /** + * Compose a string consisting of segment/head info in alphabetic format + * where capital letters denote the head flag, and underscore indicates + * no segment, so e.g. "_Aa_B" + */ + std::string dump_alpha() const; + void do_counts(int& annotations, int& disjoint, int& unannotated) const; private: