From 2605b8cce84eba5880ce7da688487447ab15c1ff Mon Sep 17 00:00:00 2001 From: ilor <kailoran@gmail.com> Date: Tue, 12 Apr 2011 17:17:10 +0200 Subject: [PATCH] extend AnnotationChannel interface: add dump_alpha and get_new_segment_index --- libcorpus2/ann/channel.cpp | 35 +++++++++++++++++++++++++++++++++++ libcorpus2/ann/channel.h | 16 ++++++++++++++-- 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/libcorpus2/ann/channel.cpp b/libcorpus2/ann/channel.cpp index 394dbb0..0138795 100644 --- a/libcorpus2/ann/channel.cpp +++ b/libcorpus2/ann/channel.cpp @@ -3,6 +3,7 @@ #include <algorithm> #include <boost/bind.hpp> #include <sstream> +#include <iostream> #include <set> namespace Corpus2 { @@ -76,6 +77,19 @@ int AnnotationChannel::renumber_segments() } return next; } +int AnnotationChannel::get_new_segment_index() const +{ + //cache this? + std::vector<bool> used(segments_.size() + 1); + foreach (size_t sid, segments_) { + if (sid < used.size()) { + used[sid] = true; + } + } + int first = 1; + while ((first < static_cast<int>(used.size())) && used[first]) ++first; + return first; +} int AnnotationChannel::get_segment_at(int idx) const { @@ -188,6 +202,27 @@ std::string AnnotationChannel::dump_heads() const return ss.str(); } +std::string AnnotationChannel::dump_alpha() const +{ + std::stringstream ss; + for (int i = 0; i < size(); ++i) { + if (segments_[i] == 0) { + if (heads_[i]) { + ss << '#'; + } else { + ss << '_'; + } + } else { + if (heads_[i]) { + ss << static_cast<unsigned char>('A' - 1 + segments_[i]); + } else { + ss << static_cast<unsigned char>('a' - 1 + segments_[i]); + } + } + } + return ss.str(); +} + void AnnotationChannel::do_counts(int& annotations, int& disjoint, int& unannotated) const { std::set<int> used_sids; diff --git a/libcorpus2/ann/channel.h b/libcorpus2/ann/channel.h index 2c3a76f..d4b02bc 100644 --- a/libcorpus2/ann/channel.h +++ b/libcorpus2/ann/channel.h @@ -74,6 +74,11 @@ public: */ int renumber_segments(); + /** + * Figure out an index for a new segment + */ + int get_new_segment_index() const; + enum AnnotationVectorMode { O_DISJOINT_EXCLUSIVE = 0, @@ -148,15 +153,22 @@ public: std::string dump_iob() const; /** - * Compose a string consisting of all segment indices in order. + * Compose a string consisting of all segment indices in order, e.g. "01102" */ std::string dump_segments() const; /** - * Compose a string consisting of all head flags in order + * Compose a string consisting of all head flags in order, e.g. " H H" */ std::string dump_heads() const; + /** + * Compose a string consisting of segment/head info in alphabetic format + * where capital letters denote the head flag, and underscore indicates + * no segment, so e.g. "_Aa_B" + */ + std::string dump_alpha() const; + void do_counts(int& annotations, int& disjoint, int& unannotated) const; private: -- GitLab