Skip to content
Snippets Groups Projects
Commit 2415ad64 authored by ilor's avatar ilor
Browse files

Annotation updates and two very basic tests

parent 68de7799
No related branches found
No related tags found
No related merge requests found
......@@ -24,6 +24,27 @@ Sentence::Ptr AnnotatedSentence::clone_shared() const
return copy;
}
boost::shared_ptr<AnnotatedSentence> AnnotatedSentence::wrap_sentence(
const boost::shared_ptr<Sentence>& s)
{
boost::shared_ptr<AnnotatedSentence> a = boost::make_shared<AnnotatedSentence>();
foreach (Token* t, s->tokens()) {
a->append(t);
}
s->release_tokens();
return a;
}
boost::shared_ptr<AnnotatedSentence> AnnotatedSentence::wrap_sentence_clone(
const boost::shared_ptr<Sentence>& s)
{
boost::shared_ptr<AnnotatedSentence> a = boost::make_shared<AnnotatedSentence>();
foreach (Token* t, s->tokens()) {
a->append(t->clone());
}
return a;
}
void AnnotatedSentence::create_channel(const std::string& name)
{
channels_.insert(std::make_pair(name, AnnotationChannel(tokens_.size())));
......
......@@ -9,6 +9,9 @@ namespace Corpus2 {
class AnnotationView;
/**
* Exception class for use when a requested annotation channel does not exist
*/
class MissingAnnotationChannel : public Corpus2Error
{
public:
......@@ -22,22 +25,56 @@ public:
}
};
/**
* A class describing Sentences with additional information in the form of
* annotation channels.
*
* Note: channels are not automatiaclly resized. The sentence should not have
* tokens added or removed after annotation channels are created.
*/
class AnnotatedSentence : public Corpus2::Sentence
{
public:
/**
* Create an empty AnnotatedSentece with no tokens and no channels
*/
AnnotatedSentence();
~AnnotatedSentence();
Ptr clone_shared() const;
Sentence::Ptr clone_shared() const;
/**
* Create an AnnotatedSentence from a Sentence, grabing all the tokens
* directly (afterwards the source Sentence has no tokens).
*/
static boost::shared_ptr<AnnotatedSentence> wrap_sentence(
const boost::shared_ptr<Sentence>& s);
/**
* Create an AnnotatedSentence from a Sentence, cloning all the tokens.
* The source Sentence is not modified.
*/
static boost::shared_ptr<AnnotatedSentence> wrap_sentence_clone(
const boost::shared_ptr<Sentence>& s);
/**
* Create an annotation channel named name in this annotated sentence.
* If the channel already exists, nothing happens.
*/
void create_channel(const std::string& name);
/**
* @return true iif this sentence has an annotation channel named name
*/
bool has_channel(const std::string& name) const {
return channels_.find(name) != channels_.end();
}
/**
* Return the annotation channel by name or throw MissingAnnotationChannel
* if there is no such channel
*/
AnnotationChannel& get_channel(const std::string& name) {
chan_map_t::iterator i = channels_.find(name);
if (i == channels_.end()) {
......@@ -46,6 +83,9 @@ public:
return i->second;
}
/**
* Const version of get_channel
*/
const AnnotationChannel& get_channel(const std::string& name) const {
chan_map_t::const_iterator i = channels_.find(name);
if (i == channels_.end()) {
......@@ -55,11 +95,20 @@ public:
}
private:
/// typedef for tha channels
typedef std::map<std::string, AnnotationChannel> chan_map_t;
/// the actual channels
chan_map_t channels_;
};
/**
* Create an AnnotationView pseudo-sentence from an AnnotatedSentence that
* behaves like a sentence viewed through an annotation.
*
* This is a free function, not a member of AnnotatedSentence, because it is
* mandatory that the AnnotatedSentence be passed via a shared_ptr.
*/
boost::shared_ptr<AnnotationView> create_view(
const boost::shared_ptr<AnnotatedSentence>& s,
const std::string& ann_name);
......
......@@ -48,6 +48,25 @@ void AnnotationChannel::make_segments_from_iob()
}
}
int AnnotationChannel::renumber_segments()
{
std::map<int, int> re;
int next = 0;
for (size_t i = 0; i < segments_.size(); ++i) {
if (segments_[i] > 0) {
std::map<int, int>::const_iterator ci = re.find(segments_[i]);
if (ci != re.end()) {
segments_[i] = ci->second;
} else {
++next;
re.insert(std::make_pair(segments_[i], next));
segments_[i] = next;
}
}
}
return next;
}
IOB::Enum AnnotationChannel::get_iob_at(int idx)
{
if (idx >= 0 && idx < static_cast<int>(iobs_.size())) {
......@@ -78,6 +97,7 @@ std::vector<Annotation> AnnotationChannel::make_annotation_vector() const
rv[s].head_index = i;
}
}
std::sort(rv.begin(), rv.end(), AnnotationHeadCompare());
rv.erase(std::remove_if(rv.begin(), rv.end(),
boost::bind(&Annotation::empty, _1)));
return rv;
......
......@@ -4,47 +4,112 @@
#include <libcorpus2/ann/iob.h>
#include <vector>
#include <boost/utility.hpp>
#include <algorithm>
#include <map>
namespace Corpus2 {
/**
* A general indexed annotation spanning a possibly disjoint group of
* tokens, with a distinguished 'head' token.
*/
struct Annotation
{
Annotation() : indices(), head_index(-1) {}
std::vector<int> indices;
int head_index;
bool empty() const { return indices.empty(); }
bool empty() const {
return indices.empty();
}
bool sane() const {
return empty() ||
std::find(indices.begin(), indices.end(), head_index) != indices.end();
}
};
struct AnnotationHeadCompare
{
bool operator()(const Annotation& a1, const Annotation& a2) {
return a1.head_index < a2.head_index;
}
};
/**
* An annotation channel in IOB format and segment index format, with manual
* sync between the two.
*/
class AnnotationChannel
{
public:
/**
* Empty ctor
*/
AnnotationChannel();
/**
* Ctor for an annotation of a specific size
*/
explicit AnnotationChannel(int size);
/**
* Discard IOB annotation information, regenerate it from the segment info.
*/
void make_iob_from_segments();
/**
* Discard indexed annotation info, regenerate from IOB.
*/
void make_segments_from_iob();
/**
* Number segments from 1 so there are no ommited indices, return the
* highest used index
*/
int renumber_segments();
/**
* Create a vector of AnnotationSegment objects, each corresponding to
* an annotation, with the annotations possibly being disjoint.
*/
std::vector<Annotation> make_annotation_vector() const;
/**
* Create a vector of AnnotationSegment objects, each corresponding to
* an annotation, forcing the annotations to be continous (disjoint
* annotations are split)
*/
std::vector<Annotation> make_continuous_annotation_vector() const;
/**
* The segment-index array accesor
*/
const std::vector<int>& segments() const {
return segments_;
}
/**
* The IOB data vector
*/
const std::vector<IOB::Enum>& iobs() const {
return iobs_;
}
/**
* IOB getter, returns IOB::O if idx is out of range
*/
IOB::Enum get_iob_at(int idx);
/**
* IOB setter, out of range indices are not processed.
*/
void set_iob_at(int idx, IOB::Enum iob);
private:
/// segment indices
std::vector<int> segments_;
/// IOB data
std::vector<IOB::Enum> iobs_;
};
......
......@@ -8,6 +8,10 @@ namespace Corpus2 {
namespace IOB {
/**
* A simple enumeration for IOB annotation chunk tagging
* O indicates not part of a chunk
* B indicates beginning of a new chunk
* I indicates continuation of a chunk started by a preceeding BI* sequence
* The only invalid sequence is O followed by I
*/
enum Enum {
O = 0,
......@@ -16,6 +20,7 @@ namespace IOB {
PostLast
};
/// Convert an enum value to an uppercase I, O or B string
/// Returns ? on invalid value
const char* to_string(Enum iob);
/// Create an enum value from an I, O or B string
Enum from_string(const std::string& s);
......
......@@ -25,6 +25,11 @@ Sentence::Ptr AnnotationView::clone_shared() const
return copy;
}
void AnnotationView::release_original()
{
original_.reset();
}
void AnnotationView::commit()
{
commit_to(original_, ann_name_);
......@@ -34,6 +39,9 @@ void AnnotationView::commit_to(
const boost::shared_ptr<AnnotatedSentence> &original,
const std::string &ann_name)
{
if (!original) {
throw AnnotationViewOutOfSync("null-pointer-to-original");
}
if (!original->has_channel(ann_name)) {
throw MissingAnnotationChannel(ann_name);
}
......
......@@ -7,9 +7,38 @@ namespace Corpus2 {
class AnnotatedSentence;
/**
* Exception class for signalling OOS between a View and a Sentence during
* e.g. commits
*/
class AnnotationViewOutOfSync : public Corpus2Error
{
public:
AnnotationViewOutOfSync(const std::string& reason)
: Corpus2Error("AnnotationView out of sync with base sentence: " + reason)
{
}
~AnnotationViewOutOfSync() throw()
{
}
};
/**
* A class representing an AnnotatedSentence viewed through the segmentation
* some Annotation specifies.
*
* It has its own Tokens but can update the original Sentence via commit().
*/
class AnnotationView : public Sentence
{
public:
/**
* Create an AnnotationView from a sentence and an annotation name.
* The AnnotationView keeps a handle to the sentence.
*
* This shoudl not be used directly, use create_view.
*/
AnnotationView(const boost::shared_ptr<AnnotatedSentence>& original,
const std::string& ann_name_);
......@@ -17,10 +46,27 @@ public:
Ptr clone_shared() const;
/**
* Push changes to the original Sentence.
*/
void commit();
/**
* Push changes to a different Sentence, but same channel
*/
void commit_to(const boost::shared_ptr<AnnotatedSentence>& original);
/**
* Push changes to ana arbitary channel in an arbitrary sentence
*/
void commit_to(const boost::shared_ptr<AnnotatedSentence>& original,
const std::string& ann_name);
/**
* Release the original Sentence. Use this when you no longer expect
* to need to go back or commit() changes.
*/
void release_original();
private:
boost::shared_ptr<AnnotatedSentence> original_;
const std::string ann_name_;
......
......@@ -6,6 +6,7 @@ add_definitions(-DLIBCORPUS2_TEST_DATA_DIR="${PROJECT_SOURCE_DIR}/")
add_executable( tests
main.cpp
ann_basic.cpp
basic.cpp
tag_split.cpp
tagset_parse.cpp
......
/*
Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski
Part of the libcorpus2 project
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.
See the LICENSE and COPYING files for more details.
*/
#include <boost/test/unit_test.hpp>
#include <boost/make_shared.hpp>
#include <libcorpus2/ann/annotatedsentence.h>
BOOST_AUTO_TEST_SUITE(ann)
BOOST_AUTO_TEST_CASE( wrap )
{
Corpus2::Sentence::Ptr s = boost::make_shared<Corpus2::Sentence>();
s->append(new Corpus2::Token(UnicodeString::fromUTF8("orth"), PwrNlp::Whitespace::None));
Corpus2::Sentence::Ptr a = Corpus2::AnnotatedSentence::wrap_sentence(s);
BOOST_CHECK(s->empty());
BOOST_REQUIRE(!a->empty());
BOOST_CHECK_EQUAL(a->tokens()[0]->orth_utf8(), "orth");
}
BOOST_AUTO_TEST_CASE( wrap_clone )
{
Corpus2::Sentence::Ptr s = boost::make_shared<Corpus2::Sentence>();
s->append(new Corpus2::Token(UnicodeString::fromUTF8("orth"), PwrNlp::Whitespace::None));
Corpus2::Sentence::Ptr a = Corpus2::AnnotatedSentence::wrap_sentence_clone(s);
BOOST_REQUIRE(!s->empty());
s->tokens()[0]->set_orth(UnicodeString::fromUTF8("aaa"));
BOOST_REQUIRE(!a->empty());
BOOST_CHECK_EQUAL(a->tokens()[0]->orth_utf8(), "orth");
}
BOOST_AUTO_TEST_SUITE_END()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment