Skip to content
Snippets Groups Projects
Commit 381668c5 authored by ilor's avatar ilor
Browse files

Rough draft of annotation internals

parent a820f05c
Branches
No related tags found
No related merge requests found
...@@ -41,6 +41,10 @@ link_directories(${Boost_LIBRARY_DIRS}) ...@@ -41,6 +41,10 @@ link_directories(${Boost_LIBRARY_DIRS})
set(LIBS ${LIBS} ${Boost_LIBRARIES}) set(LIBS ${LIBS} ${Boost_LIBRARIES})
SET(libcorpus2_STAT_SRC SET(libcorpus2_STAT_SRC
ann/annotatedsentence.cpp
ann/channel.cpp
ann/iob.cpp
ann/view.cpp
chunk.cpp chunk.cpp
exception.cpp exception.cpp
lexeme.cpp lexeme.cpp
......
#include <libcorpus2/ann/annotatedsentence.h>
#include <libcorpus2/ann/view.h>
#include <boost/make_shared.hpp>
namespace Corpus2 {
AnnotatedSentence::AnnotatedSentence()
: Sentence(), channels_()
{
}
AnnotatedSentence::~AnnotatedSentence()
{
}
Sentence::Ptr AnnotatedSentence::clone_shared() const
{
boost::shared_ptr<AnnotatedSentence> copy;
copy = boost::make_shared<AnnotatedSentence>();
foreach (const Token* t, tokens_) {
copy->append(t->clone());
}
copy->channels_ = channels_;
return copy;
}
void AnnotatedSentence::create_channel(const std::string& name)
{
channels_.insert(std::make_pair(name, AnnotationChannel(tokens_.size())));
}
boost::shared_ptr<AnnotationView> create_view(
const boost::shared_ptr<AnnotatedSentence>& s,
const std::string& ann_name)
{
const AnnotationChannel& chan = s->get_channel(ann_name);
std::vector<Annotation> ann = chan.make_annotation_vector();
boost::shared_ptr<AnnotationView> view;
view = boost::make_shared<AnnotationView>(s, ann_name);
foreach (const Annotation& a, ann) {
UnicodeString orth;
orth = s->tokens()[a.indices[0]]->orth();
for (size_t idxi = 1; idxi < a.indices.size(); ++idxi) {
int idx = a.indices[idxi];
orth += PwrNlp::Whitespace::to_whitespace(s->tokens()[idx]->wa());
orth += s->tokens()[idx]->orth();
}
Token* t = new Token(orth, s->tokens()[a.indices[0]]->wa());
Token* head_token = s->tokens()[a.head_index];
std::copy(head_token->lexemes().begin(), head_token->lexemes().end(),
std::back_inserter(t->lexemes()));
view->append(t);
}
return view;
}
} /* end ns Corpus2 */
#ifndef LIBCORPUS2_ANN_ANNOTATEDSENTENCE_H
#define LIBCORPUS2_ANN_ANNOTATEDSENTENCE_H
#include <libcorpus2/sentence.h>
#include <libcorpus2/exception.h>
#include <libcorpus2/ann/channel.h>
namespace Corpus2 {
class AnnotationView;
class MissingAnnotationChannel : public Corpus2Error
{
public:
MissingAnnotationChannel(const std::string& name)
: Corpus2Error("Annotation channel missing: " + name)
{
}
~MissingAnnotationChannel() throw()
{
}
};
class AnnotatedSentence : public Corpus2::Sentence
{
public:
AnnotatedSentence();
~AnnotatedSentence();
Ptr clone_shared() const;
void create_channel(const std::string& name);
bool has_channel(const std::string& name) const {
return channels_.find(name) != channels_.end();
}
AnnotationChannel& get_channel(const std::string& name) {
chan_map_t::iterator i = channels_.find(name);
if (i == channels_.end()) {
throw MissingAnnotationChannel(name);
}
return i->second;
}
const AnnotationChannel& get_channel(const std::string& name) const {
chan_map_t::const_iterator i = channels_.find(name);
if (i == channels_.end()) {
throw MissingAnnotationChannel(name);
}
return i->second;
}
private:
typedef std::map<std::string, AnnotationChannel> chan_map_t;
chan_map_t channels_;
};
boost::shared_ptr<AnnotationView> create_view(
const boost::shared_ptr<AnnotatedSentence>& s,
const std::string& ann_name);
} /* end ns Corpus2 */
#endif // LIBCORPUS2_ANN_ANNOTATEDSENTENCE_H
#include <libcorpus2/ann/channel.h>
#include <algorithm>
#include <boost/bind.hpp>
namespace Corpus2 {
AnnotationChannel::AnnotationChannel()
: segments_(), iobs_()
{
}
AnnotationChannel::AnnotationChannel(int size)
: segments_(size), iobs_(size)
{
}
void AnnotationChannel::make_iob_from_segments()
{
int prev_seg = 0;
for (size_t i = 0; i < segments_.size(); ++i) {
if (segments_[i] == 0) {
iobs_[i] = IOB::O;
} else if (segments_[i] != prev_seg) {
iobs_[i] = IOB::I;
} else {
iobs_[i] = IOB::B;
prev_seg = segments_[i];
}
}
}
void AnnotationChannel::make_segments_from_iob()
{
int sid = 0;
bool i_can_has = false;
for (size_t i = 0; i < segments_.size(); ++i) {
if (iobs_[i] == IOB::O) {
segments_[i] = 0;
i_can_has = false;
} else {
if (iobs_[i] == IOB::B || !i_can_has) {
iobs_[i] = IOB::B;
++sid;
}
segments_[i] = sid;
i_can_has = true;
}
}
}
IOB::Enum AnnotationChannel::get_iob_at(int idx)
{
if (idx >= 0 && idx < static_cast<int>(iobs_.size())) {
return iobs_[idx];
} else {
return IOB::O;
}
}
void AnnotationChannel::set_iob_at(int idx, IOB::Enum iob)
{
if (idx >= 0 && idx < static_cast<int>(iobs_.size())) {
iobs_[idx] = iob;
}
}
std::vector<Annotation> AnnotationChannel::make_annotation_vector() const
{
std::vector<Annotation> rv;
int smax = 0;
for (size_t i = 0; i < segments_.size(); ++i) {
int s = segments_[i];
if (s > smax) {
rv.resize(smax = s);
}
rv[s].indices.push_back(i);
if (rv[s].head_index == -1) {
rv[s].head_index = i;
}
}
rv.erase(std::remove_if(rv.begin(), rv.end(),
boost::bind(&Annotation::empty, _1)));
return rv;
}
} /* end ns Corpus2 */
#ifndef LIBCORPUS2_ANN_CHANNEL_H
#define LIBCORPUS2_ANN_CHANNEL_H
#include <libcorpus2/ann/iob.h>
#include <vector>
#include <boost/utility.hpp>
namespace Corpus2 {
struct Annotation
{
Annotation() : indices(), head_index(-1) {}
std::vector<int> indices;
int head_index;
bool empty() const { return indices.empty(); }
};
class AnnotationChannel
{
public:
AnnotationChannel();
explicit AnnotationChannel(int size);
void make_iob_from_segments();
void make_segments_from_iob();
std::vector<Annotation> make_annotation_vector() const;
std::vector<Annotation> make_continuous_annotation_vector() const;
const std::vector<int>& segments() const {
return segments_;
}
const std::vector<IOB::Enum>& iobs() const {
return iobs_;
}
IOB::Enum get_iob_at(int idx);
void set_iob_at(int idx, IOB::Enum iob);
private:
std::vector<int> segments_;
std::vector<IOB::Enum> iobs_;
};
} /* end ns Corpus2 */
#endif // LIBCORPUS2_ANN_CHANNEL_H
#include <libcorpus2/ann/iob.h>
namespace Corpus2 {
const char* IOB::to_string(IOB::Enum iob)
{
if (iob == IOB::I) return "I";
if (iob == IOB::O) return "O";
if (iob == IOB::B) return "B";
return "?";
}
IOB::Enum IOB::from_string(const std::string &s)
{
if (s == "I") return IOB::I;
if (s == "O") return IOB::O;
if (s == "B") return IOB::B;
return IOB::PostLast;
}
} /* end ns Corpus2 */
#ifndef LIBCORPUS2_ANN_IOB_H
#define LIBCORPUS2_ANN_IOB_H
#include <string>
namespace Corpus2 {
namespace IOB {
/**
* A simple enumeration for IOB annotation chunk tagging
*/
enum Enum {
O = 0,
B = 1,
I = 2,
PostLast
};
/// Convert an enum value to an uppercase I, O or B string
const char* to_string(Enum iob);
/// Create an enum value from an I, O or B string
Enum from_string(const std::string& s);
}
} /* end ns Corpus2 */
#endif // LIBCORPUS2_ANN_IOB_H
#include <libcorpus2/ann/view.h>
#include <libcorpus2/ann/annotatedsentence.h>
#include <boost/make_shared.hpp>
namespace Corpus2 {
AnnotationView::AnnotationView(
const boost::shared_ptr<AnnotatedSentence> &original,
const std::string &ann_name)
: Sentence(), original_(original), ann_name_(ann_name)
{
}
AnnotationView::~AnnotationView()
{
}
Sentence::Ptr AnnotationView::clone_shared() const
{
boost::shared_ptr<AnnotationView> copy;
copy = boost::make_shared<AnnotationView>(original_, ann_name_);
foreach (const Token* t, tokens_) {
copy->append(t->clone());
}
return copy;
}
void AnnotationView::commit()
{
commit_to(original_, ann_name_);
}
void AnnotationView::commit_to(
const boost::shared_ptr<AnnotatedSentence> &original,
const std::string &ann_name)
{
if (!original->has_channel(ann_name)) {
throw MissingAnnotationChannel(ann_name);
}
// ...
}
} /* end ns Corpus2 */
#ifndef LIBCORPUS2_ANN_VIEW_H
#define LIBCORPUS2_ANN_VIEW_H
#include <libcorpus2/sentence.h>
namespace Corpus2 {
class AnnotatedSentence;
class AnnotationView : public Sentence
{
public:
AnnotationView(const boost::shared_ptr<AnnotatedSentence>& original,
const std::string& ann_name_);
~AnnotationView();
Ptr clone_shared() const;
void commit();
void commit_to(const boost::shared_ptr<AnnotatedSentence>& original,
const std::string& ann_name);
private:
boost::shared_ptr<AnnotatedSentence> original_;
const std::string ann_name_;
};
} /* end ns Corpus2 */
#endif // LIBCORPUS2_ANN_VIEW_H
...@@ -19,6 +19,11 @@ or FITNESS FOR A PARTICULAR PURPOSE. ...@@ -19,6 +19,11 @@ or FITNESS FOR A PARTICULAR PURPOSE.
namespace Corpus2 { namespace Corpus2 {
Sentence::Sentence()
: tokens_()
{
}
Sentence::~Sentence() Sentence::~Sentence()
{ {
foreach (const Token* t, tokens_) { foreach (const Token* t, tokens_) {
......
...@@ -30,15 +30,12 @@ public: ...@@ -30,15 +30,12 @@ public:
typedef boost::shared_ptr<const Sentence> ConstPtr; typedef boost::shared_ptr<const Sentence> ConstPtr;
/// Empty constructor /// Empty constructor
Sentence() Sentence();
: tokens_()
{
}
Ptr clone_shared() const; virtual Ptr clone_shared() const;
/// Destructor /// Destructor
~Sentence(); virtual ~Sentence();
void release_tokens(); void release_tokens();
...@@ -82,7 +79,7 @@ public: ...@@ -82,7 +79,7 @@ public:
return tokens_[0]; return tokens_[0];
} }
private: protected:
/// The tokens this sentence contains and owns /// The tokens this sentence contains and owns
std::vector<Token*> tokens_; std::vector<Token*> tokens_;
}; };
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment