Skip to content
Snippets Groups Projects
Commit 51ab9ed1 authored by Adam Radziszewski's avatar Adam Radziszewski
Browse files

support token metadata in ccl rdr \& writer

parent a0fa6b8a
Branches
No related merge requests found
......@@ -20,6 +20,7 @@ or FITNESS FOR A PARTICULAR PURPOSE.
#include <libxml++/libxml++.h>
#include <libxml2/libxml/parser.h>
#include <boost/make_shared.hpp>
#include <boost/algorithm/string.hpp>
#include <libcorpus2/ann/annotatedsentence.h>
#include <cstdlib>
#include <fstream>
......@@ -53,12 +54,15 @@ protected:
void finish_token();
static const int STATE_ANN = 901;
static const int STATE_REL = 902;
static const int STATE_REL = 902; // currently unused
static const int STATE_PROP = 910;
boost::shared_ptr<AnnotatedSentence> ann_sent_;
std::string ann_chan_;
std::string prop_key_;
bool ann_head_;
typedef std::map<std::string, int> token_ann_t;
......@@ -171,6 +175,17 @@ bool CclReaderImpl::process_start_element(const Glib::ustring & name,
throw XcesError("<ann> with no channel name");
}
return true;
} else if (state_ == STATE_TOK && name == "prop") {
state_ = STATE_PROP;
grab_characters_ = true;
clear_buf();
prop_key_ = "";
foreach (const Attribute& a, attributes) {
if (a.name == "key") {
prop_key_ = a.value;
}
}
return true;
} else {
return false;
}
......@@ -191,6 +206,17 @@ bool CclReaderImpl::process_end_element(const Glib::ustring & name)
token_ann_heads_.insert(ann_chan_);
}
}
state_ = STATE_TOK;
return true;
} else if (state_ == STATE_PROP && name == "prop") {
std::string prop_val = get_buf();
boost::algorithm::trim(prop_val);
grab_characters_ = false;
if (!tok_->get_metadata()) {
tok_->create_metadata();
}
tok_->get_metadata()->set_attribute(prop_key_, prop_val);
state_ = STATE_TOK;
return true;
} else {
......
......@@ -2,6 +2,7 @@
#include <libpwrutils/foreach.h>
#include <libcorpus2/ann/annotatedsentence.h>
#include <libcorpus2/io/xcescommon.h>
#include <libcorpus2/tokenmetadata.h>
namespace Corpus2 {
......@@ -49,9 +50,18 @@ void CclWriter::write_sentence_int(const Sentence &s)
os() << v.second.get_segment_at(idx);
os() << "</ann>\n";
}
TokenMetaData* md = t->get_metadata();
if (md) {
foreach (const TokenMetaData::attr_map_t::value_type& v, md->attributes()) {
osi() << "<prop key=\"" << v.first << "\"" << ">";
os() << v.second << "</prop>\n";
}
}
if (use_indent_) indent_less();
osi() << "</tok>\n";
} else {
// TODO: currently writing of token metadata is supported only when
// we've got an AnnotatedSentence.
XmlWriter::write_token(*t);
}
}
......
......@@ -19,6 +19,7 @@ or FITNESS FOR A PARTICULAR PURPOSE.
#include <libcorpus2/lexeme.h>
#include <libcorpus2/tagset.h>
#include <libcorpus2/tokenmetadata.h>
#include <libpwrutils/util.h>
#include <libpwrutils/whitespace.h>
......@@ -35,7 +36,7 @@ namespace Corpus2 {
/// Forward declaration of optional Token metadata class
class TokenMetaData;
//class TokenMetaData;
/**
* A single token with morphological analyses.
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment