From 51ab9ed1ab3c2a04bb031f70d5ff04965f738a8a Mon Sep 17 00:00:00 2001 From: Adam Radziszewski <adam.radziszewski@pwr.wroc.pl> Date: Mon, 14 Nov 2011 13:42:15 +0100 Subject: [PATCH] support token metadata in ccl rdr \& writer --- libcorpus2/io/cclreader.cpp | 28 +++++++++++++++++++++++++++- libcorpus2/io/cclwriter.cpp | 10 ++++++++++ libcorpus2/token.h | 3 ++- 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/libcorpus2/io/cclreader.cpp b/libcorpus2/io/cclreader.cpp index 4559ed8..49fcc03 100644 --- a/libcorpus2/io/cclreader.cpp +++ b/libcorpus2/io/cclreader.cpp @@ -20,6 +20,7 @@ or FITNESS FOR A PARTICULAR PURPOSE. #include <libxml++/libxml++.h> #include <libxml2/libxml/parser.h> #include <boost/make_shared.hpp> +#include <boost/algorithm/string.hpp> #include <libcorpus2/ann/annotatedsentence.h> #include <cstdlib> #include <fstream> @@ -53,12 +54,15 @@ protected: void finish_token(); static const int STATE_ANN = 901; - static const int STATE_REL = 902; + static const int STATE_REL = 902; // currently unused + static const int STATE_PROP = 910; boost::shared_ptr<AnnotatedSentence> ann_sent_; std::string ann_chan_; + std::string prop_key_; + bool ann_head_; typedef std::map<std::string, int> token_ann_t; @@ -171,6 +175,17 @@ bool CclReaderImpl::process_start_element(const Glib::ustring & name, throw XcesError("<ann> with no channel name"); } return true; + } else if (state_ == STATE_TOK && name == "prop") { + state_ = STATE_PROP; + grab_characters_ = true; + clear_buf(); + prop_key_ = ""; + foreach (const Attribute& a, attributes) { + if (a.name == "key") { + prop_key_ = a.value; + } + } + return true; } else { return false; } @@ -191,6 +206,17 @@ bool CclReaderImpl::process_end_element(const Glib::ustring & name) token_ann_heads_.insert(ann_chan_); } } + state_ = STATE_TOK; + return true; + } else if (state_ == STATE_PROP && name == "prop") { + std::string prop_val = get_buf(); + boost::algorithm::trim(prop_val); + grab_characters_ = false; + if (!tok_->get_metadata()) { + tok_->create_metadata(); + } + tok_->get_metadata()->set_attribute(prop_key_, prop_val); + state_ = STATE_TOK; return true; } else { diff --git a/libcorpus2/io/cclwriter.cpp b/libcorpus2/io/cclwriter.cpp index 6370cf9..221d6a3 100644 --- a/libcorpus2/io/cclwriter.cpp +++ b/libcorpus2/io/cclwriter.cpp @@ -2,6 +2,7 @@ #include <libpwrutils/foreach.h> #include <libcorpus2/ann/annotatedsentence.h> #include <libcorpus2/io/xcescommon.h> +#include <libcorpus2/tokenmetadata.h> namespace Corpus2 { @@ -49,9 +50,18 @@ void CclWriter::write_sentence_int(const Sentence &s) os() << v.second.get_segment_at(idx); os() << "</ann>\n"; } + TokenMetaData* md = t->get_metadata(); + if (md) { + foreach (const TokenMetaData::attr_map_t::value_type& v, md->attributes()) { + osi() << "<prop key=\"" << v.first << "\"" << ">"; + os() << v.second << "</prop>\n"; + } + } if (use_indent_) indent_less(); osi() << "</tok>\n"; } else { + // TODO: currently writing of token metadata is supported only when + // we've got an AnnotatedSentence. XmlWriter::write_token(*t); } } diff --git a/libcorpus2/token.h b/libcorpus2/token.h index 4da41e3..d183450 100644 --- a/libcorpus2/token.h +++ b/libcorpus2/token.h @@ -19,6 +19,7 @@ or FITNESS FOR A PARTICULAR PURPOSE. #include <libcorpus2/lexeme.h> #include <libcorpus2/tagset.h> +#include <libcorpus2/tokenmetadata.h> #include <libpwrutils/util.h> #include <libpwrutils/whitespace.h> @@ -35,7 +36,7 @@ namespace Corpus2 { /// Forward declaration of optional Token metadata class -class TokenMetaData; +//class TokenMetaData; /** * A single token with morphological analyses. -- GitLab