From 51ab9ed1ab3c2a04bb031f70d5ff04965f738a8a Mon Sep 17 00:00:00 2001
From: Adam Radziszewski <adam.radziszewski@pwr.wroc.pl>
Date: Mon, 14 Nov 2011 13:42:15 +0100
Subject: [PATCH] support token metadata in ccl rdr \& writer

---
 libcorpus2/io/cclreader.cpp | 28 +++++++++++++++++++++++++++-
 libcorpus2/io/cclwriter.cpp | 10 ++++++++++
 libcorpus2/token.h          |  3 ++-
 3 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/libcorpus2/io/cclreader.cpp b/libcorpus2/io/cclreader.cpp
index 4559ed8..49fcc03 100644
--- a/libcorpus2/io/cclreader.cpp
+++ b/libcorpus2/io/cclreader.cpp
@@ -20,6 +20,7 @@ or FITNESS FOR A PARTICULAR PURPOSE.
 #include <libxml++/libxml++.h>
 #include <libxml2/libxml/parser.h>
 #include <boost/make_shared.hpp>
+#include <boost/algorithm/string.hpp>
 #include <libcorpus2/ann/annotatedsentence.h>
 #include <cstdlib>
 #include <fstream>
@@ -53,12 +54,15 @@ protected:
 	void finish_token();
 
 	static const int STATE_ANN = 901;
-	static const int STATE_REL = 902;
+	static const int STATE_REL = 902; // currently unused
+	static const int STATE_PROP = 910;
 
 	boost::shared_ptr<AnnotatedSentence> ann_sent_;
 
 	std::string ann_chan_;
 
+	std::string prop_key_;
+
 	bool ann_head_;
 
 	typedef std::map<std::string, int> token_ann_t;
@@ -171,6 +175,17 @@ bool CclReaderImpl::process_start_element(const Glib::ustring & name,
 			throw XcesError("<ann> with no channel name");
 		}
 		return true;
+	} else if (state_ == STATE_TOK && name == "prop") {
+		state_ = STATE_PROP;
+		grab_characters_ = true;
+		clear_buf();
+		prop_key_ = "";
+		foreach (const Attribute& a, attributes) {
+			if (a.name == "key") {
+				prop_key_ = a.value;
+			}
+		}
+		return true;
 	} else {
 		return false;
 	}
@@ -191,6 +206,17 @@ bool CclReaderImpl::process_end_element(const Glib::ustring & name)
 				token_ann_heads_.insert(ann_chan_);
 			}
 		}
+		state_ = STATE_TOK;
+		return true;
+	} else if (state_ == STATE_PROP && name == "prop") {
+		std::string prop_val = get_buf();
+		boost::algorithm::trim(prop_val);
+		grab_characters_ = false;
+		if (!tok_->get_metadata()) {
+			tok_->create_metadata();
+		}
+		tok_->get_metadata()->set_attribute(prop_key_, prop_val);
+
 		state_ = STATE_TOK;
 		return true;
 	} else {
diff --git a/libcorpus2/io/cclwriter.cpp b/libcorpus2/io/cclwriter.cpp
index 6370cf9..221d6a3 100644
--- a/libcorpus2/io/cclwriter.cpp
+++ b/libcorpus2/io/cclwriter.cpp
@@ -2,6 +2,7 @@
 #include <libpwrutils/foreach.h>
 #include <libcorpus2/ann/annotatedsentence.h>
 #include <libcorpus2/io/xcescommon.h>
+#include <libcorpus2/tokenmetadata.h>
 
 namespace Corpus2 {
 
@@ -49,9 +50,18 @@ void CclWriter::write_sentence_int(const Sentence &s)
 				os() << v.second.get_segment_at(idx);
 				os() << "</ann>\n";
 			}
+			TokenMetaData* md = t->get_metadata();
+			if (md) {
+				foreach (const TokenMetaData::attr_map_t::value_type& v, md->attributes()) {
+					osi() << "<prop key=\"" << v.first << "\"" << ">";
+					os() << v.second << "</prop>\n";
+				}
+			}
 			if (use_indent_) indent_less();
 			osi() << "</tok>\n";
 		} else {
+			// TODO: currently writing of token metadata is supported only when
+			// we've got an AnnotatedSentence.
 			XmlWriter::write_token(*t);
 		}
 	}
diff --git a/libcorpus2/token.h b/libcorpus2/token.h
index 4da41e3..d183450 100644
--- a/libcorpus2/token.h
+++ b/libcorpus2/token.h
@@ -19,6 +19,7 @@ or FITNESS FOR A PARTICULAR PURPOSE.
 
 #include <libcorpus2/lexeme.h>
 #include <libcorpus2/tagset.h>
+#include <libcorpus2/tokenmetadata.h>
 
 #include <libpwrutils/util.h>
 #include <libpwrutils/whitespace.h>
@@ -35,7 +36,7 @@ namespace Corpus2 {
 
 
 /// Forward declaration of optional Token metadata class
-class TokenMetaData;
+//class TokenMetaData;
 
 /**
  * A single token with morphological analyses.
-- 
GitLab