From e73283fa901da50b747ff806675c23bf2fa6ce53 Mon Sep 17 00:00:00 2001
From: Adam Radziszewski <adam.radziszewski@pwr.wroc.pl>
Date: Mon, 18 Jul 2011 16:24:02 +0200
Subject: [PATCH] compact_string repr for Value

---
 libwccl/values/strset.cpp | 52 +++++++++++++++++++++++++++++++++++++++
 libwccl/values/strset.h   |  6 +++++
 libwccl/values/tset.cpp   | 14 +++++++++++
 libwccl/values/tset.h     |  3 +++
 libwccl/values/value.h    | 16 ++++++++++++
 swig/value.i              |  1 +
 6 files changed, 92 insertions(+)

diff --git a/libwccl/values/strset.cpp b/libwccl/values/strset.cpp
index 9c869ef..5eebd40 100644
--- a/libwccl/values/strset.cpp
+++ b/libwccl/values/strset.cpp
@@ -48,6 +48,58 @@ UnicodeString StrSet::to_raw_string_u() const
 	return u;
 }
 
+std::string StrSet::to_compact_string(const Corpus2::Tagset& /* tagset */)
+		const
+{
+	if (set_.empty()) {
+		return "-";
+	}
+
+	std::stringstream ss;
+	value_type::const_iterator it = set_.begin();
+	while(it != set_.end()) {
+		ss << '\"';
+		std::string item = PwrNlp::to_utf8(*it);
+		boost::algorithm::replace_all(item, "-", "\\u002d");
+		boost::algorithm::replace_all(item, ".", "\\u002e");
+		boost::algorithm::replace_all(item, " ", "\\u0020");
+		boost::algorithm::replace_all(item, "\t", "\\u0009");
+		ss << item;
+		ss << '\"';
+		if(++it != set_.end()) {
+			ss << "-";
+		}
+	}
+	return ss.str();
+}
+
+UnicodeString StrSet::to_compact_string_u(const Corpus2::Tagset& /* tagset */)
+		const
+{
+	UnicodeString u;
+
+	if (set_.empty()) {
+		u.append(UNICODE_STRING("-", 1));
+		return u;
+	}
+
+	value_type::const_iterator it = set_.begin();
+	while(it != set_.end()) {
+		u.append(UNICODE_STRING("\"", 1));
+		UnicodeString item = *it;
+		item.findAndReplace(UNICODE_STRING("-", 1), UNICODE_STRING("\\u002d", 6));
+		item.findAndReplace(UNICODE_STRING(".", 1), UNICODE_STRING("\\u002e", 6));
+		item.findAndReplace(UNICODE_STRING(" ", 1), UNICODE_STRING("\\u0020", 6));
+		item.findAndReplace(UNICODE_STRING("\t", 1), UNICODE_STRING("\\u0009", 6));
+		u.append(item);
+		u.append(UNICODE_STRING("\"", 1));
+		if(++it != set_.end()) {
+			u.append(UNICODE_STRING("-", 1));
+		}
+	}
+	return u;
+}
+
 bool StrSet::intersects(const StrSet &other) const {
 	if (empty() || other.empty()) {
 		return false;
diff --git a/libwccl/values/strset.h b/libwccl/values/strset.h
index e1abb89..0de7a5d 100644
--- a/libwccl/values/strset.h
+++ b/libwccl/values/strset.h
@@ -99,6 +99,12 @@ public:
 	/// Value override
 	UnicodeString to_raw_string_u() const;
 
+	/// Value override
+	std::string to_compact_string(const Corpus2::Tagset& tagset) const;
+
+	/// Value override
+	UnicodeString to_compact_string_u(const Corpus2::Tagset& tagset) const;
+
 private:
 	value_type set_;
 };
diff --git a/libwccl/values/tset.cpp b/libwccl/values/tset.cpp
index 499fb5a..27760fb 100644
--- a/libwccl/values/tset.cpp
+++ b/libwccl/values/tset.cpp
@@ -1,6 +1,8 @@
 #include <libwccl/values/tset.h>
 #include <libpwrutils/foreach.h>
 #include <libpwrutils/bitset.h>
+
+#include <boost/algorithm/string.hpp>
 #include <sstream>
 
 namespace Wccl {
@@ -46,4 +48,16 @@ void TSet::insert_symbol(const Corpus2::Tagset& tagset, const std::string& s)
 	tag_.combine_with(tagset.parse_symbol(s));
 }
 
+std::string TSet::to_compact_string(const Corpus2::Tagset& tagset)
+		const
+{
+	if (tag_.is_null()) {
+		return "-";
+	}
+
+	std::string body = tagset.tag_to_symbol_string(tag_);
+	boost::algorithm::replace_all(body, ",", "-");
+	return body;
+}
+
 } /* end ns Wccl */
diff --git a/libwccl/values/tset.h b/libwccl/values/tset.h
index 73ed1f3..41ff34f 100644
--- a/libwccl/values/tset.h
+++ b/libwccl/values/tset.h
@@ -115,6 +115,9 @@ public:
 
 	std::string to_raw_string() const;
 
+	/// Value override
+	std::string to_compact_string(const Corpus2::Tagset& tagset) const;
+
 private:
 	Corpus2::Tag tag_;
 };
diff --git a/libwccl/values/value.h b/libwccl/values/value.h
index abf068d..702175c 100644
--- a/libwccl/values/value.h
+++ b/libwccl/values/value.h
@@ -71,6 +71,22 @@ public:
 		return UnicodeString::fromUTF8(to_raw_string());
 	}
 
+	/**
+	  * Compact string representation: sets are represented as hyphen-separated
+	  * strings (sorted) with no brackets. The representation is suitable for
+	  * generating compact output where some degree of ambiguity is allowed
+	  * (note that type can't be unambiguously inferred from such strings).
+	  */
+	virtual std::string to_compact_string(const Corpus2::Tagset& /* tagset */)
+			const {
+		return to_raw_string();
+	}
+
+	virtual UnicodeString to_compact_string_u(const Corpus2::Tagset& tagset)
+			const {
+		return UnicodeString::fromUTF8(to_compact_string(tagset));
+	}
+
 protected:
 	Value() {}
 };
diff --git a/swig/value.i b/swig/value.i
index 30f8eef..671c3c9 100644
--- a/swig/value.i
+++ b/swig/value.i
@@ -29,6 +29,7 @@ namespace Wccl {
 
     virtual std::string to_string(const Corpus2::Tagset& /*tagset*/) const;
     virtual std::string to_raw_string() const = 0;
+    virtual std::string to_compact_string(const Corpus2::Tagset& /*tagset*/) const;
   };
 
 }
-- 
GitLab