diff --git a/libwccl/values/strset.cpp b/libwccl/values/strset.cpp index 9c869ef90978e00a0ee6c3c6455473f3e7fe3510..5eebd4042e5185f466b091e03739f998f5504b86 100644 --- a/libwccl/values/strset.cpp +++ b/libwccl/values/strset.cpp @@ -48,6 +48,58 @@ UnicodeString StrSet::to_raw_string_u() const return u; } +std::string StrSet::to_compact_string(const Corpus2::Tagset& /* tagset */) + const +{ + if (set_.empty()) { + return "-"; + } + + std::stringstream ss; + value_type::const_iterator it = set_.begin(); + while(it != set_.end()) { + ss << '\"'; + std::string item = PwrNlp::to_utf8(*it); + boost::algorithm::replace_all(item, "-", "\\u002d"); + boost::algorithm::replace_all(item, ".", "\\u002e"); + boost::algorithm::replace_all(item, " ", "\\u0020"); + boost::algorithm::replace_all(item, "\t", "\\u0009"); + ss << item; + ss << '\"'; + if(++it != set_.end()) { + ss << "-"; + } + } + return ss.str(); +} + +UnicodeString StrSet::to_compact_string_u(const Corpus2::Tagset& /* tagset */) + const +{ + UnicodeString u; + + if (set_.empty()) { + u.append(UNICODE_STRING("-", 1)); + return u; + } + + value_type::const_iterator it = set_.begin(); + while(it != set_.end()) { + u.append(UNICODE_STRING("\"", 1)); + UnicodeString item = *it; + item.findAndReplace(UNICODE_STRING("-", 1), UNICODE_STRING("\\u002d", 6)); + item.findAndReplace(UNICODE_STRING(".", 1), UNICODE_STRING("\\u002e", 6)); + item.findAndReplace(UNICODE_STRING(" ", 1), UNICODE_STRING("\\u0020", 6)); + item.findAndReplace(UNICODE_STRING("\t", 1), UNICODE_STRING("\\u0009", 6)); + u.append(item); + u.append(UNICODE_STRING("\"", 1)); + if(++it != set_.end()) { + u.append(UNICODE_STRING("-", 1)); + } + } + return u; +} + bool StrSet::intersects(const StrSet &other) const { if (empty() || other.empty()) { return false; diff --git a/libwccl/values/strset.h b/libwccl/values/strset.h index e1abb89004ea866bc38efb93956d8788f370ca63..0de7a5de17adc7aa12cb33b9f429cca6499e02ee 100644 --- a/libwccl/values/strset.h +++ b/libwccl/values/strset.h @@ -99,6 +99,12 @@ public: /// Value override UnicodeString to_raw_string_u() const; + /// Value override + std::string to_compact_string(const Corpus2::Tagset& tagset) const; + + /// Value override + UnicodeString to_compact_string_u(const Corpus2::Tagset& tagset) const; + private: value_type set_; }; diff --git a/libwccl/values/tset.cpp b/libwccl/values/tset.cpp index 499fb5ae9148f2b248045bcc0549f15b291271b5..27760fbf1dae7daa0d0d91beb65fcb1d855d2bec 100644 --- a/libwccl/values/tset.cpp +++ b/libwccl/values/tset.cpp @@ -1,6 +1,8 @@ #include <libwccl/values/tset.h> #include <libpwrutils/foreach.h> #include <libpwrutils/bitset.h> + +#include <boost/algorithm/string.hpp> #include <sstream> namespace Wccl { @@ -46,4 +48,16 @@ void TSet::insert_symbol(const Corpus2::Tagset& tagset, const std::string& s) tag_.combine_with(tagset.parse_symbol(s)); } +std::string TSet::to_compact_string(const Corpus2::Tagset& tagset) + const +{ + if (tag_.is_null()) { + return "-"; + } + + std::string body = tagset.tag_to_symbol_string(tag_); + boost::algorithm::replace_all(body, ",", "-"); + return body; +} + } /* end ns Wccl */ diff --git a/libwccl/values/tset.h b/libwccl/values/tset.h index 73ed1f3326d73633982f2a446b6709d70dd461f1..41ff34f88568a50fb370235cf408061bd79d2c8c 100644 --- a/libwccl/values/tset.h +++ b/libwccl/values/tset.h @@ -115,6 +115,9 @@ public: std::string to_raw_string() const; + /// Value override + std::string to_compact_string(const Corpus2::Tagset& tagset) const; + private: Corpus2::Tag tag_; }; diff --git a/libwccl/values/value.h b/libwccl/values/value.h index abf068d4144218bf0f131cd25222beda4da8d018..702175c757e675f0cbcb9530d0e5e563b6080f15 100644 --- a/libwccl/values/value.h +++ b/libwccl/values/value.h @@ -71,6 +71,22 @@ public: return UnicodeString::fromUTF8(to_raw_string()); } + /** + * Compact string representation: sets are represented as hyphen-separated + * strings (sorted) with no brackets. The representation is suitable for + * generating compact output where some degree of ambiguity is allowed + * (note that type can't be unambiguously inferred from such strings). + */ + virtual std::string to_compact_string(const Corpus2::Tagset& /* tagset */) + const { + return to_raw_string(); + } + + virtual UnicodeString to_compact_string_u(const Corpus2::Tagset& tagset) + const { + return UnicodeString::fromUTF8(to_compact_string(tagset)); + } + protected: Value() {} }; diff --git a/swig/value.i b/swig/value.i index 30f8eefb203b114d7905733953ae326f53a33887..671c3c981ff1da336aa3b19064e6ab0cadb67f11 100644 --- a/swig/value.i +++ b/swig/value.i @@ -29,6 +29,7 @@ namespace Wccl { virtual std::string to_string(const Corpus2::Tagset& /*tagset*/) const; virtual std::string to_raw_string() const = 0; + virtual std::string to_compact_string(const Corpus2::Tagset& /*tagset*/) const; }; }