From c9a46af06061e9da82fb203ec02336b549da05c6 Mon Sep 17 00:00:00 2001
From: ilor <kailoran@gmail.com>
Date: Thu, 4 Nov 2010 17:32:01 +0100
Subject: [PATCH] Add Tagset::parse_symbol and tag_to_symbol_string and related
 functions. Bumps verison of corpus2.

---
 libcorpus2/CMakeLists.txt |  2 +-
 libcorpus2/tagset.cpp     | 44 +++++++++++++++++++++++++++++++++++++++
 libcorpus2/tagset.h       | 35 +++++++++++++++++++++++++++++++
 tests/tag_split.cpp       | 20 ++++++++++++++++++
 4 files changed, 100 insertions(+), 1 deletion(-)

diff --git a/libcorpus2/CMakeLists.txt b/libcorpus2/CMakeLists.txt
index e6873f9..ae39d3b 100644
--- a/libcorpus2/CMakeLists.txt
+++ b/libcorpus2/CMakeLists.txt
@@ -3,7 +3,7 @@ PROJECT(corpus2)
 
 set(corpus2_ver_major "0")
 set(corpus2_ver_minor "1")
-set(corpus2_ver_patch "0")
+set(corpus2_ver_patch "1")
 
 
 if(NOT LIBCORPUS2_SRC_DATA_DIR)
diff --git a/libcorpus2/tagset.cpp b/libcorpus2/tagset.cpp
index 538d2c2..8fbb296 100644
--- a/libcorpus2/tagset.cpp
+++ b/libcorpus2/tagset.cpp
@@ -95,6 +95,20 @@ std::string Tagset::id_string(const Tag& tag) const
 	return ss.str();
 }
 
+Tag Tagset::parse_symbol(const std::string& s) const
+{
+	mask_t m = get_pos_mask(s);
+	if (m.none()) {
+		return Tag(m);
+	}
+	m = get_attribute_mask(s);
+	if (m.any()) {
+		return Tag(0, m);
+	}
+	m = get_value_mask(s);
+	return Tag(0, m);
+}
+
 void Tagset::parse_tag(const string_range &s, bool allow_extra,
 		boost::function<void(const Tag &)> sink) const
 {
@@ -369,6 +383,36 @@ std::string Tagset::tag_to_no_opt_string(const Tag &tag) const
 	return ss.str();
 }
 
+std::vector<std::string> Tagset::tag_to_symbol_string_vector(const Tag& tag,
+		bool compress_attributes) const
+{
+	std::vector<std::string> ret;
+	foreach (mask_t p, PwrNlp::set_bits(tag.get_pos())) {
+		ret.push_back(get_pos_name(p));
+	}
+	mask_t vals = tag.get_values();
+	if (compress_attributes) {
+		for (idx_t ai = 0; ai < attribute_count(); ++ai) {
+			mask_t amask = get_attribute_mask(ai);
+			if ((vals & amask) == amask) {
+				vals ^= amask;
+				ret.push_back(get_attribute_name(ai));
+			}
+		}
+	}
+	foreach (mask_t p, PwrNlp::set_bits(vals)) {
+		ret.push_back(get_value_name(p));
+	}
+	return ret;
+}
+
+std::string Tagset::tag_to_symbol_string(const Tag& tag,
+		bool compress_attributes) const
+{
+	return boost::algorithm::join(
+			tag_to_symbol_string_vector(tag, compress_attributes), ",");
+}
+
 size_t Tagset::tag_size(const Tag& tag) const
 {
 	size_t s = PwrNlp::count_bits_set(tag.get_pos());
diff --git a/libcorpus2/tagset.h b/libcorpus2/tagset.h
index 44cfbf2..572e5b4 100644
--- a/libcorpus2/tagset.h
+++ b/libcorpus2/tagset.h
@@ -104,6 +104,19 @@ public:
 	 */
 	static Tagset from_data(const char*);
 
+	/**
+	 * Parse a single tagset symbol and return the correspondig (partial) tag.
+	 *
+	 * Pos and value names result in a single-bit-set tag, attribite names
+	 * result in a tag with all values from that attribute set.
+	 *
+	 * The resulting tags will usually be invalid as standalone tags, so
+	 * there is no validation performed.
+	 *
+	 * An invalid string will result in a null tag being returned.
+	 */
+	Tag parse_symbol(const std::string& s) const;
+
 	/**
 	 * Tag parsing -- functional version, whole tag string.
 	 *
@@ -250,6 +263,28 @@ public:
 	 */
 	std::string tag_to_no_opt_string(const Tag &tag) const;
 
+	/**
+	 * Create and return a string representation of the symbols contained
+	 * within a tag when treated as separate tagset symbols.
+	 *
+	 * There will be one string for each POS set in the tag, and enough symbols
+	 * to cover all the values. If compress_tags is false, there will be one
+	 * value name per value set. If compress_tags is true, in case there are
+	 * attributes with all values setin the tag, the name of the attribiute
+	 * will be used instead of separate names of the attribute's values.
+	 */
+	std::vector<std::string> tag_to_symbol_string_vector(const Tag& tag,
+			bool compress_attribites = true) const;
+
+	/**
+	 * Return a comma-separated string representation of all symbols contained
+	 * within a tag.
+	 *
+	 * @see tag_to_symbol_string_vector.
+	 */
+	std::string tag_to_symbol_string(const Tag& tag,
+			bool compress_attribites = true) const;
+
 	/**
 	 * Compute the number of singular tags that can be represented by the given
 	 * tag, with the following restrictions:
diff --git a/tests/tag_split.cpp b/tests/tag_split.cpp
index 7197653..b666049 100644
--- a/tests/tag_split.cpp
+++ b/tests/tag_split.cpp
@@ -196,6 +196,26 @@ BOOST_FIXTURE_TEST_CASE( tag_size, F )
 	BOOST_CHECK(tt == t);
 }
 
+BOOST_FIXTURE_TEST_CASE( s, F )
+{
+	Corpus2::Tag t = tagset->parse_simple_tag("some:tag:data", false);
+	Corpus2::Tag t2 = tagset->parse_simple_tag("same:tog:data", false);
+	BOOST_CHECK_EQUAL(tagset->tag_to_symbol_string(t), "some,tag,data");
+	BOOST_CHECK_EQUAL(tagset->tag_to_symbol_string(t2), "same,tog,data");
+	Corpus2::Tag t3 = t.get_combined(t2);
+	std::vector<std::string> v = tagset->tag_to_symbol_string_vector(t3);
+	std::sort(v.begin(), v.end());
+	std::vector<std::string> v2;
+	v2.push_back("some");
+	v2.push_back("same");
+	v2.push_back("tog");
+	v2.push_back("tag");
+	v2.push_back("data");
+	std::sort(v2.begin(), v2.end());
+	BOOST_CHECK_EQUAL_COLLECTIONS(v.begin(), v.end(), v2.begin(), v2.end());
+
+}
+
 BOOST_AUTO_TEST_SUITE_END()
 
 BOOST_AUTO_TEST_CASE(bs_split)
-- 
GitLab