From 631c5f872339b7553562c44d472e09b5c2721ad0 Mon Sep 17 00:00:00 2001
From: Adam Wardynski <award@.(B-4.4.46a)>
Date: Thu, 9 Dec 2010 14:53:04 +0100
Subject: [PATCH] matching_categories(Tag) method for TSet.

---
 libwccl/values/tset.cpp |  7 +++++++
 libwccl/values/tset.h   | 10 ++++++++++
 tests/values.cpp        | 21 +++++++++++++++++++++
 3 files changed, 38 insertions(+)

diff --git a/libwccl/values/tset.cpp b/libwccl/values/tset.cpp
index 79971c9..499fb5a 100644
--- a/libwccl/values/tset.cpp
+++ b/libwccl/values/tset.cpp
@@ -1,5 +1,6 @@
 #include <libwccl/values/tset.h>
 #include <libpwrutils/foreach.h>
+#include <libpwrutils/bitset.h>
 #include <sstream>
 
 namespace Wccl {
@@ -34,6 +35,12 @@ int TSet::categories_count(const Corpus2::Tagset& tagset) const
 	return cats;
 }
 
+int TSet::matching_categories(const Corpus2::Tag& tag) const
+{
+   const Corpus2::Tag& masked = tag_.get_masked(tag);
+   return PwrNlp::count_bits_set(masked.get_pos()) + PwrNlp::count_bits_set(masked.get_values());
+}
+
 void TSet::insert_symbol(const Corpus2::Tagset& tagset, const std::string& s)
 {
 	tag_.combine_with(tagset.parse_symbol(s));
diff --git a/libwccl/values/tset.h b/libwccl/values/tset.h
index e0aa99f..11f2279 100644
--- a/libwccl/values/tset.h
+++ b/libwccl/values/tset.h
@@ -89,6 +89,16 @@ public:
 	 */
 	int categories_count(const Corpus2::Tagset& tagset) const;
 
+	/**
+	 * @return How many categories present in the supplied tag match with
+	 *         this symbol set.
+	 * @warning The underlying assumption is that the supplied tag has at most
+	 *          1 value per category. Otherwise the value will be incorrect.
+	 * @note The symbol set may have partially defined categories. Only values
+	 *       present in this symbol set count when matching values in the tag.
+	 */
+	int matching_categories(const Corpus2::Tag& tag) const;
+
 	void combine_with(const Corpus2::Tag& other) {
 		tag_.combine_with(other);
 	}
diff --git a/tests/values.cpp b/tests/values.cpp
index 6bcaa81..f15247b 100644
--- a/tests/values.cpp
+++ b/tests/values.cpp
@@ -68,40 +68,61 @@ BOOST_AUTO_TEST_CASE(tset_ops)
 {
 	TSet s1, s2;
 	const Corpus2::Tagset& tagset = Corpus2::get_named_tagset("kipi");
+	Corpus2::Tag subst_tag = tagset.parse_tag("subst:sg:nom:f", false)[0];
+	Corpus2::Tag adj_tag = tagset.parse_tag("adj:pl:acc:m3:pos", false)[0];
+
 	BOOST_CHECK(s1.equals(s2));
 	BOOST_CHECK(s1.is_subset_of(s2));
 	BOOST_CHECK(s2.is_subset_of(s1));
 	BOOST_CHECK(!s1.intersects(s2));
 	BOOST_CHECK_EQUAL(0, s1.categories_count(tagset));
+	BOOST_CHECK_EQUAL(0, s1.matching_categories(subst_tag));
+	BOOST_CHECK_EQUAL(0, s1.matching_categories(adj_tag));
 	s1.insert_symbol(tagset, "subst");
 	BOOST_CHECK_EQUAL(1, s1.categories_count(tagset));
+	BOOST_CHECK_EQUAL(1, s1.matching_categories(subst_tag));
+	BOOST_CHECK_EQUAL(0, s1.matching_categories(adj_tag));
 	BOOST_CHECK(!s1.equals(s2));
 	BOOST_CHECK(!s1.is_subset_of(s2));
 	BOOST_CHECK(s2.is_subset_of(s1));
 	BOOST_CHECK(!s1.intersects(s2));
 	s2.insert_symbol(tagset, "pl");
 	BOOST_CHECK_EQUAL(1, s2.categories_count(tagset));
+	BOOST_CHECK_EQUAL(0, s2.matching_categories(subst_tag));
+	BOOST_CHECK_EQUAL(1, s2.matching_categories(adj_tag));
 	BOOST_CHECK(!s1.equals(s2));
 	BOOST_CHECK(!s1.is_subset_of(s2));
 	BOOST_CHECK(!s2.is_subset_of(s1));
 	BOOST_CHECK(!s1.intersects(s2));
 	s2.insert_symbol(tagset, "subst");
 	BOOST_CHECK_EQUAL(2, s2.categories_count(tagset));
+	BOOST_CHECK_EQUAL(1, s2.matching_categories(subst_tag));
+	BOOST_CHECK_EQUAL(1, s2.matching_categories(adj_tag));
 	BOOST_CHECK(!s1.equals(s2));
 	BOOST_CHECK(s1.is_subset_of(s2));
 	BOOST_CHECK(!s2.is_subset_of(s1));
 	BOOST_CHECK(s1.intersects(s2));
 	s1.insert_symbol(tagset, "pl");
+	BOOST_CHECK_EQUAL(2, s1.categories_count(tagset));
+	BOOST_CHECK_EQUAL(1, s1.matching_categories(subst_tag));
+	BOOST_CHECK_EQUAL(1, s1.matching_categories(adj_tag));
 	BOOST_CHECK(s1.equals(s2));
 	BOOST_CHECK(s1.is_subset_of(s2));
 	BOOST_CHECK(s2.is_subset_of(s1));
 	BOOST_CHECK(s1.intersects(s2));
 	s1.insert_symbol(tagset, "sg");
 	BOOST_CHECK_EQUAL(2, s1.categories_count(tagset));
+	BOOST_CHECK_EQUAL(2, s1.matching_categories(subst_tag));
+	BOOST_CHECK_EQUAL(1, s1.matching_categories(adj_tag));
 	s1.insert_symbol(tagset, "f");
 	BOOST_CHECK_EQUAL(3, s1.categories_count(tagset));
+	BOOST_CHECK_EQUAL(3, s1.matching_categories(subst_tag));
+	BOOST_CHECK_EQUAL(1, s1.matching_categories(adj_tag));
 	s1.insert_symbol(tagset, "adj");
 	BOOST_CHECK_EQUAL(3, s1.categories_count(tagset));
+	BOOST_CHECK_EQUAL(3, s1.categories_count(tagset));
+	BOOST_CHECK_EQUAL(2, s1.matching_categories(adj_tag));
+
 }
 
 BOOST_AUTO_TEST_CASE(position_ops)
-- 
GitLab