From a519febc2afa7f86e21b97b3521e3bb9ca11922f Mon Sep 17 00:00:00 2001
From: ilor <kailoran@gmail.com>
Date: Wed, 20 Oct 2010 14:37:00 +0200
Subject: [PATCH] move count_bits_set to pwrutisl, put lowest_bit alias there,
 implement Tagset::tag_size and Tagset::tag_is_singular

---
 libcorpus2/tag.cpp         | 18 ++++------------
 libcorpus2/tagset.cpp      | 24 +++++++++++++++++++++
 libcorpus2/tagset.h        | 44 ++++++++++++++++++++++++++++++++++++++
 libpwrutils/CMakeLists.txt |  2 +-
 libpwrutils/util.h         | 21 ++++++++++++++++++
 tests/tag_split.cpp        | 26 +++++++++++++++++++++-
 6 files changed, 119 insertions(+), 16 deletions(-)

diff --git a/libcorpus2/tag.cpp b/libcorpus2/tag.cpp
index c387505..8b7a4b6 100644
--- a/libcorpus2/tag.cpp
+++ b/libcorpus2/tag.cpp
@@ -2,36 +2,26 @@
 #include <libcorpus2/tagsetmanager.h>
 
 #include <libpwrutils/foreach.h>
+#include <libpwrutils/util.h>
 
 #include <cstring>
 #include <sstream>
 
 #include <boost/functional/hash.hpp>
-#include <boost/pending/lowest_bit.hpp>
+
 #include <bitset>
 
 namespace Corpus2 {
 
-
-template <typename T>
-int count_bits_set(T v)
-{
-	// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
-	v = v - ((v >> 1) & (T)~(T)0/3);                           // temp
-	v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3);      // temp
-	v = (v + (v >> 4)) & (T)~(T)0/255*15;                      // temp
-	return (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * CHAR_BIT; // count
-}
-
 int Tag::pos_count() const
 {
-	return count_bits_set(pos_);
+	return PwrNlp::count_bits_set(pos_);
 }
 
 int Tag::get_pos_index() const
 {
 	if (pos_ == 0) return -1;
-	return boost::lowest_bit(pos_);
+	return PwrNlp::lowest_bit(pos_);
 }
 
 std::string Tag::raw_dump() const
diff --git a/libcorpus2/tagset.cpp b/libcorpus2/tagset.cpp
index 51bce4a..e03a78e 100644
--- a/libcorpus2/tagset.cpp
+++ b/libcorpus2/tagset.cpp
@@ -6,6 +6,7 @@
 #include <libcorpus2/tagsetparser.h>
 
 #include <libpwrutils/foreach.h>
+#include <libpwrutils/util.h>
 
 #include <boost/algorithm/string.hpp>
 #include <boost/strong_typedef.hpp>
@@ -337,6 +338,29 @@ std::string Tagset::tag_to_no_opt_string(const Tag &tag) const
 	return ss.str();
 }
 
+size_t Tagset::tag_size(const Tag& tag) const
+{
+	size_t s = PwrNlp::count_bits_set(tag.get_pos());
+	foreach (mask_t attribute_mask, all_attribute_masks()) {
+		mask_t values = tag.get_values_for(attribute_mask);
+		size_t x = PwrNlp::count_bits_set(values);
+		if (x > 1) {
+			s *= x;
+		}
+	}
+	return s;
+}
+
+bool Tagset::tag_is_singular(const Tag& tag) const
+{
+	if (PwrNlp::count_bits_set(tag.get_pos()) != 1) return false;
+	foreach (mask_t attribute_mask, all_attribute_masks()) {
+		mask_t values = tag.get_values_for(attribute_mask);
+		if (PwrNlp::count_bits_set(values) > 1) return false;
+	}
+	return true;
+}
+
 idx_t Tagset::get_pos_index(const string_range& pos) const
 {
 	return pos_dict_.get_id(pos);
diff --git a/libcorpus2/tagset.h b/libcorpus2/tagset.h
index dec4013..366b0b0 100644
--- a/libcorpus2/tagset.h
+++ b/libcorpus2/tagset.h
@@ -250,6 +250,10 @@ public:
 	 */
 	std::string tag_to_no_opt_string(const Tag &tag) const;
 
+	size_t tag_size(const Tag& tag) const;
+
+	bool tag_is_singular(const Tag& tag) const;
+
 	/// POS name <-> index dictionary getter
 	const SymbolDictionary<idx_t>& pos_dictionary() const {
 		return pos_dict_;
@@ -401,6 +405,46 @@ public:
 	/// get the original index of the POS in the tagset definition
 	int get_original_pos_index(idx_t pos) const;
 
+
+	struct mask_iterator
+	{
+		typedef mask_t value_type;
+		typedef std::forward_iterator_tag iterator_category;
+		typedef int difference_type;
+		typedef const mask_t *pointer;
+		typedef const mask_t &reference;
+		mask_iterator(const mask_iterator &i): i_(i.i_) {}
+		mask_iterator(const mask_t& i) : i_(i) {}
+
+		mask_iterator &operator++() { i_ <<= 1; return *this; }
+		mask_iterator operator++(int) { return mask_iterator(i_ << 1); }
+		mask_iterator &operator--() { i_ >>= 1; return *this; }
+		mask_iterator operator--(int) { return mask_iterator(i_ >> 1); }
+
+		const mask_t &operator*() const { return i_; }
+
+		bool operator==(const mask_iterator &i) const { return i_ == i.i_; }
+		bool operator!=(const mask_iterator &i) const { return i_ != i.i_; }
+
+	private:
+		mask_t i_;
+	};
+
+	boost::iterator_range<mask_iterator> all_pos_masks() const {
+		return boost::iterator_range<mask_iterator>(static_cast<mask_t>(1),
+				static_cast<mask_t>(1) << pos_count());
+	}
+
+	boost::iterator_range<mask_iterator> all_value_masks() const {
+		return boost::iterator_range<mask_iterator>(static_cast<mask_t>(1),
+				static_cast<mask_t>(1) << value_count());
+	}
+
+	const std::vector<mask_t>& all_attribute_masks() const {
+		return attribute_masks_;
+	}
+
+
 private:
 	/// Temporary solution to allow splitting the parser into a separate
 	/// class
diff --git a/libpwrutils/CMakeLists.txt b/libpwrutils/CMakeLists.txt
index f6c221d..b7e1370 100644
--- a/libpwrutils/CMakeLists.txt
+++ b/libpwrutils/CMakeLists.txt
@@ -4,7 +4,7 @@ PROJECT(pwrutils)
 
 set(pwrutils_ver_major "0")
 set(pwrutils_ver_minor "0")
-set(pwrutils_ver_patch "1")
+set(pwrutils_ver_patch "2")
 
 set(LIBPWRUTILS_VERSION
 	"${pwrutils_ver_major}.${pwrutils_ver_minor}.${pwrutils_ver_patch}")
diff --git a/libpwrutils/util.h b/libpwrutils/util.h
index b1bb7fb..c1b6da9 100644
--- a/libpwrutils/util.h
+++ b/libpwrutils/util.h
@@ -21,6 +21,9 @@ or FITNESS FOR A PARTICULAR PURPOSE.
 
 #include <iostream>
 #include <string>
+#include <climits>
+
+#include <boost/pending/lowest_bit.hpp>
 
 namespace PwrNlp {
 
@@ -76,6 +79,24 @@ void utf8_string_to_uchar_container(const std::string& s,
 	}
 }
 
+/**
+ * Count set bits in a integral type.
+ * http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+ */
+template <typename T>
+int count_bits_set(T v)
+{
+	v = v - ((v >> 1) & (T)~(T)0/3);                              // temp
+	v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3);         // temp
+	v = (v + (v >> 4)) & (T)~(T)0/255*15;                         // temp
+	return (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * CHAR_BIT; // count
+}
+
+/**
+ * Get index of lowest set bit in an integral type
+ */
+using boost::lowest_bit;
+
 } /* end ns PwrNlp */
 
 #endif // PWRNLP_UTIL_H
diff --git a/tests/tag_split.cpp b/tests/tag_split.cpp
index c0d93c3..eb34403 100644
--- a/tests/tag_split.cpp
+++ b/tests/tag_split.cpp
@@ -12,7 +12,7 @@ struct F {
 			"A tag tog other a3 \n"
 			"B data thing tag-thing thang\n"
 			"C a b c \n"
-			"[POS]\n some A B [C]\n";
+			"[POS]\n some A B [C]\n same A B\n";
 		tagset.reset(new Corpus2::Tagset(tagset_string));
 	}
 	boost::shared_ptr<Corpus2::Tagset> tagset;
@@ -158,4 +158,28 @@ BOOST_FIXTURE_TEST_CASE( underscore_dots, F )
 
 	check_split(tag, r);
 }
+
+
+BOOST_FIXTURE_TEST_CASE( tag_size, F )
+{
+	Corpus2::Tag t = tagset->parse_simple_tag("some:tag:data", false);
+	Corpus2::Tag t2 = tagset->parse_simple_tag("some:tog", false);
+	Corpus2::Tag t3 = tagset->parse_simple_tag("same", false);
+	BOOST_CHECK(tagset->tag_is_singular(t));
+	BOOST_CHECK_EQUAL(tagset->tag_size(t), 1);
+	BOOST_CHECK(tagset->tag_is_singular(t2));
+	BOOST_CHECK_EQUAL(tagset->tag_size(t2), 1);
+	BOOST_CHECK(tagset->tag_is_singular(t3));
+	BOOST_CHECK_EQUAL(tagset->tag_size(t3), 1);
+	t.add_values(t2.get_values());
+	BOOST_CHECK(!tagset->tag_is_singular(t));
+	BOOST_CHECK_EQUAL(tagset->tag_size(t), 2);
+	t.add_pos(t3.get_pos());
+	BOOST_CHECK(!tagset->tag_is_singular(t));
+	BOOST_CHECK_EQUAL(tagset->tag_size(t), 4);
+	Corpus2::Tag t4 = tagset->parse_simple_tag("same:other:thang", true);
+	t.add_values(t4.get_values() & tagset->get_attribute_mask(std::string("A")));
+	BOOST_CHECK_EQUAL(tagset->tag_size(t), 6);
+}
+
 BOOST_AUTO_TEST_SUITE_END()
-- 
GitLab