From a519febc2afa7f86e21b97b3521e3bb9ca11922f Mon Sep 17 00:00:00 2001 From: ilor <kailoran@gmail.com> Date: Wed, 20 Oct 2010 14:37:00 +0200 Subject: [PATCH] move count_bits_set to pwrutisl, put lowest_bit alias there, implement Tagset::tag_size and Tagset::tag_is_singular --- libcorpus2/tag.cpp | 18 ++++------------ libcorpus2/tagset.cpp | 24 +++++++++++++++++++++ libcorpus2/tagset.h | 44 ++++++++++++++++++++++++++++++++++++++ libpwrutils/CMakeLists.txt | 2 +- libpwrutils/util.h | 21 ++++++++++++++++++ tests/tag_split.cpp | 26 +++++++++++++++++++++- 6 files changed, 119 insertions(+), 16 deletions(-) diff --git a/libcorpus2/tag.cpp b/libcorpus2/tag.cpp index c387505..8b7a4b6 100644 --- a/libcorpus2/tag.cpp +++ b/libcorpus2/tag.cpp @@ -2,36 +2,26 @@ #include <libcorpus2/tagsetmanager.h> #include <libpwrutils/foreach.h> +#include <libpwrutils/util.h> #include <cstring> #include <sstream> #include <boost/functional/hash.hpp> -#include <boost/pending/lowest_bit.hpp> + #include <bitset> namespace Corpus2 { - -template <typename T> -int count_bits_set(T v) -{ - // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel - v = v - ((v >> 1) & (T)~(T)0/3); // temp - v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3); // temp - v = (v + (v >> 4)) & (T)~(T)0/255*15; // temp - return (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * CHAR_BIT; // count -} - int Tag::pos_count() const { - return count_bits_set(pos_); + return PwrNlp::count_bits_set(pos_); } int Tag::get_pos_index() const { if (pos_ == 0) return -1; - return boost::lowest_bit(pos_); + return PwrNlp::lowest_bit(pos_); } std::string Tag::raw_dump() const diff --git a/libcorpus2/tagset.cpp b/libcorpus2/tagset.cpp index 51bce4a..e03a78e 100644 --- a/libcorpus2/tagset.cpp +++ b/libcorpus2/tagset.cpp @@ -6,6 +6,7 @@ #include <libcorpus2/tagsetparser.h> #include <libpwrutils/foreach.h> +#include <libpwrutils/util.h> #include <boost/algorithm/string.hpp> #include <boost/strong_typedef.hpp> @@ -337,6 +338,29 @@ std::string Tagset::tag_to_no_opt_string(const Tag &tag) const return ss.str(); } +size_t Tagset::tag_size(const Tag& tag) const +{ + size_t s = PwrNlp::count_bits_set(tag.get_pos()); + foreach (mask_t attribute_mask, all_attribute_masks()) { + mask_t values = tag.get_values_for(attribute_mask); + size_t x = PwrNlp::count_bits_set(values); + if (x > 1) { + s *= x; + } + } + return s; +} + +bool Tagset::tag_is_singular(const Tag& tag) const +{ + if (PwrNlp::count_bits_set(tag.get_pos()) != 1) return false; + foreach (mask_t attribute_mask, all_attribute_masks()) { + mask_t values = tag.get_values_for(attribute_mask); + if (PwrNlp::count_bits_set(values) > 1) return false; + } + return true; +} + idx_t Tagset::get_pos_index(const string_range& pos) const { return pos_dict_.get_id(pos); diff --git a/libcorpus2/tagset.h b/libcorpus2/tagset.h index dec4013..366b0b0 100644 --- a/libcorpus2/tagset.h +++ b/libcorpus2/tagset.h @@ -250,6 +250,10 @@ public: */ std::string tag_to_no_opt_string(const Tag &tag) const; + size_t tag_size(const Tag& tag) const; + + bool tag_is_singular(const Tag& tag) const; + /// POS name <-> index dictionary getter const SymbolDictionary<idx_t>& pos_dictionary() const { return pos_dict_; @@ -401,6 +405,46 @@ public: /// get the original index of the POS in the tagset definition int get_original_pos_index(idx_t pos) const; + + struct mask_iterator + { + typedef mask_t value_type; + typedef std::forward_iterator_tag iterator_category; + typedef int difference_type; + typedef const mask_t *pointer; + typedef const mask_t &reference; + mask_iterator(const mask_iterator &i): i_(i.i_) {} + mask_iterator(const mask_t& i) : i_(i) {} + + mask_iterator &operator++() { i_ <<= 1; return *this; } + mask_iterator operator++(int) { return mask_iterator(i_ << 1); } + mask_iterator &operator--() { i_ >>= 1; return *this; } + mask_iterator operator--(int) { return mask_iterator(i_ >> 1); } + + const mask_t &operator*() const { return i_; } + + bool operator==(const mask_iterator &i) const { return i_ == i.i_; } + bool operator!=(const mask_iterator &i) const { return i_ != i.i_; } + + private: + mask_t i_; + }; + + boost::iterator_range<mask_iterator> all_pos_masks() const { + return boost::iterator_range<mask_iterator>(static_cast<mask_t>(1), + static_cast<mask_t>(1) << pos_count()); + } + + boost::iterator_range<mask_iterator> all_value_masks() const { + return boost::iterator_range<mask_iterator>(static_cast<mask_t>(1), + static_cast<mask_t>(1) << value_count()); + } + + const std::vector<mask_t>& all_attribute_masks() const { + return attribute_masks_; + } + + private: /// Temporary solution to allow splitting the parser into a separate /// class diff --git a/libpwrutils/CMakeLists.txt b/libpwrutils/CMakeLists.txt index f6c221d..b7e1370 100644 --- a/libpwrutils/CMakeLists.txt +++ b/libpwrutils/CMakeLists.txt @@ -4,7 +4,7 @@ PROJECT(pwrutils) set(pwrutils_ver_major "0") set(pwrutils_ver_minor "0") -set(pwrutils_ver_patch "1") +set(pwrutils_ver_patch "2") set(LIBPWRUTILS_VERSION "${pwrutils_ver_major}.${pwrutils_ver_minor}.${pwrutils_ver_patch}") diff --git a/libpwrutils/util.h b/libpwrutils/util.h index b1bb7fb..c1b6da9 100644 --- a/libpwrutils/util.h +++ b/libpwrutils/util.h @@ -21,6 +21,9 @@ or FITNESS FOR A PARTICULAR PURPOSE. #include <iostream> #include <string> +#include <climits> + +#include <boost/pending/lowest_bit.hpp> namespace PwrNlp { @@ -76,6 +79,24 @@ void utf8_string_to_uchar_container(const std::string& s, } } +/** + * Count set bits in a integral type. + * http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel + */ +template <typename T> +int count_bits_set(T v) +{ + v = v - ((v >> 1) & (T)~(T)0/3); // temp + v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3); // temp + v = (v + (v >> 4)) & (T)~(T)0/255*15; // temp + return (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * CHAR_BIT; // count +} + +/** + * Get index of lowest set bit in an integral type + */ +using boost::lowest_bit; + } /* end ns PwrNlp */ #endif // PWRNLP_UTIL_H diff --git a/tests/tag_split.cpp b/tests/tag_split.cpp index c0d93c3..eb34403 100644 --- a/tests/tag_split.cpp +++ b/tests/tag_split.cpp @@ -12,7 +12,7 @@ struct F { "A tag tog other a3 \n" "B data thing tag-thing thang\n" "C a b c \n" - "[POS]\n some A B [C]\n"; + "[POS]\n some A B [C]\n same A B\n"; tagset.reset(new Corpus2::Tagset(tagset_string)); } boost::shared_ptr<Corpus2::Tagset> tagset; @@ -158,4 +158,28 @@ BOOST_FIXTURE_TEST_CASE( underscore_dots, F ) check_split(tag, r); } + + +BOOST_FIXTURE_TEST_CASE( tag_size, F ) +{ + Corpus2::Tag t = tagset->parse_simple_tag("some:tag:data", false); + Corpus2::Tag t2 = tagset->parse_simple_tag("some:tog", false); + Corpus2::Tag t3 = tagset->parse_simple_tag("same", false); + BOOST_CHECK(tagset->tag_is_singular(t)); + BOOST_CHECK_EQUAL(tagset->tag_size(t), 1); + BOOST_CHECK(tagset->tag_is_singular(t2)); + BOOST_CHECK_EQUAL(tagset->tag_size(t2), 1); + BOOST_CHECK(tagset->tag_is_singular(t3)); + BOOST_CHECK_EQUAL(tagset->tag_size(t3), 1); + t.add_values(t2.get_values()); + BOOST_CHECK(!tagset->tag_is_singular(t)); + BOOST_CHECK_EQUAL(tagset->tag_size(t), 2); + t.add_pos(t3.get_pos()); + BOOST_CHECK(!tagset->tag_is_singular(t)); + BOOST_CHECK_EQUAL(tagset->tag_size(t), 4); + Corpus2::Tag t4 = tagset->parse_simple_tag("same:other:thang", true); + t.add_values(t4.get_values() & tagset->get_attribute_mask(std::string("A"))); + BOOST_CHECK_EQUAL(tagset->tag_size(t), 6); +} + BOOST_AUTO_TEST_SUITE_END() -- GitLab