Skip to content
Snippets Groups Projects
Commit a519febc authored by ilor's avatar ilor
Browse files

move count_bits_set to pwrutisl, put lowest_bit alias there, implement...

move count_bits_set to pwrutisl, put lowest_bit alias there, implement Tagset::tag_size and Tagset::tag_is_singular
parent 60c8b999
No related branches found
No related tags found
No related merge requests found
......@@ -2,36 +2,26 @@
#include <libcorpus2/tagsetmanager.h>
#include <libpwrutils/foreach.h>
#include <libpwrutils/util.h>
#include <cstring>
#include <sstream>
#include <boost/functional/hash.hpp>
#include <boost/pending/lowest_bit.hpp>
#include <bitset>
namespace Corpus2 {
template <typename T>
int count_bits_set(T v)
{
// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
v = v - ((v >> 1) & (T)~(T)0/3); // temp
v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3); // temp
v = (v + (v >> 4)) & (T)~(T)0/255*15; // temp
return (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * CHAR_BIT; // count
}
int Tag::pos_count() const
{
return count_bits_set(pos_);
return PwrNlp::count_bits_set(pos_);
}
int Tag::get_pos_index() const
{
if (pos_ == 0) return -1;
return boost::lowest_bit(pos_);
return PwrNlp::lowest_bit(pos_);
}
std::string Tag::raw_dump() const
......
......@@ -6,6 +6,7 @@
#include <libcorpus2/tagsetparser.h>
#include <libpwrutils/foreach.h>
#include <libpwrutils/util.h>
#include <boost/algorithm/string.hpp>
#include <boost/strong_typedef.hpp>
......@@ -337,6 +338,29 @@ std::string Tagset::tag_to_no_opt_string(const Tag &tag) const
return ss.str();
}
size_t Tagset::tag_size(const Tag& tag) const
{
size_t s = PwrNlp::count_bits_set(tag.get_pos());
foreach (mask_t attribute_mask, all_attribute_masks()) {
mask_t values = tag.get_values_for(attribute_mask);
size_t x = PwrNlp::count_bits_set(values);
if (x > 1) {
s *= x;
}
}
return s;
}
bool Tagset::tag_is_singular(const Tag& tag) const
{
if (PwrNlp::count_bits_set(tag.get_pos()) != 1) return false;
foreach (mask_t attribute_mask, all_attribute_masks()) {
mask_t values = tag.get_values_for(attribute_mask);
if (PwrNlp::count_bits_set(values) > 1) return false;
}
return true;
}
idx_t Tagset::get_pos_index(const string_range& pos) const
{
return pos_dict_.get_id(pos);
......
......@@ -250,6 +250,10 @@ public:
*/
std::string tag_to_no_opt_string(const Tag &tag) const;
size_t tag_size(const Tag& tag) const;
bool tag_is_singular(const Tag& tag) const;
/// POS name <-> index dictionary getter
const SymbolDictionary<idx_t>& pos_dictionary() const {
return pos_dict_;
......@@ -401,6 +405,46 @@ public:
/// get the original index of the POS in the tagset definition
int get_original_pos_index(idx_t pos) const;
struct mask_iterator
{
typedef mask_t value_type;
typedef std::forward_iterator_tag iterator_category;
typedef int difference_type;
typedef const mask_t *pointer;
typedef const mask_t &reference;
mask_iterator(const mask_iterator &i): i_(i.i_) {}
mask_iterator(const mask_t& i) : i_(i) {}
mask_iterator &operator++() { i_ <<= 1; return *this; }
mask_iterator operator++(int) { return mask_iterator(i_ << 1); }
mask_iterator &operator--() { i_ >>= 1; return *this; }
mask_iterator operator--(int) { return mask_iterator(i_ >> 1); }
const mask_t &operator*() const { return i_; }
bool operator==(const mask_iterator &i) const { return i_ == i.i_; }
bool operator!=(const mask_iterator &i) const { return i_ != i.i_; }
private:
mask_t i_;
};
boost::iterator_range<mask_iterator> all_pos_masks() const {
return boost::iterator_range<mask_iterator>(static_cast<mask_t>(1),
static_cast<mask_t>(1) << pos_count());
}
boost::iterator_range<mask_iterator> all_value_masks() const {
return boost::iterator_range<mask_iterator>(static_cast<mask_t>(1),
static_cast<mask_t>(1) << value_count());
}
const std::vector<mask_t>& all_attribute_masks() const {
return attribute_masks_;
}
private:
/// Temporary solution to allow splitting the parser into a separate
/// class
......
......@@ -4,7 +4,7 @@ PROJECT(pwrutils)
set(pwrutils_ver_major "0")
set(pwrutils_ver_minor "0")
set(pwrutils_ver_patch "1")
set(pwrutils_ver_patch "2")
set(LIBPWRUTILS_VERSION
"${pwrutils_ver_major}.${pwrutils_ver_minor}.${pwrutils_ver_patch}")
......
......@@ -21,6 +21,9 @@ or FITNESS FOR A PARTICULAR PURPOSE.
#include <iostream>
#include <string>
#include <climits>
#include <boost/pending/lowest_bit.hpp>
namespace PwrNlp {
......@@ -76,6 +79,24 @@ void utf8_string_to_uchar_container(const std::string& s,
}
}
/**
* Count set bits in a integral type.
* http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
*/
template <typename T>
int count_bits_set(T v)
{
v = v - ((v >> 1) & (T)~(T)0/3); // temp
v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3); // temp
v = (v + (v >> 4)) & (T)~(T)0/255*15; // temp
return (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * CHAR_BIT; // count
}
/**
* Get index of lowest set bit in an integral type
*/
using boost::lowest_bit;
} /* end ns PwrNlp */
#endif // PWRNLP_UTIL_H
......@@ -12,7 +12,7 @@ struct F {
"A tag tog other a3 \n"
"B data thing tag-thing thang\n"
"C a b c \n"
"[POS]\n some A B [C]\n";
"[POS]\n some A B [C]\n same A B\n";
tagset.reset(new Corpus2::Tagset(tagset_string));
}
boost::shared_ptr<Corpus2::Tagset> tagset;
......@@ -158,4 +158,28 @@ BOOST_FIXTURE_TEST_CASE( underscore_dots, F )
check_split(tag, r);
}
BOOST_FIXTURE_TEST_CASE( tag_size, F )
{
Corpus2::Tag t = tagset->parse_simple_tag("some:tag:data", false);
Corpus2::Tag t2 = tagset->parse_simple_tag("some:tog", false);
Corpus2::Tag t3 = tagset->parse_simple_tag("same", false);
BOOST_CHECK(tagset->tag_is_singular(t));
BOOST_CHECK_EQUAL(tagset->tag_size(t), 1);
BOOST_CHECK(tagset->tag_is_singular(t2));
BOOST_CHECK_EQUAL(tagset->tag_size(t2), 1);
BOOST_CHECK(tagset->tag_is_singular(t3));
BOOST_CHECK_EQUAL(tagset->tag_size(t3), 1);
t.add_values(t2.get_values());
BOOST_CHECK(!tagset->tag_is_singular(t));
BOOST_CHECK_EQUAL(tagset->tag_size(t), 2);
t.add_pos(t3.get_pos());
BOOST_CHECK(!tagset->tag_is_singular(t));
BOOST_CHECK_EQUAL(tagset->tag_size(t), 4);
Corpus2::Tag t4 = tagset->parse_simple_tag("same:other:thang", true);
t.add_values(t4.get_values() & tagset->get_attribute_mask(std::string("A")));
BOOST_CHECK_EQUAL(tagset->tag_size(t), 6);
}
BOOST_AUTO_TEST_SUITE_END()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment