diff --git a/libcorpus2/tag.cpp b/libcorpus2/tag.cpp index 8b7a4b601a5aaa2ade0daa2dd5b23adb9b7ed16e..dd621335d8e66f984d8fbdf0e7a1accf2b64d8a2 100644 --- a/libcorpus2/tag.cpp +++ b/libcorpus2/tag.cpp @@ -39,7 +39,9 @@ std::string Tag::raw_dump() const bool Tag::operator<(const Tag& other) const { - return pos_ < other.pos_ || (pos_ == other.pos_ && values_ < other.values_); + return pos_ < other.pos_ || + (pos_ == other.pos_ && + values_ < other.values_); } bool Tag::operator ==(const Tag& other) const diff --git a/libcorpus2/tag.h b/libcorpus2/tag.h index d11e676f3d732c6eeb1619cd0b6f593f23868d9b..716188d10c7bcd3bb75363936335beea1f53bffe 100644 --- a/libcorpus2/tag.h +++ b/libcorpus2/tag.h @@ -7,6 +7,7 @@ #include <boost/cstdint.hpp> #include <boost/strong_typedef.hpp> #include <boost/operators.hpp> +#include <libpwrutils/bitset.h> namespace Corpus2 { @@ -14,7 +15,7 @@ class Tagset; /// Typedefs for the string -> index mappings typedef boost::int8_t idx_t; -typedef boost::uint64_t mask_t; +typedef PwrNlp::bitset<64> mask_t; BOOST_STRONG_TYPEDEF(boost::uint32_t, tagset_idx_t); /** diff --git a/libcorpus2/tagset.cpp b/libcorpus2/tagset.cpp index 403b45bba12f54fcda40e5e637dfc6ed40b7dfd2..68a827aa25208680435ed18276d458328a30dd9f 100644 --- a/libcorpus2/tagset.cpp +++ b/libcorpus2/tagset.cpp @@ -144,7 +144,7 @@ void Tagset::parse_tag(const string_range_vector &fields, bool allow_extra, std::vector<mask_t> values; foreach (string_range& dot, dots) { mask_t v = get_value_mask(boost::copy_range<std::string>(dot)); - if (!v) { + if (v.none()) { throw TagParseError("Unknown attribute value", boost::copy_range<std::string>(r), "", id_string()); @@ -229,8 +229,8 @@ Tag Tagset::make_tag(idx_t pos_idx, mask_t values, bool allow_extra) const { mask_t valid_values = get_pos_value_mask(pos_idx); mask_t invalid = values & ~valid_values; - if (invalid && !allow_extra) { - mask_t first_invalid = boost::lowest_bit(invalid); + if (invalid.any() && !allow_extra) { + mask_t first_invalid = PwrNlp::lowest_bit(invalid); throw TagParseError("Attribute not valid for this POS", get_value_name(first_invalid), get_pos_name(pos_idx), id_string()); @@ -242,7 +242,7 @@ Tag Tagset::make_tag(idx_t pos_idx, mask_t values, bool allow_extra) const Tag Tagset::make_ign_tag() const { mask_t ign_pos_mask = get_pos_mask("ign"); - assert(ign_pos_mask); + assert(ign_pos_mask.any()); return Tag(ign_pos_mask); } @@ -301,9 +301,9 @@ std::string Tagset::tag_to_string(const Tag &tag) const const std::vector<idx_t>& attrs = get_pos_attributes(pos_idx); foreach (const idx_t& a, attrs) { mask_t value = tag.get_values_for(get_attribute_mask(a)); - if (pos_requires_attribute(pos_idx, a) || value) { + if (pos_requires_attribute(pos_idx, a) || value.any()) { ss << ":"; - if (value) { + if (value.any()) { ss << get_value_name(value); } } @@ -312,7 +312,7 @@ std::string Tagset::tag_to_string(const Tag &tag) const for (idx_t a = 0; a < attribute_dict_.size(); ++a) { if (!pos_has_attribute(pos_idx, a)) { mask_t value = tag.get_values_for(get_attribute_mask(a)); - if (value) { + if (value.any()) { ss << ":" << get_value_name(value); } } @@ -329,7 +329,7 @@ std::string Tagset::tag_to_no_opt_string(const Tag &tag) const foreach (const idx_t& a, attrs) { mask_t value = tag.get_values_for(get_attribute_mask(a)); ss << ":"; - if (value) { + if (value.any()) { ss << get_value_name(value); } else { ss << get_attribute_name(a); @@ -365,7 +365,7 @@ std::vector<Tag> Tagset::split_tag(const Tag& tag) const { std::vector<Tag> tags; mask_t pos = tag.get_pos(); - while (pos) { + while (pos.any()) { idx_t pos_idx = PwrNlp::lowest_bit(pos); mask_t pos_mask = static_cast<mask_t>(1) << pos_idx; pos ^= pos_mask; @@ -375,11 +375,11 @@ std::vector<Tag> Tagset::split_tag(const Tag& tag) const for (idx_t a = 0; a < attribute_count(); ++a) { mask_t ma = get_attribute_mask(a); mask_t v = tag.get_values_for(ma); - if (ma) { + if (ma.any()) { bool dup = false; size_t sz = tags.size(); foreach (mask_t vm, get_attribute_values(a)) { - if (v & vm) { + if ((v & vm).any()) { if (dup) { std::copy(tags.begin(), tags.begin() + sz, std::back_inserter(tags)); } diff --git a/libcorpus2/token.cpp b/libcorpus2/token.cpp index e69436ffe75d8bfc2f886af9eb55c5ed6c04ab2f..1b07c3fd3fbec6f48561674aa6f215ad63668f20 100644 --- a/libcorpus2/token.cpp +++ b/libcorpus2/token.cpp @@ -94,7 +94,7 @@ bool Token::orth_pos_match(mask_t pos, const UnicodeString &orth) const if (orth.length() > 0) { if (orth.caseCompare(orth_, 0) != 0) return false; } - if (pos) { + if (pos.any()) { foreach (const Lexeme& lex, lexemes_) { if (lex.tag().get_pos() != pos) return false; } diff --git a/libpwrutils/bitset.h b/libpwrutils/bitset.h new file mode 100644 index 0000000000000000000000000000000000000000..a0a2be6f47c939cb802b99a35ffcd9888de8eda8 --- /dev/null +++ b/libpwrutils/bitset.h @@ -0,0 +1,111 @@ +#ifndef PWRNLP_BITSET_H +#define PWRNLP_BITSET_H + +#include <libpwrutils/foreach.h> +#include <boost/range.hpp> +#include <bitset> +#include <boost/functional/hash.hpp> +#include <boost/pending/lowest_bit.hpp> + + +namespace PwrNlp { + +using std::bitset; + +static const size_t ulong_bits = sizeof(unsigned long) * CHAR_BIT; + +typedef bitset<ulong_bits> word_bitset; + + +/** + * Count set bits in a integral type. + * http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel + */ +template <typename T> inline +int count_bits_set(T v) +{ + v = v - ((v >> 1) & (T)~(T)0/3); // temp + v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3); // temp + v = (v + (v >> 4)) & (T)~(T)0/255*15; // temp + return (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * CHAR_BIT; // count +} + +template <size_t S> inline +size_t count_bits_set(const std::bitset<S>& b) +{ + return b.count(); +} + +template <size_t S> inline +size_t lowest_bit(const bitset<S>& b) +{ + // GCC specific + return b._Find_first(); +} + +/** + * Get index of lowest set bit in an integral type + */ +inline size_t lowest_bit(const unsigned long long& t) +{ + if (t <= 0) return static_cast<size_t>(-1); + return boost::lowest_bit(t); +} + +inline size_t lowest_bit(const unsigned long& t) +{ + if (t <= 0) return static_cast<size_t>(-1); + return boost::lowest_bit(t); +} + +} /* end ns PwrNlp */ + +namespace std { + +template<size_t S> inline +size_t hash_value(bitset<S> b) +{ + size_t seed = 0; + const bitset<S> mask(std::numeric_limits<unsigned long>::max()); + while (b.any()) { + boost::hash_combine(seed, (b & mask).to_ulong()); + b >>= PwrNlp::ulong_bits; + } + return seed; +} + +template<> inline +size_t hash_value(bitset<PwrNlp::ulong_bits> b) +{ + size_t seed = 0; + boost::hash_combine(seed, b.to_ulong()); + return seed; +} + +template<size_t S> inline +bool operator<(bitset<S> left, bitset<S> right) +{ + const bitset<S> mask(std::numeric_limits<unsigned long>::max()); + while (left.any()) { + unsigned long l1 = (left & mask).to_ulong(); + unsigned long r1 = (right & mask).to_ulong(); + if (l1 < r1) { + return true; + } else if (l1 > r1) { + return false; + } + left >>= PwrNlp::ulong_bits; + right >>= PwrNlp::ulong_bits; + } + return right.any(); +} + +template<> inline +bool operator<(bitset<PwrNlp::ulong_bits> left, bitset<PwrNlp::ulong_bits> right) +{ + return left.to_ulong() < right.to_ulong(); +} + +} + +#endif // PWRNLP_BITSET_H diff --git a/libpwrutils/util.h b/libpwrutils/util.h index c1b6da9047c2fcdaf8094acc13e990f23cec0c9f..52fdf1cac9bc480b52015f930bb82b0bc045cd1a 100644 --- a/libpwrutils/util.h +++ b/libpwrutils/util.h @@ -23,8 +23,6 @@ or FITNESS FOR A PARTICULAR PURPOSE. #include <string> #include <climits> -#include <boost/pending/lowest_bit.hpp> - namespace PwrNlp { /** @@ -79,23 +77,7 @@ void utf8_string_to_uchar_container(const std::string& s, } } -/** - * Count set bits in a integral type. - * http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel - */ -template <typename T> -int count_bits_set(T v) -{ - v = v - ((v >> 1) & (T)~(T)0/3); // temp - v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3); // temp - v = (v + (v >> 4)) & (T)~(T)0/255*15; // temp - return (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * CHAR_BIT; // count -} -/** - * Get index of lowest set bit in an integral type - */ -using boost::lowest_bit; } /* end ns PwrNlp */ diff --git a/tagset-tool/main.cpp b/tagset-tool/main.cpp index efabbf93209956e6a97e0fcd85d5120353043ffd..0462e8e8f31d92b33fdac47e5864c75ab29091f3 100644 --- a/tagset-tool/main.cpp +++ b/tagset-tool/main.cpp @@ -121,12 +121,12 @@ void tagset_query_cb(const Corpus2::Tagset& tagset, const std::string& s) } } std::cout << "\n"; - } else if (val) { + } else if (val.any()) { Corpus2::idx_t a = tagset.get_value_attribute_index(val); std::cout << s << " -> value -> attribute "; std::cout << tagset.attribute_dictionary().get_string(a); std::cout << " ."; - foreach (Corpus2::idx_t v, tagset.get_attribute_values(a)) { + foreach (Corpus2::mask_t v, tagset.get_attribute_values(a)) { std::cout << " " << tagset.get_value_name(v); } std::cout << "\nIn POSes:"; diff --git a/tests/basic.cpp b/tests/basic.cpp index e2117c39ed1043d1a20e40dcbe757cd4450ce9a0..d8a8e8ffba451d45b09748443d820a147316e4a2 100644 --- a/tests/basic.cpp +++ b/tests/basic.cpp @@ -20,7 +20,7 @@ BOOST_AUTO_TEST_CASE( token_dup_lexemes ) { Corpus2::Token t(UnicodeString::fromUTF8("ZZ"), PwrNlp::Whitespace::ManySpaces); //Corpus2::Tagset tagset(tagsetstr1); - Corpus2::Tag t1(Corpus2::tagset_idx_t(0), Corpus2::idx_t(0)); + Corpus2::Tag t1(Corpus2::mask_t(0)); Corpus2::Lexeme l1(UnicodeString::fromUTF8("aaa"), t1); Corpus2::Lexeme l2(UnicodeString::fromUTF8("bbb"), t1); BOOST_CHECK(!t.check_duplicate_lexemes());