diff --git a/libcorpus2/CMakeLists.txt b/libcorpus2/CMakeLists.txt index 3cf132509924ddcdf060b5915fee5a6f04323cce..e6873f90dbf24a466b871192bf2aa983a9089f76 100644 --- a/libcorpus2/CMakeLists.txt +++ b/libcorpus2/CMakeLists.txt @@ -2,8 +2,8 @@ PROJECT(corpus2) set(corpus2_ver_major "0") -set(corpus2_ver_minor "0") -set(corpus2_ver_patch "2") +set(corpus2_ver_minor "1") +set(corpus2_ver_patch "0") if(NOT LIBCORPUS2_SRC_DATA_DIR) diff --git a/libcorpus2/lexeme.cpp b/libcorpus2/lexeme.cpp index cd5361328ed333c2f5b30770c86cc8004325203f..7c1a8355611102516621c68354e9523994833096 100644 --- a/libcorpus2/lexeme.cpp +++ b/libcorpus2/lexeme.cpp @@ -19,7 +19,7 @@ Lexeme Lexeme::create(const UnicodeString& lemma, const Tag& tag) bool Lexeme::is_null() const { - return lemma().length() == 0 || !tag().has_valid_tagset(); + return lemma().length() == 0 || tag().is_null(); } bool Lexeme::operator<(const Lexeme& other) const diff --git a/libcorpus2/tag.cpp b/libcorpus2/tag.cpp index d3f6fe4506cbb46e09cb3278015273c33603bf32..945b5969de08c1d57d4144bdf3bb8f796f025620 100644 --- a/libcorpus2/tag.cpp +++ b/libcorpus2/tag.cpp @@ -2,74 +2,58 @@ #include <libcorpus2/tagsetmanager.h> #include <libpwrutils/foreach.h> +#include <libpwrutils/util.h> #include <cstring> #include <sstream> #include <boost/functional/hash.hpp> -namespace Corpus2 { - -Tag::Tag() - : pos_id_(-1), tagset_id_(-1) -{ -} +#include <bitset> -Tag::Tag(tagset_idx_t tagset_id, pos_idx_t pos) - : pos_id_(pos), tagset_id_(tagset_id) -{ -} +namespace Corpus2 { -Tag::Tag(tagset_idx_t tagset_id, pos_idx_t pos, - const std::vector<value_idx_t> &values) - : pos_id_(pos), values_(values), tagset_id_(tagset_id) +int Tag::pos_count() const { + return PwrNlp::count_bits_set(pos_); } -bool Tag::has_valid_tagset() const +int Tag::get_pos_index() const { - return tagset_id_ != static_cast<tagset_idx_t>(-1) - && TagsetManagerSingleton::Instance().get_cache_entry(tagset_id()); + if (pos_ == 0) return -1; + return PwrNlp::lowest_bit(pos_); } std::string Tag::raw_dump() const { std::ostringstream ss; ss << "["; - ss << static_cast<int>(tagset_id_) << "#" << static_cast<int>(pos_id_); - foreach (value_idx_t v, values_) { - ss << ":" << static_cast<int>(v) ; - } + std::bitset<sizeof(mask_t) * CHAR_BIT> binaryp(pos_); + std::bitset<sizeof(mask_t) * CHAR_BIT> binaryv(values_); + //ss << static_cast<int>(tagset_id_); + ss << "" << pos_; + ss << ":" << values_; ss << "]"; return ss.str(); } bool Tag::operator<(const Tag& other) const { - return tagset_id_ < other.tagset_id_ - || (tagset_id_ == other.tagset_id_ - && (pos_id_ < other.pos_id_ - || (pos_id_ == other.pos_id_ - && (values_.size() < other.values_.size() - || (values_.size() == other.values_.size() - && memcmp(&values_[0], &other.values_[0], - std::min(values_.size(), - other.values_.size())) < 0))))); + return pos_ < other.pos_ || + (pos_ == other.pos_ && + values_ < other.values_); } bool Tag::operator ==(const Tag& other) const { - return tagset_id_ == other.tagset_id_ - && pos_id_ == other.pos_id_ - && values_ == other.values_; + return pos_ == other.pos_ && values_ == other.values_; } size_t hash_value(const Tag& tag) { std::size_t seed = 0; - boost::hash_combine(seed, tag.pos_id_); - boost::hash_combine(seed, tag.tagset_id_); - boost::hash_combine(seed, tag.values_); + boost::hash_combine(seed, tag.get_pos()); + boost::hash_combine(seed, tag.get_values()); return seed; } diff --git a/libcorpus2/tag.h b/libcorpus2/tag.h index 6db43d2aac0f6aaa74ff44d956ad18d4e824d229..716188d10c7bcd3bb75363936335beea1f53bffe 100644 --- a/libcorpus2/tag.h +++ b/libcorpus2/tag.h @@ -3,20 +3,19 @@ #include <string> #include <vector> - +#include <cassert> #include <boost/cstdint.hpp> #include <boost/strong_typedef.hpp> #include <boost/operators.hpp> +#include <libpwrutils/bitset.h> namespace Corpus2 { class Tagset; /// Typedefs for the string -> index mappings -typedef boost::uint8_t idx_t; -BOOST_STRONG_TYPEDEF(idx_t, pos_idx_t); -BOOST_STRONG_TYPEDEF(idx_t, attribute_idx_t); -BOOST_STRONG_TYPEDEF(idx_t, value_idx_t); +typedef boost::int8_t idx_t; +typedef PwrNlp::bitset<64> mask_t; BOOST_STRONG_TYPEDEF(boost::uint32_t, tagset_idx_t); /** @@ -31,52 +30,100 @@ BOOST_STRONG_TYPEDEF(boost::uint32_t, tagset_idx_t); * This allows more sanity checking, esp. during tagset conversion. */ class Tag - : boost::equality_comparable<Tag>, boost::less_than_comparable<Tag> +// : boost::equality_comparable<Tag>, boost::less_than_comparable<Tag> { public: /// Empty tag constructor - Tag(); + Tag() + : pos_(0), values_(0) + { + } /// Tagset-and-POS (no values) constructor - Tag(tagset_idx_t tagset_id, pos_idx_t pos); + explicit Tag(mask_t pos) + : pos_(pos), values_(0) + { + } /// Tagset-POS-values constructor - Tag(tagset_idx_t tagset_id, pos_idx_t pos, - const std::vector<value_idx_t>& values); + Tag(mask_t pos, mask_t values) + : pos_(pos), values_(values) + { + } + + bool is_null() const { + return pos_ == 0 && values_ == 0; + } + + int pos_count() const; + + int get_pos_index() const; /// POS (part-of-speech) accesor - pos_idx_t pos_id() const { - return pos_id_; + mask_t get_pos() const { + return pos_; } /// POS setter - void set_pos_id(pos_idx_t v) { - pos_id_ = v; + void set_pos(mask_t v) { + pos_ = v; + } + + void add_pos(mask_t v) { + pos_ |= v; } /// values accesor - const std::vector<value_idx_t>& values() const { + mask_t get_values() const { return values_; } + mask_t get_values_for(mask_t mask) const { + return values_ & mask; + } + /// values accesor -- nonconst reference - std::vector<value_idx_t>& values() { - return values_; + void set_values(mask_t v) { + values_ = v; } - /// debug aid, dump the tag's internal numeric representation - std::string raw_dump() const; + void add_values(mask_t v) { + values_ |= v; + } + + void add_values_masked(mask_t value, mask_t mask) { + //values_ = (values_ & ~mask) | (value & mask); + //see http://graphics.stanford.edu/~seander/bithacks.html#MaskedMerge + values_ = values_ ^ ((values_ ^ value) & mask); + } + + Tag& combine_with(const Tag& other) { + pos_ |= other.pos_; + values_ |= other.values_; + return *this; + } + + Tag get_combined(const Tag& other) const { + Tag t(*this); + return t.combine_with(other); + } - /// tagset id accesor - tagset_idx_t tagset_id() const { - return tagset_id_; + Tag& mask_with(const Tag& other) { + pos_ &= other.pos_; + values_ &= other.values_; + return *this; } - bool has_valid_tagset() const; + Tag get_masked(const Tag& other) const { + Tag t(*this); + return t.mask_with(other); + } + + /// debug aid, dump the tag's internal numeric representation + std::string raw_dump() const; /** - * Tag comparison. Tags sort by tagset id, then pos id, and finally - * value-by-value. Boost is used to provide other comparison operators. + * Tag comparison. */ bool operator<(const Tag& other) const; @@ -87,17 +134,15 @@ public: private: /// the POS id - pos_idx_t pos_id_; -\ - /// the values - std::vector<value_idx_t> values_; + mask_t pos_; - /// the tagset id - tagset_idx_t tagset_id_; + /// the values + mask_t values_; - friend size_t hash_value(const Tag &tag); }; +size_t hash_value(const Tag &tag); + } /* end ns Corpus2 */ diff --git a/libcorpus2/tagset.cpp b/libcorpus2/tagset.cpp index dd193b397e6061e29f6bb75cf3427b141d58cf06..538d2c240a764256ec904bd25caa4f3a73c0fc43 100644 --- a/libcorpus2/tagset.cpp +++ b/libcorpus2/tagset.cpp @@ -6,11 +6,13 @@ #include <libcorpus2/tagsetparser.h> #include <libpwrutils/foreach.h> +#include <libpwrutils/util.h> #include <boost/algorithm/string.hpp> #include <boost/strong_typedef.hpp> #include <boost/algorithm/string.hpp> #include <boost/bind.hpp> +#include <boost/pending/lowest_bit.hpp> #include <sstream> #include <iostream> @@ -71,12 +73,11 @@ Tagset::Tagset() { } -Tagset::Tagset(const char *s) - : id_(++next_id_) +Tagset Tagset::from_data(const char *s) { std::stringstream ss; ss << s; - *this = TagsetParser::load_ini(ss); + return TagsetParser::load_ini(ss); } std::string Tagset::id_string() const @@ -104,19 +105,19 @@ void Tagset::parse_tag(const string_range &s, bool allow_extra, namespace { void append_to_multi_tag( - std::vector< std::vector<value_idx_t> > & current, - const std::vector<value_idx_t> & to_add) + std::vector< mask_t > & current, + const std::vector<mask_t> & to_add, mask_t to_add_attr) { - foreach (std::vector<value_idx_t>& o, current) { - o.push_back(to_add[0]); - } size_t current_size = current.size(); for (size_t ai = 1; ai < to_add.size(); ++ai) { for (size_t oi = 0; oi < current_size; ++oi) { current.push_back(current[oi]); - current.back().back() = to_add[ai]; + current.back() = (current.back() & ~to_add_attr) | to_add[ai]; } } + for (size_t i = 0; i < current_size; ++i) { + current[i] |= to_add[0]; + } } } @@ -126,41 +127,54 @@ void Tagset::parse_tag(const string_range_vector &fields, bool allow_extra, if (fields.empty()) { throw TagParseError("No POS", "", "", id_string()); } - pos_idx_t pos_id = pos_dict_.get_id(fields[0]); - if (!pos_dict_.is_id_valid(pos_id)) { + idx_t pos_idx = get_pos_index(fields[0]); + if (pos_idx < 0) { throw TagParseError("Invalid POS", boost::copy_range<std::string>(fields[0]), "", id_string()); } - std::vector< std::vector<value_idx_t> > opts(1); + std::vector< mask_t > all_variants; + all_variants.push_back(0); for (size_t fi = 1; fi < fields.size(); ++fi) { const string_range& r = fields[fi]; if (r.size() != 1 || *r.begin() != '_') { string_range_vector dots; boost::algorithm::split(dots, r, boost::is_any_of(".")); - std::vector<value_idx_t> values; + std::vector<mask_t> values; + mask_t amask; foreach (string_range& dot, dots) { - value_idx_t v = value_dict_.get_id(dot); - if (!value_dict_.is_id_valid(v)) { + mask_t v = get_value_mask(boost::copy_range<std::string>(dot)); + mask_t curr = get_attribute_mask(get_value_attribute(v)); + + + if (amask.none()) { + amask = curr; + } else if (amask != curr) { + throw TagParseError("Values from two attributes split by dot", + boost::copy_range<std::string>(r), "", + id_string()); + } + if (v.none()) { throw TagParseError("Unknown attribute value", boost::copy_range<std::string>(r), "", id_string()); } values.push_back(v); } - append_to_multi_tag(opts, values); + append_to_multi_tag(all_variants, values, amask); } else if (!r.empty()) { // underscore handling - if (fi - 1 >= pos_attributes_[pos_id].size()) { + if (fi - 1 >= pos_attributes_[pos_idx].size()) { throw TagParseError( "Underscore beyond last attribute for this POS", "", "", id_string()); } - attribute_idx_t attr = pos_attributes_[pos_id][fi - 1]; - append_to_multi_tag(opts, attribute_values_[attr]); + idx_t attr = pos_attributes_[pos_idx][fi - 1]; + mask_t amask = get_attribute_mask(attr); + append_to_multi_tag(all_variants, attribute_values_[attr], amask); } // else empty, do nothing } - foreach (std::vector<value_idx_t>& opt, opts) { - sink(make_tag(pos_id, opt, allow_extra)); + foreach (mask_t variant, all_variants) { + sink(make_tag(pos_idx, variant, allow_extra)); } } @@ -196,108 +210,101 @@ Tag Tagset::parse_simple_tag(const string_range_vector &ts, throw TagParseError("Empty POS+attribute list", "", "", id_string()); } - pos_idx_t pos_id = pos_dict_.get_id(ts[0]); - if (!pos_dict_.is_id_valid(pos_id)) { + idx_t pos_idx = get_pos_index(ts[0]); + if (pos_idx < 0) { throw TagParseError("Invalid POS", boost::copy_range<std::string>(ts[0]), "", id_string()); } - const std::vector<bool>& valid_attrs_mask = - get_pos_valid_attributes(pos_id); - Tag tag(id_, pos_id); - std::vector<value_idx_t> vvv(attribute_dict_.size(), - static_cast<value_idx_t>(0)); - tag.values().swap(vvv); - + mask_t values = 0; for (size_t i = 1; i < ts.size(); ++i) { if (!ts[i].empty()) { - value_idx_t val_id = value_dict_.get_id(ts[i]); - if (!value_dict_.is_id_valid(val_id)) { - attribute_idx_t a = attribute_dict_.get_id(ts[i]); - if (attribute_dict_.is_id_valid(a)) { - tag.values()[a] = 0; + mask_t val = get_value_mask(boost::copy_range<std::string>(ts[i])); + if (val == 0) { + mask_t a = get_attribute_mask(ts[i]); + if (a != 0) { + values &= (~a); } else { throw TagParseError("Unknown attribute value", boost::copy_range<std::string>(ts[i]), "", id_string()); } } else { - attribute_idx_t attr_id = get_value_attribute(val_id); - if (valid_attrs_mask[attr_id] || allow_extra) { - tag.values()[attr_id] = val_id; - } + mask_t a = get_attribute_mask(get_value_attribute(val)); + values = (values & ~a) | val; } } } - return tag; + + return make_tag(pos_idx, values, allow_extra); } -Tag Tagset::make_tag(pos_idx_t pos, const std::vector<value_idx_t>& values, - bool allow_extra) const +Tag Tagset::make_tag(idx_t pos_idx, mask_t values, bool allow_extra) const { - const std::vector<bool>& valid_attrs_mask = - get_pos_valid_attributes(pos); - Tag tag(id_, pos); - std::vector<value_idx_t> vvv(attribute_dict_.size(), - static_cast<value_idx_t>(0)); - tag.values().swap(vvv); - - for (size_t i = 0; i < values.size(); ++i) { - value_idx_t val_id = values[i]; - attribute_idx_t attr_id = get_value_attribute(val_id); - if (valid_attrs_mask[attr_id] || allow_extra) { - tag.values()[attr_id] = val_id; - } else { - throw TagParseError("Attribute not valid for this POS", - attribute_dict_.get_string(attr_id), - pos_dict_.get_string(pos), id_string()); - } + mask_t required_values = get_pos_required_mask(pos_idx); + //std::cerr << values << "\n"; + //std::cerr << required_values << "\n"; + //std::cerr << (required_values & values) << "\n"; + //std::cerr << PwrNlp::count_bits_set(required_values & values) + // << " of " << pos_required_attributes_idx_[pos_idx].size() << "\n"; + size_t has_req = PwrNlp::count_bits_set(required_values & values); + if (has_req != pos_required_attributes_idx_[pos_idx].size()) { + throw TagParseError("Required attribute missing", + tag_to_string(Tag(get_pos_mask(pos_idx), values)), + get_pos_name(pos_idx), id_string()); + } + mask_t valid_values = get_pos_value_mask(pos_idx); + mask_t invalid = values & ~valid_values; + if (invalid.any() && !allow_extra) { + mask_t first_invalid = PwrNlp::lowest_bit(invalid); + throw TagParseError("Attribute not valid for this POS", + get_value_name(first_invalid), + get_pos_name(pos_idx), id_string()); } - return tag; + // check singularity? + return Tag(get_pos_mask(pos_idx), values); } Tag Tagset::make_ign_tag() const { - pos_idx_t ign_pos = pos_dictionary().get_id("ign"); - assert(pos_dictionary().is_id_valid(ign_pos)); - Tag tag(id_, ign_pos); - tag.values().resize(attribute_dict_.size(), - static_cast<value_idx_t>(0)); - return tag; + mask_t ign_pos_mask = get_pos_mask("ign"); + assert(ign_pos_mask.any()); + return Tag(ign_pos_mask); } bool Tagset::validate_tag(const Tag &t, bool allow_extra, std::ostream* os) const { - if (!pos_dict_.is_id_valid(t.pos_id())) { + if (t.pos_count() != 1) { if (os) { - (*os) << " POS not valid : " << (int) t.pos_id(); + (*os) << " POS not singular : " << t.pos_count(); } return false; } - std::vector<bool> valid = get_pos_valid_attributes(t.pos_id()); - std::vector<bool> required = get_pos_required_attributes(t.pos_id()); - if (t.values().size() < attribute_dict_.size()) { + size_t ts = tag_size(t); + if (ts != 1) { if (os) { - (*os) << " Values size below tagset attribute count: " - << t.values().size() << "<" << attribute_dict_.size(); + (*os) << " Tag not singular : " << ts; } return false; } - if (!allow_extra && t.values().size() > attribute_dict_.size()) { + + idx_t pos_idx = t.get_pos_index(); + if (!pos_dict_.is_id_valid(pos_idx)) { if (os) { - (*os) << " Values size above tagset attribute count" - << t.values().size() << ">" << attribute_dict_.size(); + (*os) << " POS not valid : " << (int)pos_idx; } return false; } - for (attribute_idx_t i = static_cast<attribute_idx_t>(0); - i < t.values().size(); ++i) { - value_idx_t v = t.values()[i]; - if (v == 0) { + std::vector<bool> valid = get_pos_attributes_flag(pos_idx); + std::vector<bool> required = get_pos_required_attributes(pos_idx); + + for (idx_t i = 0; i < attribute_count(); ++i) { + mask_t value = t.get_values_for(get_attribute_mask(i)); + if (value == 0) { if (required[i]) { if (os) { - (*os) << " Required attribuite " - << attribute_dictionary().get_string(i) + (*os) << " red attribuite " + << get_attribute_name(i) << " missing"; } return false; @@ -306,27 +313,9 @@ bool Tagset::validate_tag(const Tag &t, bool allow_extra, if (!valid[i] && !allow_extra) { if (os) { (*os) << " Extra attribute value: " - << value_dictionary().get_string(v) + << get_value_name(value) << " (attribute " - << attribute_dictionary().get_string(i) << ")"; - } - return false; - } - if (!value_dict_.is_id_valid(v)) { - if (os) { - (*os) << " Invalid value at attribite " - << attribute_dictionary().get_string(i); - } - return false; - } - attribute_idx_t a = value_attribute_[v]; - if (a != i) { - if (os) { - (*os) << " Value does not match attribute, got " - << value_dictionary().get_string(v) << " (" - << attribute_dictionary().get_string(a) << ") in" - << attribute_dictionary().get_string(i) - << "'s position"; + << get_attribute_name(i) << ")"; } return false; } @@ -338,23 +327,25 @@ bool Tagset::validate_tag(const Tag &t, bool allow_extra, std::string Tagset::tag_to_string(const Tag &tag) const { std::ostringstream ss; - ss << pos_dict_.get_string(tag.pos_id()); - const std::vector<attribute_idx_t>& attrs = - get_pos_attributes(tag.pos_id()); - foreach (const attribute_idx_t& a, attrs) { - if (pos_required_attributes_[tag.pos_id()][a] || - tag.values()[a] > 0) { + idx_t pos_idx = tag.get_pos_index(); + ss << get_pos_name(pos_idx); + const std::vector<idx_t>& attrs = get_pos_attributes(pos_idx); + foreach (const idx_t& a, attrs) { + mask_t value = tag.get_values_for(get_attribute_mask(a)); + if (pos_requires_attribute(pos_idx, a) || value.any()) { ss << ":"; - if (tag.values()[a] > 0) { - ss << value_dict_.get_string(tag.values()[a]); + if (value.any()) { + ss << get_value_name(value); } } } // print extra attributes - for (size_t i = 0; i < attribute_dict_.size(); ++i) { - if (tag.values()[i] > 0 && - !pos_valid_attributes_[tag.pos_id()][i]) { - ss << ":" << value_dict_.get_string(tag.values()[i]); + for (idx_t a = 0; a < attribute_count(); ++a) { + if (!pos_has_attribute(pos_idx, a)) { + mask_t value = tag.get_values_for(get_attribute_mask(a)); + if (value.any()) { + ss << ":" << get_value_name(value); + } } } return ss.str(); @@ -363,65 +354,247 @@ std::string Tagset::tag_to_string(const Tag &tag) const std::string Tagset::tag_to_no_opt_string(const Tag &tag) const { std::ostringstream ss; - ss << pos_dict_.get_string(tag.pos_id()); - const std::vector<attribute_idx_t>& attrs = - get_pos_attributes(tag.pos_id()); - foreach (const attribute_idx_t& a, attrs) { + idx_t pos_idx = tag.get_pos_index(); + ss << get_pos_name(pos_idx); + const std::vector<idx_t>& attrs = get_pos_attributes(pos_idx); + foreach (const idx_t& a, attrs) { + mask_t value = tag.get_values_for(get_attribute_mask(a)); ss << ":"; - if (tag.values()[a] > 0) { - ss << value_dict_.get_string(tag.values()[a]); + if (value.any()) { + ss << get_value_name(value); } else { - ss << attribute_dict_.get_string(a); + ss << get_attribute_name(a); } } return ss.str(); } -attribute_idx_t Tagset::get_value_attribute(value_idx_t id) const +size_t Tagset::tag_size(const Tag& tag) const { - if (!value_dict_.is_id_valid(id)) { - std::stringstream ss; - ss << "get_value_attribute fail " << (int)id; - throw Corpus2Error(ss.str()); + size_t s = PwrNlp::count_bits_set(tag.get_pos()); + foreach (mask_t attribute_mask, all_attribute_masks()) { + mask_t values = tag.get_values_for(attribute_mask); + size_t x = PwrNlp::count_bits_set(values); + if (x > 1) { + s *= x; + } } - return value_attribute_[id]; + return s; } -const std::vector<value_idx_t>& Tagset::get_attribute_values( - attribute_idx_t a) const +bool Tagset::tag_is_singular(const Tag& tag) const { - assert(attribute_dict_.is_id_valid(a)); - return attribute_values_[a]; + if (PwrNlp::count_bits_set(tag.get_pos()) != 1) return false; + foreach (mask_t attribute_mask, all_attribute_masks()) { + mask_t values = tag.get_values_for(attribute_mask); + if (PwrNlp::count_bits_set(values) > 1) return false; + } + return true; } -const std::vector<attribute_idx_t>& Tagset::get_pos_attributes( - pos_idx_t pos) const +std::vector<Tag> Tagset::split_tag(const Tag& tag) const +{ + std::vector<Tag> tags; + mask_t pos = tag.get_pos(); + while (pos.any()) { + idx_t pos_idx = PwrNlp::lowest_bit(pos); + mask_t pos_mask = static_cast<mask_t>(1) << pos_idx; + pos ^= pos_mask; + tags.push_back(Tag(pos_mask)); + } + + for (idx_t a = 0; a < attribute_count(); ++a) { + mask_t ma = get_attribute_mask(a); + mask_t v = tag.get_values_for(ma); + if (ma.any()) { + bool dup = false; + size_t sz = tags.size(); + foreach (mask_t vm, get_attribute_values(a)) { + if ((v & vm).any()) { + if (dup) { + for (size_t i = 0; i < sz; ++i) { + tags.push_back(tags[i]); + } + } + dup = true; + for (size_t i = 0; i < sz; ++i) { + tags[i].add_values(vm); + } + } + } + } + } + return tags; +} + +idx_t Tagset::get_pos_index(const string_range& pos) const +{ + return pos_dict_.get_id(pos); +} + +const std::string& Tagset::get_pos_name(idx_t pos) const +{ + return pos_dict_.get_string(pos); +} + +const std::string& Tagset::get_pos_name(mask_t pos) const +{ + return pos_dict_.get_string(get_pos_index(pos)); +} + +mask_t Tagset::get_pos_mask(const string_range& pos) const +{ + return get_pos_mask(get_pos_index(pos)); +} + +mask_t Tagset::get_pos_mask(idx_t pos) const +{ + if (pos >= 0) { + return static_cast<mask_t>(1) << pos; + } else { + return 0; + } +} + +idx_t Tagset::get_pos_index(mask_t pos) const +{ + if (pos.none()) { + return -1; + } else { + return PwrNlp::lowest_bit(pos); + } +} + +idx_t Tagset::get_attribute_index(const string_range& a) const +{ + return attribute_dict_.get_id(a); +} + +const std::string& Tagset::get_attribute_name(idx_t a) const +{ + return attribute_dict_.get_string(a); +} + +const std::vector<mask_t>& Tagset::get_attribute_values(idx_t a) const +{ + static std::vector<mask_t> null_vec; + if (a < 0 || a >= attribute_count()) { + return null_vec; + } else { + return attribute_values_[a]; + } +} + +mask_t Tagset::get_attribute_mask(idx_t a) const +{ + if (a < 0 || a >= attribute_count()) { + return 0; + } else { + return attribute_masks_[a]; + } +} + +mask_t Tagset::get_attribute_mask(const string_range& a) const +{ + return get_attribute_mask(get_attribute_index(a)); +} + +mask_t Tagset::get_value_mask(const std::string& v) const +{ + std::map<std::string, mask_t>::const_iterator ci; + ci = string_to_value_mask_.find(v); + if (ci == string_to_value_mask_.end()) { + return 0; + } else { + return ci->second; + } +} + +const std::string& Tagset::get_value_name(mask_t v) const +{ + static std::string nullstr; + std::map<mask_t, std::string>::const_iterator ci; + ci = value_mask_to_string_.find(v); + if (ci == value_mask_to_string_.end()) { + return nullstr; + } else { + return ci->second; + } +} + +idx_t Tagset::get_value_attribute(mask_t v) const +{ + std::map<mask_t, idx_t>::const_iterator ci; + ci = value_mask_to_attribute_index_.find(v); + if (ci == value_mask_to_attribute_index_.end()) { + return -1; + } else { + return ci->second; + } +} + +const std::vector<idx_t>& Tagset::get_pos_attributes(idx_t pos) const { assert(pos_dict_.is_id_valid(pos)); return pos_attributes_[pos]; } -const std::vector<bool>& Tagset::get_pos_valid_attributes( - pos_idx_t pos) const +const std::vector<bool>& Tagset::get_pos_attributes_flag( + idx_t pos) const { assert(pos_dict_.is_id_valid(pos)); return pos_valid_attributes_[pos]; } const std::vector<bool>& Tagset::get_pos_required_attributes( - pos_idx_t pos) const + idx_t pos) const { assert(pos_dict_.is_id_valid(pos)); return pos_required_attributes_[pos]; } +bool Tagset::pos_requires_attribute(idx_t pos, idx_t attribute) const +{ + return pos_required_attributes_[pos][attribute]; +} + +bool Tagset::pos_has_attribute(idx_t pos, idx_t attribute) const +{ + return pos_valid_attributes_[pos][attribute]; +} + +mask_t Tagset::get_pos_value_mask(idx_t pos) const +{ + return pos_valid_value_masks_[pos]; +} + +mask_t Tagset::get_pos_required_mask(idx_t pos) const +{ + return pos_required_value_masks_[pos]; +} + +int Tagset::pos_count() const +{ + return pos_dict_.size(); +} + +int Tagset::attribute_count() const +{ + return attribute_dict_.size(); +} + +int Tagset::value_count() const +{ + return value_mask_to_string_.size(); +} + size_t Tagset::size() const { size_t sum = 0; for (size_t p = 0; p < pos_dict_.size(); ++p) { size_t pos_size = 1; for (size_t i = 0; i < pos_attributes_[p].size(); ++i) { - attribute_idx_t a = pos_attributes_[p][i]; + idx_t a = pos_attributes_[p][i]; if (pos_required_attributes_[p][a]) { pos_size *= attribute_values_[a].size(); } else { @@ -461,14 +634,14 @@ void Tagset::lexemes_into_token(Token& tok, const UnicodeString& lemma, } } -size_t Tagset::get_original_pos_index(pos_idx_t pos) const +int Tagset::get_original_pos_index(idx_t pos) const { - std::map<pos_idx_t, size_t>::const_iterator i = + std::map<idx_t, int>::const_iterator i = original_pos_indices_.find(pos); if (i != original_pos_indices_.end()) { return i->second; } else { - return static_cast<size_t>(-1); + return -1; } } diff --git a/libcorpus2/tagset.h b/libcorpus2/tagset.h index 1ec350e12d8559ed9b5d8e2306450b067996cd81..44cfbf25bb4effbc1906e070ef545cc57a1a7d2b 100644 --- a/libcorpus2/tagset.h +++ b/libcorpus2/tagset.h @@ -99,10 +99,10 @@ public: Tagset(); /** - * Tagset convenience constructor, parse a string as if it were the + * Tagset convenience creation function, parse a string as if it were the * contents of a tagset ini file */ - explicit Tagset(const char*); + static Tagset from_data(const char*); /** * Tag parsing -- functional version, whole tag string. @@ -210,8 +210,7 @@ public: * The values are assumed to be valid in this tagset, but are checked * for correctness with regards to the POS. */ - Tag make_tag(pos_idx_t pos, const std::vector<value_idx_t>& values, - bool allow_extra) const; + Tag make_tag(idx_t pos, mask_t values, bool allow_extra) const; /** * Convenience function for creating a 'ign' (ignored) tag within this @@ -251,39 +250,144 @@ public: */ std::string tag_to_no_opt_string(const Tag &tag) const; + /** + * Compute the number of singular tags that can be represented by the given + * tag, with the following restrictions: + * - the tags must be sub-tags of the given tag + * - the tags must have a value for every attribute where the given tag has + * a non-zero value + * + * @returns 0 if the tag is null, 1 if the tag is sigular, otherwise the + * number of different singular tags conforming to the + * restrictions above that can be constructed from the given tag. + */ + size_t tag_size(const Tag& tag) const; + + /** + * Check if a tag actually represents only one tag. + * + * A tag is singular if it: + * - has exactly one POS bit set + * - has at most one bit set in each attribute. + * Note that the tag might be invalid, this is not checked. + */ + bool tag_is_singular(const Tag& tag) const; + + /** + * Split a tag into a vector of singular tags. Validity is not checked. + * + * @see tag_size + * + * @returns a vector of tags, each of which is singular, with size equal + * to tag_size called on the tag being split. Each returned tag + * in the resulting vector is a sub-tag of the original tag, and + * all not-empty attributes of the original tag are not empty in + * the split tag as well. + */ + std::vector<Tag> split_tag(const Tag& tag) const; + /// POS name <-> index dictionary getter - const SymbolDictionary<pos_idx_t>& pos_dictionary() const { + const SymbolDictionary<idx_t>& pos_dictionary() const { return pos_dict_; } /// attribute name <-> index dictionary getter - const SymbolDictionary<attribute_idx_t>& attribute_dictionary() const { + const SymbolDictionary<idx_t>& attribute_dictionary() const { return attribute_dict_; } - /// value name <-> index dictionary getter - const SymbolDictionary<value_idx_t>& value_dictionary() const { - return value_dict_; - } + /// POS name -> index mapping + /// @returns -1 on invalid name + idx_t get_pos_index(const string_range& pos) const; + + /// POS index -> name + /// @returns empty string on invalid index + const std::string& get_pos_name(idx_t pos) const; + + /// POS mask -> name + /// @returns empty string on invalid index + const std::string& get_pos_name(mask_t pos) const; + + /// POS name -> mask mapping + /// @return null mask on invalid name + mask_t get_pos_mask(const string_range& pos) const; + + /// POS index -> mask mapping + /// @return null mask on invalid index + mask_t get_pos_mask(idx_t pos) const; + + /// POS mask -> index mapping + /// @return -1 on empty mask, unspecified in more tha one POS set + idx_t get_pos_index(mask_t pos) const; + + + /// Attribute name -> index mapping + /// @returns -1 on invalid name + idx_t get_attribute_index(const string_range& a) const; + + /// Attribute index -> name + /// @returns empty string on invalid index + const std::string& get_attribute_name(idx_t pos) const; + + /// Value mask -> attribute index mapping. + /// if the value mask contains values from more than one attribute, + /// behavior is not well defined + /// @return -1 on invalid mask + idx_t get_value_attribute(mask_t v) const; - /// Getter for the value -> attribute mapping - attribute_idx_t get_value_attribute(value_idx_t id) const; + /// Attribute index -> vector of valid value masks mapping + /// @return empty vector on invalid index + const std::vector<mask_t>& get_attribute_values(idx_t a) const; + + /// Attribute index -> combined value mask + /// @return null mask on invalid index + mask_t get_attribute_mask(idx_t a) const; + + /// Attribute name -> combined value mask + /// @return null mask on invalid name + mask_t get_attribute_mask(const string_range& a) const; + + + /// Value name -> mask + /// @returns null mask on invalid name + mask_t get_value_mask(const std::string& v) const; + + /// Value mask -> name + /// @returns empty string on invalid mask + const std::string& get_value_name(mask_t v) const; - /// Getter for the attribute -> valid values mapping - const std::vector<value_idx_t>& get_attribute_values( - attribute_idx_t a) const; /// Getter for the pos -> valid attributes (in order) mapping - const std::vector<attribute_idx_t>& get_pos_attributes( - pos_idx_t pos) const; + /// Valid attributes are both the required and optional attributes. + /// Generally the optonal ones should be after the required ones. + const std::vector<idx_t>& get_pos_attributes(idx_t pos) const; /// Getter for the pos -> valid attributes flag vector - const std::vector<bool>& get_pos_valid_attributes( - pos_idx_t pos) const; + const std::vector<bool>& get_pos_attributes_flag(idx_t pos) const; /// Getter for the pos -> required attributes flag vector - const std::vector<bool>& get_pos_required_attributes( - pos_idx_t pos) const; + const std::vector<bool>& get_pos_required_attributes(idx_t pos) const; + + /// @returns true if the given pos _requires_ the given attribute + bool pos_requires_attribute(idx_t pos, idx_t attribute) const; + + /// @returns true if the given pos _allows_ the given attribute + bool pos_has_attribute(idx_t pos, idx_t attribute) const; + + /// Getter for a mask covering all valid values for a given pos + mask_t get_pos_value_mask(idx_t pos) const; + + /// Getter for a mask covering all required attributes of a pos + mask_t get_pos_required_mask(idx_t pos) const; + + /// The number of POSes in this tagset + int pos_count() const; + + /// The number of attributes in this tagset + int attribute_count() const; + + /// The number of values in this tagset + int value_count() const; /** * Tagset cardinality counter -- the number of different valid tags @@ -341,7 +445,52 @@ public: } /// get the original index of the POS in the tagset definition - size_t get_original_pos_index(pos_idx_t pos) const; + int get_original_pos_index(idx_t pos) const; + + /// Helper iterator class for the mask ranges + struct mask_iterator + { + typedef mask_t value_type; + typedef std::forward_iterator_tag iterator_category; + typedef int difference_type; + typedef const mask_t *pointer; + typedef const mask_t &reference; + mask_iterator(const mask_iterator &i): i_(i.i_) {} + mask_iterator(const mask_t& i) : i_(i) {} + + mask_iterator &operator++() { i_ <<= 1; return *this; } + mask_iterator operator++(int) { return mask_iterator(i_ << 1); } + mask_iterator &operator--() { i_ >>= 1; return *this; } + mask_iterator operator--(int) { return mask_iterator(i_ >> 1); } + + const mask_t &operator*() const { return i_; } + + bool operator==(const mask_iterator &i) const { return i_ == i.i_; } + bool operator!=(const mask_iterator &i) const { return i_ != i.i_; } + + private: + mask_t i_; + }; + + /// Range getter for all the valid POS masks, in order, compatible with + /// boost's foreach + /// It is possible to use a foreach (mask_t m, tagset.all_*_masks()) {...} + boost::iterator_range<mask_iterator> all_pos_masks() const { + return boost::iterator_range<mask_iterator>(static_cast<mask_t>(1), + static_cast<mask_t>(1) << pos_count()); + } + + /// Range getter for all valid value masks + boost::iterator_range<mask_iterator> all_value_masks() const { + return boost::iterator_range<mask_iterator>(static_cast<mask_t>(1), + static_cast<mask_t>(1) << value_count()); + } + + /// Getter attribute masks + const std::vector<mask_t>& all_attribute_masks() const { + return attribute_masks_; + } + private: /// Temporary solution to allow splitting the parser into a separate @@ -358,34 +507,49 @@ private: static tagset_idx_t next_id_; /// String - number dictionary for the POS names - SymbolDictionary<pos_idx_t> pos_dict_; + SymbolDictionary<idx_t> pos_dict_; /// String - number dictionary for the attribute names - SymbolDictionary<attribute_idx_t> attribute_dict_; + SymbolDictionary<idx_t> attribute_dict_; - /// String - number dictionary for the attribute values - SymbolDictionary<value_idx_t> value_dict_; + /// Value names to masks + std::map<std::string, mask_t> string_to_value_mask_; + + /// Value masks to names + std::map<mask_t, std::string> value_mask_to_string_; /// The original indices of the POSes in the tagset definition - std::map<pos_idx_t, size_t> original_pos_indices_; + std::map<idx_t, int> original_pos_indices_; + + /// mapping from attribute indices to valid value masks + std::vector< std::vector<mask_t> > attribute_values_; - /// mapping from attribute indices to valid value indices - std::vector< std::vector<value_idx_t> > attribute_values_; + /// Attribute index to combined value mask + std::vector<mask_t> attribute_masks_; - /// reverse mapping, from a value index to the respective attribute + /// reverse mapping, from a value mask to the respective attribute /// index (values are assumed to be unique and not shared between /// attributes) - std::vector<attribute_idx_t> value_attribute_; + std::map<mask_t, idx_t> value_mask_to_attribute_index_; /// POS to valid attribute indices mapping /// The order of the attributes is important, as it affects string /// output and the behavior of the _ special character in parsing - std::vector< std::vector<attribute_idx_t> > pos_attributes_; + std::vector< std::vector<idx_t> > pos_attributes_; + + /// POS to required attribute indices + std::vector< std::vector<idx_t> > pos_required_attributes_idx_; + + /// POS to combined valid attriubute value mask + std::vector<mask_t> pos_valid_value_masks_; + + /// POS to combined required attriubute value mask + std::vector<mask_t> pos_required_value_masks_; - /// Flags for attributes which are valid for a given POS + /// Flags for attribute indices which are valid for a given POS std::vector< std::vector<bool> > pos_valid_attributes_; - /// Flags for attributes which are required for a given POS + /// Flags for attribute indices which are required for a given POS std::vector< std::vector<bool> > pos_required_attributes_; }; diff --git a/libcorpus2/tagsetparser.cpp b/libcorpus2/tagsetparser.cpp index 3d167cd172ada4f20e2a2bec8597507d20a6b3de..3cf0e9edabdde05b6af6600127e4bb089079012e 100644 --- a/libcorpus2/tagsetparser.cpp +++ b/libcorpus2/tagsetparser.cpp @@ -36,7 +36,7 @@ Tagset TagsetParser::load_ini(std::istream &is) std::set<std::string> symbols(values); typedef std::map< std::string, std::deque<std::string> > vmap_t; vmap_t vmap; - typedef std::map< std::string, std::vector<attribute_idx_t> > pmap_t; + typedef std::map< std::string, std::vector<idx_t> > pmap_t; pmap_t pmap; typedef std::map< std::string, std::vector<bool> > reqmap_t; reqmap_t reqmap; @@ -76,26 +76,32 @@ Tagset TagsetParser::load_ini(std::istream &is) } } - std::vector<std::string> vec; - std::copy(values.begin(), values.end(), - std::inserter(vec, vec.begin())); - if (vec[0] != "@null") { - throw TagsetParseError("First value not '@null'", line_no, vec[0]); + if (*values.begin() != "@null") { + throw TagsetParseError("First value not '@null'", line_no, + *values.begin()); } - tagset.value_dict_.load_sorted_data(vec); - vec.clear(); - tagset.value_attribute_.resize(values.size()); + mask_t current_value = 1; + std::vector<std::string> vec; + idx_t current_attribute_index = 0; foreach (const vmap_t::value_type v, vmap) { + mask_t attribute_mask = 0; vec.push_back(v.first); tagset.attribute_values_.resize( tagset.attribute_values_.size() + 1); foreach (const std::string& s, v.second) { - tagset.attribute_values_.back().push_back( - tagset.value_dict_.get_id(s)); - value_idx_t v = tagset.value_dict_.get_id(s); - tagset.value_attribute_[v] = vec.size() - 1; + tagset.attribute_values_.back().push_back(current_value); + tagset.value_mask_to_attribute_index_.insert( + std::make_pair(current_value, current_attribute_index)); + tagset.string_to_value_mask_.insert( + std::make_pair(s, current_value)); + tagset.value_mask_to_string_.insert( + std::make_pair(current_value, s)); + attribute_mask |= current_value; + current_value <<= 1; } + tagset.attribute_masks_.push_back(attribute_mask); + ++current_attribute_index; } tagset.attribute_dict_.load_sorted_data(vec); @@ -111,7 +117,7 @@ Tagset TagsetParser::load_ini(std::istream &is) throw TagsetParseError("Duplicate symbol", line_no, v[0]); } poses_plain.push_back(v[0]); - std::vector<attribute_idx_t>& pattrs = pmap[v[0]]; + std::vector<idx_t>& pattrs = pmap[v[0]]; std::vector<bool>& req_mask = reqmap[v[0]]; req_mask.resize(tagset.attribute_dict_.size()); v.pop_front(); @@ -122,7 +128,7 @@ Tagset TagsetParser::load_ini(std::istream &is) required = false; s = s.substr(1, s.size() - 2); } - attribute_idx_t a = tagset.attribute_dict_.get_id(s); + idx_t a = tagset.attribute_dict_.get_id(s); if (!tagset.attribute_dict_.is_id_valid(a)) { throw TagsetParseError("Attribute name invalid", line_no, s); @@ -137,20 +143,31 @@ Tagset TagsetParser::load_ini(std::istream &is) vec.clear(); foreach (const pmap_t::value_type v, pmap) { vec.push_back(v.first); + mask_t valid(0); + mask_t required(0); tagset.pos_attributes_.push_back(v.second); + tagset.pos_required_attributes_idx_.resize( + tagset.pos_required_attributes_idx_.size() + 1); tagset.pos_valid_attributes_.push_back( std::vector<bool>(tagset.attribute_values_.size(), false)); - foreach (attribute_idx_t a, v.second) { + foreach (idx_t a, v.second) { + valid |= tagset.get_attribute_mask(a); + if (reqmap[v.first][a]) { + required |= tagset.get_attribute_mask(a); + tagset.pos_required_attributes_idx_.back().push_back(a); + } tagset.pos_valid_attributes_.back()[a] = true; } tagset.pos_required_attributes_.push_back(reqmap[v.first]); + tagset.pos_valid_value_masks_.push_back(valid); + tagset.pos_required_value_masks_.push_back(required); } tagset.pos_dict_.load_sorted_data(vec); if (tagset.pos_dict_.size() == 0) { - throw TagsetParseError("No POS in tagset", 0, ""); + throw TagsetParseError("No POS in tagset", line_no, ""); } for (size_t i = 0; i < poses_plain.size(); ++i) { - pos_idx_t p = tagset.pos_dictionary().get_id(poses_plain[i]); + idx_t p = tagset.pos_dictionary().get_id(poses_plain[i]); tagset.original_pos_indices_.insert(std::make_pair(p,i)); } @@ -161,20 +178,20 @@ void TagsetParser::save_ini(const Tagset &tagset, std::ostream &os) { os << "# Autogenerated by Corpus2\n\n"; os << "[ATTR]\n"; - attribute_idx_t a(0); + idx_t a(0); while (tagset.attribute_dict_.is_id_valid(a)) { os << tagset.attribute_dict_.get_string(a) << "\t= "; - foreach (value_idx_t v, tagset.get_attribute_values(a)) { - os << tagset.value_dict_.get_string(v) << " "; + foreach (mask_t m, tagset.get_attribute_values(a)) { + os << tagset.get_value_name(m) << " "; } os << "\n"; ++a; } os << "\n[POS]\n"; - pos_idx_t p(0); + idx_t p(0); while (tagset.pos_dict_.is_id_valid(p)) { os << tagset.pos_dict_.get_string(p) << "\t= "; - foreach (attribute_idx_t a, tagset.get_pos_attributes(p)) { + foreach (idx_t a, tagset.get_pos_attributes(p)) { if (tagset.pos_required_attributes_[p][a]) { os << tagset.attribute_dict_.get_string(a) << " "; } else { diff --git a/libcorpus2/token.cpp b/libcorpus2/token.cpp index ac86d5c6886992ca0f5fd80bc4bb3430e356babb..1b07c3fd3fbec6f48561674aa6f215ad63668f20 100644 --- a/libcorpus2/token.cpp +++ b/libcorpus2/token.cpp @@ -37,9 +37,9 @@ struct preferred_lexeme_cmp bool operator()(const Lexeme& l1, const Lexeme& l2) const { return (!l1.is_disamb() && l2.is_disamb()) || (l1.is_disamb() == l2.is_disamb() - && (tagset->get_original_pos_index(l1.tag().pos_id()) > - tagset->get_original_pos_index(l2.tag().pos_id()) - || (l1.tag().pos_id() == l2.tag().pos_id() + && (tagset->get_original_pos_index(l1.tag().get_pos_index()) > + tagset->get_original_pos_index(l2.tag().get_pos_index()) + || (l1.tag().get_pos() == l2.tag().get_pos() && l1 < l2))); } }; @@ -89,14 +89,14 @@ bool Token::remove_duplicate_lexemes() return old_size != lexemes_.size(); } -bool Token::orth_pos_match(pos_idx_t pos, const UnicodeString &orth) const +bool Token::orth_pos_match(mask_t pos, const UnicodeString &orth) const { if (orth.length() > 0) { if (orth.caseCompare(orth_, 0) != 0) return false; } - if (pos != static_cast<pos_idx_t>(-1)) { + if (pos.any()) { foreach (const Lexeme& lex, lexemes_) { - if (lex.tag().pos_id() != pos) return false; + if (lex.tag().get_pos() != pos) return false; } } return true; diff --git a/libcorpus2/token.h b/libcorpus2/token.h index 6ea73cdc878daf9d556878bbc87cbd114dbea91f..2961eb7e32a6b7d99c5ea7e16e1ba98a4ff61747 100644 --- a/libcorpus2/token.h +++ b/libcorpus2/token.h @@ -117,7 +117,7 @@ public: * @returns true if the orth and lexemes pass the check, false * otherwise */ - bool orth_pos_match(pos_idx_t pos, const UnicodeString& orth) const; + bool orth_pos_match(mask_t pos, const UnicodeString& orth) const; private: /// The orth (actual encountered form) diff --git a/libcorpus2/util/symboldictionary.h b/libcorpus2/util/symboldictionary.h index f703a8f9899b470261a921e82991c4b98e59f593..f0027a95a472213f7b5da65365c7d71ac0e974bd 100644 --- a/libcorpus2/util/symboldictionary.h +++ b/libcorpus2/util/symboldictionary.h @@ -39,7 +39,7 @@ public: bool is_id_valid(IndexT idx) const; /// Getter for the size of this dictionary - size_t size() const; + size_t size() const; /** * Get the index for a given string identifier, const char* version. @@ -158,7 +158,7 @@ template <typename IndexT> const std::string& SymbolDictionary<IndexT>::get_string(IndexT id) const { size_t idx = static_cast<size_t>(id); - if (id < data_.size()) { + if (idx < data_.size()) { return data_[idx]; } else { return nullstr; diff --git a/libcorpus2/version.in b/libcorpus2/version.in index eacb822dc141beeb2d4bcdbbb11f1df08d411e59..87022b142a71294f04a51fad2b566e5e7c41d613 100644 --- a/libcorpus2/version.in +++ b/libcorpus2/version.in @@ -1,9 +1,9 @@ #ifndef LIBCORPUS2_VERSION_H #define LIBCORPUS2_VERSION_H -#define LIBCORPUS2_VERSION_MAJOR @ver_major@ -#define LIBCORPUS2_VERSION_MINOR @ver_minor@ -#define LIBCORPUS2_VERSION_PATCH @ver_patch@ +#define LIBCORPUS2_VERSION_MAJOR @corpus2_ver_major@ +#define LIBCORPUS2_VERSION_MINOR @corpus2_ver_minor@ +#define LIBCORPUS2_VERSION_PATCH @corpus2_ver_patch@ #define LIBCORPUS2_VERSION "@LIBCORPUS2_VERSION@" #endif diff --git a/libpwrutils/CMakeLists.txt b/libpwrutils/CMakeLists.txt index f6c221d3e2c9c495603043bbcda95ca8933d7a14..b7e137017971a96234fb7dc87fcf81b176bbeb6b 100644 --- a/libpwrutils/CMakeLists.txt +++ b/libpwrutils/CMakeLists.txt @@ -4,7 +4,7 @@ PROJECT(pwrutils) set(pwrutils_ver_major "0") set(pwrutils_ver_minor "0") -set(pwrutils_ver_patch "1") +set(pwrutils_ver_patch "2") set(LIBPWRUTILS_VERSION "${pwrutils_ver_major}.${pwrutils_ver_minor}.${pwrutils_ver_patch}") diff --git a/libpwrutils/bitset.h b/libpwrutils/bitset.h new file mode 100644 index 0000000000000000000000000000000000000000..fa4ddc3a420613a0e4994fdb01d8295738da006d --- /dev/null +++ b/libpwrutils/bitset.h @@ -0,0 +1,112 @@ +#ifndef PWRNLP_BITSET_H +#define PWRNLP_BITSET_H + +#include <libpwrutils/foreach.h> +#include <boost/range.hpp> +#include <bitset> +#include <boost/functional/hash.hpp> +#include <boost/pending/lowest_bit.hpp> +#include <climits> + + +namespace PwrNlp { + +using std::bitset; + +static const size_t ulong_bits = sizeof(unsigned long) * CHAR_BIT; + +typedef bitset<ulong_bits> word_bitset; + + +/** + * Count set bits in a integral type. + * http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel + */ +template <typename T> inline +int count_bits_set(T v) +{ + v = v - ((v >> 1) & (T)~(T)0/3); // temp + v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3); // temp + v = (v + (v >> 4)) & (T)~(T)0/255*15; // temp + return (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * CHAR_BIT; // count +} + +template <size_t S> inline +size_t count_bits_set(const std::bitset<S>& b) +{ + return b.count(); +} + +template <size_t S> inline +size_t lowest_bit(const bitset<S>& b) +{ + // GCC specific + return b._Find_first(); +} + +/** + * Get index of lowest set bit in an integral type + */ +inline size_t lowest_bit(const unsigned long long& t) +{ + if (t <= 0) return static_cast<size_t>(-1); + return boost::lowest_bit(t); +} + +inline size_t lowest_bit(const unsigned long& t) +{ + if (t <= 0) return static_cast<size_t>(-1); + return boost::lowest_bit(t); +} + +} /* end ns PwrNlp */ + +namespace std { + +template<size_t S> inline +size_t hash_value(bitset<S> b) +{ + size_t seed = 0; + const bitset<S> mask(std::numeric_limits<unsigned long>::max()); + while (b.any()) { + boost::hash_combine(seed, (b & mask).to_ulong()); + b >>= PwrNlp::ulong_bits; + } + return seed; +} + +template<> inline +size_t hash_value(bitset<PwrNlp::ulong_bits> b) +{ + size_t seed = 0; + boost::hash_combine(seed, b.to_ulong()); + return seed; +} + +template<size_t S> inline +bool operator<(bitset<S> left, bitset<S> right) +{ + const bitset<S> mask(std::numeric_limits<unsigned long>::max()); + while (left.any()) { + unsigned long l1 = (left & mask).to_ulong(); + unsigned long r1 = (right & mask).to_ulong(); + if (l1 < r1) { + return true; + } else if (l1 > r1) { + return false; + } + left >>= PwrNlp::ulong_bits; + right >>= PwrNlp::ulong_bits; + } + return right.any(); +} + +template<> inline +bool operator<(bitset<PwrNlp::ulong_bits> left, bitset<PwrNlp::ulong_bits> right) +{ + return left.to_ulong() < right.to_ulong(); +} + +} + +#endif // PWRNLP_BITSET_H diff --git a/libpwrutils/util.h b/libpwrutils/util.h index b1bb7fb3e6db2283e77e2ed520629977e6d4835c..52fdf1cac9bc480b52015f930bb82b0bc045cd1a 100644 --- a/libpwrutils/util.h +++ b/libpwrutils/util.h @@ -21,6 +21,7 @@ or FITNESS FOR A PARTICULAR PURPOSE. #include <iostream> #include <string> +#include <climits> namespace PwrNlp { @@ -76,6 +77,8 @@ void utf8_string_to_uchar_container(const std::string& s, } } + + } /* end ns PwrNlp */ #endif // PWRNLP_UTIL_H diff --git a/tagset-tool/main.cpp b/tagset-tool/main.cpp index 553bc4db2caa43ed0efef2b60c584935a5941faa..4ae2cdb3b7841d323c605ce4a3e8607646ab2bb9 100644 --- a/tagset-tool/main.cpp +++ b/tagset-tool/main.cpp @@ -74,9 +74,13 @@ void libedit_read_loop(boost::function<void (const std::string&)>& line_cb) void tagset_info(const Corpus2::Tagset& tagset) { std::cerr << "Corpus2::Tagset loaded: " - << tagset.pos_dictionary().size() << " POSes, " - << tagset.attribute_dictionary().size() << " attributes, " - << tagset.value_dictionary().size() << " values\n"; + << tagset.pos_count() << " POSes, " + << tagset.attribute_count() << " attributes, " + << tagset.value_count() << " values ["; + for (Corpus2::idx_t a = 0; a < tagset.attribute_count(); ++a) { + std::cerr << tagset.get_attribute_values(a).size() << " "; + } + std::cerr << "\n"; std::cerr << "Size is " << tagset.size() << " (extra size is " << tagset.size_extra() << ")\n"; std::cerr << "POSes: "; @@ -86,50 +90,61 @@ void tagset_info(const Corpus2::Tagset& tagset) std::cerr << "\n"; } -void tagset_query_cb(const Corpus2::Tagset& tagset, const std::string& s) +void tagset_query_cb(const Corpus2::Tagset& tagset, const std::string& s, + bool internals) { - Corpus2::pos_idx_t pos = tagset.pos_dictionary().get_id(s); - Corpus2::attribute_idx_t atr = tagset.attribute_dictionary().get_id(s); - Corpus2::value_idx_t val = tagset.value_dictionary().get_id(s); - if (tagset.pos_dictionary().is_id_valid(pos)) { + Corpus2::idx_t pos = tagset.pos_dictionary().get_id(s); + Corpus2::idx_t atr = tagset.attribute_dictionary().get_id(s); + Corpus2::mask_t val = tagset.get_value_mask(s); + if (pos >= 0) { + if (internals) { + std::cout << tagset.get_pos_mask(pos) << " (" << (int)pos << ")\n"; + } std::cout << s << " -> POS ->" ; - foreach (Corpus2::attribute_idx_t a, tagset.get_pos_attributes(pos)) { + foreach (Corpus2::idx_t a, tagset.get_pos_attributes(pos)) { std::string astr = tagset.attribute_dictionary().get_string(a); - if (tagset.get_pos_required_attributes(pos)[a]) { + if (tagset.pos_requires_attribute(pos, a)) { std::cout << " " << astr; } else { std::cout << " [" << astr << "]"; } } std::cout << "\n"; - } else if (tagset.attribute_dictionary().is_id_valid(atr)) { + } else if (atr > 0) { + if (internals) { + std::cout << tagset.get_attribute_mask(atr) << " (" << (int)atr << ")\n"; + } std::cout << s << " -> attribute ->"; - foreach (Corpus2::value_idx_t v, tagset.get_attribute_values(atr)) { - std::cout << " " << tagset.value_dictionary().get_string(v); + foreach (Corpus2::mask_t v, tagset.get_attribute_values(atr)) { + std::cout << " " << tagset.get_value_name(v); } std::cout << "\nIn POSes:"; - for (Corpus2::pos_idx_t p = (Corpus2::pos_idx_t)(0); p < tagset.pos_dictionary().size(); ++p) { - if (tagset.get_pos_valid_attributes(p)[atr]) { + for (Corpus2::idx_t p = 0; p < tagset.pos_count(); ++p) { + if (tagset.pos_has_attribute(p,atr)) { std::cout << " " << tagset.pos_dictionary().get_string(p); - if (!tagset.get_pos_required_attributes(p)[atr]) { + if (!tagset.pos_requires_attribute(p, atr)) { std::cout << "?"; } } } std::cout << "\n"; - } else if (tagset.value_dictionary().is_id_valid(val)) { - Corpus2::attribute_idx_t a = tagset.get_value_attribute(val); + } else if (val.any()) { + Corpus2::idx_t a = tagset.get_value_attribute(val); + if (internals) { + std::cout << val << " (" << PwrNlp::lowest_bit(val) << ")\n"; + std::cout << tagset.get_attribute_mask(a) << " (" << (int)a << ")\n"; + } std::cout << s << " -> value -> attribute "; std::cout << tagset.attribute_dictionary().get_string(a); std::cout << " ."; - foreach (Corpus2::value_idx_t v, tagset.get_attribute_values(a)) { - std::cout << " " << tagset.value_dictionary().get_string(v); + foreach (Corpus2::mask_t v, tagset.get_attribute_values(a)) { + std::cout << " " << tagset.get_value_name(v); } std::cout << "\nIn POSes:"; - for (Corpus2::pos_idx_t p = (Corpus2::pos_idx_t)(0); p < tagset.pos_dictionary().size(); ++p) { - if (tagset.get_pos_valid_attributes(p)[a]) { + for (Corpus2::idx_t p = 0; p < tagset.pos_count(); ++p) { + if (tagset.pos_has_attribute(p, a)) { std::cout << " " << tagset.pos_dictionary().get_string(p); - if (!tagset.get_pos_required_attributes(p)[a]) { + if (!tagset.pos_requires_attribute(p, a)) { std::cout << "?"; } } @@ -140,7 +155,8 @@ void tagset_query_cb(const Corpus2::Tagset& tagset, const std::string& s) } } -void tag_parse_cb(const Corpus2::Tagset& tagset, bool validate, bool sort, const std::string& s) +void tag_parse_cb(const Corpus2::Tagset& tagset, bool validate, bool sort, + const std::string& s, bool internals) { try { Corpus2::Token t; @@ -152,6 +168,9 @@ void tag_parse_cb(const Corpus2::Tagset& tagset, bool validate, bool sort, const if (validate) { tagset.validate_tag(lex.tag(), false, &ss); } + if (internals) { + ss << "\n" << lex.tag().raw_dump() << ""; + } out.push_back(ss.str()); } if (sort) { @@ -167,7 +186,7 @@ void tag_parse_cb(const Corpus2::Tagset& tagset, bool validate, bool sort, const int main(int argc, char** argv) { std::string tagset_load, tagset_save; - bool quiet = false; + bool quiet = false, internals = false; bool parse = false, validate = false, sort = false; using boost::program_options::value; @@ -181,6 +200,8 @@ int main(int argc, char** argv) "Suppress startup info\n") ("parse,p", value(&parse)->zero_tokens(), "Parse complex tag strings mode") + ("internals,i", value(&internals)->zero_tokens(), + "Output internal representations") ("validate,v", value(&validate)->zero_tokens(), "Validate parsed tags") ("sort,s", value(&sort)->zero_tokens(), @@ -232,13 +253,13 @@ int main(int argc, char** argv) std::cerr << "(Tag parse mode)\n"; } _prompt = "tag-parse> "; - f = boost::bind(&tag_parse_cb, boost::ref(tagset), validate, sort, _1); + f = boost::bind(&tag_parse_cb, boost::ref(tagset), validate, sort, _1, internals); } else { if (!quiet) { std::cerr << "(Tagset query mode)\n"; } _prompt = "tagset-query> "; - f = boost::bind(&tagset_query_cb, boost::ref(tagset), _1); + f = boost::bind(&tagset_query_cb, boost::ref(tagset), _1, internals); } #ifdef HAVE_LIBEDIT libedit_read_loop(f); diff --git a/tests/basic.cpp b/tests/basic.cpp index d9651327aca94202a2fe8232d1f6a8c7e6081ec5..d8a8e8ffba451d45b09748443d820a147316e4a2 100644 --- a/tests/basic.cpp +++ b/tests/basic.cpp @@ -20,7 +20,7 @@ BOOST_AUTO_TEST_CASE( token_dup_lexemes ) { Corpus2::Token t(UnicodeString::fromUTF8("ZZ"), PwrNlp::Whitespace::ManySpaces); //Corpus2::Tagset tagset(tagsetstr1); - Corpus2::Tag t1(Corpus2::tagset_idx_t(0), Corpus2::pos_idx_t(0)); + Corpus2::Tag t1(Corpus2::mask_t(0)); Corpus2::Lexeme l1(UnicodeString::fromUTF8("aaa"), t1); Corpus2::Lexeme l2(UnicodeString::fromUTF8("bbb"), t1); BOOST_CHECK(!t.check_duplicate_lexemes()); diff --git a/tests/tag_split.cpp b/tests/tag_split.cpp index c0d93c35bf5f825c8eb08497991a724f8ae517b0..1f4ca44dc1bf89e6fed6fd98d8d615b10dc32af0 100644 --- a/tests/tag_split.cpp +++ b/tests/tag_split.cpp @@ -12,8 +12,9 @@ struct F { "A tag tog other a3 \n" "B data thing tag-thing thang\n" "C a b c \n" - "[POS]\n some A B [C]\n"; - tagset.reset(new Corpus2::Tagset(tagset_string)); + "[POS]\n some A B [C]\n same A B \n P3 [A] [B]\n"; + tagset.reset(new Corpus2::Tagset()); + *tagset = Corpus2::Tagset::from_data(tagset_string); } boost::shared_ptr<Corpus2::Tagset> tagset; @@ -22,10 +23,15 @@ struct F { std::set<std::string> actual; std::vector<Corpus2::Tag> tags; Corpus2::Token t; - tagset->lexemes_into_token(t, UnicodeString(), s); + try { + tagset->lexemes_into_token(t, UnicodeString(), s); + } catch (Corpus2::TagParseError& e) { + throw; + } + foreach (const Corpus2::Lexeme& lex, t.lexemes()) { const Corpus2::Tag& tag = lex.tag(); - BOOST_WARN(tagset->validate_tag(tag, false)); + BOOST_WARN(tagset->validate_tag(tag, false, &std::cerr)); actual.insert(tagset->tag_to_string(tag)); tags.push_back(tag); } @@ -89,9 +95,9 @@ BOOST_FIXTURE_TEST_CASE( dots_plus, F ) BOOST_FIXTURE_TEST_CASE( missing, F ) { - const char tag[] = "some:data"; + const char tag[] = "P3:data"; std::set<std::string> r; - r.insert("some::data"); + r.insert("P3:data"); check_split(tag, r); } @@ -158,4 +164,35 @@ BOOST_FIXTURE_TEST_CASE( underscore_dots, F ) check_split(tag, r); } + + +BOOST_FIXTURE_TEST_CASE( tag_size, F ) +{ + Corpus2::Tag t = tagset->parse_simple_tag("some:tag:data", false); + Corpus2::Tag t2 = tagset->parse_simple_tag("some:tog:data", false); + Corpus2::Tag t3 = tagset->parse_simple_tag("same:tag:data", false); + BOOST_CHECK(tagset->tag_is_singular(t)); + BOOST_CHECK_EQUAL(tagset->tag_size(t), 1); + BOOST_CHECK(tagset->tag_is_singular(t2)); + BOOST_CHECK_EQUAL(tagset->tag_size(t2), 1); + BOOST_CHECK(tagset->tag_is_singular(t3)); + BOOST_CHECK_EQUAL(tagset->tag_size(t3), 1); + t.add_values(t2.get_values()); + BOOST_CHECK(!tagset->tag_is_singular(t)); + BOOST_CHECK_EQUAL(tagset->tag_size(t), 2); + t.add_pos(t3.get_pos()); + BOOST_CHECK(!tagset->tag_is_singular(t)); + BOOST_CHECK_EQUAL(tagset->tag_size(t), 4); + Corpus2::Tag t4 = tagset->parse_simple_tag("same:other:thang", true); + t.add_values(t4.get_values() & tagset->get_attribute_mask(std::string("A"))); + BOOST_CHECK_EQUAL(tagset->tag_size(t), 6); + std::vector<Corpus2::Tag> tags = tagset->split_tag(t); + BOOST_CHECK_EQUAL(tags.size(), 6); + Corpus2::Tag tt; + foreach (Corpus2::Tag t, tags) { + tt.combine_with(t); + } + BOOST_CHECK(tt == t); +} + BOOST_AUTO_TEST_SUITE_END() diff --git a/tests/tagset_parse.cpp b/tests/tagset_parse.cpp index 892e072945c95a86f4654f71c7dea018803a70fe..d5faa7d1792748ac4adc94545f99d22a5371da45 100644 --- a/tests/tagset_parse.cpp +++ b/tests/tagset_parse.cpp @@ -26,19 +26,23 @@ BOOST_AUTO_TEST_CASE( empty ) BOOST_AUTO_TEST_CASE( minimal ) { + Corpus2::Tagset t; try { - parse(PRE POSA); + t = parse(PRE POSA); } catch (Corpus2::TagsetParseError& e) { BOOST_FAIL(e.info()); } + BOOST_CHECK_EQUAL(t.pos_count(), 1); } BOOST_AUTO_TEST_CASE( minimal_nonewline ) { + Corpus2::Tagset t; try { - parse(PRE "[POS]\n POS1"); + t = parse(PRE "[POS]\n POS1"); } catch (Corpus2::TagsetParseError& e) { BOOST_FAIL(e.info()); } + BOOST_CHECK_EQUAL(t.pos_count(), 1); } BOOST_AUTO_TEST_CASE( dupe_val )