diff --git a/libcorpus2/lexeme.cpp b/libcorpus2/lexeme.cpp index cd5361328ed333c2f5b30770c86cc8004325203f..7c1a8355611102516621c68354e9523994833096 100644 --- a/libcorpus2/lexeme.cpp +++ b/libcorpus2/lexeme.cpp @@ -19,7 +19,7 @@ Lexeme Lexeme::create(const UnicodeString& lemma, const Tag& tag) bool Lexeme::is_null() const { - return lemma().length() == 0 || !tag().has_valid_tagset(); + return lemma().length() == 0 || tag().is_null(); } bool Lexeme::operator<(const Lexeme& other) const diff --git a/libcorpus2/tag.cpp b/libcorpus2/tag.cpp index d3f6fe4506cbb46e09cb3278015273c33603bf32..7bc7d9b26ab4cc91b640c78302e1908ea06476a1 100644 --- a/libcorpus2/tag.cpp +++ b/libcorpus2/tag.cpp @@ -7,69 +7,58 @@ #include <sstream> #include <boost/functional/hash.hpp> +#include <boost/pending/lowest_bit.hpp> namespace Corpus2 { -Tag::Tag() - : pos_id_(-1), tagset_id_(-1) -{ -} -Tag::Tag(tagset_idx_t tagset_id, pos_idx_t pos) - : pos_id_(pos), tagset_id_(tagset_id) +template <typename T> +int count_bits_set(T v) { + // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel + v = v - ((v >> 1) & (T)~(T)0/3); // temp + v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3); // temp + v = (v + (v >> 4)) & (T)~(T)0/255*15; // temp + return (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * CHAR_BIT; // count } -Tag::Tag(tagset_idx_t tagset_id, pos_idx_t pos, - const std::vector<value_idx_t> &values) - : pos_id_(pos), values_(values), tagset_id_(tagset_id) +int Tag::pos_count() const { + return count_bits_set(pos_); } -bool Tag::has_valid_tagset() const +int Tag::get_pos_index() const { - return tagset_id_ != static_cast<tagset_idx_t>(-1) - && TagsetManagerSingleton::Instance().get_cache_entry(tagset_id()); + if (pos_ == 0) return -1; + return boost::lowest_bit(pos_); } std::string Tag::raw_dump() const { std::ostringstream ss; ss << "["; - ss << static_cast<int>(tagset_id_) << "#" << static_cast<int>(pos_id_); - foreach (value_idx_t v, values_) { - ss << ":" << static_cast<int>(v) ; - } + //ss << static_cast<int>(tagset_id_); + ss << "#" << (void*)(pos_); + ss << ":" << (void*)(values_) ; ss << "]"; return ss.str(); } bool Tag::operator<(const Tag& other) const { - return tagset_id_ < other.tagset_id_ - || (tagset_id_ == other.tagset_id_ - && (pos_id_ < other.pos_id_ - || (pos_id_ == other.pos_id_ - && (values_.size() < other.values_.size() - || (values_.size() == other.values_.size() - && memcmp(&values_[0], &other.values_[0], - std::min(values_.size(), - other.values_.size())) < 0))))); + return pos_ < other.pos_ || (pos_ == other.pos_ && values_ < other.values_); } bool Tag::operator ==(const Tag& other) const { - return tagset_id_ == other.tagset_id_ - && pos_id_ == other.pos_id_ - && values_ == other.values_; + return pos_ == other.pos_ && values_ == other.values_; } size_t hash_value(const Tag& tag) { std::size_t seed = 0; - boost::hash_combine(seed, tag.pos_id_); - boost::hash_combine(seed, tag.tagset_id_); - boost::hash_combine(seed, tag.values_); + boost::hash_combine(seed, tag.get_pos()); + boost::hash_combine(seed, tag.get_values()); return seed; } diff --git a/libcorpus2/tag.h b/libcorpus2/tag.h index 6db43d2aac0f6aaa74ff44d956ad18d4e824d229..c700216b62e974a991c564b8c0bb9a1679abefed 100644 --- a/libcorpus2/tag.h +++ b/libcorpus2/tag.h @@ -3,7 +3,7 @@ #include <string> #include <vector> - +#include <cassert> #include <boost/cstdint.hpp> #include <boost/strong_typedef.hpp> #include <boost/operators.hpp> @@ -13,10 +13,8 @@ namespace Corpus2 { class Tagset; /// Typedefs for the string -> index mappings -typedef boost::uint8_t idx_t; -BOOST_STRONG_TYPEDEF(idx_t, pos_idx_t); -BOOST_STRONG_TYPEDEF(idx_t, attribute_idx_t); -BOOST_STRONG_TYPEDEF(idx_t, value_idx_t); +typedef boost::int8_t idx_t; +typedef boost::uint64_t mask_t; BOOST_STRONG_TYPEDEF(boost::uint32_t, tagset_idx_t); /** @@ -31,52 +29,100 @@ BOOST_STRONG_TYPEDEF(boost::uint32_t, tagset_idx_t); * This allows more sanity checking, esp. during tagset conversion. */ class Tag - : boost::equality_comparable<Tag>, boost::less_than_comparable<Tag> +// : boost::equality_comparable<Tag>, boost::less_than_comparable<Tag> { public: /// Empty tag constructor - Tag(); + Tag() + : pos_(0), values_(0) + { + } /// Tagset-and-POS (no values) constructor - Tag(tagset_idx_t tagset_id, pos_idx_t pos); + explicit Tag(mask_t pos) + : pos_(pos), values_(0) + { + } /// Tagset-POS-values constructor - Tag(tagset_idx_t tagset_id, pos_idx_t pos, - const std::vector<value_idx_t>& values); + Tag(mask_t pos, mask_t values) + : pos_(pos), values_(values) + { + } + + bool is_null() const { + return pos_ == 0 && values_ == 0; + } + + int pos_count() const; + + int get_pos_index() const; /// POS (part-of-speech) accesor - pos_idx_t pos_id() const { - return pos_id_; + mask_t get_pos() const { + return pos_; } /// POS setter - void set_pos_id(pos_idx_t v) { - pos_id_ = v; + void set_pos(mask_t v) { + pos_ = v; + } + + void add_pos(mask_t v) { + pos_ |= v; } /// values accesor - const std::vector<value_idx_t>& values() const { + mask_t get_values() const { return values_; } + mask_t get_values_for(mask_t mask) const { + return values_ & mask; + } + /// values accesor -- nonconst reference - std::vector<value_idx_t>& values() { - return values_; + void set_values(mask_t v) { + values_ = v; } - /// debug aid, dump the tag's internal numeric representation - std::string raw_dump() const; + void add_values(mask_t v) { + values_ |= v; + } + + void add_values_masked(mask_t value, mask_t mask) + { + assert(mask & value); + values_ = (values_ & ~mask) | value; + } + + Tag& combine_with(const Tag& other) { + pos_ |= other.pos_; + values_ |= other.values_; + return *this; + } + + Tag get_combined(const Tag& other) const { + Tag t(*this); + return t.combine_with(other); + } - /// tagset id accesor - tagset_idx_t tagset_id() const { - return tagset_id_; + Tag& mask_with(const Tag& other) { + pos_ &= other.pos_; + values_ &= other.values_; + return *this; } - bool has_valid_tagset() const; + Tag get_masked(const Tag& other) const { + Tag t(*this); + return t.get_masked(other); + } + + /// debug aid, dump the tag's internal numeric representation + std::string raw_dump() const; /** - * Tag comparison. Tags sort by tagset id, then pos id, and finally - * value-by-value. Boost is used to provide other comparison operators. + * Tag comparison. */ bool operator<(const Tag& other) const; @@ -87,17 +133,15 @@ public: private: /// the POS id - pos_idx_t pos_id_; -\ - /// the values - std::vector<value_idx_t> values_; + mask_t pos_; - /// the tagset id - tagset_idx_t tagset_id_; + /// the values + mask_t values_; - friend size_t hash_value(const Tag &tag); }; +size_t hash_value(const Tag &tag); + } /* end ns Corpus2 */ diff --git a/libcorpus2/tagset.cpp b/libcorpus2/tagset.cpp index dd193b397e6061e29f6bb75cf3427b141d58cf06..82dfd31707c074310b18cad05d239ab35ab3a4a8 100644 --- a/libcorpus2/tagset.cpp +++ b/libcorpus2/tagset.cpp @@ -11,6 +11,7 @@ #include <boost/strong_typedef.hpp> #include <boost/algorithm/string.hpp> #include <boost/bind.hpp> +#include <boost/pending/lowest_bit.hpp> #include <sstream> #include <iostream> @@ -104,19 +105,19 @@ void Tagset::parse_tag(const string_range &s, bool allow_extra, namespace { void append_to_multi_tag( - std::vector< std::vector<value_idx_t> > & current, - const std::vector<value_idx_t> & to_add) + std::vector< mask_t > & current, + const std::vector<mask_t> & to_add) { - foreach (std::vector<value_idx_t>& o, current) { - o.push_back(to_add[0]); - } size_t current_size = current.size(); for (size_t ai = 1; ai < to_add.size(); ++ai) { for (size_t oi = 0; oi < current_size; ++oi) { current.push_back(current[oi]); - current.back().back() = to_add[ai]; + current.back() |= to_add[ai]; } } + for (size_t i = 0; i < current_size; ++i) { + current[i] |= to_add[0]; + } } } @@ -126,41 +127,42 @@ void Tagset::parse_tag(const string_range_vector &fields, bool allow_extra, if (fields.empty()) { throw TagParseError("No POS", "", "", id_string()); } - pos_idx_t pos_id = pos_dict_.get_id(fields[0]); - if (!pos_dict_.is_id_valid(pos_id)) { + idx_t pos_idx = get_pos_index(fields[0]); + if (pos_idx < 0) { throw TagParseError("Invalid POS", boost::copy_range<std::string>(fields[0]), "", id_string()); } - std::vector< std::vector<value_idx_t> > opts(1); + std::vector< mask_t > all_variants; + all_variants.push_back(0); for (size_t fi = 1; fi < fields.size(); ++fi) { const string_range& r = fields[fi]; if (r.size() != 1 || *r.begin() != '_') { string_range_vector dots; boost::algorithm::split(dots, r, boost::is_any_of(".")); - std::vector<value_idx_t> values; + std::vector<mask_t> values; foreach (string_range& dot, dots) { - value_idx_t v = value_dict_.get_id(dot); - if (!value_dict_.is_id_valid(v)) { + mask_t v = get_value_mask(boost::copy_range<std::string>(dot)); + if (!v) { throw TagParseError("Unknown attribute value", boost::copy_range<std::string>(r), "", id_string()); } values.push_back(v); } - append_to_multi_tag(opts, values); + append_to_multi_tag(all_variants, values); } else if (!r.empty()) { // underscore handling - if (fi - 1 >= pos_attributes_[pos_id].size()) { + if (fi - 1 >= pos_attributes_[pos_idx].size()) { throw TagParseError( "Underscore beyond last attribute for this POS", "", "", id_string()); } - attribute_idx_t attr = pos_attributes_[pos_id][fi - 1]; - append_to_multi_tag(opts, attribute_values_[attr]); + idx_t attr = pos_attributes_[pos_idx][fi - 1]; + append_to_multi_tag(all_variants, attribute_values_[attr]); } // else empty, do nothing } - foreach (std::vector<value_idx_t>& opt, opts) { - sink(make_tag(pos_id, opt, allow_extra)); + foreach (mask_t variant, all_variants) { + sink(make_tag(pos_idx, variant, allow_extra)); } } @@ -196,108 +198,81 @@ Tag Tagset::parse_simple_tag(const string_range_vector &ts, throw TagParseError("Empty POS+attribute list", "", "", id_string()); } - pos_idx_t pos_id = pos_dict_.get_id(ts[0]); - if (!pos_dict_.is_id_valid(pos_id)) { + idx_t pos_idx = get_pos_index(ts[0]); + if (pos_idx < 0) { throw TagParseError("Invalid POS", boost::copy_range<std::string>(ts[0]), "", id_string()); } - const std::vector<bool>& valid_attrs_mask = - get_pos_valid_attributes(pos_id); - Tag tag(id_, pos_id); - std::vector<value_idx_t> vvv(attribute_dict_.size(), - static_cast<value_idx_t>(0)); - tag.values().swap(vvv); - + mask_t values = 0; for (size_t i = 1; i < ts.size(); ++i) { if (!ts[i].empty()) { - value_idx_t val_id = value_dict_.get_id(ts[i]); - if (!value_dict_.is_id_valid(val_id)) { - attribute_idx_t a = attribute_dict_.get_id(ts[i]); - if (attribute_dict_.is_id_valid(a)) { - tag.values()[a] = 0; + mask_t val = get_value_mask(boost::copy_range<std::string>(ts[i])); + if (val == 0) { + mask_t a = get_attribute_mask(ts[i]); + if (a != 0) { + values &= (~a); } else { throw TagParseError("Unknown attribute value", boost::copy_range<std::string>(ts[i]), "", id_string()); } } else { - attribute_idx_t attr_id = get_value_attribute(val_id); - if (valid_attrs_mask[attr_id] || allow_extra) { - tag.values()[attr_id] = val_id; - } + values |= val; } } } - return tag; + return Tag(get_pos_mask(pos_idx), values); } -Tag Tagset::make_tag(pos_idx_t pos, const std::vector<value_idx_t>& values, - bool allow_extra) const +Tag Tagset::make_tag(idx_t pos_idx, mask_t values, bool allow_extra) const { - const std::vector<bool>& valid_attrs_mask = - get_pos_valid_attributes(pos); - Tag tag(id_, pos); - std::vector<value_idx_t> vvv(attribute_dict_.size(), - static_cast<value_idx_t>(0)); - tag.values().swap(vvv); - - for (size_t i = 0; i < values.size(); ++i) { - value_idx_t val_id = values[i]; - attribute_idx_t attr_id = get_value_attribute(val_id); - if (valid_attrs_mask[attr_id] || allow_extra) { - tag.values()[attr_id] = val_id; - } else { - throw TagParseError("Attribute not valid for this POS", - attribute_dict_.get_string(attr_id), - pos_dict_.get_string(pos), id_string()); - } + mask_t valid_values = get_pos_value_mask(pos_idx); + mask_t invalid = values & ~valid_values; + if (invalid && !allow_extra) { + mask_t first_invalid = boost::lowest_bit(invalid); + throw TagParseError("Attribute not valid for this POS", + get_value_name(first_invalid), + get_pos_name(pos_idx), id_string()); } - return tag; + // check singularity + return Tag(get_pos_mask(pos_idx), values); } Tag Tagset::make_ign_tag() const { - pos_idx_t ign_pos = pos_dictionary().get_id("ign"); - assert(pos_dictionary().is_id_valid(ign_pos)); - Tag tag(id_, ign_pos); - tag.values().resize(attribute_dict_.size(), - static_cast<value_idx_t>(0)); - return tag; + mask_t ign_pos_mask = get_pos_mask("ign"); + assert(ign_pos_mask); + return Tag(ign_pos_mask); } bool Tagset::validate_tag(const Tag &t, bool allow_extra, std::ostream* os) const { - if (!pos_dict_.is_id_valid(t.pos_id())) { - if (os) { - (*os) << " POS not valid : " << (int) t.pos_id(); - } - return false; - } - std::vector<bool> valid = get_pos_valid_attributes(t.pos_id()); - std::vector<bool> required = get_pos_required_attributes(t.pos_id()); - if (t.values().size() < attribute_dict_.size()) { + // check singularity + if (t.pos_count() != 1) { if (os) { - (*os) << " Values size below tagset attribute count: " - << t.values().size() << "<" << attribute_dict_.size(); + (*os) << " POS not singular : " << t.pos_count(); } return false; } - if (!allow_extra && t.values().size() > attribute_dict_.size()) { + + idx_t pos_idx = t.get_pos_index(); + if (!pos_dict_.is_id_valid(pos_idx)) { if (os) { - (*os) << " Values size above tagset attribute count" - << t.values().size() << ">" << attribute_dict_.size(); + (*os) << " POS not valid : " << (int)pos_idx; } return false; } - for (attribute_idx_t i = static_cast<attribute_idx_t>(0); - i < t.values().size(); ++i) { - value_idx_t v = t.values()[i]; - if (v == 0) { + std::vector<bool> valid = get_pos_attributes_flag(pos_idx); + std::vector<bool> required = get_pos_required_attributes(pos_idx); + + for (idx_t i = 0; i < attribute_dict_.size(); ++i) { + mask_t value = t.get_values_for(get_attribute_mask(i)); + if (value == 0) { if (required[i]) { if (os) { (*os) << " Required attribuite " - << attribute_dictionary().get_string(i) + << get_attribute_name(i) << " missing"; } return false; @@ -306,27 +281,9 @@ bool Tagset::validate_tag(const Tag &t, bool allow_extra, if (!valid[i] && !allow_extra) { if (os) { (*os) << " Extra attribute value: " - << value_dictionary().get_string(v) + << get_value_name(value) << " (attribute " - << attribute_dictionary().get_string(i) << ")"; - } - return false; - } - if (!value_dict_.is_id_valid(v)) { - if (os) { - (*os) << " Invalid value at attribite " - << attribute_dictionary().get_string(i); - } - return false; - } - attribute_idx_t a = value_attribute_[v]; - if (a != i) { - if (os) { - (*os) << " Value does not match attribute, got " - << value_dictionary().get_string(v) << " (" - << attribute_dictionary().get_string(a) << ") in" - << attribute_dictionary().get_string(i) - << "'s position"; + << get_attribute_name(i) << ")"; } return false; } @@ -338,23 +295,25 @@ bool Tagset::validate_tag(const Tag &t, bool allow_extra, std::string Tagset::tag_to_string(const Tag &tag) const { std::ostringstream ss; - ss << pos_dict_.get_string(tag.pos_id()); - const std::vector<attribute_idx_t>& attrs = - get_pos_attributes(tag.pos_id()); - foreach (const attribute_idx_t& a, attrs) { - if (pos_required_attributes_[tag.pos_id()][a] || - tag.values()[a] > 0) { + idx_t pos_idx = tag.get_pos_index(); + ss << get_pos_name(pos_idx); + const std::vector<idx_t>& attrs = get_pos_attributes(pos_idx); + foreach (const idx_t& a, attrs) { + mask_t value = tag.get_values_for(get_attribute_mask(a)); + if (pos_requires_attribute(pos_idx, a) || value) { ss << ":"; - if (tag.values()[a] > 0) { - ss << value_dict_.get_string(tag.values()[a]); + if (value) { + ss << get_value_name(value); } } } // print extra attributes - for (size_t i = 0; i < attribute_dict_.size(); ++i) { - if (tag.values()[i] > 0 && - !pos_valid_attributes_[tag.pos_id()][i]) { - ss << ":" << value_dict_.get_string(tag.values()[i]); + for (idx_t a = 0; a < attribute_dict_.size(); ++a) { + if (!pos_has_attribute(pos_idx, a)) { + mask_t value = tag.get_values_for(get_attribute_mask(a)); + if (value) { + ss << ":" << get_value_name(value); + } } } return ss.str(); @@ -363,65 +322,184 @@ std::string Tagset::tag_to_string(const Tag &tag) const std::string Tagset::tag_to_no_opt_string(const Tag &tag) const { std::ostringstream ss; - ss << pos_dict_.get_string(tag.pos_id()); - const std::vector<attribute_idx_t>& attrs = - get_pos_attributes(tag.pos_id()); - foreach (const attribute_idx_t& a, attrs) { + idx_t pos_idx = tag.get_pos_index(); + ss << get_pos_name(pos_idx); + const std::vector<idx_t>& attrs = get_pos_attributes(pos_idx); + foreach (const idx_t& a, attrs) { + mask_t value = tag.get_values_for(get_attribute_mask(a)); ss << ":"; - if (tag.values()[a] > 0) { - ss << value_dict_.get_string(tag.values()[a]); + if (value) { + ss << get_value_name(value); } else { - ss << attribute_dict_.get_string(a); + ss << get_attribute_name(a); } } return ss.str(); } -attribute_idx_t Tagset::get_value_attribute(value_idx_t id) const +idx_t Tagset::get_pos_index(const string_range& pos) const { - if (!value_dict_.is_id_valid(id)) { - std::stringstream ss; - ss << "get_value_attribute fail " << (int)id; - throw Corpus2Error(ss.str()); + return pos_dict_.get_id(pos); +} + +const std::string& Tagset::get_pos_name(idx_t pos) const +{ + return pos_dict_.get_string(pos); +} + +mask_t Tagset::get_pos_mask(const string_range& pos) const +{ + return get_pos_mask(get_pos_index(pos)); +} + +mask_t Tagset::get_pos_mask(idx_t pos) const +{ + return 1 << pos; +} + +idx_t Tagset::get_attribute_index(const string_range& a) const +{ + return attribute_dict_.get_id(a); +} + +const std::string& Tagset::get_attribute_name(idx_t a) const +{ + return attribute_dict_.get_string(a); +} + +const std::vector<mask_t>& Tagset::get_attribute_values(idx_t a) const +{ + static std::vector<mask_t> null_vec; + if (a < 0 || a >= attribute_dict_.size()) { + return null_vec; + } else { + return attribute_values_[a]; } - return value_attribute_[id]; } -const std::vector<value_idx_t>& Tagset::get_attribute_values( - attribute_idx_t a) const +mask_t Tagset::get_attribute_mask(idx_t a) const { - assert(attribute_dict_.is_id_valid(a)); - return attribute_values_[a]; + if (a < 0 || a >= attribute_dict_.size()) { + return 0; + } else { + return attribute_masks_[a]; + } } -const std::vector<attribute_idx_t>& Tagset::get_pos_attributes( - pos_idx_t pos) const +mask_t Tagset::get_attribute_mask(const string_range& a) const +{ + return get_attribute_mask(get_attribute_index(a)); +} + +mask_t Tagset::get_value_mask(const std::string& v) const +{ + std::map<std::string, mask_t>::const_iterator ci; + ci = string_to_value_mask_.find(v); + if (ci == string_to_value_mask_.end()) { + return 0; + } else { + return ci->second; + } +} + +const std::string& Tagset::get_value_name(mask_t v) const +{ + static std::string nullstr; + std::map<mask_t, std::string>::const_iterator ci; + ci = value_mask_to_string_.find(v); + if (ci == value_mask_to_string_.end()) { + return nullstr; + } else { + return ci->second; + } +} + +idx_t Tagset::get_value_attribute_index(mask_t v) const +{ + std::map<mask_t, idx_t>::const_iterator ci; + ci = value_mask_to_attribute_index_.find(v); + if (ci == value_mask_to_attribute_index_.end()) { + return -1; + } else { + return ci->second; + } +} + + + +idx_t Tagset::get_value_attribute(mask_t v) const +{ + std::map<mask_t, idx_t>::const_iterator ci; + ci = value_mask_to_attribute_index_.find(v); + if (ci == value_mask_to_attribute_index_.end()) { + return -1; + } else { + return ci->second; + } +} + +const std::vector<idx_t>& Tagset::get_pos_attributes(idx_t pos) const { assert(pos_dict_.is_id_valid(pos)); return pos_attributes_[pos]; } -const std::vector<bool>& Tagset::get_pos_valid_attributes( - pos_idx_t pos) const +const std::vector<bool>& Tagset::get_pos_attributes_flag( + idx_t pos) const { assert(pos_dict_.is_id_valid(pos)); return pos_valid_attributes_[pos]; } const std::vector<bool>& Tagset::get_pos_required_attributes( - pos_idx_t pos) const + idx_t pos) const { assert(pos_dict_.is_id_valid(pos)); return pos_required_attributes_[pos]; } +bool Tagset::pos_requires_attribute(idx_t pos, idx_t attribute) const +{ + return pos_required_attributes_[pos][attribute]; +} + +bool Tagset::pos_has_attribute(idx_t pos, idx_t attribute) const +{ + return pos_valid_attributes_[pos][attribute]; +} + +mask_t Tagset::get_pos_value_mask(idx_t pos) const +{ + return 0; //TODO +} + +mask_t Tagset::get_pos_required_mask(idx_t pos) const +{ + return 0; //TODO +} + +size_t Tagset::pos_count() const +{ + return pos_dict_.size(); +} + +size_t Tagset::attribute_count() const +{ + return attribute_dict_.size(); +} + +size_t Tagset::value_count() const +{ + return value_mask_to_string_.size(); +} + size_t Tagset::size() const { size_t sum = 0; for (size_t p = 0; p < pos_dict_.size(); ++p) { size_t pos_size = 1; for (size_t i = 0; i < pos_attributes_[p].size(); ++i) { - attribute_idx_t a = pos_attributes_[p][i]; + idx_t a = pos_attributes_[p][i]; if (pos_required_attributes_[p][a]) { pos_size *= attribute_values_[a].size(); } else { @@ -461,14 +539,14 @@ void Tagset::lexemes_into_token(Token& tok, const UnicodeString& lemma, } } -size_t Tagset::get_original_pos_index(pos_idx_t pos) const +int Tagset::get_original_pos_index(idx_t pos) const { - std::map<pos_idx_t, size_t>::const_iterator i = + std::map<idx_t, int>::const_iterator i = original_pos_indices_.find(pos); if (i != original_pos_indices_.end()) { return i->second; } else { - return static_cast<size_t>(-1); + return -1; } } diff --git a/libcorpus2/tagset.h b/libcorpus2/tagset.h index 1ec350e12d8559ed9b5d8e2306450b067996cd81..dec401392582755e1ec12bc5f7d0e027ac2c16ef 100644 --- a/libcorpus2/tagset.h +++ b/libcorpus2/tagset.h @@ -210,8 +210,7 @@ public: * The values are assumed to be valid in this tagset, but are checked * for correctness with regards to the POS. */ - Tag make_tag(pos_idx_t pos, const std::vector<value_idx_t>& values, - bool allow_extra) const; + Tag make_tag(idx_t pos, mask_t values, bool allow_extra) const; /** * Convenience function for creating a 'ign' (ignored) tag within this @@ -252,38 +251,97 @@ public: std::string tag_to_no_opt_string(const Tag &tag) const; /// POS name <-> index dictionary getter - const SymbolDictionary<pos_idx_t>& pos_dictionary() const { + const SymbolDictionary<idx_t>& pos_dictionary() const { return pos_dict_; } /// attribute name <-> index dictionary getter - const SymbolDictionary<attribute_idx_t>& attribute_dictionary() const { + const SymbolDictionary<idx_t>& attribute_dictionary() const { return attribute_dict_; } - /// value name <-> index dictionary getter - const SymbolDictionary<value_idx_t>& value_dictionary() const { - return value_dict_; - } - /// Getter for the value -> attribute mapping - attribute_idx_t get_value_attribute(value_idx_t id) const; + /// POS name -> index mapping + /// @returns -1 on invalid name + idx_t get_pos_index(const string_range& pos) const; + + /// POS index -> name + /// @returns empty string on invalid index + const std::string& get_pos_name(idx_t pos) const; + + /// POS name -> mask mapping + /// @return null mask on invalid name + mask_t get_pos_mask(const string_range& pos) const; + + /// POS index -> mask mapping + /// @return null mask on invalid index + mask_t get_pos_mask(idx_t pos) const; + + + /// Attribute name -> index mapping + /// @returns -1 on invalid name + idx_t get_attribute_index(const string_range& a) const; + + /// Attribute index -> name + /// @returns empty string on invalid index + const std::string& get_attribute_name(idx_t pos) const; + + /// Value mask -> attribute index mapping. + /// if the value mask contains values from more than one attribute, + /// behavior is not well defined + /// @return -1 on invalid mask + idx_t get_value_attribute(mask_t v) const; + + /// Attribute index -> vector of valid value masks mapping + /// @return empty vector on invalid index + const std::vector<mask_t>& get_attribute_values(idx_t a) const; + + /// Attribute index -> combined value mask + /// @return null mask on invalid index + mask_t get_attribute_mask(idx_t a) const; + + /// Attribute name -> combined value mask + /// @return null mask on invalid name + mask_t get_attribute_mask(const string_range& a) const; + + + /// Value name -> mask + /// @returns null mask on invalid name + mask_t get_value_mask(const std::string& v) const; + + /// Value mask -> name + /// @returns empty string on invalid mask + const std::string& get_value_name(mask_t v) const; + + /// Value mask -> attribute index + /// @returns -1 on invalid mask + idx_t get_value_attribute_index(mask_t v) const; - /// Getter for the attribute -> valid values mapping - const std::vector<value_idx_t>& get_attribute_values( - attribute_idx_t a) const; /// Getter for the pos -> valid attributes (in order) mapping - const std::vector<attribute_idx_t>& get_pos_attributes( - pos_idx_t pos) const; + /// Valid attributes are both the required and optional attributes. + /// Generally the optonal ones should be after the required ones. + const std::vector<idx_t>& get_pos_attributes(idx_t pos) const; /// Getter for the pos -> valid attributes flag vector - const std::vector<bool>& get_pos_valid_attributes( - pos_idx_t pos) const; + const std::vector<bool>& get_pos_attributes_flag(idx_t pos) const; /// Getter for the pos -> required attributes flag vector - const std::vector<bool>& get_pos_required_attributes( - pos_idx_t pos) const; + const std::vector<bool>& get_pos_required_attributes(idx_t pos) const; + + bool pos_requires_attribute(idx_t pos, idx_t attribute) const; + + bool pos_has_attribute(idx_t pos, idx_t attribute) const; + + mask_t get_pos_value_mask(idx_t pos) const; + + mask_t get_pos_required_mask(idx_t pos) const; + + size_t pos_count() const; + + size_t attribute_count() const; + + size_t value_count() const; /** * Tagset cardinality counter -- the number of different valid tags @@ -341,7 +399,7 @@ public: } /// get the original index of the POS in the tagset definition - size_t get_original_pos_index(pos_idx_t pos) const; + int get_original_pos_index(idx_t pos) const; private: /// Temporary solution to allow splitting the parser into a separate @@ -358,29 +416,36 @@ private: static tagset_idx_t next_id_; /// String - number dictionary for the POS names - SymbolDictionary<pos_idx_t> pos_dict_; + SymbolDictionary<idx_t> pos_dict_; /// String - number dictionary for the attribute names - SymbolDictionary<attribute_idx_t> attribute_dict_; + SymbolDictionary<idx_t> attribute_dict_; + + std::map<std::string, mask_t> string_to_value_mask_; - /// String - number dictionary for the attribute values - SymbolDictionary<value_idx_t> value_dict_; + std::map<mask_t, std::string> value_mask_to_string_; /// The original indices of the POSes in the tagset definition - std::map<pos_idx_t, size_t> original_pos_indices_; + std::map<idx_t, int> original_pos_indices_; - /// mapping from attribute indices to valid value indices - std::vector< std::vector<value_idx_t> > attribute_values_; + /// mapping from attribute indices to valid value masks + std::vector< std::vector<mask_t> > attribute_values_; - /// reverse mapping, from a value index to the respective attribute + std::vector<mask_t> attribute_masks_; + + /// reverse mapping, from a value mask to the respective attribute /// index (values are assumed to be unique and not shared between /// attributes) - std::vector<attribute_idx_t> value_attribute_; + std::map<mask_t, idx_t> value_mask_to_attribute_index_; /// POS to valid attribute indices mapping /// The order of the attributes is important, as it affects string /// output and the behavior of the _ special character in parsing - std::vector< std::vector<attribute_idx_t> > pos_attributes_; + std::vector< std::vector<idx_t> > pos_attributes_; + + std::vector<mask_t> pos_valid_value_masks_; + + std::vector<mask_t> pos_required_value_masks_; /// Flags for attributes which are valid for a given POS std::vector< std::vector<bool> > pos_valid_attributes_; diff --git a/libcorpus2/tagsetparser.cpp b/libcorpus2/tagsetparser.cpp index 3d167cd172ada4f20e2a2bec8597507d20a6b3de..c75f6c0e2144f85e1d32bdd0ecf789ada7e00af3 100644 --- a/libcorpus2/tagsetparser.cpp +++ b/libcorpus2/tagsetparser.cpp @@ -36,7 +36,7 @@ Tagset TagsetParser::load_ini(std::istream &is) std::set<std::string> symbols(values); typedef std::map< std::string, std::deque<std::string> > vmap_t; vmap_t vmap; - typedef std::map< std::string, std::vector<attribute_idx_t> > pmap_t; + typedef std::map< std::string, std::vector<idx_t> > pmap_t; pmap_t pmap; typedef std::map< std::string, std::vector<bool> > reqmap_t; reqmap_t reqmap; @@ -76,25 +76,26 @@ Tagset TagsetParser::load_ini(std::istream &is) } } - std::vector<std::string> vec; - std::copy(values.begin(), values.end(), - std::inserter(vec, vec.begin())); - if (vec[0] != "@null") { - throw TagsetParseError("First value not '@null'", line_no, vec[0]); + if (*values.begin() != "@null") { + throw TagsetParseError("First value not '@null'", line_no, + *values.begin()); } - tagset.value_dict_.load_sorted_data(vec); - vec.clear(); - tagset.value_attribute_.resize(values.size()); + mask_t current_value = 1; + std::vector<std::string> vec; foreach (const vmap_t::value_type v, vmap) { vec.push_back(v.first); tagset.attribute_values_.resize( tagset.attribute_values_.size() + 1); foreach (const std::string& s, v.second) { - tagset.attribute_values_.back().push_back( - tagset.value_dict_.get_id(s)); - value_idx_t v = tagset.value_dict_.get_id(s); - tagset.value_attribute_[v] = vec.size() - 1; + tagset.attribute_values_.back().push_back(current_value); + tagset.value_mask_to_attribute_index_.insert( + std::make_pair(current_value, vec.size() - 1)); + tagset.string_to_value_mask_.insert( + std::make_pair(s, current_value)); + tagset.value_mask_to_string_.insert( + std::make_pair(current_value, s)); + current_value <<= 1; } } tagset.attribute_dict_.load_sorted_data(vec); @@ -111,7 +112,7 @@ Tagset TagsetParser::load_ini(std::istream &is) throw TagsetParseError("Duplicate symbol", line_no, v[0]); } poses_plain.push_back(v[0]); - std::vector<attribute_idx_t>& pattrs = pmap[v[0]]; + std::vector<idx_t>& pattrs = pmap[v[0]]; std::vector<bool>& req_mask = reqmap[v[0]]; req_mask.resize(tagset.attribute_dict_.size()); v.pop_front(); @@ -122,7 +123,7 @@ Tagset TagsetParser::load_ini(std::istream &is) required = false; s = s.substr(1, s.size() - 2); } - attribute_idx_t a = tagset.attribute_dict_.get_id(s); + idx_t a = tagset.attribute_dict_.get_id(s); if (!tagset.attribute_dict_.is_id_valid(a)) { throw TagsetParseError("Attribute name invalid", line_no, s); @@ -140,7 +141,7 @@ Tagset TagsetParser::load_ini(std::istream &is) tagset.pos_attributes_.push_back(v.second); tagset.pos_valid_attributes_.push_back( std::vector<bool>(tagset.attribute_values_.size(), false)); - foreach (attribute_idx_t a, v.second) { + foreach (idx_t a, v.second) { tagset.pos_valid_attributes_.back()[a] = true; } tagset.pos_required_attributes_.push_back(reqmap[v.first]); @@ -150,7 +151,7 @@ Tagset TagsetParser::load_ini(std::istream &is) throw TagsetParseError("No POS in tagset", 0, ""); } for (size_t i = 0; i < poses_plain.size(); ++i) { - pos_idx_t p = tagset.pos_dictionary().get_id(poses_plain[i]); + idx_t p = tagset.pos_dictionary().get_id(poses_plain[i]); tagset.original_pos_indices_.insert(std::make_pair(p,i)); } @@ -161,20 +162,20 @@ void TagsetParser::save_ini(const Tagset &tagset, std::ostream &os) { os << "# Autogenerated by Corpus2\n\n"; os << "[ATTR]\n"; - attribute_idx_t a(0); + idx_t a(0); while (tagset.attribute_dict_.is_id_valid(a)) { os << tagset.attribute_dict_.get_string(a) << "\t= "; - foreach (value_idx_t v, tagset.get_attribute_values(a)) { - os << tagset.value_dict_.get_string(v) << " "; + foreach (mask_t m, tagset.get_attribute_values(a)) { + os << tagset.get_value_name(m) << " "; } os << "\n"; ++a; } os << "\n[POS]\n"; - pos_idx_t p(0); + idx_t p(0); while (tagset.pos_dict_.is_id_valid(p)) { os << tagset.pos_dict_.get_string(p) << "\t= "; - foreach (attribute_idx_t a, tagset.get_pos_attributes(p)) { + foreach (idx_t a, tagset.get_pos_attributes(p)) { if (tagset.pos_required_attributes_[p][a]) { os << tagset.attribute_dict_.get_string(a) << " "; } else { diff --git a/libcorpus2/token.cpp b/libcorpus2/token.cpp index ac86d5c6886992ca0f5fd80bc4bb3430e356babb..e69436ffe75d8bfc2f886af9eb55c5ed6c04ab2f 100644 --- a/libcorpus2/token.cpp +++ b/libcorpus2/token.cpp @@ -37,9 +37,9 @@ struct preferred_lexeme_cmp bool operator()(const Lexeme& l1, const Lexeme& l2) const { return (!l1.is_disamb() && l2.is_disamb()) || (l1.is_disamb() == l2.is_disamb() - && (tagset->get_original_pos_index(l1.tag().pos_id()) > - tagset->get_original_pos_index(l2.tag().pos_id()) - || (l1.tag().pos_id() == l2.tag().pos_id() + && (tagset->get_original_pos_index(l1.tag().get_pos_index()) > + tagset->get_original_pos_index(l2.tag().get_pos_index()) + || (l1.tag().get_pos() == l2.tag().get_pos() && l1 < l2))); } }; @@ -89,14 +89,14 @@ bool Token::remove_duplicate_lexemes() return old_size != lexemes_.size(); } -bool Token::orth_pos_match(pos_idx_t pos, const UnicodeString &orth) const +bool Token::orth_pos_match(mask_t pos, const UnicodeString &orth) const { if (orth.length() > 0) { if (orth.caseCompare(orth_, 0) != 0) return false; } - if (pos != static_cast<pos_idx_t>(-1)) { + if (pos) { foreach (const Lexeme& lex, lexemes_) { - if (lex.tag().pos_id() != pos) return false; + if (lex.tag().get_pos() != pos) return false; } } return true; diff --git a/libcorpus2/token.h b/libcorpus2/token.h index 6ea73cdc878daf9d556878bbc87cbd114dbea91f..2961eb7e32a6b7d99c5ea7e16e1ba98a4ff61747 100644 --- a/libcorpus2/token.h +++ b/libcorpus2/token.h @@ -117,7 +117,7 @@ public: * @returns true if the orth and lexemes pass the check, false * otherwise */ - bool orth_pos_match(pos_idx_t pos, const UnicodeString& orth) const; + bool orth_pos_match(mask_t pos, const UnicodeString& orth) const; private: /// The orth (actual encountered form) diff --git a/libcorpus2/util/symboldictionary.h b/libcorpus2/util/symboldictionary.h index f703a8f9899b470261a921e82991c4b98e59f593..322e33bb5f329d789cb08d42e4797c5b08308be4 100644 --- a/libcorpus2/util/symboldictionary.h +++ b/libcorpus2/util/symboldictionary.h @@ -39,7 +39,7 @@ public: bool is_id_valid(IndexT idx) const; /// Getter for the size of this dictionary - size_t size() const; + size_t size() const; /** * Get the index for a given string identifier, const char* version. diff --git a/tagset-tool/main.cpp b/tagset-tool/main.cpp index 553bc4db2caa43ed0efef2b60c584935a5941faa..b05090546c44fa79077c84ec22129a6dad982b87 100644 --- a/tagset-tool/main.cpp +++ b/tagset-tool/main.cpp @@ -74,9 +74,13 @@ void libedit_read_loop(boost::function<void (const std::string&)>& line_cb) void tagset_info(const Corpus2::Tagset& tagset) { std::cerr << "Corpus2::Tagset loaded: " - << tagset.pos_dictionary().size() << " POSes, " - << tagset.attribute_dictionary().size() << " attributes, " - << tagset.value_dictionary().size() << " values\n"; + << tagset.pos_count() << " POSes, " + << tagset.attribute_count() << " attributes, " + << tagset.value_count() << " values ["; + for (Corpus2::idx_t a = 0; a < tagset.attribute_count(); ++a) { + std::cerr << tagset.get_attribute_values(a).size() << " "; + } + std::cerr << "\n"; std::cerr << "Size is " << tagset.size() << " (extra size is " << tagset.size_extra() << ")\n"; std::cerr << "POSes: "; @@ -88,12 +92,12 @@ void tagset_info(const Corpus2::Tagset& tagset) void tagset_query_cb(const Corpus2::Tagset& tagset, const std::string& s) { - Corpus2::pos_idx_t pos = tagset.pos_dictionary().get_id(s); - Corpus2::attribute_idx_t atr = tagset.attribute_dictionary().get_id(s); - Corpus2::value_idx_t val = tagset.value_dictionary().get_id(s); + Corpus2::idx_t pos = tagset.pos_dictionary().get_id(s); + Corpus2::idx_t atr = tagset.attribute_dictionary().get_id(s); + Corpus2::mask_t val = tagset.get_value_mask(s); if (tagset.pos_dictionary().is_id_valid(pos)) { std::cout << s << " -> POS ->" ; - foreach (Corpus2::attribute_idx_t a, tagset.get_pos_attributes(pos)) { + foreach (Corpus2::idx_t a, tagset.get_pos_attributes(pos)) { std::string astr = tagset.attribute_dictionary().get_string(a); if (tagset.get_pos_required_attributes(pos)[a]) { std::cout << " " << astr; @@ -104,12 +108,12 @@ void tagset_query_cb(const Corpus2::Tagset& tagset, const std::string& s) std::cout << "\n"; } else if (tagset.attribute_dictionary().is_id_valid(atr)) { std::cout << s << " -> attribute ->"; - foreach (Corpus2::value_idx_t v, tagset.get_attribute_values(atr)) { - std::cout << " " << tagset.value_dictionary().get_string(v); + foreach (Corpus2::mask_t v, tagset.get_attribute_values(atr)) { + std::cout << " " << tagset.get_value_name(v); } std::cout << "\nIn POSes:"; - for (Corpus2::pos_idx_t p = (Corpus2::pos_idx_t)(0); p < tagset.pos_dictionary().size(); ++p) { - if (tagset.get_pos_valid_attributes(p)[atr]) { + for (Corpus2::idx_t p = 0; p < tagset.pos_dictionary().size(); ++p) { + if (tagset.get_pos_attributes(p)[atr]) { std::cout << " " << tagset.pos_dictionary().get_string(p); if (!tagset.get_pos_required_attributes(p)[atr]) { std::cout << "?"; @@ -117,17 +121,17 @@ void tagset_query_cb(const Corpus2::Tagset& tagset, const std::string& s) } } std::cout << "\n"; - } else if (tagset.value_dictionary().is_id_valid(val)) { - Corpus2::attribute_idx_t a = tagset.get_value_attribute(val); + } else if (val) { + Corpus2::idx_t a = tagset.get_value_attribute_index(val); std::cout << s << " -> value -> attribute "; std::cout << tagset.attribute_dictionary().get_string(a); std::cout << " ."; - foreach (Corpus2::value_idx_t v, tagset.get_attribute_values(a)) { - std::cout << " " << tagset.value_dictionary().get_string(v); + foreach (Corpus2::idx_t v, tagset.get_attribute_values(a)) { + std::cout << " " << tagset.get_value_name(v); } std::cout << "\nIn POSes:"; - for (Corpus2::pos_idx_t p = (Corpus2::pos_idx_t)(0); p < tagset.pos_dictionary().size(); ++p) { - if (tagset.get_pos_valid_attributes(p)[a]) { + for (Corpus2::idx_t p = 0; p < tagset.pos_dictionary().size(); ++p) { + if (tagset.get_pos_attributes(p)[a]) { std::cout << " " << tagset.pos_dictionary().get_string(p); if (!tagset.get_pos_required_attributes(p)[a]) { std::cout << "?"; diff --git a/tests/basic.cpp b/tests/basic.cpp index d9651327aca94202a2fe8232d1f6a8c7e6081ec5..e2117c39ed1043d1a20e40dcbe757cd4450ce9a0 100644 --- a/tests/basic.cpp +++ b/tests/basic.cpp @@ -20,7 +20,7 @@ BOOST_AUTO_TEST_CASE( token_dup_lexemes ) { Corpus2::Token t(UnicodeString::fromUTF8("ZZ"), PwrNlp::Whitespace::ManySpaces); //Corpus2::Tagset tagset(tagsetstr1); - Corpus2::Tag t1(Corpus2::tagset_idx_t(0), Corpus2::pos_idx_t(0)); + Corpus2::Tag t1(Corpus2::tagset_idx_t(0), Corpus2::idx_t(0)); Corpus2::Lexeme l1(UnicodeString::fromUTF8("aaa"), t1); Corpus2::Lexeme l2(UnicodeString::fromUTF8("bbb"), t1); BOOST_CHECK(!t.check_duplicate_lexemes());