diff --git a/libcorpus2/tagset.cpp b/libcorpus2/tagset.cpp index 68a827aa25208680435ed18276d458328a30dd9f..a4afb93feb7261679db1740486ebf851b0f2bf58 100644 --- a/libcorpus2/tagset.cpp +++ b/libcorpus2/tagset.cpp @@ -144,6 +144,7 @@ void Tagset::parse_tag(const string_range_vector &fields, bool allow_extra, std::vector<mask_t> values; foreach (string_range& dot, dots) { mask_t v = get_value_mask(boost::copy_range<std::string>(dot)); + //TODO ensure all in one attribute, pass mask to append_ if (v.none()) { throw TagParseError("Unknown attribute value", boost::copy_range<std::string>(r), "", @@ -159,6 +160,7 @@ void Tagset::parse_tag(const string_range_vector &fields, bool allow_extra, "", "", id_string()); } idx_t attr = pos_attributes_[pos_idx][fi - 1]; + //TODO use attr mask append_to_multi_tag(all_variants, attribute_values_[attr]); } // else empty, do nothing } @@ -218,15 +220,29 @@ Tag Tagset::parse_simple_tag(const string_range_vector &ts, id_string()); } } else { - values |= val; + mask_t a = get_attribute_mask(get_value_attribute(val)); + values = (values & ~a) | val; } } } - return Tag(get_pos_mask(pos_idx), values); + + return make_tag(pos_idx, values, allow_extra); } Tag Tagset::make_tag(idx_t pos_idx, mask_t values, bool allow_extra) const { + mask_t required_values = get_pos_value_mask(pos_idx); + //std::cerr << values << "\n"; + //std::cerr << required_values << "\n"; + //std::cerr << (required_values & values) << "\n"; + //std::cerr << PwrNlp::count_bits_set(required_values & values) + // << pos_required_attributes_idx_[pos_idx].size() << "\n"; + size_t has_req = PwrNlp::count_bits_set(required_values & values); + if (has_req != pos_required_attributes_idx_[pos_idx].size()) { + throw TagParseError("Required attribute missing", + "", + get_pos_name(pos_idx), id_string()); + } mask_t valid_values = get_pos_value_mask(pos_idx); mask_t invalid = values & ~valid_values; if (invalid.any() && !allow_extra) { @@ -267,7 +283,7 @@ bool Tagset::validate_tag(const Tag &t, bool allow_extra, std::vector<bool> valid = get_pos_attributes_flag(pos_idx); std::vector<bool> required = get_pos_required_attributes(pos_idx); - for (idx_t i = 0; i < attribute_dict_.size(); ++i) { + for (idx_t i = 0; i < attribute_count(); ++i) { mask_t value = t.get_values_for(get_attribute_mask(i)); if (value == 0) { if (required[i]) { @@ -309,7 +325,7 @@ std::string Tagset::tag_to_string(const Tag &tag) const } } // print extra attributes - for (idx_t a = 0; a < attribute_dict_.size(); ++a) { + for (idx_t a = 0; a < attribute_count(); ++a) { if (!pos_has_attribute(pos_idx, a)) { mask_t value = tag.get_values_for(get_attribute_mask(a)); if (value.any()) { @@ -431,7 +447,7 @@ const std::string& Tagset::get_attribute_name(idx_t a) const const std::vector<mask_t>& Tagset::get_attribute_values(idx_t a) const { static std::vector<mask_t> null_vec; - if (a < 0 || a >= attribute_dict_.size()) { + if (a < 0 || a >= attribute_count()) { return null_vec; } else { return attribute_values_[a]; @@ -440,7 +456,7 @@ const std::vector<mask_t>& Tagset::get_attribute_values(idx_t a) const mask_t Tagset::get_attribute_mask(idx_t a) const { - if (a < 0 || a >= attribute_dict_.size()) { + if (a < 0 || a >= attribute_count()) { return 0; } else { return attribute_masks_[a]; @@ -531,25 +547,25 @@ bool Tagset::pos_has_attribute(idx_t pos, idx_t attribute) const mask_t Tagset::get_pos_value_mask(idx_t pos) const { - return 0; //TODO + return pos_valid_value_masks_[pos]; } mask_t Tagset::get_pos_required_mask(idx_t pos) const { - return 0; //TODO + return pos_required_value_masks_[pos]; } -size_t Tagset::pos_count() const +int Tagset::pos_count() const { return pos_dict_.size(); } -size_t Tagset::attribute_count() const +int Tagset::attribute_count() const { return attribute_dict_.size(); } -size_t Tagset::value_count() const +int Tagset::value_count() const { return value_mask_to_string_.size(); } diff --git a/libcorpus2/tagset.h b/libcorpus2/tagset.h index af3f417f692df2ff81ef0d84d1d63b2b9f7a4d82..51bed03d933966c3f6c7d561bdda248d9c956844 100644 --- a/libcorpus2/tagset.h +++ b/libcorpus2/tagset.h @@ -343,11 +343,11 @@ public: mask_t get_pos_required_mask(idx_t pos) const; - size_t pos_count() const; + int pos_count() const; - size_t attribute_count() const; + int attribute_count() const; - size_t value_count() const; + int value_count() const; /** * Tagset cardinality counter -- the number of different valid tags @@ -489,6 +489,8 @@ private: /// output and the behavior of the _ special character in parsing std::vector< std::vector<idx_t> > pos_attributes_; + std::vector< std::vector<idx_t> > pos_required_attributes_idx_; + std::vector<mask_t> pos_valid_value_masks_; std::vector<mask_t> pos_required_value_masks_; diff --git a/libcorpus2/tagsetparser.cpp b/libcorpus2/tagsetparser.cpp index 6e71a6169cb293dbbef7816100c55107a10e1ee7..e4d3ac93d6c1cbee476c27d7abf06a24d27d20a9 100644 --- a/libcorpus2/tagsetparser.cpp +++ b/libcorpus2/tagsetparser.cpp @@ -143,13 +143,24 @@ Tagset TagsetParser::load_ini(std::istream &is) vec.clear(); foreach (const pmap_t::value_type v, pmap) { vec.push_back(v.first); + mask_t valid(0); + mask_t required(0); tagset.pos_attributes_.push_back(v.second); + tagset.pos_required_attributes_idx_.resize( + tagset.pos_required_attributes_idx_.size() + 1); tagset.pos_valid_attributes_.push_back( std::vector<bool>(tagset.attribute_values_.size(), false)); foreach (idx_t a, v.second) { + valid |= tagset.get_attribute_mask(a); + if (reqmap[v.first][a]) { + required |= tagset.get_attribute_mask(a); + tagset.pos_required_attributes_idx_.back().push_back(a); + } tagset.pos_valid_attributes_.back()[a] = true; } tagset.pos_required_attributes_.push_back(reqmap[v.first]); + tagset.pos_valid_value_masks_.push_back(valid); + tagset.pos_required_value_masks_.push_back(required); } tagset.pos_dict_.load_sorted_data(vec); if (tagset.pos_dict_.size() == 0) {