diff --git a/libcorpus2/tagset.cpp b/libcorpus2/tagset.cpp index e03a78eba5afc7bc57805a225ab2441e591a946f..403b45bba12f54fcda40e5e637dfc6ed40b7dfd2 100644 --- a/libcorpus2/tagset.cpp +++ b/libcorpus2/tagset.cpp @@ -361,6 +361,39 @@ bool Tagset::tag_is_singular(const Tag& tag) const return true; } +std::vector<Tag> Tagset::split_tag(const Tag& tag) const +{ + std::vector<Tag> tags; + mask_t pos = tag.get_pos(); + while (pos) { + idx_t pos_idx = PwrNlp::lowest_bit(pos); + mask_t pos_mask = static_cast<mask_t>(1) << pos_idx; + pos ^= pos_mask; + tags.push_back(Tag(pos_mask)); + } + + for (idx_t a = 0; a < attribute_count(); ++a) { + mask_t ma = get_attribute_mask(a); + mask_t v = tag.get_values_for(ma); + if (ma) { + bool dup = false; + size_t sz = tags.size(); + foreach (mask_t vm, get_attribute_values(a)) { + if (v & vm) { + if (dup) { + std::copy(tags.begin(), tags.begin() + sz, std::back_inserter(tags)); + } + dup = true; + for (size_t i = 0; i < sz; ++i) { + tags[i].add_values(vm); + } + } + } + } + } + return tags; +} + idx_t Tagset::get_pos_index(const string_range& pos) const { return pos_dict_.get_id(pos); @@ -379,7 +412,7 @@ mask_t Tagset::get_pos_mask(const string_range& pos) const mask_t Tagset::get_pos_mask(idx_t pos) const { if (pos >= 0) { - return 1 << pos; + return static_cast<mask_t>(1) << pos; } else { return 0; } diff --git a/libcorpus2/tagset.h b/libcorpus2/tagset.h index 366b0b083e8b08394bab7019a51e2e679cd5e93c..af3f417f692df2ff81ef0d84d1d63b2b9f7a4d82 100644 --- a/libcorpus2/tagset.h +++ b/libcorpus2/tagset.h @@ -254,6 +254,8 @@ public: bool tag_is_singular(const Tag& tag) const; + std::vector<Tag> split_tag(const Tag& tag) const; + /// POS name <-> index dictionary getter const SymbolDictionary<idx_t>& pos_dictionary() const { return pos_dict_; diff --git a/tests/tag_split.cpp b/tests/tag_split.cpp index eb3440344e8cfdb307cd7f9bee73c37f00be68be..997e0c2fe07063c6f8749232f18b46e3969356b5 100644 --- a/tests/tag_split.cpp +++ b/tests/tag_split.cpp @@ -180,6 +180,13 @@ BOOST_FIXTURE_TEST_CASE( tag_size, F ) Corpus2::Tag t4 = tagset->parse_simple_tag("same:other:thang", true); t.add_values(t4.get_values() & tagset->get_attribute_mask(std::string("A"))); BOOST_CHECK_EQUAL(tagset->tag_size(t), 6); + std::vector<Corpus2::Tag> tags = tagset->split_tag(t); + BOOST_CHECK_EQUAL(tags.size(), 6); + Corpus2::Tag tt; + foreach (Corpus2::Tag t, tags) { + tt.combine_with(t); + } + BOOST_CHECK(tt == t); } BOOST_AUTO_TEST_SUITE_END()