From 2e82e6534325a9f9d353f7d5ba3172410614f4cd Mon Sep 17 00:00:00 2001 From: ilor <kailoran@gmail.com> Date: Wed, 20 Oct 2010 16:10:32 +0200 Subject: [PATCH] add Tagset::split_tag --- libcorpus2/tagset.cpp | 35 ++++++++++++++++++++++++++++++++++- libcorpus2/tagset.h | 2 ++ tests/tag_split.cpp | 7 +++++++ 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/libcorpus2/tagset.cpp b/libcorpus2/tagset.cpp index e03a78e..403b45b 100644 --- a/libcorpus2/tagset.cpp +++ b/libcorpus2/tagset.cpp @@ -361,6 +361,39 @@ bool Tagset::tag_is_singular(const Tag& tag) const return true; } +std::vector<Tag> Tagset::split_tag(const Tag& tag) const +{ + std::vector<Tag> tags; + mask_t pos = tag.get_pos(); + while (pos) { + idx_t pos_idx = PwrNlp::lowest_bit(pos); + mask_t pos_mask = static_cast<mask_t>(1) << pos_idx; + pos ^= pos_mask; + tags.push_back(Tag(pos_mask)); + } + + for (idx_t a = 0; a < attribute_count(); ++a) { + mask_t ma = get_attribute_mask(a); + mask_t v = tag.get_values_for(ma); + if (ma) { + bool dup = false; + size_t sz = tags.size(); + foreach (mask_t vm, get_attribute_values(a)) { + if (v & vm) { + if (dup) { + std::copy(tags.begin(), tags.begin() + sz, std::back_inserter(tags)); + } + dup = true; + for (size_t i = 0; i < sz; ++i) { + tags[i].add_values(vm); + } + } + } + } + } + return tags; +} + idx_t Tagset::get_pos_index(const string_range& pos) const { return pos_dict_.get_id(pos); @@ -379,7 +412,7 @@ mask_t Tagset::get_pos_mask(const string_range& pos) const mask_t Tagset::get_pos_mask(idx_t pos) const { if (pos >= 0) { - return 1 << pos; + return static_cast<mask_t>(1) << pos; } else { return 0; } diff --git a/libcorpus2/tagset.h b/libcorpus2/tagset.h index 366b0b0..af3f417 100644 --- a/libcorpus2/tagset.h +++ b/libcorpus2/tagset.h @@ -254,6 +254,8 @@ public: bool tag_is_singular(const Tag& tag) const; + std::vector<Tag> split_tag(const Tag& tag) const; + /// POS name <-> index dictionary getter const SymbolDictionary<idx_t>& pos_dictionary() const { return pos_dict_; diff --git a/tests/tag_split.cpp b/tests/tag_split.cpp index eb34403..997e0c2 100644 --- a/tests/tag_split.cpp +++ b/tests/tag_split.cpp @@ -180,6 +180,13 @@ BOOST_FIXTURE_TEST_CASE( tag_size, F ) Corpus2::Tag t4 = tagset->parse_simple_tag("same:other:thang", true); t.add_values(t4.get_values() & tagset->get_attribute_mask(std::string("A"))); BOOST_CHECK_EQUAL(tagset->tag_size(t), 6); + std::vector<Corpus2::Tag> tags = tagset->split_tag(t); + BOOST_CHECK_EQUAL(tags.size(), 6); + Corpus2::Tag tt; + foreach (Corpus2::Tag t, tags) { + tt.combine_with(t); + } + BOOST_CHECK(tt == t); } BOOST_AUTO_TEST_SUITE_END() -- GitLab