From 2e82e6534325a9f9d353f7d5ba3172410614f4cd Mon Sep 17 00:00:00 2001
From: ilor <kailoran@gmail.com>
Date: Wed, 20 Oct 2010 16:10:32 +0200
Subject: [PATCH] add Tagset::split_tag

---
 libcorpus2/tagset.cpp | 35 ++++++++++++++++++++++++++++++++++-
 libcorpus2/tagset.h   |  2 ++
 tests/tag_split.cpp   |  7 +++++++
 3 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/libcorpus2/tagset.cpp b/libcorpus2/tagset.cpp
index e03a78e..403b45b 100644
--- a/libcorpus2/tagset.cpp
+++ b/libcorpus2/tagset.cpp
@@ -361,6 +361,39 @@ bool Tagset::tag_is_singular(const Tag& tag) const
 	return true;
 }
 
+std::vector<Tag> Tagset::split_tag(const Tag& tag) const
+{
+	std::vector<Tag> tags;
+	mask_t pos = tag.get_pos();
+	while (pos) {
+		idx_t pos_idx = PwrNlp::lowest_bit(pos);
+		mask_t pos_mask = static_cast<mask_t>(1) << pos_idx;
+		pos ^= pos_mask;
+		tags.push_back(Tag(pos_mask));
+	}
+
+	for (idx_t a = 0; a < attribute_count(); ++a) {
+		mask_t ma = get_attribute_mask(a);
+		mask_t v = tag.get_values_for(ma);
+		if (ma) {
+			bool dup = false;
+			size_t sz = tags.size();
+			foreach (mask_t vm, get_attribute_values(a)) {
+				if (v & vm) {
+					if (dup) {
+						std::copy(tags.begin(), tags.begin() + sz, std::back_inserter(tags));
+					}
+					dup = true;
+					for (size_t i = 0; i < sz; ++i) {
+						tags[i].add_values(vm);
+					}
+				}
+			}
+		}
+	}
+	return tags;
+}
+
 idx_t Tagset::get_pos_index(const string_range& pos) const
 {
 	return pos_dict_.get_id(pos);
@@ -379,7 +412,7 @@ mask_t Tagset::get_pos_mask(const string_range& pos) const
 mask_t Tagset::get_pos_mask(idx_t pos) const
 {
 	if (pos >= 0) {
-		return 1 << pos;
+		return static_cast<mask_t>(1) << pos;
 	} else {
 		return 0;
 	}
diff --git a/libcorpus2/tagset.h b/libcorpus2/tagset.h
index 366b0b0..af3f417 100644
--- a/libcorpus2/tagset.h
+++ b/libcorpus2/tagset.h
@@ -254,6 +254,8 @@ public:
 
 	bool tag_is_singular(const Tag& tag) const;
 
+	std::vector<Tag> split_tag(const Tag& tag) const;
+
 	/// POS name <-> index dictionary getter
 	const SymbolDictionary<idx_t>& pos_dictionary() const {
 		return pos_dict_;
diff --git a/tests/tag_split.cpp b/tests/tag_split.cpp
index eb34403..997e0c2 100644
--- a/tests/tag_split.cpp
+++ b/tests/tag_split.cpp
@@ -180,6 +180,13 @@ BOOST_FIXTURE_TEST_CASE( tag_size, F )
 	Corpus2::Tag t4 = tagset->parse_simple_tag("same:other:thang", true);
 	t.add_values(t4.get_values() & tagset->get_attribute_mask(std::string("A")));
 	BOOST_CHECK_EQUAL(tagset->tag_size(t), 6);
+	std::vector<Corpus2::Tag> tags = tagset->split_tag(t);
+	BOOST_CHECK_EQUAL(tags.size(), 6);
+	Corpus2::Tag tt;
+	foreach (Corpus2::Tag t, tags) {
+		tt.combine_with(t);
+	}
+	BOOST_CHECK(tt == t);
 }
 
 BOOST_AUTO_TEST_SUITE_END()
-- 
GitLab