From 29c7ed0ea27fe8713fcb0d6fa1080ac2aa7c4ce7 Mon Sep 17 00:00:00 2001
From: Adam Radziszewski <adam.radziszewski@pwr.wroc.pl>
Date: Thu, 14 Jul 2011 16:00:26 +0200
Subject: [PATCH] helper function for processing unspec attr vals

---
 libcorpus2/tagging.cpp | 38 ++++++++++++++++++++++++++++++++
 libcorpus2/tagging.h   | 20 +++++++++++++++++
 libcorpus2/tagset.cpp  | 49 ++++++++++++++++++++++++++++++++++++++++++
 libcorpus2/tagset.h    | 18 ++++++++++++++++
 swig/tagging.i         |  8 +++++++
 swig/tagset.i          |  3 ++-
 6 files changed, 135 insertions(+), 1 deletion(-)

diff --git a/libcorpus2/tagging.cpp b/libcorpus2/tagging.cpp
index b1f4587..6730461 100644
--- a/libcorpus2/tagging.cpp
+++ b/libcorpus2/tagging.cpp
@@ -19,6 +19,7 @@ or FITNESS FOR A PARTICULAR PURPOSE.
 
 
 #include <libpwrutils/foreach.h>
+#include <libpwrutils/bitset.h>
 
 namespace Corpus2 {
 
@@ -52,5 +53,42 @@ Tag mask_token(const Token& token, const Tag& mask, bool disamb_only)
 	return t;
 }
 
+int mask_card(const Tag& mask)
+{
+	return PwrNlp::count_bits_set(mask.get_pos())
+			+ PwrNlp::count_bits_set(mask.get_values());
+}
+
+bool select_preferred_disamb(const Tagset& tagset,
+							 Token* token)
+{
+	size_t lex_idx = token->get_preferred_lexeme_index(tagset);
+	if(!token->lexemes()[lex_idx].is_disamb()) {
+		return false;
+	}
+
+	for (size_t other_idx = 0;
+		 other_idx < token->lexemes().size();
+		 ++other_idx) {
+		if (other_idx != lex_idx) {
+			token->lexemes()[other_idx].set_disamb(false);
+		}
+	}
+	return true;
+}
+
+void expand_unspec_attrs(const Tagset& tagset, Token* token)
+{
+	foreach (Lexeme& lex, token->lexemes()) {
+		lex.set_tag(tagset.expand_unspec_attrs(lex.tag()));
+	}
+}
+
+void select_singular_tags(const Tagset& tagset, Token* token)
+{
+	foreach (Lexeme& lex, token->lexemes()) {
+		lex.set_tag(tagset.select_singular(lex.tag()));
+	}
+}
 
 } /* end ns Corpus2 */
diff --git a/libcorpus2/tagging.h b/libcorpus2/tagging.h
index 42e4dd2..16ed943 100644
--- a/libcorpus2/tagging.h
+++ b/libcorpus2/tagging.h
@@ -42,6 +42,26 @@ Tag get_attribute_mask(const Tagset& tagset,
   */
 Tag mask_token(const Token& token, const Tag& mask, bool disamb_only);
 
+/** Returns the number of set elements belonging to the mask given. */
+int mask_card(const Tag& mask);
+
+/** Forces one disamb lexeme per token. The selection is based on tagset
+  * definition order. Returns if any disamb found.
+  */
+bool select_preferred_disamb(const Tagset& tagset, Token* token);
+
+/** Encodes attributes with unspecified values as each value set.
+  * This is to facilitate safe masking when the value in question is not to be
+  * skipped.
+  */
+void expand_unspec_attrs(const Tagset& tagset, Token* token);
+
+/** Repairs multivalue tags. Optional attributes will be cleared if
+  * multi-value. Regular attributes will be set to lowest value given.
+  */
+void select_singular_tags(const Tagset& tagset, Token* token);
+
+
 } /* end ns Corpus2 */
 
 #endif // LIBCORPUS2_TAGGING_H
diff --git a/libcorpus2/tagset.cpp b/libcorpus2/tagset.cpp
index f078d17..fd0af7b 100644
--- a/libcorpus2/tagset.cpp
+++ b/libcorpus2/tagset.cpp
@@ -571,6 +571,55 @@ std::vector<Tag> Tagset::split_tag(const Tag& tag) const
 	return tags;
 }
 
+Tag Tagset::select_singular(const Tag& tag) const
+{
+	Tag new_tag;
+	// force one POS
+	idx_t pos_idx = tag.get_pos_index();
+	mask_t pos_mask = get_pos_mask(pos_idx);
+	new_tag.set_pos(pos_mask);
+	// now iterate over attrs
+	const std::vector<idx_t>& attrs = get_pos_attributes(pos_idx);
+	foreach (const idx_t& a, attrs) {
+		mask_t attr_mask = get_attribute_mask(a);
+		mask_t value = tag.get_values_for(attr_mask);
+		// check if the attr is multi-value
+		if (PwrNlp::count_bits_set(value) > 1)
+		{
+			if (pos_requires_attribute(pos_idx, a)) {
+				// this is a required attr, so just select first value
+				idx_t val_bit = PwrNlp::lowest_bit(value);
+				// well, this is not POS but attr value but the
+				// implementation is ok anyway...
+				mask_t one_mask = get_pos_mask(val_bit);
+				new_tag.add_values(one_mask);
+			}
+			// else it is already null
+		}
+		else {
+			// leave the singular value intact
+			new_tag.add_values(value);
+		}
+	}
+	return new_tag;
+}
+
+Tag Tagset::expand_unspec_attrs(const Tag& tag) const
+{
+	Tag new_tag(tag);
+	idx_t pos_idx = tag.get_pos_index();
+	const std::vector<idx_t>& attrs = get_pos_attributes(pos_idx);
+	foreach (const idx_t& a, attrs) {
+		mask_t attr_mask = get_attribute_mask(a);
+		mask_t value = tag.get_values_for(attr_mask);
+		if (!value.any()) { // no value given
+			const Tag all_vals(0, attr_mask);
+			new_tag.combine_with(all_vals);
+		}
+	}
+	return new_tag;
+}
+
 idx_t Tagset::get_pos_index(const string_range& pos) const
 {
 	return pos_dict_.get_id(pos);
diff --git a/libcorpus2/tagset.h b/libcorpus2/tagset.h
index ae407a4..271a735 100644
--- a/libcorpus2/tagset.h
+++ b/libcorpus2/tagset.h
@@ -365,6 +365,24 @@ public:
 	 */
 	std::vector<Tag> split_tag(const Tag& tag) const;
 
+	/**
+	  * Creates a singular tag from this one, possibly being a "multi-tag".
+	  * POS and required attribute sets are reduced to tagset-wise first
+	  * values, while optional attributes are reduced to unspecified if
+	  * multiple values given (left intact otherwise).
+	  */
+	Tag select_singular(const Tag& tag) const;
+
+	/**
+	  * Creates a copy of the given tag where optional or required attributes
+	  * with no value given are encoded as each possible value set.
+	  * NOTE: this may result in tags technically invalid (multiple values set
+	  * for one attribute), yet it is convenient for some tagging scenarios to
+	  * explicitly distinguish between an irrelevant attribute and a relevant
+	  * one but no value given.
+	  */
+	Tag expand_unspec_attrs(const Tag& tag) const;
+
 	/// POS name <-> index dictionary getter
 	const SymbolDictionary<idx_t>& pos_dictionary() const {
 		return pos_dict_;
diff --git a/swig/tagging.i b/swig/tagging.i
index 21c79f1..5ae141d 100644
--- a/swig/tagging.i
+++ b/swig/tagging.i
@@ -19,6 +19,14 @@ Tag get_attribute_mask(const Tagset& tagset,
 
 Tag mask_token(const Token& token, const Tag& mask, bool disamb_only);
 
+int mask_card(const Tag& mask);
+
+bool select_preferred_disamb(const Tagset& tagset, Token* token);
+
+void expand_unspec_attrs(const Tagset& tagset, Token* token);
+
+void select_singular_tags(const Tagset& tagset, Token* token);
+
 }
 
 using namespace std;
diff --git a/swig/tagset.i b/swig/tagset.i
index d318a80..1d62038 100644
--- a/swig/tagset.i
+++ b/swig/tagset.i
@@ -91,7 +91,8 @@ namespace Corpus2 {
 
     /* --------------------------------------------------------------------- */
     std::vector<Tag> split_tag(const Tag& tag) const;
-
+    Tag select_singular(const Tag& tag) const;
+    Tag expand_unspec_attrs(const Tag& tag) const;
     /* --------------------------------------------------------------------- */
     int pos_count() const;
     int attribute_count() const;
-- 
GitLab