From 29c7ed0ea27fe8713fcb0d6fa1080ac2aa7c4ce7 Mon Sep 17 00:00:00 2001 From: Adam Radziszewski <adam.radziszewski@pwr.wroc.pl> Date: Thu, 14 Jul 2011 16:00:26 +0200 Subject: [PATCH] helper function for processing unspec attr vals --- libcorpus2/tagging.cpp | 38 ++++++++++++++++++++++++++++++++ libcorpus2/tagging.h | 20 +++++++++++++++++ libcorpus2/tagset.cpp | 49 ++++++++++++++++++++++++++++++++++++++++++ libcorpus2/tagset.h | 18 ++++++++++++++++ swig/tagging.i | 8 +++++++ swig/tagset.i | 3 ++- 6 files changed, 135 insertions(+), 1 deletion(-) diff --git a/libcorpus2/tagging.cpp b/libcorpus2/tagging.cpp index b1f4587..6730461 100644 --- a/libcorpus2/tagging.cpp +++ b/libcorpus2/tagging.cpp @@ -19,6 +19,7 @@ or FITNESS FOR A PARTICULAR PURPOSE. #include <libpwrutils/foreach.h> +#include <libpwrutils/bitset.h> namespace Corpus2 { @@ -52,5 +53,42 @@ Tag mask_token(const Token& token, const Tag& mask, bool disamb_only) return t; } +int mask_card(const Tag& mask) +{ + return PwrNlp::count_bits_set(mask.get_pos()) + + PwrNlp::count_bits_set(mask.get_values()); +} + +bool select_preferred_disamb(const Tagset& tagset, + Token* token) +{ + size_t lex_idx = token->get_preferred_lexeme_index(tagset); + if(!token->lexemes()[lex_idx].is_disamb()) { + return false; + } + + for (size_t other_idx = 0; + other_idx < token->lexemes().size(); + ++other_idx) { + if (other_idx != lex_idx) { + token->lexemes()[other_idx].set_disamb(false); + } + } + return true; +} + +void expand_unspec_attrs(const Tagset& tagset, Token* token) +{ + foreach (Lexeme& lex, token->lexemes()) { + lex.set_tag(tagset.expand_unspec_attrs(lex.tag())); + } +} + +void select_singular_tags(const Tagset& tagset, Token* token) +{ + foreach (Lexeme& lex, token->lexemes()) { + lex.set_tag(tagset.select_singular(lex.tag())); + } +} } /* end ns Corpus2 */ diff --git a/libcorpus2/tagging.h b/libcorpus2/tagging.h index 42e4dd2..16ed943 100644 --- a/libcorpus2/tagging.h +++ b/libcorpus2/tagging.h @@ -42,6 +42,26 @@ Tag get_attribute_mask(const Tagset& tagset, */ Tag mask_token(const Token& token, const Tag& mask, bool disamb_only); +/** Returns the number of set elements belonging to the mask given. */ +int mask_card(const Tag& mask); + +/** Forces one disamb lexeme per token. The selection is based on tagset + * definition order. Returns if any disamb found. + */ +bool select_preferred_disamb(const Tagset& tagset, Token* token); + +/** Encodes attributes with unspecified values as each value set. + * This is to facilitate safe masking when the value in question is not to be + * skipped. + */ +void expand_unspec_attrs(const Tagset& tagset, Token* token); + +/** Repairs multivalue tags. Optional attributes will be cleared if + * multi-value. Regular attributes will be set to lowest value given. + */ +void select_singular_tags(const Tagset& tagset, Token* token); + + } /* end ns Corpus2 */ #endif // LIBCORPUS2_TAGGING_H diff --git a/libcorpus2/tagset.cpp b/libcorpus2/tagset.cpp index f078d17..fd0af7b 100644 --- a/libcorpus2/tagset.cpp +++ b/libcorpus2/tagset.cpp @@ -571,6 +571,55 @@ std::vector<Tag> Tagset::split_tag(const Tag& tag) const return tags; } +Tag Tagset::select_singular(const Tag& tag) const +{ + Tag new_tag; + // force one POS + idx_t pos_idx = tag.get_pos_index(); + mask_t pos_mask = get_pos_mask(pos_idx); + new_tag.set_pos(pos_mask); + // now iterate over attrs + const std::vector<idx_t>& attrs = get_pos_attributes(pos_idx); + foreach (const idx_t& a, attrs) { + mask_t attr_mask = get_attribute_mask(a); + mask_t value = tag.get_values_for(attr_mask); + // check if the attr is multi-value + if (PwrNlp::count_bits_set(value) > 1) + { + if (pos_requires_attribute(pos_idx, a)) { + // this is a required attr, so just select first value + idx_t val_bit = PwrNlp::lowest_bit(value); + // well, this is not POS but attr value but the + // implementation is ok anyway... + mask_t one_mask = get_pos_mask(val_bit); + new_tag.add_values(one_mask); + } + // else it is already null + } + else { + // leave the singular value intact + new_tag.add_values(value); + } + } + return new_tag; +} + +Tag Tagset::expand_unspec_attrs(const Tag& tag) const +{ + Tag new_tag(tag); + idx_t pos_idx = tag.get_pos_index(); + const std::vector<idx_t>& attrs = get_pos_attributes(pos_idx); + foreach (const idx_t& a, attrs) { + mask_t attr_mask = get_attribute_mask(a); + mask_t value = tag.get_values_for(attr_mask); + if (!value.any()) { // no value given + const Tag all_vals(0, attr_mask); + new_tag.combine_with(all_vals); + } + } + return new_tag; +} + idx_t Tagset::get_pos_index(const string_range& pos) const { return pos_dict_.get_id(pos); diff --git a/libcorpus2/tagset.h b/libcorpus2/tagset.h index ae407a4..271a735 100644 --- a/libcorpus2/tagset.h +++ b/libcorpus2/tagset.h @@ -365,6 +365,24 @@ public: */ std::vector<Tag> split_tag(const Tag& tag) const; + /** + * Creates a singular tag from this one, possibly being a "multi-tag". + * POS and required attribute sets are reduced to tagset-wise first + * values, while optional attributes are reduced to unspecified if + * multiple values given (left intact otherwise). + */ + Tag select_singular(const Tag& tag) const; + + /** + * Creates a copy of the given tag where optional or required attributes + * with no value given are encoded as each possible value set. + * NOTE: this may result in tags technically invalid (multiple values set + * for one attribute), yet it is convenient for some tagging scenarios to + * explicitly distinguish between an irrelevant attribute and a relevant + * one but no value given. + */ + Tag expand_unspec_attrs(const Tag& tag) const; + /// POS name <-> index dictionary getter const SymbolDictionary<idx_t>& pos_dictionary() const { return pos_dict_; diff --git a/swig/tagging.i b/swig/tagging.i index 21c79f1..5ae141d 100644 --- a/swig/tagging.i +++ b/swig/tagging.i @@ -19,6 +19,14 @@ Tag get_attribute_mask(const Tagset& tagset, Tag mask_token(const Token& token, const Tag& mask, bool disamb_only); +int mask_card(const Tag& mask); + +bool select_preferred_disamb(const Tagset& tagset, Token* token); + +void expand_unspec_attrs(const Tagset& tagset, Token* token); + +void select_singular_tags(const Tagset& tagset, Token* token); + } using namespace std; diff --git a/swig/tagset.i b/swig/tagset.i index d318a80..1d62038 100644 --- a/swig/tagset.i +++ b/swig/tagset.i @@ -91,7 +91,8 @@ namespace Corpus2 { /* --------------------------------------------------------------------- */ std::vector<Tag> split_tag(const Tag& tag) const; - + Tag select_singular(const Tag& tag) const; + Tag expand_unspec_attrs(const Tag& tag) const; /* --------------------------------------------------------------------- */ int pos_count() const; int attribute_count() const; -- GitLab