diff --git a/libcorpus2/tagging.cpp b/libcorpus2/tagging.cpp index b1f4587deff2189e8cc9f962ce0dbc942dfae1b7..6730461c98a04f2cd9673eedbf06d72e4d2c12b4 100644 --- a/libcorpus2/tagging.cpp +++ b/libcorpus2/tagging.cpp @@ -19,6 +19,7 @@ or FITNESS FOR A PARTICULAR PURPOSE. #include <libpwrutils/foreach.h> +#include <libpwrutils/bitset.h> namespace Corpus2 { @@ -52,5 +53,42 @@ Tag mask_token(const Token& token, const Tag& mask, bool disamb_only) return t; } +int mask_card(const Tag& mask) +{ + return PwrNlp::count_bits_set(mask.get_pos()) + + PwrNlp::count_bits_set(mask.get_values()); +} + +bool select_preferred_disamb(const Tagset& tagset, + Token* token) +{ + size_t lex_idx = token->get_preferred_lexeme_index(tagset); + if(!token->lexemes()[lex_idx].is_disamb()) { + return false; + } + + for (size_t other_idx = 0; + other_idx < token->lexemes().size(); + ++other_idx) { + if (other_idx != lex_idx) { + token->lexemes()[other_idx].set_disamb(false); + } + } + return true; +} + +void expand_unspec_attrs(const Tagset& tagset, Token* token) +{ + foreach (Lexeme& lex, token->lexemes()) { + lex.set_tag(tagset.expand_unspec_attrs(lex.tag())); + } +} + +void select_singular_tags(const Tagset& tagset, Token* token) +{ + foreach (Lexeme& lex, token->lexemes()) { + lex.set_tag(tagset.select_singular(lex.tag())); + } +} } /* end ns Corpus2 */ diff --git a/libcorpus2/tagging.h b/libcorpus2/tagging.h index 42e4dd29d6c1376eff539cbca954d81f0bce77ad..16ed9437b916592393fa9d55b12ca04ae571ef06 100644 --- a/libcorpus2/tagging.h +++ b/libcorpus2/tagging.h @@ -42,6 +42,26 @@ Tag get_attribute_mask(const Tagset& tagset, */ Tag mask_token(const Token& token, const Tag& mask, bool disamb_only); +/** Returns the number of set elements belonging to the mask given. */ +int mask_card(const Tag& mask); + +/** Forces one disamb lexeme per token. The selection is based on tagset + * definition order. Returns if any disamb found. + */ +bool select_preferred_disamb(const Tagset& tagset, Token* token); + +/** Encodes attributes with unspecified values as each value set. + * This is to facilitate safe masking when the value in question is not to be + * skipped. + */ +void expand_unspec_attrs(const Tagset& tagset, Token* token); + +/** Repairs multivalue tags. Optional attributes will be cleared if + * multi-value. Regular attributes will be set to lowest value given. + */ +void select_singular_tags(const Tagset& tagset, Token* token); + + } /* end ns Corpus2 */ #endif // LIBCORPUS2_TAGGING_H diff --git a/libcorpus2/tagset.cpp b/libcorpus2/tagset.cpp index f078d170438dc11e6de305f2673204a4947899c6..fd0af7be58fdbee633ce8344cfc3a3ad4655b62a 100644 --- a/libcorpus2/tagset.cpp +++ b/libcorpus2/tagset.cpp @@ -571,6 +571,55 @@ std::vector<Tag> Tagset::split_tag(const Tag& tag) const return tags; } +Tag Tagset::select_singular(const Tag& tag) const +{ + Tag new_tag; + // force one POS + idx_t pos_idx = tag.get_pos_index(); + mask_t pos_mask = get_pos_mask(pos_idx); + new_tag.set_pos(pos_mask); + // now iterate over attrs + const std::vector<idx_t>& attrs = get_pos_attributes(pos_idx); + foreach (const idx_t& a, attrs) { + mask_t attr_mask = get_attribute_mask(a); + mask_t value = tag.get_values_for(attr_mask); + // check if the attr is multi-value + if (PwrNlp::count_bits_set(value) > 1) + { + if (pos_requires_attribute(pos_idx, a)) { + // this is a required attr, so just select first value + idx_t val_bit = PwrNlp::lowest_bit(value); + // well, this is not POS but attr value but the + // implementation is ok anyway... + mask_t one_mask = get_pos_mask(val_bit); + new_tag.add_values(one_mask); + } + // else it is already null + } + else { + // leave the singular value intact + new_tag.add_values(value); + } + } + return new_tag; +} + +Tag Tagset::expand_unspec_attrs(const Tag& tag) const +{ + Tag new_tag(tag); + idx_t pos_idx = tag.get_pos_index(); + const std::vector<idx_t>& attrs = get_pos_attributes(pos_idx); + foreach (const idx_t& a, attrs) { + mask_t attr_mask = get_attribute_mask(a); + mask_t value = tag.get_values_for(attr_mask); + if (!value.any()) { // no value given + const Tag all_vals(0, attr_mask); + new_tag.combine_with(all_vals); + } + } + return new_tag; +} + idx_t Tagset::get_pos_index(const string_range& pos) const { return pos_dict_.get_id(pos); diff --git a/libcorpus2/tagset.h b/libcorpus2/tagset.h index ae407a45be504a1670ee86d2e803e329110cf567..271a7353fd2dbe2478fde435a52321463d662547 100644 --- a/libcorpus2/tagset.h +++ b/libcorpus2/tagset.h @@ -365,6 +365,24 @@ public: */ std::vector<Tag> split_tag(const Tag& tag) const; + /** + * Creates a singular tag from this one, possibly being a "multi-tag". + * POS and required attribute sets are reduced to tagset-wise first + * values, while optional attributes are reduced to unspecified if + * multiple values given (left intact otherwise). + */ + Tag select_singular(const Tag& tag) const; + + /** + * Creates a copy of the given tag where optional or required attributes + * with no value given are encoded as each possible value set. + * NOTE: this may result in tags technically invalid (multiple values set + * for one attribute), yet it is convenient for some tagging scenarios to + * explicitly distinguish between an irrelevant attribute and a relevant + * one but no value given. + */ + Tag expand_unspec_attrs(const Tag& tag) const; + /// POS name <-> index dictionary getter const SymbolDictionary<idx_t>& pos_dictionary() const { return pos_dict_; diff --git a/swig/tagging.i b/swig/tagging.i index 21c79f1fe484df8a795239711a0ce1c242eb2bd6..5ae141def36404439243a2c4d963cbdc6bcd2fdf 100644 --- a/swig/tagging.i +++ b/swig/tagging.i @@ -19,6 +19,14 @@ Tag get_attribute_mask(const Tagset& tagset, Tag mask_token(const Token& token, const Tag& mask, bool disamb_only); +int mask_card(const Tag& mask); + +bool select_preferred_disamb(const Tagset& tagset, Token* token); + +void expand_unspec_attrs(const Tagset& tagset, Token* token); + +void select_singular_tags(const Tagset& tagset, Token* token); + } using namespace std; diff --git a/swig/tagset.i b/swig/tagset.i index d318a80adf8e56c05d5c30668fac1f12b33414a4..1d620388f7a2fd5aced5cd7e5736e5b26c218882 100644 --- a/swig/tagset.i +++ b/swig/tagset.i @@ -91,7 +91,8 @@ namespace Corpus2 { /* --------------------------------------------------------------------- */ std::vector<Tag> split_tag(const Tag& tag) const; - + Tag select_singular(const Tag& tag) const; + Tag expand_unspec_attrs(const Tag& tag) const; /* --------------------------------------------------------------------- */ int pos_count() const; int attribute_count() const;