Skip to content
Snippets Groups Projects
Commit 29c7ed0e authored by Adam Radziszewski's avatar Adam Radziszewski
Browse files

helper function for processing unspec attr vals

parent 83b7d92a
Branches
No related merge requests found
......@@ -19,6 +19,7 @@ or FITNESS FOR A PARTICULAR PURPOSE.
#include <libpwrutils/foreach.h>
#include <libpwrutils/bitset.h>
namespace Corpus2 {
......@@ -52,5 +53,42 @@ Tag mask_token(const Token& token, const Tag& mask, bool disamb_only)
return t;
}
int mask_card(const Tag& mask)
{
return PwrNlp::count_bits_set(mask.get_pos())
+ PwrNlp::count_bits_set(mask.get_values());
}
bool select_preferred_disamb(const Tagset& tagset,
Token* token)
{
size_t lex_idx = token->get_preferred_lexeme_index(tagset);
if(!token->lexemes()[lex_idx].is_disamb()) {
return false;
}
for (size_t other_idx = 0;
other_idx < token->lexemes().size();
++other_idx) {
if (other_idx != lex_idx) {
token->lexemes()[other_idx].set_disamb(false);
}
}
return true;
}
void expand_unspec_attrs(const Tagset& tagset, Token* token)
{
foreach (Lexeme& lex, token->lexemes()) {
lex.set_tag(tagset.expand_unspec_attrs(lex.tag()));
}
}
void select_singular_tags(const Tagset& tagset, Token* token)
{
foreach (Lexeme& lex, token->lexemes()) {
lex.set_tag(tagset.select_singular(lex.tag()));
}
}
} /* end ns Corpus2 */
......@@ -42,6 +42,26 @@ Tag get_attribute_mask(const Tagset& tagset,
*/
Tag mask_token(const Token& token, const Tag& mask, bool disamb_only);
/** Returns the number of set elements belonging to the mask given. */
int mask_card(const Tag& mask);
/** Forces one disamb lexeme per token. The selection is based on tagset
* definition order. Returns if any disamb found.
*/
bool select_preferred_disamb(const Tagset& tagset, Token* token);
/** Encodes attributes with unspecified values as each value set.
* This is to facilitate safe masking when the value in question is not to be
* skipped.
*/
void expand_unspec_attrs(const Tagset& tagset, Token* token);
/** Repairs multivalue tags. Optional attributes will be cleared if
* multi-value. Regular attributes will be set to lowest value given.
*/
void select_singular_tags(const Tagset& tagset, Token* token);
} /* end ns Corpus2 */
#endif // LIBCORPUS2_TAGGING_H
......@@ -571,6 +571,55 @@ std::vector<Tag> Tagset::split_tag(const Tag& tag) const
return tags;
}
Tag Tagset::select_singular(const Tag& tag) const
{
Tag new_tag;
// force one POS
idx_t pos_idx = tag.get_pos_index();
mask_t pos_mask = get_pos_mask(pos_idx);
new_tag.set_pos(pos_mask);
// now iterate over attrs
const std::vector<idx_t>& attrs = get_pos_attributes(pos_idx);
foreach (const idx_t& a, attrs) {
mask_t attr_mask = get_attribute_mask(a);
mask_t value = tag.get_values_for(attr_mask);
// check if the attr is multi-value
if (PwrNlp::count_bits_set(value) > 1)
{
if (pos_requires_attribute(pos_idx, a)) {
// this is a required attr, so just select first value
idx_t val_bit = PwrNlp::lowest_bit(value);
// well, this is not POS but attr value but the
// implementation is ok anyway...
mask_t one_mask = get_pos_mask(val_bit);
new_tag.add_values(one_mask);
}
// else it is already null
}
else {
// leave the singular value intact
new_tag.add_values(value);
}
}
return new_tag;
}
Tag Tagset::expand_unspec_attrs(const Tag& tag) const
{
Tag new_tag(tag);
idx_t pos_idx = tag.get_pos_index();
const std::vector<idx_t>& attrs = get_pos_attributes(pos_idx);
foreach (const idx_t& a, attrs) {
mask_t attr_mask = get_attribute_mask(a);
mask_t value = tag.get_values_for(attr_mask);
if (!value.any()) { // no value given
const Tag all_vals(0, attr_mask);
new_tag.combine_with(all_vals);
}
}
return new_tag;
}
idx_t Tagset::get_pos_index(const string_range& pos) const
{
return pos_dict_.get_id(pos);
......
......@@ -365,6 +365,24 @@ public:
*/
std::vector<Tag> split_tag(const Tag& tag) const;
/**
* Creates a singular tag from this one, possibly being a "multi-tag".
* POS and required attribute sets are reduced to tagset-wise first
* values, while optional attributes are reduced to unspecified if
* multiple values given (left intact otherwise).
*/
Tag select_singular(const Tag& tag) const;
/**
* Creates a copy of the given tag where optional or required attributes
* with no value given are encoded as each possible value set.
* NOTE: this may result in tags technically invalid (multiple values set
* for one attribute), yet it is convenient for some tagging scenarios to
* explicitly distinguish between an irrelevant attribute and a relevant
* one but no value given.
*/
Tag expand_unspec_attrs(const Tag& tag) const;
/// POS name <-> index dictionary getter
const SymbolDictionary<idx_t>& pos_dictionary() const {
return pos_dict_;
......
......@@ -19,6 +19,14 @@ Tag get_attribute_mask(const Tagset& tagset,
Tag mask_token(const Token& token, const Tag& mask, bool disamb_only);
int mask_card(const Tag& mask);
bool select_preferred_disamb(const Tagset& tagset, Token* token);
void expand_unspec_attrs(const Tagset& tagset, Token* token);
void select_singular_tags(const Tagset& tagset, Token* token);
}
using namespace std;
......
......@@ -91,7 +91,8 @@ namespace Corpus2 {
/* --------------------------------------------------------------------- */
std::vector<Tag> split_tag(const Tag& tag) const;
Tag select_singular(const Tag& tag) const;
Tag expand_unspec_attrs(const Tag& tag) const;
/* --------------------------------------------------------------------- */
int pos_count() const;
int attribute_count() const;
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment