diff --git a/libcorpus2/tagging.cpp b/libcorpus2/tagging.cpp index 6730461c98a04f2cd9673eedbf06d72e4d2c12b4..d009c95166e41b93a695a2d36845fcce23acc220 100644 --- a/libcorpus2/tagging.cpp +++ b/libcorpus2/tagging.cpp @@ -91,4 +91,38 @@ void select_singular_tags(const Tagset& tagset, Token* token) } } +bool disambiguate_equal(Token* token, const Tag& mask_where, + const Tag& mask_wanted) +{ + std::vector<Lexeme> wanted; + foreach (const Lexeme& lex, token->lexemes()) { + Tag mask_theirs = lex.tag().get_masked(mask_where); + if (mask_theirs == mask_wanted) { + wanted.push_back(lex); + } + } + if (wanted.empty()) { + return false; + } + token->replace_lexemes(wanted); + return true; +} + +bool disambiguate_subset(Token* token, const Tag& mask_where, + const Tag& mask_wanted) +{ + std::vector<Lexeme> wanted; + foreach (const Lexeme& lex, token->lexemes()) { + Tag mask_theirs = lex.tag().get_masked(mask_where); + if (mask_theirs.get_masked(mask_wanted) == mask_theirs) { + wanted.push_back(lex); + } + } + if (wanted.empty()) { + return false; + } + token->replace_lexemes(wanted); + return true; +} + } /* end ns Corpus2 */ diff --git a/libcorpus2/tagging.h b/libcorpus2/tagging.h index 16ed9437b916592393fa9d55b12ca04ae571ef06..0cbb4438a3b854d787d590f09dae29083344e56f 100644 --- a/libcorpus2/tagging.h +++ b/libcorpus2/tagging.h @@ -61,6 +61,24 @@ void expand_unspec_attrs(const Tagset& tagset, Token* token); */ void select_singular_tags(const Tagset& tagset, Token* token); +/** Tries to select only those lexemes whose tags projected onto mask_where + * have exactly the value as given in mask_wanted. E.g. pass whole attribute + * as mask_where and a particular desired value as mask_wanted. If no lexeme + * satisfies the constraint, will leave the token intact. + * @return if succeeded + */ +bool disambiguate_equal(Token* token, const Tag& mask_where, + const Tag& mask_wanted); + +/** Tries to select only those lexemes whose tags projected onto mask_where + * have a subset of the value as given in mask_wanted. E.g. pass noun + gerund + * mask and have both left. NOTE: this may be inconvenient for dealing with + * optional attributes. If no lexeme satisfies the constraint, will leave the + * token intact. + * @return if succeeded + */ +bool disambiguate_subset(Token* token, const Tag& mask_where, + const Tag& mask_wanted); } /* end ns Corpus2 */ diff --git a/swig/tagging.i b/swig/tagging.i index 5ae141def36404439243a2c4d963cbdc6bcd2fdf..96d1bb84c34a6ac0da2ff4560fcfb9754fa18902 100644 --- a/swig/tagging.i +++ b/swig/tagging.i @@ -27,6 +27,10 @@ void expand_unspec_attrs(const Tagset& tagset, Token* token); void select_singular_tags(const Tagset& tagset, Token* token); +bool disambiguate_equal(Token* token, const Tag& mask_where, const Tag& mask_wanted); + +bool disambiguate_subset(Token* token, const Tag& mask_where, const Tag& mask_wanted); + } using namespace std;