diff --git a/libcorpus2/tagging.cpp b/libcorpus2/tagging.cpp index 1572fa8355726d4411873be6c6484abb219e5a73..b4bc5c39291cd14eb436400e9ee6032e8e73dd2e 100644 --- a/libcorpus2/tagging.cpp +++ b/libcorpus2/tagging.cpp @@ -63,7 +63,7 @@ bool select_preferred_disamb(const Tagset& tagset, Token* token) { size_t lex_idx = token->get_preferred_lexeme_index(tagset); if(!token->lexemes()[lex_idx].is_disamb()) { - return false; + return false; // disamb would've taken precedence => no disamb at all } for (size_t other_idx = 0; @@ -88,6 +88,38 @@ void select_preferred_lexeme(const Tagset& tagset, Token* token) } } +bool select_preferred_disamb_tag(const Tagset& tagset, Token* token) +{ + const Corpus2::Lexeme &prototypical = token->get_preferred_lexeme(tagset); + if(!prototypical.is_disamb()) { + return false; // disamb would've taken precedence => no disamb at all + } + foreach (Lexeme& lex, token->lexemes()) { + if (lex.tag() != prototypical.tag()) { + lex.set_disamb(false); + } + } + return true; +} + +void select_preferred_tag(const Tagset& tagset, Token* token) +{ + foreach (Lexeme& lex, token->lexemes()) { + lex.set_disamb(true); + } + if (token->lexemes().size() > 1) { + const Corpus2::Tag tag_wanted = token->get_preferred_lexeme(tagset).tag(); + std::vector<Lexeme> wanted; + foreach (const Lexeme& lex, token->lexemes()) { + if (lex.tag() == tag_wanted) { + wanted.push_back(lex); + } + } + assert(!wanted.empty()); + token->replace_lexemes(wanted); + } +} + void expand_optional_attrs(const Tagset& tagset, Token* token) { foreach (Lexeme& lex, token->lexemes()) { diff --git a/libcorpus2/tagging.h b/libcorpus2/tagging.h index 59fbcd4bfe0c869156c237d6f64c2c2b5b795ee1..65c4b80a1fe59958cc5e0313bf8f28320635a0a7 100644 --- a/libcorpus2/tagging.h +++ b/libcorpus2/tagging.h @@ -56,6 +56,17 @@ bool select_preferred_disamb(const Tagset& tagset, Token* token); */ void select_preferred_lexeme(const Tagset& tagset, Token* token); +/** Forces one DISAMB TAG per token. Works as select_preferred_disamb, + * but multiple disamb lexemes may be left, as long as they differ only + * in base forms. Returns if any disamb found. + */ +bool select_preferred_disamb_tag(const Tagset& tagset, Token* token); + +/** Forces one TAG per token. Works as select_preferred_lexeme, but multiple + * lexemes may be left, as long as they differ only in base forms. + */ +void select_preferred_tag(const Tagset& tagset, Token* token); + /** Encodes optional attributes with unspecified values as each value set. * This is to facilitate safe masking when the value in question is not to be * skipped. diff --git a/swig/tagging.i b/swig/tagging.i index c09e2a1eff46b65f23d3393f70d0df330f620d36..0d65ecf1ad8509507ea4569d8ad0af7e753e8230 100644 --- a/swig/tagging.i +++ b/swig/tagging.i @@ -25,6 +25,10 @@ bool select_preferred_disamb(const Tagset& tagset, Token* token); void select_preferred_lexeme(const Tagset& tagset, Token* token); +bool select_preferred_disamb_tag(const Tagset& tagset, Token* token); + +void select_preferred_tag(const Tagset& tagset, Token* token); + void expand_optional_attrs(const Tagset& tagset, Token* token); void select_singular_tags(const Tagset& tagset, Token* token);