From e1f1bf11afaf56b16610c24512ec06aea2f37f08 Mon Sep 17 00:00:00 2001 From: Adam Radziszewski <adam.radziszewski@pwr.wroc.pl> Date: Wed, 17 Aug 2011 14:14:54 +0200 Subject: [PATCH] tagging functions that distinguish tags only --- libcorpus2/tagging.cpp | 34 +++++++++++++++++++++++++++++++++- libcorpus2/tagging.h | 11 +++++++++++ swig/tagging.i | 4 ++++ 3 files changed, 48 insertions(+), 1 deletion(-) diff --git a/libcorpus2/tagging.cpp b/libcorpus2/tagging.cpp index 1572fa8..b4bc5c3 100644 --- a/libcorpus2/tagging.cpp +++ b/libcorpus2/tagging.cpp @@ -63,7 +63,7 @@ bool select_preferred_disamb(const Tagset& tagset, Token* token) { size_t lex_idx = token->get_preferred_lexeme_index(tagset); if(!token->lexemes()[lex_idx].is_disamb()) { - return false; + return false; // disamb would've taken precedence => no disamb at all } for (size_t other_idx = 0; @@ -88,6 +88,38 @@ void select_preferred_lexeme(const Tagset& tagset, Token* token) } } +bool select_preferred_disamb_tag(const Tagset& tagset, Token* token) +{ + const Corpus2::Lexeme &prototypical = token->get_preferred_lexeme(tagset); + if(!prototypical.is_disamb()) { + return false; // disamb would've taken precedence => no disamb at all + } + foreach (Lexeme& lex, token->lexemes()) { + if (lex.tag() != prototypical.tag()) { + lex.set_disamb(false); + } + } + return true; +} + +void select_preferred_tag(const Tagset& tagset, Token* token) +{ + foreach (Lexeme& lex, token->lexemes()) { + lex.set_disamb(true); + } + if (token->lexemes().size() > 1) { + const Corpus2::Tag tag_wanted = token->get_preferred_lexeme(tagset).tag(); + std::vector<Lexeme> wanted; + foreach (const Lexeme& lex, token->lexemes()) { + if (lex.tag() == tag_wanted) { + wanted.push_back(lex); + } + } + assert(!wanted.empty()); + token->replace_lexemes(wanted); + } +} + void expand_optional_attrs(const Tagset& tagset, Token* token) { foreach (Lexeme& lex, token->lexemes()) { diff --git a/libcorpus2/tagging.h b/libcorpus2/tagging.h index 59fbcd4..65c4b80 100644 --- a/libcorpus2/tagging.h +++ b/libcorpus2/tagging.h @@ -56,6 +56,17 @@ bool select_preferred_disamb(const Tagset& tagset, Token* token); */ void select_preferred_lexeme(const Tagset& tagset, Token* token); +/** Forces one DISAMB TAG per token. Works as select_preferred_disamb, + * but multiple disamb lexemes may be left, as long as they differ only + * in base forms. Returns if any disamb found. + */ +bool select_preferred_disamb_tag(const Tagset& tagset, Token* token); + +/** Forces one TAG per token. Works as select_preferred_lexeme, but multiple + * lexemes may be left, as long as they differ only in base forms. + */ +void select_preferred_tag(const Tagset& tagset, Token* token); + /** Encodes optional attributes with unspecified values as each value set. * This is to facilitate safe masking when the value in question is not to be * skipped. diff --git a/swig/tagging.i b/swig/tagging.i index c09e2a1..0d65ecf 100644 --- a/swig/tagging.i +++ b/swig/tagging.i @@ -25,6 +25,10 @@ bool select_preferred_disamb(const Tagset& tagset, Token* token); void select_preferred_lexeme(const Tagset& tagset, Token* token); +bool select_preferred_disamb_tag(const Tagset& tagset, Token* token); + +void select_preferred_tag(const Tagset& tagset, Token* token); + void expand_optional_attrs(const Tagset& tagset, Token* token); void select_singular_tags(const Tagset& tagset, Token* token); -- GitLab