From ec56059728c826b4ed3d597dc6e017c3691fad18 Mon Sep 17 00:00:00 2001 From: Adam Radziszewski <adam.radziszewski@pwr.wroc.pl> Date: Wed, 27 Jul 2011 18:52:21 +0200 Subject: [PATCH] tagging utils --- libcorpus2/tagging.cpp | 15 +++++++++++++-- libcorpus2/tagging.h | 6 ++++++ swig/tagging.i | 2 ++ 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/libcorpus2/tagging.cpp b/libcorpus2/tagging.cpp index 41b0733..1572fa8 100644 --- a/libcorpus2/tagging.cpp +++ b/libcorpus2/tagging.cpp @@ -59,8 +59,7 @@ int mask_card(const Tag& mask) + PwrNlp::count_bits_set(mask.get_values()); } -bool select_preferred_disamb(const Tagset& tagset, - Token* token) +bool select_preferred_disamb(const Tagset& tagset, Token* token) { size_t lex_idx = token->get_preferred_lexeme_index(tagset); if(!token->lexemes()[lex_idx].is_disamb()) { @@ -77,6 +76,18 @@ bool select_preferred_disamb(const Tagset& tagset, return true; } +void select_preferred_lexeme(const Tagset& tagset, Token* token) +{ + foreach (Lexeme& lex, token->lexemes()) { + lex.set_disamb(true); + } + if (token->lexemes().size() > 1) { + std::vector<Lexeme> one; + one.push_back(token->get_preferred_lexeme(tagset)); + token->replace_lexemes(one); + } +} + void expand_optional_attrs(const Tagset& tagset, Token* token) { foreach (Lexeme& lex, token->lexemes()) { diff --git a/libcorpus2/tagging.h b/libcorpus2/tagging.h index 7ae0a39..59fbcd4 100644 --- a/libcorpus2/tagging.h +++ b/libcorpus2/tagging.h @@ -50,6 +50,12 @@ int mask_card(const Tag& mask); */ bool select_preferred_disamb(const Tagset& tagset, Token* token); +/** Forces one lexeme per token. The selection is based on tagset + * definition order, disamb markers are not respected. + * The selected lexeme will be set to disamb=True. + */ +void select_preferred_lexeme(const Tagset& tagset, Token* token); + /** Encodes optional attributes with unspecified values as each value set. * This is to facilitate safe masking when the value in question is not to be * skipped. diff --git a/swig/tagging.i b/swig/tagging.i index c9fdd9c..c09e2a1 100644 --- a/swig/tagging.i +++ b/swig/tagging.i @@ -23,6 +23,8 @@ int mask_card(const Tag& mask); bool select_preferred_disamb(const Tagset& tagset, Token* token); +void select_preferred_lexeme(const Tagset& tagset, Token* token); + void expand_optional_attrs(const Tagset& tagset, Token* token); void select_singular_tags(const Tagset& tagset, Token* token); -- GitLab