From 911db3406cbe81f380cb977bcf62e5eea918cb83 Mon Sep 17 00:00:00 2001 From: Adam Wardynski <award@.(B-4.4.46a)> Date: Fri, 10 Dec 2010 15:46:49 +0100 Subject: [PATCH] agrfltr implementation. --- libwccl/ops/functions/tset/agrfilter.cpp | 70 ++++++++++++++++++++---- 1 file changed, 60 insertions(+), 10 deletions(-) diff --git a/libwccl/ops/functions/tset/agrfilter.cpp b/libwccl/ops/functions/tset/agrfilter.cpp index ce6911d..21b6661 100644 --- a/libwccl/ops/functions/tset/agrfilter.cpp +++ b/libwccl/ops/functions/tset/agrfilter.cpp @@ -50,17 +50,67 @@ AgrFilter::BaseRetValPtr AgrFilter::apply_internal(const FunExecContext& context return detail::DefaultFunction<TSet>()->apply(context); } - const boost::shared_ptr<const TSet>& attribs_tset = attribs_expr_->apply(context); - const boost::shared_ptr<const TSet>& mask_tset = mask_expr_->apply(context); - const Corpus2::Tag& attribs = attribs_tset->get_value(); - const Corpus2::Tag& mask = mask_tset->get_value(); + const boost::shared_ptr<const TSet>& attribs = attribs_expr_->apply(context); + const boost::shared_ptr<const TSet>& mask = mask_expr_->apply(context); - boost::shared_ptr<TSet> tset = boost::make_shared<TSet>(); - // - //@ todo: implement - // - tset->contents().mask_with(mask); - return tset; + int min_card = attribs->categories_count(tagset_); + const Corpus2::Token* t1 = sc.at(abs_left); + const Corpus2::Token* t2 = sc.at(abs_right); + // to optimize a bit, make sure t1 is the one with less lexemes + if (t1->lexemes().size() > t2->lexemes().size()) { + std::swap(t1, t2); + } + + + boost::shared_ptr<TSet> agreements = boost::make_shared<TSet>(); + + // Check strong agreement between range endpoints. + // For each possible agreement between the endpoints, + // check if remaining tokens meet that agreement too, + // but instead of looking for strong agreement i.e. + // matching on exact number of categories, look for + // weak agreement i.e. matching only on those categories + // that are present. + // Specifically, if there is a lexeme that does not + // match any of the categories, that means the token + // does meet the weak agreement. + // For each agreement we take symbols that define the + // agreement, sum them up, apply the filter mask, and return. + foreach (const Corpus2::Lexeme& t1_lex, t1->lexemes()) { + const Corpus2::Tag& t1_tag = t1_lex.tag(); + // don't bother checking t2 unless current t1_tag matches enough categories + if (attribs->matching_categories(t1_tag) >= min_card) { + foreach (const Corpus2::Lexeme& t2_lex, t2->lexemes()) { + Corpus2::Tag inter = t1_tag.get_masked(t2_lex.tag()); + // if the intersection matches enough categories we have agreement + if (attribs->matching_categories(inter) >= min_card) { + // Check if selected agreement is met by all remaining tokens + bool agreement_met = true; + for(int i = abs_left + 1; agreement_met && (i < abs_right); ++i) { + foreach(const Corpus2::Lexeme& i_lex, sc.at(i)->lexemes()) { + // Check if agreement is met, but taking into account + // only categories actually matched in current tag, + // without requirement to match all categories in the + // agreement. + Corpus2::Tag i_inter = i_lex.tag().get_masked(inter); + agreement_met = + (attribs->matching_categories(i_lex.tag()) + == attribs->matching_categories(i_inter)); + if(agreement_met) { + break; + } + } + } + if (agreement_met) { + agreements->combine_with(inter); + } + } + } + } + } + + agreements->contents().mask_with(mask->get_value()); + return agreements; } } /* end ns Wccl */ -- GitLab