diff --git a/libwccl/ops/actions/unify.cpp b/libwccl/ops/actions/unify.cpp index a4a53f0e36147e565f01d1bb76b4bc9a295e2bab..692aa5d338f518c619fdbbc0553ea0ecfb8a5ac6 100644 --- a/libwccl/ops/actions/unify.cpp +++ b/libwccl/ops/actions/unify.cpp @@ -25,33 +25,35 @@ Bool Unify::execute(const ActionExecContext& context) const const boost::shared_ptr<const TSet>& attribs = attribs_expr_->apply(context); std::vector<std::vector<Corpus2::Lexeme> > remainings; + std::vector<std::vector<Corpus2::Lexeme> > underspecifieds; // first pass - cut lexemes that don't match agreement attribs at all for (int i = abs_left; i <= abs_right; ++i) { Corpus2::Token& curr_tok = *sc.at(i); std::vector<Corpus2::Lexeme> remaining; + std::vector<Corpus2::Lexeme> underspecified; foreach(Corpus2::Lexeme& lexeme, curr_tok.lexemes()) { if (attribs->matching_categories(lexeme.tag()) > 0) { remaining.push_back(lexeme); + } else { + underspecified.push_back(lexeme); } } remainings.push_back(remaining); + underspecifieds.push_back(underspecified); } - // second pass - leave only lexemes that are in agreement // with at least one lexeme from previous non-empty-after-1st-pass token int previous = remainings.size() - 1; // at first, "previous" is the last token not emptied after 1st pass - while ((previous > 0) && (remainings[previous].size() == 0)) { + while ((previous > 0) && (remainings[previous].empty())) { previous--; } for (size_t i = 0; i < remainings.size(); ++i) { // proceed only for tokens that have lexemes that could go into agreement // (i.e. are not emptied after 1st pass) - if (remainings[i].size() > 0) { - std::vector<Corpus2::Lexeme>& prev_rem = remainings[previous]; - // next time around "previous" will be the current one - previous = i; + if (!remainings[i].empty()) { + const std::vector<Corpus2::Lexeme>& prev_rem = remainings[previous]; std::vector<Corpus2::Lexeme> curr_rem; // get only lexemes that agree with a lexeme from previous token foreach(const Corpus2::Lexeme& curr_lex, remainings[i]) { @@ -63,28 +65,45 @@ Bool Unify::execute(const ActionExecContext& context) const } } } - if (curr_rem.size() == 0) { - // no lexemes left, means no agreement possible, so can't unify - abort with no changes - changed.set_value(false); - return changed; - } - if (curr_rem.size() != remainings[i].size()) { - changed.set_value(true); - remainings[i] = curr_rem; + if (curr_rem.empty()) { + if (underspecifieds[i].empty()) { + // there is no agreement possible, abort with no changes + changed.set_value(false); + return changed; + } else { + // there were underspecified lexemes, they will be left in the end + // so agreement is met anyway, however we need to mark that + // changes were made because we removed some lexemes that did not + // meet agreement; + // "previous" index has to be left alone though, pointing at previous + // token that still had some agreedable lexemes + changed.set_value(true); + } + } else { + // some non-underspecified lexemes are left, so + // next time around, "previous" token is going to be the current one + previous = i; + // there were changes only if we actually removed something + if (curr_rem.size() != remainings[i].size()) { + changed.set_value(true); + remainings[i] = curr_rem; + } } } } - - // finally assign remaining tokens accordingly, - // leaving intact the tokens that did not have agreeable lexemes at all + // finally assign remaining lexemes to original tokens if (changed.get_value()) { for (size_t i = 0; i < remainings.size(); ++i) { if (remainings[i].size() > 0) { - sc.at(i + abs_left)->lexemes() = remainings[i]; + std::vector<Corpus2::Lexeme>& lexemes = sc.at(i + abs_left)->lexemes(); + lexemes = remainings[i]; + // underspecced lexemes meet agreement too, so leave them in the end, if any + foreach (const Corpus2::Lexeme& lex, underspecifieds[i]) { + lexemes.push_back(lex); + } } } } - return changed; }