Unify to treat underspecified lexemes as meeting agreement.

So such lexemes are always left in the end.

Unify to treat underspecified lexemes as meeting agreement.
So such lexemes are always left in the end.
dd0777c6 · Adam Wardynski · eb6a353f · dd0777c6
Commit dd0777c6 authored 14 years ago by Adam Wardynski
--- a/libwccl/ops/actions/unify.cpp
+++ b/libwccl/ops/actions/unify.cpp
@@ -25,33 +25,35 @@ Bool Unify::execute(const ActionExecContext& context) const
 	const boost::shared_ptr<const TSet>& attribs = attribs_expr_->apply(context);

 	std::vector<std::vector<Corpus2::Lexeme> > remainings;
+	std::vector<std::vector<Corpus2::Lexeme> > underspecifieds;

 	// first pass - cut lexemes that don't match agreement attribs at all
 	for (int i = abs_left; i <= abs_right; ++i) {
 		Corpus2::Token& curr_tok = *sc.at(i);
 		std::vector<Corpus2::Lexeme> remaining;
+		std::vector<Corpus2::Lexeme> underspecified;
 		foreach(Corpus2::Lexeme& lexeme, curr_tok.lexemes()) {
 			if (attribs->matching_categories(lexeme.tag()) > 0) {
 				remaining.push_back(lexeme);
+			} else {
+				underspecified.push_back(lexeme);
 			}
 		}
 		remainings.push_back(remaining);
+		underspecifieds.push_back(underspecified);
 	}
-
 	// second pass - leave only lexemes that are in agreement
 	// with at least one lexeme from previous non-empty-after-1st-pass token
 	int previous = remainings.size() - 1;
 	// at first, "previous" is the last token not emptied after 1st pass
-	while ((previous > 0) && (remainings[previous].size() == 0)) {
+	while ((previous > 0) && (remainings[previous].empty())) {
 		previous--;
 	}
 	for (size_t i = 0; i < remainings.size(); ++i) {
 		// proceed only for tokens that have lexemes that could go into agreement
 		// (i.e. are not emptied after 1st pass)
-		if (remainings[i].size() > 0) {
-			std::vector<Corpus2::Lexeme>& prev_rem = remainings[previous];
-			// next time around "previous" will be the current one
-			previous = i;
+		if (!remainings[i].empty()) {
+			const std::vector<Corpus2::Lexeme>& prev_rem = remainings[previous];
 			std::vector<Corpus2::Lexeme> curr_rem;
 			// get only lexemes that agree with a lexeme from previous token
 			foreach(const Corpus2::Lexeme& curr_lex, remainings[i]) {
@@ -63,28 +65,45 @@ Bool Unify::execute(const ActionExecContext& context) const
 					}
 				}
 			}
-			if (curr_rem.size() == 0) {
-				// no lexemes left, means no agreement possible, so can't unify - abort with no changes
-				changed.set_value(false);
-				return changed;
-			}
-			if (curr_rem.size() != remainings[i].size()) {
-				changed.set_value(true);
-				remainings[i] = curr_rem;
+			if (curr_rem.empty()) {
+				if (underspecifieds[i].empty()) {
+					// there is no agreement possible, abort with no changes
+					changed.set_value(false);
+					return changed;
+				} else {
+					// there were underspecified lexemes, they will be left in the end
+					// so agreement is met anyway, however we need to mark that
+					// changes were made because we removed some lexemes that did not
+					// meet agreement;
+					// "previous" index has to be left alone though, pointing at previous
+					// token that still had some agreedable lexemes
+					changed.set_value(true);
+				}
+			} else {
+				// some non-underspecified lexemes are left, so
+				// next time around, "previous" token is going to be the current one
+				previous = i;
+				// there were changes only if we actually removed something
+				if (curr_rem.size() != remainings[i].size()) {
+					changed.set_value(true);
+					remainings[i] = curr_rem;
+				}
 			}
 		}
 	}
-
-	// finally assign remaining tokens accordingly,
-	// leaving intact the tokens that did not have agreeable lexemes at all
+	// finally assign remaining lexemes to original tokens
 	if (changed.get_value()) {
 		for (size_t i = 0; i < remainings.size(); ++i) {
 			if (remainings[i].size() > 0) {
-				sc.at(i + abs_left)->lexemes() = remainings[i];
+				std::vector<Corpus2::Lexeme>& lexemes = sc.at(i + abs_left)->lexemes();
+				lexemes = remainings[i];
+				// underspecced lexemes meet agreement too, so leave them in the end, if any
+				foreach (const Corpus2::Lexeme& lex, underspecifieds[i]) {
+					lexemes.push_back(lex);
+				}
 			}
 		}
 	}
-
 	return changed;
 }