From dd0777c6689508be2a386f747fb0ebeb4992a3ff Mon Sep 17 00:00:00 2001
From: Adam Wardynski <award@.(B-4.4.46a)>
Date: Thu, 13 Jan 2011 19:34:49 +0100
Subject: [PATCH] Unify to treat underspecified lexemes as meeting agreement.
 So such lexemes are always left in the end.

---
 libwccl/ops/actions/unify.cpp | 57 +++++++++++++++++++++++------------
 1 file changed, 38 insertions(+), 19 deletions(-)

diff --git a/libwccl/ops/actions/unify.cpp b/libwccl/ops/actions/unify.cpp
index a4a53f0..692aa5d 100644
--- a/libwccl/ops/actions/unify.cpp
+++ b/libwccl/ops/actions/unify.cpp
@@ -25,33 +25,35 @@ Bool Unify::execute(const ActionExecContext& context) const
 	const boost::shared_ptr<const TSet>& attribs = attribs_expr_->apply(context);
 
 	std::vector<std::vector<Corpus2::Lexeme> > remainings;
+	std::vector<std::vector<Corpus2::Lexeme> > underspecifieds;
 
 	// first pass - cut lexemes that don't match agreement attribs at all
 	for (int i = abs_left; i <= abs_right; ++i) {
 		Corpus2::Token& curr_tok = *sc.at(i);
 		std::vector<Corpus2::Lexeme> remaining;
+		std::vector<Corpus2::Lexeme> underspecified;
 		foreach(Corpus2::Lexeme& lexeme, curr_tok.lexemes()) {
 			if (attribs->matching_categories(lexeme.tag()) > 0) {
 				remaining.push_back(lexeme);
+			} else {
+				underspecified.push_back(lexeme);
 			}
 		}
 		remainings.push_back(remaining);
+		underspecifieds.push_back(underspecified);
 	}
-
 	// second pass - leave only lexemes that are in agreement
 	// with at least one lexeme from previous non-empty-after-1st-pass token
 	int previous = remainings.size() - 1;
 	// at first, "previous" is the last token not emptied after 1st pass
-	while ((previous > 0) && (remainings[previous].size() == 0)) {
+	while ((previous > 0) && (remainings[previous].empty())) {
 		previous--;
 	}
 	for (size_t i = 0; i < remainings.size(); ++i) {
 		// proceed only for tokens that have lexemes that could go into agreement
 		// (i.e. are not emptied after 1st pass)
-		if (remainings[i].size() > 0) {
-			std::vector<Corpus2::Lexeme>& prev_rem = remainings[previous];
-			// next time around "previous" will be the current one
-			previous = i;
+		if (!remainings[i].empty()) {
+			const std::vector<Corpus2::Lexeme>& prev_rem = remainings[previous];
 			std::vector<Corpus2::Lexeme> curr_rem;
 			// get only lexemes that agree with a lexeme from previous token
 			foreach(const Corpus2::Lexeme& curr_lex, remainings[i]) {
@@ -63,28 +65,45 @@ Bool Unify::execute(const ActionExecContext& context) const
 					}
 				}
 			}
-			if (curr_rem.size() == 0) {
-				// no lexemes left, means no agreement possible, so can't unify - abort with no changes
-				changed.set_value(false);
-				return changed;
-			}
-			if (curr_rem.size() != remainings[i].size()) {
-				changed.set_value(true);
-				remainings[i] = curr_rem;
+			if (curr_rem.empty()) {
+				if (underspecifieds[i].empty()) {
+					// there is no agreement possible, abort with no changes
+					changed.set_value(false);
+					return changed;
+				} else {
+					// there were underspecified lexemes, they will be left in the end
+					// so agreement is met anyway, however we need to mark that
+					// changes were made because we removed some lexemes that did not
+					// meet agreement;
+					// "previous" index has to be left alone though, pointing at previous
+					// token that still had some agreedable lexemes
+					changed.set_value(true);
+				}
+			} else {
+				// some non-underspecified lexemes are left, so
+				// next time around, "previous" token is going to be the current one
+				previous = i;
+				// there were changes only if we actually removed something
+				if (curr_rem.size() != remainings[i].size()) {
+					changed.set_value(true);
+					remainings[i] = curr_rem;
+				}
 			}
 		}
 	}
-
-	// finally assign remaining tokens accordingly,
-	// leaving intact the tokens that did not have agreeable lexemes at all
+	// finally assign remaining lexemes to original tokens
 	if (changed.get_value()) {
 		for (size_t i = 0; i < remainings.size(); ++i) {
 			if (remainings[i].size() > 0) {
-				sc.at(i + abs_left)->lexemes() = remainings[i];
+				std::vector<Corpus2::Lexeme>& lexemes = sc.at(i + abs_left)->lexemes();
+				lexemes = remainings[i];
+				// underspecced lexemes meet agreement too, so leave them in the end, if any
+				foreach (const Corpus2::Lexeme& lex, underspecifieds[i]) {
+					lexemes.push_back(lex);
+				}
 			}
 		}
 	}
-
 	return changed;
 }
 
-- 
GitLab