Unify Action, leaving lexemes that meet an agreement.

eb6a353f · Adam Wardynski · 996823e3 · eb6a353f · eb6a353f · eb6a353f
Commit eb6a353f authored 14 years ago by Adam Wardynski
--- a/libwccl/CMakeLists.txt
+++ b/libwccl/CMakeLists.txt
@@ -31,6 +31,7 @@ SET(libwccl_STAT_SRC
 	ops/actions/delete.cpp
 	ops/actions/relabel.cpp
 	ops/actions/select.cpp
+	ops/actions/unify.cpp
 	ops/formatters.cpp
 	ops/functions/bool/iteration.cpp
 	ops/functions/bool/iterations/atleast.cpp

--- a/libwccl/ops/actions/unify.cpp
+++ b/libwccl/ops/actions/unify.cpp
+#include <libwccl/ops/actions/unify.h>
+#include <libpwrutils/foreach.h>
+#include <sstream>
+namespace Wccl {
+Bool Unify::execute(const ActionExecContext& context) const
+{
+	Bool changed(false);
+	SentenceContext& sc = context.sentence_context();
+	const boost::shared_ptr<const Position>& range_left = pos_begin_->apply(context);
+	if (range_left->get_value() == Position::Nowhere) {
+		return Bool(false);
+	}
+	const boost::shared_ptr<const Position>& range_right = pos_end_->apply(context);
+	if (range_right->get_value() == Position::Nowhere) {
+		return Bool(false);
+	}
+	int abs_left, abs_right;
+	if (!sc.validate_range(*range_left, *range_right, abs_left, abs_right)) {
+		return Bool(false);
+	}
+	const boost::shared_ptr<const TSet>& attribs = attribs_expr_->apply(context);
+	std::vector<std::vector<Corpus2::Lexeme> > remainings;
+	// first pass - cut lexemes that don't match agreement attribs at all
+	for (int i = abs_left; i <= abs_right; ++i) {
+		Corpus2::Token& curr_tok = *sc.at(i);
+		std::vector<Corpus2::Lexeme> remaining;
+		foreach(Corpus2::Lexeme& lexeme, curr_tok.lexemes()) {
+			if (attribs->matching_categories(lexeme.tag()) > 0) {
+				remaining.push_back(lexeme);
+			}
+		}
+		remainings.push_back(remaining);
+	}
+	// second pass - leave only lexemes that are in agreement
+	// with at least one lexeme from previous non-empty-after-1st-pass token
+	int previous = remainings.size() - 1;
+	// at first, "previous" is the last token not emptied after 1st pass
+	while ((previous > 0) && (remainings[previous].size() == 0)) {
+		previous--;
+	}
+	for (size_t i = 0; i < remainings.size(); ++i) {
+		// proceed only for tokens that have lexemes that could go into agreement
+		// (i.e. are not emptied after 1st pass)
+		if (remainings[i].size() > 0) {
+			std::vector<Corpus2::Lexeme>& prev_rem = remainings[previous];
+			// next time around "previous" will be the current one
+			previous = i;
+			std::vector<Corpus2::Lexeme> curr_rem;
+			// get only lexemes that agree with a lexeme from previous token
+			foreach(const Corpus2::Lexeme& curr_lex, remainings[i]) {
+				foreach(const Corpus2::Lexeme& prev_lex, prev_rem) {
+					Corpus2::Tag inter = curr_lex.tag().get_masked(prev_lex.tag());
+					if (attribs->matching_categories(inter) == attribs->matching_categories(curr_lex.tag())) {
+						curr_rem.push_back(curr_lex);
+						break;
+					}
+				}
+			}
+			if (curr_rem.size() == 0) {
+				// no lexemes left, means no agreement possible, so can't unify - abort with no changes
+				changed.set_value(false);
+				return changed;
+			}
+			if (curr_rem.size() != remainings[i].size()) {
+				changed.set_value(true);
+				remainings[i] = curr_rem;
+			}
+		}
+	}
+	// finally assign remaining tokens accordingly,
+	// leaving intact the tokens that did not have agreeable lexemes at all
+	if (changed.get_value()) {
+		for (size_t i = 0; i < remainings.size(); ++i) {
+			if (remainings[i].size() > 0) {
+				sc.at(i + abs_left)->lexemes() = remainings[i];
+			}
+		}
+	}
+	return changed;
+}
+std::string Unify::to_string(const Corpus2::Tagset& tagset) const
+{
+	std::ostringstream os;
+	os << name() << "(" << pos_begin_->to_string(tagset) << ", "
+			<< pos_end_->to_string(tagset) << ", "
+			<< attribs_expr_->to_string(tagset) << ")";
+	return os.str();
+}
+std::ostream& Unify::write_to(std::ostream& os) const
+{
+	os << name() << "(" << *pos_begin_ << ", " << *pos_end_ << ", " << *attribs_expr_ << ")";
+	return os;
+}
+} /* end ns Wccl */
--- a/libwccl/ops/actions/unify.h
+++ b/libwccl/ops/actions/unify.h
+#ifndef LIBWCCL_OPS_RELABEL_DELETE_H
+#define LIBWCCL_OPS_RELABEL_DELETE_H
+#include <libwccl/ops/action.h>
+#include <libwccl/values/position.h>
+#include <libwccl/values/bool.h>
+#include <libwccl/ops/function.h>
+namespace Wccl {
+/**
+ * Action to unify tokens on an agreement, removing lexemes
+ * that violate the agreement.
+ */
+class Unify : public Action
+{
+public:
+	typedef boost::shared_ptr<Function<Position> > PosFunctionPtr;
+	typedef boost::shared_ptr<Function<TSet> > TSetFunctionPtr;
+	Unify(
+		const PosFunctionPtr& pos_begin,
+		const PosFunctionPtr& pos_end,
+		const TSetFunctionPtr& attribs_expr)
+		: pos_begin_(pos_begin),
+		  pos_end_(pos_end),
+		  attribs_expr_(attribs_expr)
+	{
+		BOOST_ASSERT(pos_begin_);
+		BOOST_ASSERT(pos_end_);
+		BOOST_ASSERT(attribs_expr_);
+	}
+	/**
+	 * @returns Name of the function.
+	 */
+	std::string name() const {
+		return "unify";
+	}
+	/**
+	 * @returns String representation of the Action
+	 */
+	std::string to_string(const Corpus2::Tagset& tagset) const;
+protected:
+	/**
+	 * Writes string representation of the Action to
+	 * an output stream.
+	 * @returns Stream written to.
+	 * @note May be incomplete and/or containt internal info.
+	 */
+	std::ostream& write_to(std::ostream& ostream) const;
+	/**
+	 * Executes the Action on given context: remove lexemes
+	 * from tokens at given range that do not meet an agreement.
+	 * Range is trimmed to sentence boundaries.
+	 * No action is made in case of invalid/empty range.
+	 * @returns True if there were any changes made; False otherwise
+	 */
+	Bool execute(const ActionExecContext &context) const;
+private:
+	const PosFunctionPtr pos_begin_;
+	const PosFunctionPtr pos_end_;
+	const TSetFunctionPtr attribs_expr_;
+};
+} /* end ns Wccl */
+#endif // LIBWCCL_OPS_RELABEL_DELETE_H