diff --git a/libwccl/CMakeLists.txt b/libwccl/CMakeLists.txt index 3d265a8d8f64bfecff990cd66c1f99d29a7d6ef9..7fa31d3812b9056e8b3081c9ed489f3f26c6ad70 100644 --- a/libwccl/CMakeLists.txt +++ b/libwccl/CMakeLists.txt @@ -31,6 +31,7 @@ SET(libwccl_STAT_SRC ops/actions/delete.cpp ops/actions/relabel.cpp ops/actions/select.cpp + ops/actions/unify.cpp ops/formatters.cpp ops/functions/bool/iteration.cpp ops/functions/bool/iterations/atleast.cpp diff --git a/libwccl/ops/actions/unify.cpp b/libwccl/ops/actions/unify.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a4a53f0e36147e565f01d1bb76b4bc9a295e2bab --- /dev/null +++ b/libwccl/ops/actions/unify.cpp @@ -0,0 +1,107 @@ +#include <libwccl/ops/actions/unify.h> +#include <libpwrutils/foreach.h> +#include <sstream> + +namespace Wccl { + +Bool Unify::execute(const ActionExecContext& context) const +{ + Bool changed(false); + SentenceContext& sc = context.sentence_context(); + + const boost::shared_ptr<const Position>& range_left = pos_begin_->apply(context); + if (range_left->get_value() == Position::Nowhere) { + return Bool(false); + } + const boost::shared_ptr<const Position>& range_right = pos_end_->apply(context); + if (range_right->get_value() == Position::Nowhere) { + return Bool(false); + } + int abs_left, abs_right; + if (!sc.validate_range(*range_left, *range_right, abs_left, abs_right)) { + return Bool(false); + } + + const boost::shared_ptr<const TSet>& attribs = attribs_expr_->apply(context); + + std::vector<std::vector<Corpus2::Lexeme> > remainings; + + // first pass - cut lexemes that don't match agreement attribs at all + for (int i = abs_left; i <= abs_right; ++i) { + Corpus2::Token& curr_tok = *sc.at(i); + std::vector<Corpus2::Lexeme> remaining; + foreach(Corpus2::Lexeme& lexeme, curr_tok.lexemes()) { + if (attribs->matching_categories(lexeme.tag()) > 0) { + remaining.push_back(lexeme); + } + } + remainings.push_back(remaining); + } + + // second pass - leave only lexemes that are in agreement + // with at least one lexeme from previous non-empty-after-1st-pass token + int previous = remainings.size() - 1; + // at first, "previous" is the last token not emptied after 1st pass + while ((previous > 0) && (remainings[previous].size() == 0)) { + previous--; + } + for (size_t i = 0; i < remainings.size(); ++i) { + // proceed only for tokens that have lexemes that could go into agreement + // (i.e. are not emptied after 1st pass) + if (remainings[i].size() > 0) { + std::vector<Corpus2::Lexeme>& prev_rem = remainings[previous]; + // next time around "previous" will be the current one + previous = i; + std::vector<Corpus2::Lexeme> curr_rem; + // get only lexemes that agree with a lexeme from previous token + foreach(const Corpus2::Lexeme& curr_lex, remainings[i]) { + foreach(const Corpus2::Lexeme& prev_lex, prev_rem) { + Corpus2::Tag inter = curr_lex.tag().get_masked(prev_lex.tag()); + if (attribs->matching_categories(inter) == attribs->matching_categories(curr_lex.tag())) { + curr_rem.push_back(curr_lex); + break; + } + } + } + if (curr_rem.size() == 0) { + // no lexemes left, means no agreement possible, so can't unify - abort with no changes + changed.set_value(false); + return changed; + } + if (curr_rem.size() != remainings[i].size()) { + changed.set_value(true); + remainings[i] = curr_rem; + } + } + } + + // finally assign remaining tokens accordingly, + // leaving intact the tokens that did not have agreeable lexemes at all + if (changed.get_value()) { + for (size_t i = 0; i < remainings.size(); ++i) { + if (remainings[i].size() > 0) { + sc.at(i + abs_left)->lexemes() = remainings[i]; + } + } + } + + return changed; +} + +std::string Unify::to_string(const Corpus2::Tagset& tagset) const +{ + std::ostringstream os; + os << name() << "(" << pos_begin_->to_string(tagset) << ", " + << pos_end_->to_string(tagset) << ", " + << attribs_expr_->to_string(tagset) << ")"; + return os.str(); +} + +std::ostream& Unify::write_to(std::ostream& os) const +{ + os << name() << "(" << *pos_begin_ << ", " << *pos_end_ << ", " << *attribs_expr_ << ")"; + return os; +} + + +} /* end ns Wccl */ diff --git a/libwccl/ops/actions/unify.h b/libwccl/ops/actions/unify.h new file mode 100644 index 0000000000000000000000000000000000000000..68efc1b02dfa2db0b8a09937b62c3e40c4e0f38e --- /dev/null +++ b/libwccl/ops/actions/unify.h @@ -0,0 +1,72 @@ +#ifndef LIBWCCL_OPS_RELABEL_DELETE_H +#define LIBWCCL_OPS_RELABEL_DELETE_H + +#include <libwccl/ops/action.h> +#include <libwccl/values/position.h> +#include <libwccl/values/bool.h> +#include <libwccl/ops/function.h> + +namespace Wccl { + +/** + * Action to unify tokens on an agreement, removing lexemes + * that violate the agreement. + */ +class Unify : public Action +{ +public: + typedef boost::shared_ptr<Function<Position> > PosFunctionPtr; + typedef boost::shared_ptr<Function<TSet> > TSetFunctionPtr; + + Unify( + const PosFunctionPtr& pos_begin, + const PosFunctionPtr& pos_end, + const TSetFunctionPtr& attribs_expr) + : pos_begin_(pos_begin), + pos_end_(pos_end), + attribs_expr_(attribs_expr) + { + BOOST_ASSERT(pos_begin_); + BOOST_ASSERT(pos_end_); + BOOST_ASSERT(attribs_expr_); + } + + /** + * @returns Name of the function. + */ + std::string name() const { + return "unify"; + } + + /** + * @returns String representation of the Action + */ + std::string to_string(const Corpus2::Tagset& tagset) const; + +protected: + /** + * Writes string representation of the Action to + * an output stream. + * @returns Stream written to. + * @note May be incomplete and/or containt internal info. + */ + std::ostream& write_to(std::ostream& ostream) const; + + /** + * Executes the Action on given context: remove lexemes + * from tokens at given range that do not meet an agreement. + * Range is trimmed to sentence boundaries. + * No action is made in case of invalid/empty range. + * @returns True if there were any changes made; False otherwise + */ + Bool execute(const ActionExecContext &context) const; + +private: + const PosFunctionPtr pos_begin_; + const PosFunctionPtr pos_end_; + const TSetFunctionPtr attribs_expr_; +}; + +} /* end ns Wccl */ + +#endif // LIBWCCL_OPS_RELABEL_DELETE_H