Skip to content
Snippets Groups Projects
Commit eb6a353f authored by Adam Wardynski's avatar Adam Wardynski
Browse files

Unify Action, leaving lexemes that meet an agreement.

parent 996823e3
Branches
No related merge requests found
...@@ -31,6 +31,7 @@ SET(libwccl_STAT_SRC ...@@ -31,6 +31,7 @@ SET(libwccl_STAT_SRC
ops/actions/delete.cpp ops/actions/delete.cpp
ops/actions/relabel.cpp ops/actions/relabel.cpp
ops/actions/select.cpp ops/actions/select.cpp
ops/actions/unify.cpp
ops/formatters.cpp ops/formatters.cpp
ops/functions/bool/iteration.cpp ops/functions/bool/iteration.cpp
ops/functions/bool/iterations/atleast.cpp ops/functions/bool/iterations/atleast.cpp
......
#include <libwccl/ops/actions/unify.h>
#include <libpwrutils/foreach.h>
#include <sstream>
namespace Wccl {
Bool Unify::execute(const ActionExecContext& context) const
{
Bool changed(false);
SentenceContext& sc = context.sentence_context();
const boost::shared_ptr<const Position>& range_left = pos_begin_->apply(context);
if (range_left->get_value() == Position::Nowhere) {
return Bool(false);
}
const boost::shared_ptr<const Position>& range_right = pos_end_->apply(context);
if (range_right->get_value() == Position::Nowhere) {
return Bool(false);
}
int abs_left, abs_right;
if (!sc.validate_range(*range_left, *range_right, abs_left, abs_right)) {
return Bool(false);
}
const boost::shared_ptr<const TSet>& attribs = attribs_expr_->apply(context);
std::vector<std::vector<Corpus2::Lexeme> > remainings;
// first pass - cut lexemes that don't match agreement attribs at all
for (int i = abs_left; i <= abs_right; ++i) {
Corpus2::Token& curr_tok = *sc.at(i);
std::vector<Corpus2::Lexeme> remaining;
foreach(Corpus2::Lexeme& lexeme, curr_tok.lexemes()) {
if (attribs->matching_categories(lexeme.tag()) > 0) {
remaining.push_back(lexeme);
}
}
remainings.push_back(remaining);
}
// second pass - leave only lexemes that are in agreement
// with at least one lexeme from previous non-empty-after-1st-pass token
int previous = remainings.size() - 1;
// at first, "previous" is the last token not emptied after 1st pass
while ((previous > 0) && (remainings[previous].size() == 0)) {
previous--;
}
for (size_t i = 0; i < remainings.size(); ++i) {
// proceed only for tokens that have lexemes that could go into agreement
// (i.e. are not emptied after 1st pass)
if (remainings[i].size() > 0) {
std::vector<Corpus2::Lexeme>& prev_rem = remainings[previous];
// next time around "previous" will be the current one
previous = i;
std::vector<Corpus2::Lexeme> curr_rem;
// get only lexemes that agree with a lexeme from previous token
foreach(const Corpus2::Lexeme& curr_lex, remainings[i]) {
foreach(const Corpus2::Lexeme& prev_lex, prev_rem) {
Corpus2::Tag inter = curr_lex.tag().get_masked(prev_lex.tag());
if (attribs->matching_categories(inter) == attribs->matching_categories(curr_lex.tag())) {
curr_rem.push_back(curr_lex);
break;
}
}
}
if (curr_rem.size() == 0) {
// no lexemes left, means no agreement possible, so can't unify - abort with no changes
changed.set_value(false);
return changed;
}
if (curr_rem.size() != remainings[i].size()) {
changed.set_value(true);
remainings[i] = curr_rem;
}
}
}
// finally assign remaining tokens accordingly,
// leaving intact the tokens that did not have agreeable lexemes at all
if (changed.get_value()) {
for (size_t i = 0; i < remainings.size(); ++i) {
if (remainings[i].size() > 0) {
sc.at(i + abs_left)->lexemes() = remainings[i];
}
}
}
return changed;
}
std::string Unify::to_string(const Corpus2::Tagset& tagset) const
{
std::ostringstream os;
os << name() << "(" << pos_begin_->to_string(tagset) << ", "
<< pos_end_->to_string(tagset) << ", "
<< attribs_expr_->to_string(tagset) << ")";
return os.str();
}
std::ostream& Unify::write_to(std::ostream& os) const
{
os << name() << "(" << *pos_begin_ << ", " << *pos_end_ << ", " << *attribs_expr_ << ")";
return os;
}
} /* end ns Wccl */
#ifndef LIBWCCL_OPS_RELABEL_DELETE_H
#define LIBWCCL_OPS_RELABEL_DELETE_H
#include <libwccl/ops/action.h>
#include <libwccl/values/position.h>
#include <libwccl/values/bool.h>
#include <libwccl/ops/function.h>
namespace Wccl {
/**
* Action to unify tokens on an agreement, removing lexemes
* that violate the agreement.
*/
class Unify : public Action
{
public:
typedef boost::shared_ptr<Function<Position> > PosFunctionPtr;
typedef boost::shared_ptr<Function<TSet> > TSetFunctionPtr;
Unify(
const PosFunctionPtr& pos_begin,
const PosFunctionPtr& pos_end,
const TSetFunctionPtr& attribs_expr)
: pos_begin_(pos_begin),
pos_end_(pos_end),
attribs_expr_(attribs_expr)
{
BOOST_ASSERT(pos_begin_);
BOOST_ASSERT(pos_end_);
BOOST_ASSERT(attribs_expr_);
}
/**
* @returns Name of the function.
*/
std::string name() const {
return "unify";
}
/**
* @returns String representation of the Action
*/
std::string to_string(const Corpus2::Tagset& tagset) const;
protected:
/**
* Writes string representation of the Action to
* an output stream.
* @returns Stream written to.
* @note May be incomplete and/or containt internal info.
*/
std::ostream& write_to(std::ostream& ostream) const;
/**
* Executes the Action on given context: remove lexemes
* from tokens at given range that do not meet an agreement.
* Range is trimmed to sentence boundaries.
* No action is made in case of invalid/empty range.
* @returns True if there were any changes made; False otherwise
*/
Bool execute(const ActionExecContext &context) const;
private:
const PosFunctionPtr pos_begin_;
const PosFunctionPtr pos_end_;
const TSetFunctionPtr attribs_expr_;
};
} /* end ns Wccl */
#endif // LIBWCCL_OPS_RELABEL_DELETE_H
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment