Skip to content
Snippets Groups Projects
Commit dd0777c6 authored by Adam Wardynski's avatar Adam Wardynski
Browse files

Unify to treat underspecified lexemes as meeting agreement.

So such lexemes are always left in the end.
parent eb6a353f
Branches
No related merge requests found
......@@ -25,33 +25,35 @@ Bool Unify::execute(const ActionExecContext& context) const
const boost::shared_ptr<const TSet>& attribs = attribs_expr_->apply(context);
std::vector<std::vector<Corpus2::Lexeme> > remainings;
std::vector<std::vector<Corpus2::Lexeme> > underspecifieds;
// first pass - cut lexemes that don't match agreement attribs at all
for (int i = abs_left; i <= abs_right; ++i) {
Corpus2::Token& curr_tok = *sc.at(i);
std::vector<Corpus2::Lexeme> remaining;
std::vector<Corpus2::Lexeme> underspecified;
foreach(Corpus2::Lexeme& lexeme, curr_tok.lexemes()) {
if (attribs->matching_categories(lexeme.tag()) > 0) {
remaining.push_back(lexeme);
} else {
underspecified.push_back(lexeme);
}
}
remainings.push_back(remaining);
underspecifieds.push_back(underspecified);
}
// second pass - leave only lexemes that are in agreement
// with at least one lexeme from previous non-empty-after-1st-pass token
int previous = remainings.size() - 1;
// at first, "previous" is the last token not emptied after 1st pass
while ((previous > 0) && (remainings[previous].size() == 0)) {
while ((previous > 0) && (remainings[previous].empty())) {
previous--;
}
for (size_t i = 0; i < remainings.size(); ++i) {
// proceed only for tokens that have lexemes that could go into agreement
// (i.e. are not emptied after 1st pass)
if (remainings[i].size() > 0) {
std::vector<Corpus2::Lexeme>& prev_rem = remainings[previous];
// next time around "previous" will be the current one
previous = i;
if (!remainings[i].empty()) {
const std::vector<Corpus2::Lexeme>& prev_rem = remainings[previous];
std::vector<Corpus2::Lexeme> curr_rem;
// get only lexemes that agree with a lexeme from previous token
foreach(const Corpus2::Lexeme& curr_lex, remainings[i]) {
......@@ -63,28 +65,45 @@ Bool Unify::execute(const ActionExecContext& context) const
}
}
}
if (curr_rem.size() == 0) {
// no lexemes left, means no agreement possible, so can't unify - abort with no changes
changed.set_value(false);
return changed;
}
if (curr_rem.size() != remainings[i].size()) {
changed.set_value(true);
remainings[i] = curr_rem;
if (curr_rem.empty()) {
if (underspecifieds[i].empty()) {
// there is no agreement possible, abort with no changes
changed.set_value(false);
return changed;
} else {
// there were underspecified lexemes, they will be left in the end
// so agreement is met anyway, however we need to mark that
// changes were made because we removed some lexemes that did not
// meet agreement;
// "previous" index has to be left alone though, pointing at previous
// token that still had some agreedable lexemes
changed.set_value(true);
}
} else {
// some non-underspecified lexemes are left, so
// next time around, "previous" token is going to be the current one
previous = i;
// there were changes only if we actually removed something
if (curr_rem.size() != remainings[i].size()) {
changed.set_value(true);
remainings[i] = curr_rem;
}
}
}
}
// finally assign remaining tokens accordingly,
// leaving intact the tokens that did not have agreeable lexemes at all
// finally assign remaining lexemes to original tokens
if (changed.get_value()) {
for (size_t i = 0; i < remainings.size(); ++i) {
if (remainings[i].size() > 0) {
sc.at(i + abs_left)->lexemes() = remainings[i];
std::vector<Corpus2::Lexeme>& lexemes = sc.at(i + abs_left)->lexemes();
lexemes = remainings[i];
// underspecced lexemes meet agreement too, so leave them in the end, if any
foreach (const Corpus2::Lexeme& lex, underspecifieds[i]) {
lexemes.push_back(lex);
}
}
}
}
return changed;
}
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment