Skip to content
Snippets Groups Projects
Commit bdda2bbd authored by ilor's avatar ilor
Browse files

Merge branch 'master' of nlp.pwr.wroc.pl:wccl

parents 01e28d80 b3201c10
Branches
No related merge requests found
......@@ -17,4 +17,24 @@ bool LeftLook::iterate(
return false;
}
std::string LeftLook::to_string(const Corpus2::Tagset& tagset) const
{
std::ostringstream ss;
ss << name(tagset) << "("
<< right_pos_expr_->to_string(tagset) << ", "
<< left_pos_expr_->to_string(tagset) << ", "
<< Position::var_repr(iter_var_acc_.get_name()) << ", "
<< evaluating_expr_->to_string(tagset) << ")";
return ss.str();
}
std::ostream& LeftLook::write_to(std::ostream& os) const
{
return os << raw_name() << "("
<< *right_pos_expr_ << ", "
<< *left_pos_expr_ << ", "
<< Position::var_repr(iter_var_acc_.get_name()) << ", "
<< *evaluating_expr_ << ")";
}
} /* end ns Wccl */
......@@ -29,6 +29,12 @@ public:
return "llook";
}
/**
* @returns String reperesentation of LeftLook Operator:
* llook(right_pos_expr, left_pos_expr, variable, eval_expr, min_matches)
*/
std::string to_string(const Corpus2::Tagset& tagset) const;
protected:
/**
* @returns True if, when scanning right-to-left,
......@@ -46,6 +52,15 @@ protected:
int right,
Position &p,
const FunExecContext &context) const;
/**
* Writes raw string reperesentation of LeftLook operator, in form of
* llook(raw_right_p_expr, raw_left_p_expr, var, raw_eval_expr, min_matches)
* @note This version doesn't require a Tagset, but may
* be incomplete and/or contain internal info.
* @returns Stream written to.
*/
std::ostream& write_to(std::ostream& ostream) const;
};
......
......@@ -33,12 +33,30 @@ PointAgreement::BaseRetValPtr PointAgreement::apply_internal(const FunExecContex
return Predicate::False(context);
}
const boost::shared_ptr<const TSet>& attribs_tset = attribs_expr_->apply(context);
const Corpus2::Tag& attribs = attribs_tset->get_value();
const boost::shared_ptr<const TSet>& attribs = attribs_expr_->apply(context);
//
// @todo: implement
//
int min_card = attribs->categories_count(tagset_);
const Corpus2::Token* t1 = sc.at(*pos1);
const Corpus2::Token* t2 = sc.at(*pos2);
// to optimize a bit, make sure t1 is the one with less lexemes
if (t1->lexemes().size() > t2->lexemes().size()) {
std::swap(t1, t2);
}
foreach (const Corpus2::Lexeme& t1_lex, t1->lexemes()) {
const Corpus2::Tag& t1_tag = t1_lex.tag();
// don't bother checking t2 unless current t1_tag matches enough categories
if (attribs->matching_categories(t1_tag) >= min_card) {
foreach (const Corpus2::Lexeme& t2_lex, t2->lexemes()) {
Corpus2::Tag intersection = t1_tag.get_masked(t2_lex.tag());
// if the intersection matches enough categories we have agreement
if (attribs->matching_categories(intersection) >= min_card) {
return Predicate::True(context);
}
}
}
}
return Predicate::False(context);
}
......
#include <libwccl/values/tset.h>
#include <libpwrutils/foreach.h>
#include <libpwrutils/bitset.h>
#include <sstream>
namespace Wccl {
......@@ -17,11 +19,28 @@ std::string TSet::to_string(const Corpus2::Tagset& tagset) const
std::string TSet::var_repr(const std::string &var_name)
{
std::stringstream ss;
std::ostringstream ss;
ss << "$t:" << var_name;
return ss.str();
}
int TSet::categories_count(const Corpus2::Tagset& tagset) const
{
int cats = (tag_.get_pos().any()) ? 1 : 0;
foreach (const Corpus2::mask_t& mask, tagset.all_attribute_masks()) {
if (tag_.get_values_for(mask).any()) {
++cats;
}
}
return cats;
}
int TSet::matching_categories(const Corpus2::Tag& tag) const
{
const Corpus2::Tag& masked = tag_.get_masked(tag);
return PwrNlp::count_bits_set(masked.get_pos()) + PwrNlp::count_bits_set(masked.get_values());
}
void TSet::insert_symbol(const Corpus2::Tagset& tagset, const std::string& s)
{
tag_.combine_with(tagset.parse_symbol(s));
......
......@@ -52,7 +52,7 @@ public:
}
/**
* Convenience function to add a symbol from a tagste by name.
* Convenience function to add a symbol from a tagset by name.
*
* Note: slow. Avoid in code that gets repeatedly executed.
*/
......@@ -82,6 +82,23 @@ public:
return tag_ == other.tag_;
}
/**
* @return Number of categories present in this symbol set according
* to supplied tagset.
* @note A category is word class or an attribute.
*/
int categories_count(const Corpus2::Tagset& tagset) const;
/**
* @return How many categories present in the supplied tag match with
* this symbol set.
* @warning The underlying assumption is that the supplied tag has at most
* 1 value per category. Otherwise the value will be incorrect.
* @note The symbol set may have partially defined categories. Only values
* present in this symbol set count when matching values in the tag.
*/
int matching_categories(const Corpus2::Tag& tag) const;
void combine_with(const Corpus2::Tag& other) {
tag_.combine_with(other);
}
......@@ -90,7 +107,6 @@ public:
tag_.combine_with(other.get_value());
}
std::string to_string(const Corpus2::Tagset &) const;
std::string to_raw_string() const;
......
......@@ -68,30 +68,61 @@ BOOST_AUTO_TEST_CASE(tset_ops)
{
TSet s1, s2;
const Corpus2::Tagset& tagset = Corpus2::get_named_tagset("kipi");
Corpus2::Tag subst_tag = tagset.parse_tag("subst:sg:nom:f", false)[0];
Corpus2::Tag adj_tag = tagset.parse_tag("adj:pl:acc:m3:pos", false)[0];
BOOST_CHECK(s1.equals(s2));
BOOST_CHECK(s1.is_subset_of(s2));
BOOST_CHECK(s2.is_subset_of(s1));
BOOST_CHECK(!s1.intersects(s2));
BOOST_CHECK_EQUAL(0, s1.categories_count(tagset));
BOOST_CHECK_EQUAL(0, s1.matching_categories(subst_tag));
BOOST_CHECK_EQUAL(0, s1.matching_categories(adj_tag));
s1.insert_symbol(tagset, "subst");
BOOST_CHECK_EQUAL(1, s1.categories_count(tagset));
BOOST_CHECK_EQUAL(1, s1.matching_categories(subst_tag));
BOOST_CHECK_EQUAL(0, s1.matching_categories(adj_tag));
BOOST_CHECK(!s1.equals(s2));
BOOST_CHECK(!s1.is_subset_of(s2));
BOOST_CHECK(s2.is_subset_of(s1));
BOOST_CHECK(!s1.intersects(s2));
s2.insert_symbol(tagset, "pl");
BOOST_CHECK_EQUAL(1, s2.categories_count(tagset));
BOOST_CHECK_EQUAL(0, s2.matching_categories(subst_tag));
BOOST_CHECK_EQUAL(1, s2.matching_categories(adj_tag));
BOOST_CHECK(!s1.equals(s2));
BOOST_CHECK(!s1.is_subset_of(s2));
BOOST_CHECK(!s2.is_subset_of(s1));
BOOST_CHECK(!s1.intersects(s2));
s2.insert_symbol(tagset, "subst");
BOOST_CHECK_EQUAL(2, s2.categories_count(tagset));
BOOST_CHECK_EQUAL(1, s2.matching_categories(subst_tag));
BOOST_CHECK_EQUAL(1, s2.matching_categories(adj_tag));
BOOST_CHECK(!s1.equals(s2));
BOOST_CHECK(s1.is_subset_of(s2));
BOOST_CHECK(!s2.is_subset_of(s1));
BOOST_CHECK(s1.intersects(s2));
s1.insert_symbol(tagset, "pl");
BOOST_CHECK_EQUAL(2, s1.categories_count(tagset));
BOOST_CHECK_EQUAL(1, s1.matching_categories(subst_tag));
BOOST_CHECK_EQUAL(1, s1.matching_categories(adj_tag));
BOOST_CHECK(s1.equals(s2));
BOOST_CHECK(s1.is_subset_of(s2));
BOOST_CHECK(s2.is_subset_of(s1));
BOOST_CHECK(s1.intersects(s2));
s1.insert_symbol(tagset, "sg");
BOOST_CHECK_EQUAL(2, s1.categories_count(tagset));
BOOST_CHECK_EQUAL(2, s1.matching_categories(subst_tag));
BOOST_CHECK_EQUAL(1, s1.matching_categories(adj_tag));
s1.insert_symbol(tagset, "f");
BOOST_CHECK_EQUAL(3, s1.categories_count(tagset));
BOOST_CHECK_EQUAL(3, s1.matching_categories(subst_tag));
BOOST_CHECK_EQUAL(1, s1.matching_categories(adj_tag));
s1.insert_symbol(tagset, "adj");
BOOST_CHECK_EQUAL(3, s1.categories_count(tagset));
BOOST_CHECK_EQUAL(3, s1.categories_count(tagset));
BOOST_CHECK_EQUAL(2, s1.matching_categories(adj_tag));
}
BOOST_AUTO_TEST_CASE(position_ops)
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment