diff --git a/libwccl/ops/functions/bool/iterations/leftlook.cpp b/libwccl/ops/functions/bool/iterations/leftlook.cpp index d1a80282735c953fa4b714e79914fa7e2b589546..349b40c2ae301ead0b495dccb2a72cc29923fc0c 100644 --- a/libwccl/ops/functions/bool/iterations/leftlook.cpp +++ b/libwccl/ops/functions/bool/iterations/leftlook.cpp @@ -17,4 +17,24 @@ bool LeftLook::iterate( return false; } +std::string LeftLook::to_string(const Corpus2::Tagset& tagset) const +{ + std::ostringstream ss; + ss << name(tagset) << "(" + << right_pos_expr_->to_string(tagset) << ", " + << left_pos_expr_->to_string(tagset) << ", " + << Position::var_repr(iter_var_acc_.get_name()) << ", " + << evaluating_expr_->to_string(tagset) << ")"; + return ss.str(); +} + +std::ostream& LeftLook::write_to(std::ostream& os) const +{ + return os << raw_name() << "(" + << *right_pos_expr_ << ", " + << *left_pos_expr_ << ", " + << Position::var_repr(iter_var_acc_.get_name()) << ", " + << *evaluating_expr_ << ")"; +} + } /* end ns Wccl */ diff --git a/libwccl/ops/functions/bool/iterations/leftlook.h b/libwccl/ops/functions/bool/iterations/leftlook.h index 92e42dab29581b899b855639df708443495f9593..618aa3d11e63eefcfc9d3ed74abba08e5e9432a1 100644 --- a/libwccl/ops/functions/bool/iterations/leftlook.h +++ b/libwccl/ops/functions/bool/iterations/leftlook.h @@ -29,6 +29,12 @@ public: return "llook"; } + /** + * @returns String reperesentation of LeftLook Operator: + * llook(right_pos_expr, left_pos_expr, variable, eval_expr, min_matches) + */ + std::string to_string(const Corpus2::Tagset& tagset) const; + protected: /** * @returns True if, when scanning right-to-left, @@ -46,6 +52,15 @@ protected: int right, Position &p, const FunExecContext &context) const; + + /** + * Writes raw string reperesentation of LeftLook operator, in form of + * llook(raw_right_p_expr, raw_left_p_expr, var, raw_eval_expr, min_matches) + * @note This version doesn't require a Tagset, but may + * be incomplete and/or contain internal info. + * @returns Stream written to. + */ + std::ostream& write_to(std::ostream& ostream) const; }; diff --git a/libwccl/ops/functions/bool/predicates/pointagreement.cpp b/libwccl/ops/functions/bool/predicates/pointagreement.cpp index 1c7e8591d9622cd943a96b8b00f8b6ae5b39709d..60ef77be6236904f47f97a2e778cc29dceb3a980 100644 --- a/libwccl/ops/functions/bool/predicates/pointagreement.cpp +++ b/libwccl/ops/functions/bool/predicates/pointagreement.cpp @@ -33,12 +33,30 @@ PointAgreement::BaseRetValPtr PointAgreement::apply_internal(const FunExecContex return Predicate::False(context); } - const boost::shared_ptr<const TSet>& attribs_tset = attribs_expr_->apply(context); - const Corpus2::Tag& attribs = attribs_tset->get_value(); + const boost::shared_ptr<const TSet>& attribs = attribs_expr_->apply(context); - // - // @todo: implement - // + int min_card = attribs->categories_count(tagset_); + + const Corpus2::Token* t1 = sc.at(*pos1); + const Corpus2::Token* t2 = sc.at(*pos2); + // to optimize a bit, make sure t1 is the one with less lexemes + if (t1->lexemes().size() > t2->lexemes().size()) { + std::swap(t1, t2); + } + + foreach (const Corpus2::Lexeme& t1_lex, t1->lexemes()) { + const Corpus2::Tag& t1_tag = t1_lex.tag(); + // don't bother checking t2 unless current t1_tag matches enough categories + if (attribs->matching_categories(t1_tag) >= min_card) { + foreach (const Corpus2::Lexeme& t2_lex, t2->lexemes()) { + Corpus2::Tag intersection = t1_tag.get_masked(t2_lex.tag()); + // if the intersection matches enough categories we have agreement + if (attribs->matching_categories(intersection) >= min_card) { + return Predicate::True(context); + } + } + } + } return Predicate::False(context); } diff --git a/libwccl/values/tset.cpp b/libwccl/values/tset.cpp index 16a62b33271da69be7b89827e2f088aa665b7819..499fb5ae9148f2b248045bcc0549f15b291271b5 100644 --- a/libwccl/values/tset.cpp +++ b/libwccl/values/tset.cpp @@ -1,4 +1,6 @@ #include <libwccl/values/tset.h> +#include <libpwrutils/foreach.h> +#include <libpwrutils/bitset.h> #include <sstream> namespace Wccl { @@ -17,11 +19,28 @@ std::string TSet::to_string(const Corpus2::Tagset& tagset) const std::string TSet::var_repr(const std::string &var_name) { - std::stringstream ss; + std::ostringstream ss; ss << "$t:" << var_name; return ss.str(); } +int TSet::categories_count(const Corpus2::Tagset& tagset) const +{ + int cats = (tag_.get_pos().any()) ? 1 : 0; + foreach (const Corpus2::mask_t& mask, tagset.all_attribute_masks()) { + if (tag_.get_values_for(mask).any()) { + ++cats; + } + } + return cats; +} + +int TSet::matching_categories(const Corpus2::Tag& tag) const +{ + const Corpus2::Tag& masked = tag_.get_masked(tag); + return PwrNlp::count_bits_set(masked.get_pos()) + PwrNlp::count_bits_set(masked.get_values()); +} + void TSet::insert_symbol(const Corpus2::Tagset& tagset, const std::string& s) { tag_.combine_with(tagset.parse_symbol(s)); diff --git a/libwccl/values/tset.h b/libwccl/values/tset.h index a89d0eb27d3449cd1f77f4d450f610800bdb769b..11f227932f65bee65b6286af079562f93fd202cd 100644 --- a/libwccl/values/tset.h +++ b/libwccl/values/tset.h @@ -52,7 +52,7 @@ public: } /** - * Convenience function to add a symbol from a tagste by name. + * Convenience function to add a symbol from a tagset by name. * * Note: slow. Avoid in code that gets repeatedly executed. */ @@ -82,6 +82,23 @@ public: return tag_ == other.tag_; } + /** + * @return Number of categories present in this symbol set according + * to supplied tagset. + * @note A category is word class or an attribute. + */ + int categories_count(const Corpus2::Tagset& tagset) const; + + /** + * @return How many categories present in the supplied tag match with + * this symbol set. + * @warning The underlying assumption is that the supplied tag has at most + * 1 value per category. Otherwise the value will be incorrect. + * @note The symbol set may have partially defined categories. Only values + * present in this symbol set count when matching values in the tag. + */ + int matching_categories(const Corpus2::Tag& tag) const; + void combine_with(const Corpus2::Tag& other) { tag_.combine_with(other); } @@ -90,7 +107,6 @@ public: tag_.combine_with(other.get_value()); } - std::string to_string(const Corpus2::Tagset &) const; std::string to_raw_string() const; diff --git a/tests/values.cpp b/tests/values.cpp index d5f02c842f5fa3a531c7a2a39a254bbc01cb269c..f15247bb2a12808f1f7947bd8d292514100dcfa5 100644 --- a/tests/values.cpp +++ b/tests/values.cpp @@ -68,30 +68,61 @@ BOOST_AUTO_TEST_CASE(tset_ops) { TSet s1, s2; const Corpus2::Tagset& tagset = Corpus2::get_named_tagset("kipi"); + Corpus2::Tag subst_tag = tagset.parse_tag("subst:sg:nom:f", false)[0]; + Corpus2::Tag adj_tag = tagset.parse_tag("adj:pl:acc:m3:pos", false)[0]; + BOOST_CHECK(s1.equals(s2)); BOOST_CHECK(s1.is_subset_of(s2)); BOOST_CHECK(s2.is_subset_of(s1)); BOOST_CHECK(!s1.intersects(s2)); + BOOST_CHECK_EQUAL(0, s1.categories_count(tagset)); + BOOST_CHECK_EQUAL(0, s1.matching_categories(subst_tag)); + BOOST_CHECK_EQUAL(0, s1.matching_categories(adj_tag)); s1.insert_symbol(tagset, "subst"); + BOOST_CHECK_EQUAL(1, s1.categories_count(tagset)); + BOOST_CHECK_EQUAL(1, s1.matching_categories(subst_tag)); + BOOST_CHECK_EQUAL(0, s1.matching_categories(adj_tag)); BOOST_CHECK(!s1.equals(s2)); BOOST_CHECK(!s1.is_subset_of(s2)); BOOST_CHECK(s2.is_subset_of(s1)); BOOST_CHECK(!s1.intersects(s2)); s2.insert_symbol(tagset, "pl"); + BOOST_CHECK_EQUAL(1, s2.categories_count(tagset)); + BOOST_CHECK_EQUAL(0, s2.matching_categories(subst_tag)); + BOOST_CHECK_EQUAL(1, s2.matching_categories(adj_tag)); BOOST_CHECK(!s1.equals(s2)); BOOST_CHECK(!s1.is_subset_of(s2)); BOOST_CHECK(!s2.is_subset_of(s1)); BOOST_CHECK(!s1.intersects(s2)); s2.insert_symbol(tagset, "subst"); + BOOST_CHECK_EQUAL(2, s2.categories_count(tagset)); + BOOST_CHECK_EQUAL(1, s2.matching_categories(subst_tag)); + BOOST_CHECK_EQUAL(1, s2.matching_categories(adj_tag)); BOOST_CHECK(!s1.equals(s2)); BOOST_CHECK(s1.is_subset_of(s2)); BOOST_CHECK(!s2.is_subset_of(s1)); BOOST_CHECK(s1.intersects(s2)); s1.insert_symbol(tagset, "pl"); + BOOST_CHECK_EQUAL(2, s1.categories_count(tagset)); + BOOST_CHECK_EQUAL(1, s1.matching_categories(subst_tag)); + BOOST_CHECK_EQUAL(1, s1.matching_categories(adj_tag)); BOOST_CHECK(s1.equals(s2)); BOOST_CHECK(s1.is_subset_of(s2)); BOOST_CHECK(s2.is_subset_of(s1)); BOOST_CHECK(s1.intersects(s2)); + s1.insert_symbol(tagset, "sg"); + BOOST_CHECK_EQUAL(2, s1.categories_count(tagset)); + BOOST_CHECK_EQUAL(2, s1.matching_categories(subst_tag)); + BOOST_CHECK_EQUAL(1, s1.matching_categories(adj_tag)); + s1.insert_symbol(tagset, "f"); + BOOST_CHECK_EQUAL(3, s1.categories_count(tagset)); + BOOST_CHECK_EQUAL(3, s1.matching_categories(subst_tag)); + BOOST_CHECK_EQUAL(1, s1.matching_categories(adj_tag)); + s1.insert_symbol(tagset, "adj"); + BOOST_CHECK_EQUAL(3, s1.categories_count(tagset)); + BOOST_CHECK_EQUAL(3, s1.categories_count(tagset)); + BOOST_CHECK_EQUAL(2, s1.matching_categories(adj_tag)); + } BOOST_AUTO_TEST_CASE(position_ops)