From 0fad24216edef3cf0d177a464200b367df0a39ea Mon Sep 17 00:00:00 2001 From: Adam Wardynski <award@.(B-4.4.46a)> Date: Thu, 2 Dec 2010 17:29:28 +0100 Subject: [PATCH] GetSymbolsInRange - range(symbol, p1, p2) operator. --- libwccl/CMakeLists.txt | 1 + .../ops/functions/tset/getsymbolsinrange.cpp | 61 ++++++++ .../ops/functions/tset/getsymbolsinrange.h | 78 ++++++++++ tests/CMakeLists.txt | 1 + tests/getsymbolsinrange.cpp | 147 ++++++++++++++++++ 5 files changed, 288 insertions(+) create mode 100644 libwccl/ops/functions/tset/getsymbolsinrange.cpp create mode 100644 libwccl/ops/functions/tset/getsymbolsinrange.h create mode 100644 tests/getsymbolsinrange.cpp diff --git a/libwccl/CMakeLists.txt b/libwccl/CMakeLists.txt index c969bf7..a88b82f 100644 --- a/libwccl/CMakeLists.txt +++ b/libwccl/CMakeLists.txt @@ -41,6 +41,7 @@ SET(libwccl_STAT_SRC ops/functions/strset/tolower.cpp ops/functions/strset/toupper.cpp ops/functions/tset/getsymbols.cpp + ops/functions/tset/getsymbolsinrange.cpp parser/grammar.g parser/Parser.cpp parser/ParserException.cpp diff --git a/libwccl/ops/functions/tset/getsymbolsinrange.cpp b/libwccl/ops/functions/tset/getsymbolsinrange.cpp new file mode 100644 index 0000000..406700c --- /dev/null +++ b/libwccl/ops/functions/tset/getsymbolsinrange.cpp @@ -0,0 +1,61 @@ +#include <libwccl/ops/functions/tset/getsymbolsinrange.h> +#include <libwccl/ops/functions/constant.h> + +#include <sstream> + +namespace Wccl { + +std::string GetSymbolsInRange::to_string(const Corpus2::Tagset& tagset) const +{ + std::stringstream ss; + ss << name(tagset) << "(" + << tagset.get_attribute_name(mask_.get_values()) << ", " + << rbegin_expr_->to_string(tagset) << ", " + << rend_expr_->to_string(tagset) << ")"; + return ss.str(); +} + +std::string GetSymbolsInRange::to_raw_string() const { + std::stringstream ss; + ss << raw_name() << "(" + << mask_.raw_dump() << ", " + << rbegin_expr_->to_raw_string() << ", " + << rend_expr_->to_raw_string() << ")"; + return ss.str(); +} + + +GetSymbolsInRange::BaseRetValPtr GetSymbolsInRange::apply_internal(const FunExecContext& context) const +{ + const boost::shared_ptr<const Position>& range_begin = rbegin_expr_->apply(context); + const boost::shared_ptr<const Position>& range_end = rend_expr_->apply(context); + const SentenceContext& sc = context.sentence_context(); + + int abs_begin = sc.get_abs_position(*range_begin); + int abs_end = sc.get_abs_position(*range_end); + // Trim range to sentence boundaries + if ((abs_begin != Position::Nowhere) && (abs_begin < 0)) { + abs_begin = 0; + } + if ((abs_end != Position::Nowhere) && (abs_end >= sc.size())) { + abs_end = sc.size() - 1; + } + // If range is empty, return an empty set - note the below also + // covers ranges without overlap with actual sentence range + // (including an empty sentence). + if((abs_begin == Position::Nowhere) || (abs_end == Position::Nowhere) || (abs_begin > abs_end)) { + return detail::DefaultFunction<TSet>()->apply(context); + } + + boost::shared_ptr<TSet> tset = boost::make_shared<TSet>(); + for(int abs_pos = abs_begin; abs_pos <= abs_end; abs_pos++) { + const Corpus2::Token* token = sc.at(abs_pos); + foreach (const Corpus2::Lexeme& lexeme, token->lexemes()) { + tset->combine_with(lexeme.tag()); + } + } + tset->tag_ref().mask_with(mask_); + return tset; +} + +} /* end ns Wccl */ diff --git a/libwccl/ops/functions/tset/getsymbolsinrange.h b/libwccl/ops/functions/tset/getsymbolsinrange.h new file mode 100644 index 0000000..aed881c --- /dev/null +++ b/libwccl/ops/functions/tset/getsymbolsinrange.h @@ -0,0 +1,78 @@ +#ifndef LIBWCCL_OPS_FUNCTIONS_TSET_GETSYMBOLSINRANGE_H +#define LIBWCCL_OPS_FUNCTIONS_TSET_GETSYMBOLSINRANGE_H + +#include <libwccl/values/tset.h> +#include <libwccl/values/position.h> +#include <libwccl/ops/function.h> + +namespace Wccl { + +/** + * Operator that gets tagset symbols from tokens in given range. + */ +class GetSymbolsInRange : public Function<TSet> { +public: + typedef boost::shared_ptr<Function<Position> > PosFunctionPtr; + + GetSymbolsInRange( + const Corpus2::Tag& mask, + const PosFunctionPtr& range_begin_expr, + const PosFunctionPtr& range_end_expr) + : mask_(mask), + rbegin_expr_(range_begin_expr), + rend_expr_(range_end_expr) + { + BOOST_ASSERT(rbegin_expr_); + BOOST_ASSERT(rend_expr_); + } + + /** + * @returns String representation of the function in the form of: + * "range(tagset_symbol, range_begin_expr, range_end_expr) + */ + std::string to_string(const Corpus2::Tagset& tagset) const; + + /** + * @returns String representation of the function in the form of: + * "range(raw_tagset_symbol, range_begin_expr_raw, range_end_expr_raw) + * @note This version does not require tagset, but will be inclomplete + * and/or contain internal info. + */ + std::string to_raw_string() const; + + /** + * @returns The operator name, "range" + */ + std::string raw_name() const { + return "range"; + } + +protected: + Corpus2::Tag mask_; + + const PosFunctionPtr rbegin_expr_; + const PosFunctionPtr rend_expr_; + + /** + * Gets positions for beginning and end of range we are + * interested in (from corresponding argument expressions). + * The range is trimmed to boundaries of the sentence we are working on. + * An empty set is returned if any of the positions points + * to nowhere, or when the range doesn't overlap with the sentence, + * or when supplied begin is actually after the supplied end. + * Otherwise we have a valid range, and a symbol set is returned, + * which is the sum of tagset symbols for words within the range + * (inclusive). The selection of symbols is based on the mask. + * The main intention is to supply masks that correspond + * directly to a single selected attribute, but the code accepts any + * valid mask (i.e. any combination of attributes and their values, + * and even includes the part-of-speech part). + * @returns A tagset symbol set for the words within range, given + * the mask, if the range is valid. An empty Tset otherwise. + */ + BaseRetValPtr apply_internal(const FunExecContext& context) const; +}; + +} /* end ns Wccl */ + +#endif // LIBWCCL_OPS_FUNCTIONS_TSET_GETSYMBOLSINRANGE_H diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 2ab345c..8b0d92d 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -10,6 +10,7 @@ add_executable(tests context.cpp getlemmas.cpp getsymbols.cpp + getsymbolsinrange.cpp getorth.cpp logicalpredicates.cpp main.cpp diff --git a/tests/getsymbolsinrange.cpp b/tests/getsymbolsinrange.cpp new file mode 100644 index 0000000..51374a4 --- /dev/null +++ b/tests/getsymbolsinrange.cpp @@ -0,0 +1,147 @@ +#include <boost/test/unit_test.hpp> +#include <boost/bind.hpp> +#include <boost/shared_ptr.hpp> +#include <libcorpus2/sentence.h> +#include <libcorpus2/tagsetmanager.h> + +#include <libwccl/ops/functions/constant.h> +#include <libwccl/ops/functions/tset/getsymbols.h> +#include <libwccl/ops/functions/tset/getsymbolsinrange.h> + +using namespace Wccl; + +BOOST_AUTO_TEST_SUITE(get_symbols_in_range_op) + +struct SymbolsInRangeFix +{ + SymbolsInRangeFix() + : s(boost::make_shared<Corpus2::Sentence>()), + sc(s), + tagset(Corpus2::get_named_tagset("kipi")), + cx(sc, boost::make_shared<Variables>()), + pos_zero(0), + pos_one(1), + pos_minus_one(-1), + nowhere(Position::Nowhere), + pos_zero_constant(new Constant<Position>(pos_zero)), + pos_one_constant(new Constant<Position>(pos_one)), + pos_minus_one_constant(new Constant<Position>(pos_minus_one)), + nowhere_constant(new Constant<Position>(nowhere)), + pos_plus_2_constant(new Constant<Position>(Position(2))), + pos_minus_2_constant(new Constant<Position>(Position(-2))) + { + Corpus2::Token* the_token = new Corpus2::Token( + "One", + PwrNlp::Whitespace::ManySpaces); + Corpus2::Lexeme l1("aaa", tagset.parse_simple_tag("subst:sg:nom:m1", false)); + Corpus2::Lexeme l2("aaa", tagset.parse_simple_tag("subst:sg:nom:m2", false)); + the_token->add_lexeme(l1); + the_token->add_lexeme(l2); + s->append(the_token); + Corpus2::Token* another_token = new Corpus2::Token( + "Two", + PwrNlp::Whitespace::ManySpaces); + Corpus2::Lexeme l3("aaa", tagset.parse_simple_tag("subst:pl:dat:f", false)); + Corpus2::Lexeme l4("aaa", tagset.parse_simple_tag("prep:nom:wok", false)); + Corpus2::Lexeme l5("aaa", tagset.parse_simple_tag("adja", false)); + another_token->add_lexeme(l3); + another_token->add_lexeme(l4); + another_token->add_lexeme(l5); + s->append(another_token); + gnd = tagset.parse_symbol("gnd"); + nmb = tagset.parse_symbol("nmb"); + vcl = tagset.parse_symbol("vcl"); + } + + boost::shared_ptr<Corpus2::Sentence> s; + SentenceContext sc; + const Corpus2::Tagset& tagset; + + FunExecContext cx; + Position pos_zero; + Position pos_one; + Position pos_minus_one; + Position nowhere; + boost::shared_ptr<Function<Position> > pos_zero_constant; + boost::shared_ptr<Function<Position> > pos_one_constant; + boost::shared_ptr<Function<Position> > pos_minus_one_constant; + boost::shared_ptr<Function<Position> > nowhere_constant; + boost::shared_ptr<Function<Position> > pos_plus_2_constant; + boost::shared_ptr<Function<Position> > pos_minus_2_constant; + TSet empty; + Corpus2::Tag gnd; + Corpus2::Tag nmb; + Corpus2::Tag vcl; + Corpus2::Tag pos; +}; + +BOOST_FIXTURE_TEST_CASE(range_nowhere, SymbolsInRangeFix) +{ + for(int i = 0; i < 3; i++) + { + GetSymbolsInRange range(gnd, nowhere_constant, pos_zero_constant); + BOOST_CHECK(range.apply(cx)->equals(empty)); + GetSymbolsInRange r2(gnd, pos_zero_constant, nowhere_constant); + BOOST_CHECK(r2.apply(cx)->equals(empty)); + GetSymbolsInRange r3(gnd, nowhere_constant, nowhere_constant); + BOOST_CHECK(r3.apply(cx)->equals(empty)); + sc.advance(); + } +} + +BOOST_FIXTURE_TEST_CASE(range_outside, SymbolsInRangeFix) +{ + GetSymbolsInRange range(gnd, pos_minus_2_constant, pos_minus_one_constant); + BOOST_CHECK(range.apply(cx)->equals(empty)); + sc.advance(); + GetSymbolsInRange r2(gnd, pos_one_constant, pos_plus_2_constant); + BOOST_CHECK(r2.apply(cx)->equals(empty)); +} + +BOOST_FIXTURE_TEST_CASE(range_valid_including_trimmed, SymbolsInRangeFix) +{ + GetSymbolsInRange range(gnd, pos_minus_2_constant, pos_zero_constant); + BOOST_CHECK_EQUAL("{m1,m2}", range.apply(cx)->to_string(tagset)); + GetSymbolsInRange r2(gnd, pos_minus_one_constant, pos_zero_constant); + BOOST_CHECK_EQUAL("{m1,m2}", r2.apply(cx)->to_string(tagset)); + GetSymbolsInRange r3(gnd, pos_one_constant, pos_plus_2_constant); + BOOST_CHECK_EQUAL("{f}", r3.apply(cx)->to_string(tagset)); + GetSymbolsInRange r4(gnd, pos_zero_constant, pos_plus_2_constant); + BOOST_CHECK_EQUAL("{m1,m2,f}", r4.apply(cx)->to_string(tagset)); + sc.advance(); + BOOST_CHECK_EQUAL("{m1,m2,f}", range.apply(cx)->to_string(tagset)); + BOOST_CHECK_EQUAL("{m1,m2,f}", r2.apply(cx)->to_string(tagset)); + BOOST_CHECK_EQUAL("{}", r3.apply(cx)->to_string(tagset)); + BOOST_CHECK_EQUAL("{f}", r4.apply(cx)->to_string(tagset)); + sc.advance(); + BOOST_CHECK_EQUAL("{m1,m2,f}", range.apply(cx)->to_string(tagset)); + BOOST_CHECK_EQUAL("{f}", r2.apply(cx)->to_string(tagset)); + BOOST_CHECK_EQUAL("{}", r3.apply(cx)->to_string(tagset)); + BOOST_CHECK_EQUAL("{}", r4.apply(cx)->to_string(tagset)); +} + +BOOST_FIXTURE_TEST_CASE(range_same_token, SymbolsInRangeFix) +{ + GetSymbolsInRange range(gnd, pos_zero_constant, pos_zero_constant); + BOOST_CHECK_EQUAL("{m1,m2}", range.apply(cx)->to_string(tagset)); + sc.advance(); + BOOST_CHECK_EQUAL("{f}", range.apply(cx)->to_string(tagset)); + sc.advance(); + BOOST_CHECK_EQUAL("{}", range.apply(cx)->to_string(tagset)); +} + + +BOOST_FIXTURE_TEST_CASE(range_to_string, SymbolsInRangeFix) +{ + GetSymbolsInRange range(gnd, pos_zero_constant, pos_plus_2_constant); + BOOST_CHECK_EQUAL("range(gnd, 0, 2)", range.to_string(tagset)); +} + +BOOST_FIXTURE_TEST_CASE(range_to_raw_string, SymbolsInRangeFix) +{ + GetSymbolsInRange range(gnd, pos_zero_constant, pos_plus_2_constant); + std::string expected = "range(" + gnd.raw_dump() + ", 0, 2)"; + BOOST_CHECK_EQUAL(expected, range.to_raw_string()); +} + +BOOST_AUTO_TEST_SUITE_END() -- GitLab