diff --git a/libwccl/CMakeLists.txt b/libwccl/CMakeLists.txt index 70729be5912b3fa4fa4c0aff6676e8c1c63240db..a1d869794ff87c66b71bab02fafa6fa4cc38dbb2 100644 --- a/libwccl/CMakeLists.txt +++ b/libwccl/CMakeLists.txt @@ -36,6 +36,7 @@ SET(libwccl_STAT_SRC ops/functions/bool/predicates/regex.cpp ops/functions/position/relativeposition.cpp ops/functions/strset/affix.cpp + ops/functions/strset/getlemmas.cpp ops/functions/strset/getorth.cpp ops/functions/strset/tolower.cpp ops/functions/strset/toupper.cpp diff --git a/libwccl/ops/functions/strset/getlemmas.cpp b/libwccl/ops/functions/strset/getlemmas.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1d95df36932363dec342002b4e7511b6e1410ea9 --- /dev/null +++ b/libwccl/ops/functions/strset/getlemmas.cpp @@ -0,0 +1,30 @@ +#include <libwccl/ops/functions/strset/getlemmas.h> +#include <libwccl/ops/formatters.h> +#include <libwccl/ops/functions/constant.h> + +namespace Wccl { + +std::string GetLemmas::to_string(const Corpus2::Tagset& tagset) const +{ + return UnaryFunctionFormatter::to_string(tagset, *this, *pos_expr_, "[", "]"); +} + +std::string GetLemmas::to_raw_string() const { + return UnaryFunctionFormatter::to_raw_string(*this, *pos_expr_, "[", "]"); +} + +GetLemmas::BaseRetValPtr GetLemmas::apply_internal(const FunExecContext& context) const +{ + const boost::shared_ptr<const Position>& pos = pos_expr_->apply(context); + const SentenceContext& sc = context.sentence_context(); + if(pos->is_outside(sc) || !sc.is_current_inside()) { + return detail::DefaultFunction<StrSet>()->apply(context); + } + boost::shared_ptr<StrSet> u_set = boost::make_shared<StrSet>(); + foreach(const Corpus2::Lexeme& lexeme, sc.at(*pos)->lexemes()) { + u_set->insert(lexeme.lemma()); + } + return u_set; +} + +} /* end ns Wccl */ diff --git a/libwccl/ops/functions/strset/getlemmas.h b/libwccl/ops/functions/strset/getlemmas.h new file mode 100644 index 0000000000000000000000000000000000000000..b4017e8a38771acb7910ca0e7c508284db7c8afe --- /dev/null +++ b/libwccl/ops/functions/strset/getlemmas.h @@ -0,0 +1,61 @@ +#ifndef LIBWCCL_OPS_FUNCTIONS_STRSET_GETBASE_H +#define LIBWCCL_OPS_FUNCTIONS_STRSET_GETBASE_H + +#include <libwccl/values/strset.h> +#include <libwccl/values/position.h> +#include <libwccl/ops/function.h> + +namespace Wccl { + +/** + * Operator that takes a position, gets word pointed by the + * position and returns the lemmas of the word. + * Returns empty string set if position pointed outside of + * the sentence boundaries. + */ +class GetLemmas : public Function<StrSet> { +public: + typedef boost::shared_ptr<Function<Position> > PosFunctionPtr; + + GetLemmas(const PosFunctionPtr& pos_expr) + : pos_expr_(pos_expr) + { + BOOST_ASSERT(pos_expr_); + } + + /** + * @returns String representation of the function in the form of: + * "base(pos_expr_string)" + */ + std::string to_string(const Corpus2::Tagset& tagset) const; + + /** + * @returns String representation of the function in the form of: + * "base(pos_expr_string)" + * @note This version does not require tagset, but may be inclomplete + * and/or contain internal info. + */ + std::string to_raw_string() const; + + /** + * @returns Name of the function: "base" + */ + std::string raw_name() const { + return "base"; + } +protected: + const PosFunctionPtr pos_expr_; + + /** + * Gets a position from the argument expression, then gets + * word at that position from Sentence in the SentenceContext, + * then gets the lemmas of the word and returns them. + * @returns Lemmas of the word poitned to, if position + * lies within boundaries of the Sentence. Empty string set otherwise. + */ + BaseRetValPtr apply_internal(const FunExecContext& context) const; +}; + +} /* end ns Wccl */ + +#endif // LIBWCCL_OPS_FUNCTIONS_STRSET_GETBASE_H diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index b2e582a82be446ccb304d1ef76c9e02b70d28823..68fb226ffc793f6e12f4f75f7f3c939910f10448 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -8,6 +8,7 @@ add_executable(tests conditional.cpp constant.cpp context.cpp + getlemmas.cpp getorth.cpp logicalpredicates.cpp main.cpp diff --git a/tests/getlemmas.cpp b/tests/getlemmas.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fc3cb893cc3c6fefaad0bf41e2e0b2f053b1cc91 --- /dev/null +++ b/tests/getlemmas.cpp @@ -0,0 +1,163 @@ +#include <boost/test/unit_test.hpp> +#include <boost/bind.hpp> +#include <boost/shared_ptr.hpp> +#include <libcorpus2/sentence.h> + +#include <libwccl/ops/functions/constant.h> +#include <libwccl/ops/functions/strset/getlemmas.h> + +using namespace Wccl; + +BOOST_AUTO_TEST_SUITE(relative_position) + +struct LemmasPredFix +{ + LemmasPredFix() + : s(boost::make_shared<Corpus2::Sentence>()), + sc(s), + tagset(), + cx(sc, boost::make_shared<Variables>()), + pos_zero(0), + pos_one(1), + pos_minus_one(-1), + nowhere(Position::Nowhere), + begin(Position::Begin), + end(Position::End), + pos_zero_constant(new Constant<Position>(pos_zero)), + pos_one_constant(new Constant<Position>(pos_one)), + pos_minus_one_constant(new Constant<Position>(pos_minus_one)), + nowhere_constant(new Constant<Position>(nowhere)), + begin_constant(new Constant<Position>(begin)), + end_constant(new Constant<Position>(end)), + empty_set(), + first_lemmas(), + second_lemmas() + { + first_lemmas.insert("aaa"); + first_lemmas.insert("bbb"); + second_lemmas.insert("ccc"); + second_lemmas.insert("ddd"); + Corpus2::Token* the_token = new Corpus2::Token( + "One", + PwrNlp::Whitespace::ManySpaces); + Corpus2::Tag t1(Corpus2::mask_t(0)); + Corpus2::Lexeme l1("aaa", t1); + Corpus2::Lexeme l2("bbb", t1); + the_token->add_lexeme(l1); + the_token->add_lexeme(l2); + s->append(the_token); + Corpus2::Token* another_token = new Corpus2::Token( + "Two", + PwrNlp::Whitespace::ManySpaces); + Corpus2::Tag t2(Corpus2::mask_t(0)); + Corpus2::Lexeme l3("ccc", t2); + Corpus2::Lexeme l4("ddd", t2); + another_token->add_lexeme(l3); + another_token->add_lexeme(l4); + s->append(another_token); + } + + boost::shared_ptr<Corpus2::Sentence> s; + SentenceContext sc; + Corpus2::Tagset tagset; + + FunExecContext cx; + Position pos_zero; + Position pos_one; + Position pos_minus_one; + Position nowhere; + Position begin; + Position end; + boost::shared_ptr<Function<Position> > pos_zero_constant; + boost::shared_ptr<Function<Position> > pos_one_constant; + boost::shared_ptr<Function<Position> > pos_minus_one_constant; + boost::shared_ptr<Function<Position> > nowhere_constant; + boost::shared_ptr<Function<Position> > begin_constant; + boost::shared_ptr<Function<Position> > end_constant; + StrSet empty_set; + StrSet first_lemmas; + StrSet second_lemmas; + +}; + +BOOST_FIXTURE_TEST_CASE(lemmas_nowhere, LemmasPredFix) +{ + GetLemmas lemmas(nowhere_constant); + BOOST_CHECK(lemmas.apply(cx)->equals(empty_set)); + sc.advance(); + BOOST_CHECK(lemmas.apply(cx)->equals(empty_set)); + sc.advance(); + BOOST_CHECK(lemmas.apply(cx)->equals(empty_set)); + sc.goto_start(); +} + +BOOST_FIXTURE_TEST_CASE(lemmas_begin, LemmasPredFix) +{ + GetLemmas lemmas(begin_constant); + BOOST_CHECK(lemmas.apply(cx)->equals(first_lemmas)); + sc.advance(); + BOOST_CHECK(lemmas.apply(cx)->equals(first_lemmas)); + sc.advance(); + BOOST_CHECK(lemmas.apply(cx)->equals(empty_set)); + sc.goto_start(); +} + +BOOST_FIXTURE_TEST_CASE(lemmas_end, LemmasPredFix) +{ + GetLemmas lemmas(end_constant); + BOOST_CHECK(lemmas.apply(cx)->equals(second_lemmas)); + sc.advance(); + BOOST_CHECK(lemmas.apply(cx)->equals(second_lemmas)); + sc.advance(); + BOOST_CHECK(lemmas.apply(cx)->equals(empty_set)); + sc.goto_start(); +} + +BOOST_FIXTURE_TEST_CASE(lemmas_zero, LemmasPredFix) +{ + GetLemmas lemmas(pos_zero_constant); + BOOST_CHECK(lemmas.apply(cx)->equals(first_lemmas)); + sc.advance(); + BOOST_CHECK(lemmas.apply(cx)->equals(second_lemmas)); + sc.advance(); + BOOST_CHECK(lemmas.apply(cx)->equals(empty_set)); + sc.goto_start(); +} + +BOOST_FIXTURE_TEST_CASE(lemmas_one, LemmasPredFix) +{ + GetLemmas lemmas(pos_one_constant); + BOOST_CHECK(lemmas.apply(cx)->equals(second_lemmas)); + sc.advance(); + BOOST_CHECK(lemmas.apply(cx)->equals(empty_set)); + sc.advance(); + BOOST_CHECK(lemmas.apply(cx)->equals(empty_set)); + sc.goto_start(); +} + +BOOST_FIXTURE_TEST_CASE(lemmas_minus_one, LemmasPredFix) +{ + GetLemmas lemmas(pos_minus_one_constant); + BOOST_CHECK(lemmas.apply(cx)->equals(empty_set)); + sc.advance(); + BOOST_CHECK(lemmas.apply(cx)->equals(first_lemmas)); + sc.advance(); + BOOST_CHECK(lemmas.apply(cx)->equals(empty_set)); + sc.goto_start(); +} +//------ to_string test cases ------- + +BOOST_FIXTURE_TEST_CASE(lemmas_to_string, LemmasPredFix) +{ + GetLemmas lemmas(begin_constant); + BOOST_CHECK_EQUAL("base[begin]", lemmas.to_string(tagset)); +} + +BOOST_FIXTURE_TEST_CASE(lemmas_to_raw_string, LemmasPredFix) +{ + GetLemmas lemmas(end_constant); + BOOST_CHECK_EQUAL("base[end]", lemmas.to_string(tagset)); + GetLemmas lemmas2(pos_minus_one_constant); + BOOST_CHECK_EQUAL("base[-1]", lemmas2.to_string(tagset)); +} +BOOST_AUTO_TEST_SUITE_END()