From f6a79879005ff80b7aaa814b6c2b2f271f6b1169 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Wardy=C5=84ski?= <no@email> Date: Thu, 11 Nov 2010 22:44:10 +0100 Subject: [PATCH] Affix operator (returning prefixes or suffixes of given length) --- libwccl/CMakeLists.txt | 3 +- libwccl/ops/affix.cpp | 44 ++++++++++++++++++++++++++++ libwccl/ops/affix.h | 59 ++++++++++++++++++++++++++++++++++++++ tests/strsetfunctions.cpp | 60 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 165 insertions(+), 1 deletion(-) create mode 100644 libwccl/ops/affix.cpp create mode 100644 libwccl/ops/affix.h diff --git a/libwccl/CMakeLists.txt b/libwccl/CMakeLists.txt index c5fb97c..1a86bd6 100644 --- a/libwccl/CMakeLists.txt +++ b/libwccl/CMakeLists.txt @@ -16,9 +16,10 @@ set(LIBS ${LIBS} ${Boost_LIBRARIES}) SET(libwccl_STAT_SRC exception.cpp ops/and.cpp + ops/affix.cpp ops/formatters.cpp ops/logicalpredicate.cpp - ops/nor.cpp + ops/nor.cpp ops/or.cpp ops/predicate.cpp ops/tolower.cpp diff --git a/libwccl/ops/affix.cpp b/libwccl/ops/affix.cpp new file mode 100644 index 0000000..892a3c0 --- /dev/null +++ b/libwccl/ops/affix.cpp @@ -0,0 +1,44 @@ +#include <libwccl/ops/affix.h> +#include <sstream> +#include <boost/foreach.hpp> +#define foreach BOOST_FOREACH + +namespace Wccl { + +std::string Affix::to_string(const Corpus2::Tagset& tagset) const +{ + std::stringstream str; + str << operator_name(tagset) << "(" << strset_expr_->to_string(tagset) + << ", " << affix_length_ << ")"; + return str.str(); +} + +std::string Affix::to_raw_string() const { + std::stringstream str; + str << raw_operator_name() << "(" << strset_expr_->to_raw_string() + << ", " << affix_length_ << ")"; + return str.str(); +} + +Affix::BaseRetValPtr Affix::apply_internal(const SentenceContext& context) const +{ + if(affix_length_ == 0) { + return strset_expr_->apply(context); + } + const boost::shared_ptr<StrSet>& set = strset_expr_->apply(context); + boost::shared_ptr<StrSet> a_set = boost::shared_ptr<StrSet>(new StrSet()); + if(affix_length_ < 0) { + foreach(const UnicodeString& s, set->contents()) { + a_set->insert(UnicodeString(s).remove(0, s.length() + affix_length_)); + } + } else { + foreach(const UnicodeString& s, set->contents()) { + UnicodeString prefixed(s); + prefixed.truncate(affix_length_); + a_set->insert(prefixed); + } + } + return a_set; +} + +} /* end ns Wccl */ diff --git a/libwccl/ops/affix.h b/libwccl/ops/affix.h new file mode 100644 index 0000000..fc38e59 --- /dev/null +++ b/libwccl/ops/affix.h @@ -0,0 +1,59 @@ +#ifndef LIBWCCL_OPS_AFFIX_H +#define LIBWCCL_OPS_AFFIX_H + +#include <boost/shared_ptr.hpp> +#include <libwccl/values/strset.h> +#include <libwccl/ops/functions.h> + +namespace Wccl { + +/** + * Operator that takes a set of strings and returns a new + * set with corresponding values that are prefixes or + * suffixes of given length + */ +class Affix : public Function<StrSet> { +public: + typedef boost::shared_ptr<Function<StrSet> > StrSetFunctionPtr; + + Affix(const StrSetFunctionPtr& strset_expr, int affix_length) + : strset_expr_(strset_expr), + affix_length_(affix_length) + { + BOOST_ASSERT(strset_expr_); + } + + /** + * String representation of the operator in form of: + * "affix(strset_expr_string)" + */ + virtual std::string to_string(const Corpus2::Tagset& tagset) const; + + /** + * String representation of conditional operator in form of: + * "affix(strset_expr_raw_string)" + * This version does not require tagset, but may be inclomplete + * and/or contain internal info. + */ + virtual std::string to_raw_string() const; + + virtual const std::string raw_operator_name() const { + return "affix"; + } + +protected: + const StrSetFunctionPtr strset_expr_; + const int affix_length_; + + typedef FunctionBase::BaseRetValPtr BaseRetValPtr; + + /** + * Get a string set from the argument expression and return copy of the set + * with all strings converted into prefixes or suffixes of given length + */ + virtual BaseRetValPtr apply_internal(const SentenceContext& context) const; +}; + +} /* end ns Wccl */ + +#endif // LIBWCCL_OPS_AFFIX_H diff --git a/tests/strsetfunctions.cpp b/tests/strsetfunctions.cpp index 337f36c..bf0ecc9 100644 --- a/tests/strsetfunctions.cpp +++ b/tests/strsetfunctions.cpp @@ -8,6 +8,7 @@ #include <libwccl/sentencecontext.h> #include <libwccl/ops/tolower.h> #include <libwccl/ops/toupper.h> +#include <libwccl/ops/affix.h> #include <libwccl/ops/constant.h> using namespace Wccl; @@ -29,6 +30,7 @@ struct StrSetFix strset.insert("some1325numbers"); strset.insert("ALLUPPER"); strset.insert("kIdSpEeChLoL"); + strset.insert("short"); strset_expr.reset(new Constant<StrSet>(strset)); } @@ -49,12 +51,70 @@ BOOST_FIXTURE_TEST_CASE(lower, StrSetFix) lowerset.insert("some1325numbers"); lowerset.insert("allupper"); lowerset.insert("kidspeechlol"); + lowerset.insert("short"); ToLower to_lower(strset_expr); BOOST_CHECK(lowerset.equals(*to_lower.apply(sc))); } +BOOST_FIXTURE_TEST_CASE(upper, StrSetFix) +{ + StrSet upperset; + upperset.insert("ALLLOWER"); + upperset.insert("FIRSTCAPITAL"); + upperset.insert("PASCALCASE"); + upperset.insert("CAMELCASE"); + upperset.insert("SOME1325NUMBERS"); + upperset.insert("ALLUPPER"); + upperset.insert("KIDSPEECHLOL"); + upperset.insert("SHORT"); + + ToUpper to_upper(strset_expr); + + BOOST_CHECK(upperset.equals(*to_upper.apply(sc))); +} + +BOOST_FIXTURE_TEST_CASE(prefix, StrSetFix) +{ + StrSet prefixset; + prefixset.insert("alllowe"); + prefixset.insert("Firstca"); + prefixset.insert("PascalC"); + prefixset.insert("camelCa"); + prefixset.insert("some132"); + prefixset.insert("ALLUPPE"); + prefixset.insert("kIdSpEe"); + prefixset.insert("short"); + + Affix prefix(strset_expr, 7); + + BOOST_CHECK(prefixset.equals(*prefix.apply(sc))); +} + +BOOST_FIXTURE_TEST_CASE(suffix, StrSetFix) +{ + StrSet suffixset; + suffixset.insert("lllower"); + suffixset.insert("capital"); + suffixset.insert("calCase"); + suffixset.insert("melCase"); + suffixset.insert("numbers"); + suffixset.insert("LLUPPER"); + suffixset.insert("EeChLoL"); + suffixset.insert("short"); + + Affix suffix(strset_expr, -7); + + BOOST_CHECK(suffixset.equals(*suffix.apply(sc))); +} + +BOOST_FIXTURE_TEST_CASE(affix_0, StrSetFix) +{ + Affix affix_0(strset_expr, 0); + BOOST_CHECK(strset.equals(*affix_0.apply(sc))); +} + BOOST_FIXTURE_TEST_CASE(lower_locale, StrSetFix) { //I'm not sure if I can guarantee this test will pass -- GitLab