From 60c8f519ae6f20fd16550c772bb0952e5b760de3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Wardy=C5=84ski?= <no@email> Date: Wed, 10 Nov 2010 20:41:49 +0100 Subject: [PATCH] Adding "lower" operator --- libwccl/CMakeLists.txt | 1 + libwccl/ops/tolower.cpp | 26 +++++++++++ libwccl/ops/tolower.h | 56 +++++++++++++++++++++++ tests/CMakeLists.txt | 1 + tests/strsetfunctions.cpp | 93 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 177 insertions(+) create mode 100644 libwccl/ops/tolower.cpp create mode 100644 libwccl/ops/tolower.h create mode 100644 tests/strsetfunctions.cpp diff --git a/libwccl/CMakeLists.txt b/libwccl/CMakeLists.txt index b3bb20e..b142613 100644 --- a/libwccl/CMakeLists.txt +++ b/libwccl/CMakeLists.txt @@ -21,6 +21,7 @@ SET(libwccl_STAT_SRC ops/nor.cpp ops/or.cpp ops/predicate.cpp + ops/tolower.cpp parser/Parser.cpp parser/ParserException.cpp sentencecontext.cpp diff --git a/libwccl/ops/tolower.cpp b/libwccl/ops/tolower.cpp new file mode 100644 index 0000000..d553cc7 --- /dev/null +++ b/libwccl/ops/tolower.cpp @@ -0,0 +1,26 @@ +#include <libwccl/ops/tolower.h> +#include <libwccl/ops/formatters.h> + +namespace Wccl { + +std::string ToLower::to_string(const Corpus2::Tagset& tagset) const +{ + return UnaryFunctionFormatter::to_string(tagset, *this, *strset_expr_); +} + +std::string ToLower::to_raw_string() const { + return UnaryFunctionFormatter::to_raw_string(*this, *strset_expr_); +} + +ToLower::BaseRetValPtr ToLower::apply_internal(const SentenceContext& context) const { + const boost::shared_ptr<StrSet >& set = strset_expr_->apply(context); + boost::shared_ptr<StrSet > l_set = boost::make_shared<StrSet>(); + //TODO: should tolower be a method of StrSet as well? + foreach(const UnicodeString& s, set->contents()) { + //TODO: what about locale? is default ok? should the context hold it? + l_set->insert(UnicodeString(s).toLower()); + } + return l_set; +} + +} /* end ns Wccl */ diff --git a/libwccl/ops/tolower.h b/libwccl/ops/tolower.h new file mode 100644 index 0000000..7aeb4c0 --- /dev/null +++ b/libwccl/ops/tolower.h @@ -0,0 +1,56 @@ +#ifndef LIBWCCL_OPS_TOLOWER_H +#define LIBWCCL_OPS_TOLOWER_H + +#include <boost/shared_ptr.hpp> +#include <libwccl/values/strset.h> +#include <libwccl/ops/functions.h> + +namespace Wccl { + +/** + * Operator that takes a set of strings and returns a new + * set with corresponding values in lower case form + */ +class ToLower : public Function<StrSet> { +public: + typedef boost::shared_ptr<Function<StrSet> > StrSetFunctionPtr; + + ToLower(const StrSetFunctionPtr& strset_expr) + : strset_expr_(strset_expr) + { + BOOST_ASSERT(strset_expr_); + } + + /** + * String representation of the operator in form of: + * "lower(strset_expr_string)" + */ + virtual std::string to_string(const Corpus2::Tagset& tagset) const; + + /** + * String representation of conditional operator in form of: + * "if cond_expr_raw_s then iftrue_expr_raw_s else iffalse_expr_raw_s" + * This version does not require tagset, but may be inclomplete + * and/or contain internal info. + */ + virtual std::string to_raw_string() const; + + virtual const std::string raw_operator_name() const { + return "lower"; + } + +protected: + const StrSetFunctionPtr strset_expr_; + + typedef FunctionBase::BaseRetValPtr BaseRetValPtr; + + /** + * Get a string set from the argument expression and return copy of the set + * with all strings in lower case form + */ + virtual BaseRetValPtr apply_internal(const SentenceContext& context) const; +}; + +} /* end ns Wccl */ + +#endif // LIBWCCL_OPS_TOLOWER_H diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index abc70b5..a7eeb50 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -11,6 +11,7 @@ add_executable(tests logicalpredicates.cpp main.cpp position.cpp + strsetfunctions.cpp values.cpp varaccess.cpp variables.cpp diff --git a/tests/strsetfunctions.cpp b/tests/strsetfunctions.cpp new file mode 100644 index 0000000..4822967 --- /dev/null +++ b/tests/strsetfunctions.cpp @@ -0,0 +1,93 @@ +#include <boost/test/unit_test.hpp> +#include <boost/bind.hpp> +#include <boost/shared_ptr.hpp> +#include <libcorpus2/sentence.h> + + +#include <libwccl/values/strset.h> +#include <libwccl/sentencecontext.h> +#include <libwccl/ops/tolower.h> +#include <libwccl/ops/constant.h> + +using namespace Wccl; + +BOOST_AUTO_TEST_SUITE(strset_functions) + +struct StrSetFix +{ + StrSetFix() + : sc(boost::make_shared<Corpus2::Sentence>()), + tagset(), + strset(), + strset_expr() + { + strset.insert("alllower"); + strset.insert("Firstcapital"); + strset.insert("PascalCase"); + strset.insert("camelCase"); + strset.insert("some1325numbers"); + strset.insert("ALLUPPER"); + strset.insert("kIdSpEeChLoL"); + + strset_expr.reset(new Constant<StrSet>(strset)); + } + SentenceContext sc; + Corpus2::Tagset tagset; + + StrSet strset; + boost::shared_ptr<Function<StrSet> > strset_expr; +}; + +BOOST_FIXTURE_TEST_CASE(lower, StrSetFix) +{ + StrSet lowerset; + lowerset.insert("alllower"); + lowerset.insert("firstcapital"); + lowerset.insert("pascalcase"); + lowerset.insert("camelcase"); + lowerset.insert("some1325numbers"); + lowerset.insert("allupper"); + lowerset.insert("kidspeechlol"); + + ToLower to_lower(strset_expr); + + BOOST_CHECK(lowerset.equals(*to_lower.apply(sc))); +} + +BOOST_FIXTURE_TEST_CASE(lower_locale, StrSetFix) +{ + //I'm not sure if I can guarantee this test will pass + //on all locales? - ToLower uses default locale at the moment + + //I wanted to make sure switching around encoding of source file + //won't affect the test, so I explicitly provide escaped UTF8 sequence + + StrSet upperset; + upperset.insert(UnicodeString::fromUTF8( + "za\xC5\xBB\xC3\x93\xC5\x81\xC4\x86g\xC4\x98\xC5\x9AL\xC4\x84ja\xC5\xB9\xC5\x83" + "zA\xC5\xBC\xC3\xB3\xC5\x82\xC4\x87g\xC4\x99\xC5\x9Bl\xC4\x85ja\xC5\xBA\xC5\x84")); + StrSet lowerset; + lowerset.insert(UnicodeString::fromUTF8( + "za\xC5\xBC\xC3\xB3\xC5\x82\xC4\x87g\xC4\x99\xC5\x9Bl\xC4\x85ja\xC5\xBA\xC5\x84" + "za\xC5\xBC\xC3\xB3\xC5\x82\xC4\x87g\xC4\x99\xC5\x9Bl\xC4\x85ja\xC5\xBA\xC5\x84")); + + ToLower to_lower(boost::shared_ptr<Function<StrSet> >( + new Constant<StrSet>(upperset))); + + BOOST_CHECK(lowerset.equals(*to_lower.apply(sc))); +} + + +//------ to_string test cases ------- + +BOOST_FIXTURE_TEST_CASE(lower_to_string, StrSetFix) +{ + StrSet one_elem_set; + one_elem_set.insert("YayaAy"); + ToLower to_lower(boost::shared_ptr<Function<StrSet> >( + new Constant<StrSet>(one_elem_set))); + std::string expected = "lower([\"YayaAy\"])"; + BOOST_CHECK_EQUAL(expected, to_lower.to_string(tagset)); +} + +BOOST_AUTO_TEST_SUITE_END() -- GitLab