From f6a79879005ff80b7aaa814b6c2b2f271f6b1169 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Wardy=C5=84ski?= <no@email>
Date: Thu, 11 Nov 2010 22:44:10 +0100
Subject: [PATCH] Affix operator (returning prefixes or suffixes of given
 length)

---
 libwccl/CMakeLists.txt    |  3 +-
 libwccl/ops/affix.cpp     | 44 ++++++++++++++++++++++++++++
 libwccl/ops/affix.h       | 59 ++++++++++++++++++++++++++++++++++++++
 tests/strsetfunctions.cpp | 60 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 165 insertions(+), 1 deletion(-)
 create mode 100644 libwccl/ops/affix.cpp
 create mode 100644 libwccl/ops/affix.h

diff --git a/libwccl/CMakeLists.txt b/libwccl/CMakeLists.txt
index c5fb97c..1a86bd6 100644
--- a/libwccl/CMakeLists.txt
+++ b/libwccl/CMakeLists.txt
@@ -16,9 +16,10 @@ set(LIBS ${LIBS} ${Boost_LIBRARIES})
 SET(libwccl_STAT_SRC
 	exception.cpp
 	ops/and.cpp
+	ops/affix.cpp
 	ops/formatters.cpp
 	ops/logicalpredicate.cpp
-    ops/nor.cpp
+	ops/nor.cpp
 	ops/or.cpp
 	ops/predicate.cpp
 	ops/tolower.cpp
diff --git a/libwccl/ops/affix.cpp b/libwccl/ops/affix.cpp
new file mode 100644
index 0000000..892a3c0
--- /dev/null
+++ b/libwccl/ops/affix.cpp
@@ -0,0 +1,44 @@
+#include <libwccl/ops/affix.h>
+#include <sstream>
+#include <boost/foreach.hpp>
+#define foreach BOOST_FOREACH
+
+namespace Wccl {
+
+std::string Affix::to_string(const Corpus2::Tagset& tagset) const
+{
+	std::stringstream str;
+	str << operator_name(tagset) << "(" << strset_expr_->to_string(tagset)
+		<< ", " << affix_length_ << ")";
+	return str.str();
+}
+
+std::string Affix::to_raw_string() const {
+	std::stringstream str;
+	str << raw_operator_name() << "(" << strset_expr_->to_raw_string()
+		<< ", " << affix_length_ << ")";
+	return str.str();
+}
+
+Affix::BaseRetValPtr Affix::apply_internal(const SentenceContext& context) const
+{
+	if(affix_length_ == 0) {
+		return strset_expr_->apply(context);
+	}
+	const boost::shared_ptr<StrSet>& set = strset_expr_->apply(context);
+	boost::shared_ptr<StrSet> a_set = boost::shared_ptr<StrSet>(new StrSet());
+	if(affix_length_ < 0) {
+		foreach(const UnicodeString& s, set->contents()) {
+			a_set->insert(UnicodeString(s).remove(0, s.length() + affix_length_));
+		}
+	} else {
+		foreach(const UnicodeString& s, set->contents()) {
+			UnicodeString prefixed(s);
+			prefixed.truncate(affix_length_);
+			a_set->insert(prefixed);
+		}
+	}
+	return a_set;
+}
+
+} /* end ns Wccl */
diff --git a/libwccl/ops/affix.h b/libwccl/ops/affix.h
new file mode 100644
index 0000000..fc38e59
--- /dev/null
+++ b/libwccl/ops/affix.h
@@ -0,0 +1,59 @@
+#ifndef LIBWCCL_OPS_AFFIX_H
+#define LIBWCCL_OPS_AFFIX_H
+
+#include <boost/shared_ptr.hpp>
+#include <libwccl/values/strset.h>
+#include <libwccl/ops/functions.h>
+
+namespace Wccl {
+
+/**
+ * Operator that takes a set of strings and returns a new
+ * set with corresponding values that are prefixes or
+ * suffixes of given length
+ */
+class Affix : public Function<StrSet> {
+public:
+	typedef boost::shared_ptr<Function<StrSet> > StrSetFunctionPtr;
+	
+	Affix(const StrSetFunctionPtr& strset_expr, int affix_length)
+		: strset_expr_(strset_expr),
+		  affix_length_(affix_length)
+	{
+		BOOST_ASSERT(strset_expr_);
+	}
+
+	/**
+	 * String representation of the operator in form of:
+	 * "affix(strset_expr_string)"
+	 */
+	virtual std::string to_string(const Corpus2::Tagset& tagset) const;
+
+	/**
+	 * String representation of conditional operator in form of:
+	 * "affix(strset_expr_raw_string)"
+	 * This version does not require tagset, but may be inclomplete
+	 * and/or contain internal info.
+	 */
+	virtual std::string to_raw_string() const;
+
+	virtual const std::string raw_operator_name() const {
+		return "affix";
+	}
+
+protected:
+	const StrSetFunctionPtr strset_expr_;
+	const int affix_length_;
+
+	typedef FunctionBase::BaseRetValPtr BaseRetValPtr;
+
+	/**
+	 * Get a string set from the argument expression and return copy of the set
+	 * with all strings converted into prefixes or suffixes of given length
+	 */
+	virtual BaseRetValPtr apply_internal(const SentenceContext& context) const;
+};
+
+} /* end ns Wccl */
+
+#endif // LIBWCCL_OPS_AFFIX_H
diff --git a/tests/strsetfunctions.cpp b/tests/strsetfunctions.cpp
index 337f36c..bf0ecc9 100644
--- a/tests/strsetfunctions.cpp
+++ b/tests/strsetfunctions.cpp
@@ -8,6 +8,7 @@
 #include <libwccl/sentencecontext.h>
 #include <libwccl/ops/tolower.h>
 #include <libwccl/ops/toupper.h>
+#include <libwccl/ops/affix.h>
 #include <libwccl/ops/constant.h>
 
 using namespace Wccl;
@@ -29,6 +30,7 @@ struct StrSetFix
 		strset.insert("some1325numbers");
 		strset.insert("ALLUPPER");
 		strset.insert("kIdSpEeChLoL");
+		strset.insert("short");
 
 		strset_expr.reset(new Constant<StrSet>(strset));
 	}
@@ -49,12 +51,70 @@ BOOST_FIXTURE_TEST_CASE(lower, StrSetFix)
 	lowerset.insert("some1325numbers");
 	lowerset.insert("allupper");
 	lowerset.insert("kidspeechlol");
+	lowerset.insert("short");
 
 	ToLower to_lower(strset_expr);
 
 	BOOST_CHECK(lowerset.equals(*to_lower.apply(sc)));
 }
 
+BOOST_FIXTURE_TEST_CASE(upper, StrSetFix)
+{
+	StrSet upperset;
+	upperset.insert("ALLLOWER");
+	upperset.insert("FIRSTCAPITAL");
+	upperset.insert("PASCALCASE");
+	upperset.insert("CAMELCASE");
+	upperset.insert("SOME1325NUMBERS");
+	upperset.insert("ALLUPPER");
+	upperset.insert("KIDSPEECHLOL");
+	upperset.insert("SHORT");
+
+	ToUpper to_upper(strset_expr);
+
+	BOOST_CHECK(upperset.equals(*to_upper.apply(sc)));
+}
+
+BOOST_FIXTURE_TEST_CASE(prefix, StrSetFix)
+{
+    StrSet prefixset;
+    prefixset.insert("alllowe");
+    prefixset.insert("Firstca");
+    prefixset.insert("PascalC");
+    prefixset.insert("camelCa");
+    prefixset.insert("some132");
+    prefixset.insert("ALLUPPE");
+    prefixset.insert("kIdSpEe");
+    prefixset.insert("short");
+
+    Affix prefix(strset_expr, 7);
+
+    BOOST_CHECK(prefixset.equals(*prefix.apply(sc)));
+}
+
+BOOST_FIXTURE_TEST_CASE(suffix, StrSetFix)
+{
+    StrSet suffixset;
+    suffixset.insert("lllower");
+    suffixset.insert("capital");
+    suffixset.insert("calCase");
+    suffixset.insert("melCase");
+    suffixset.insert("numbers");
+    suffixset.insert("LLUPPER");
+    suffixset.insert("EeChLoL");
+    suffixset.insert("short");
+
+    Affix suffix(strset_expr, -7);
+
+    BOOST_CHECK(suffixset.equals(*suffix.apply(sc)));
+}
+
+BOOST_FIXTURE_TEST_CASE(affix_0, StrSetFix)
+{
+	Affix affix_0(strset_expr, 0);
+	BOOST_CHECK(strset.equals(*affix_0.apply(sc)));
+}
+
 BOOST_FIXTURE_TEST_CASE(lower_locale, StrSetFix)
 {
 	//I'm not sure if I can guarantee this test will pass
-- 
GitLab