diff --git a/libwccl/CMakeLists.txt b/libwccl/CMakeLists.txt index 16afd16e6326a43448827099a85bf51419eabf4b..a34ff5994e38af9436aac889d30ceda77f80794e 100644 --- a/libwccl/CMakeLists.txt +++ b/libwccl/CMakeLists.txt @@ -30,6 +30,7 @@ SET(libwccl_STAT_SRC ops/nor.cpp ops/or.cpp ops/predicate.cpp + ops/regex.cpp ops/tolower.cpp ops/toupper.cpp parser/Parser.cpp diff --git a/libwccl/ops/regex.cpp b/libwccl/ops/regex.cpp new file mode 100644 index 0000000000000000000000000000000000000000..818352918af9d15cd5b3a4cecc60a08bdf18f9f1 --- /dev/null +++ b/libwccl/ops/regex.cpp @@ -0,0 +1,92 @@ +#include <libwccl/ops/regex.h> +#include <sstream> +#include <libpwrutils/util.h> + +namespace Wccl { + +RegexParseError::RegexParseError( + const UnicodeString& pattern, + const UErrorCode& status_code, + const UParseError& parse_error) + : status(u_errorName(status_code)), + pattern_line(parse_error.line), + offset(parse_error.offset), + pre_context(PwrNlp::to_utf8(UnicodeString(parse_error.preContext))), + error(PwrNlp::to_utf8(UnicodeString(parse_error.postContext))), + expression(pattern.toUTF8String(std::string())), + WcclError("Could not parse regular expression: " + status + ":" + error) +{ +} + +std::string RegexParseError::info() const +{ + std::stringstream ss; + ss << "Could not parse regular expression at line " << pattern_line + << " offset " << offset << ". Status: " << status + << ". Error: " << error << ". Expression was: " << expression; + return ss.str(); +} + +boost::shared_ptr<const RegexPattern> compile_regex(const UnicodeString &pat_str) +{ + UParseError error; + memset(&error, 0, sizeof(error)); + UErrorCode status; + memset(&status, 0, sizeof(status)); + boost::shared_ptr<const RegexPattern> pattern(RegexPattern::compile(pat_str, error, status)); + if(status != U_ZERO_ERROR) + { + throw RegexParseError(pat_str, status, error); + } + return pattern; +} + +Regex::Regex(const Regex::StrSetFunctionPtr &strset_expr, const UnicodeString &patstr) + : strset_expr_(strset_expr), + patstr_(patstr), + pattern_(compile_regex(patstr)) +{ + BOOST_ASSERT(strset_expr_); + BOOST_ASSERT(pattern_); +} + +std::string Regex::to_string(const Corpus2::Tagset& tagset) const +{ + std::stringstream ss; + ss << operator_name(tagset) << "(" << strset_expr_->to_string(tagset) + << ", \"" << PwrNlp::to_utf8(patstr_) << "\")"; //TODO: utf escaping? + return ss.str(); +} + +std::string Regex::to_raw_string() const { + std::stringstream ss; + ss << raw_operator_name() << "(" << strset_expr_->to_raw_string() + << ", \"" << PwrNlp::to_utf8(patstr_) << "\")"; //TODO: utf escaping? + return ss.str(); +} + +Regex::BaseRetValPtr Regex::apply_internal(const SentenceContext& context) const { + const boost::shared_ptr<StrSet >& set = strset_expr_->apply(context); + if(set->empty()) { + return Predicate::False->apply(context); + } + foreach(const UnicodeString& s, set->contents()) { + UErrorCode status = U_ZERO_ERROR; + boost::scoped_ptr<RegexMatcher> matcher(pattern_->matcher(s, status)); + if(status != U_ZERO_ERROR) { + BOOST_ASSERT(status == U_ZERO_ERROR); + return Predicate::False->apply(context); + } + bool matched = matcher->matches(status); + if(status != U_ZERO_ERROR) { + BOOST_ASSERT(status == U_ZERO_ERROR); + return Predicate::False->apply(context); + } + if(!matched) { + return Predicate::False->apply(context); + } + } + return Predicate::True->apply(context); +} + +} /* end ns Wccl */ diff --git a/libwccl/ops/regex.h b/libwccl/ops/regex.h new file mode 100644 index 0000000000000000000000000000000000000000..93689aec586d45d02b145053aad7dd9f849aee03 --- /dev/null +++ b/libwccl/ops/regex.h @@ -0,0 +1,82 @@ +#ifndef LIBWCCL_OPS_REGEX_H +#define LIBWCCL_OPS_REGEX_H + +#include <boost/shared_ptr.hpp> +#include <boost/scoped_ptr.hpp> +#include <unicode/regex.h> + +#include <libwccl/exception.h> +#include <libwccl/values/bool.h> +#include <libwccl/values/strset.h> +#include <libwccl/ops/predicate.h> + +namespace Wccl { + +/** + * Operator that applies a regular expression to a set of + * strings and returns true if all of the strings match the + * expression + */ +class Regex : public Predicate { +public: + typedef boost::shared_ptr<Function<StrSet> > StrSetFunctionPtr; + + Regex(const StrSetFunctionPtr& strset_expr, const UnicodeString& patstr); + + /** + * String representation of the operator in form of: + * "regex(strset_expr_string, regex_string)" + */ + virtual std::string to_string(const Corpus2::Tagset& tagset) const; + + /** + * String representation of conditional operator in form of: + * "regex(strset_expr_raw_string, regex_string)" + * This version does not require tagset, but may be inclomplete + * and/or contain internal info. + */ + virtual std::string to_raw_string() const; + + virtual const std::string raw_operator_name() const { + return "regex"; + } + +protected: + typedef FunctionBase::BaseRetValPtr BaseRetValPtr; + + /** + * Get a string set from the argument expression, + * apply regular expression to each string one by one, + * return false if a string not matching expression is found. + * Return true if all strings matched the regular espression. + */ + virtual BaseRetValPtr apply_internal(const SentenceContext& context) const; + +private: + const StrSetFunctionPtr strset_expr_; + const UnicodeString patstr_; + const boost::shared_ptr<const RegexPattern> pattern_; +}; + +class RegexParseError : WcclError +{ +public: + RegexParseError( + const UnicodeString& pattern, + const UErrorCode& status_code, + const UParseError& parse_error); + + std::string info() const; + + const std::string status; + const int pattern_line; + const int offset; + const std::string pre_context; + const std::string error; + const std::string expression; +}; + +} /* end ns Wccl */ + + +#endif // LIBWCCL_OPS_REGEX_H diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index a7eeb50488475fa673ca6ce9554f14c5882b2ed5..a3bf95457d27e4c88b606e69bd3bb29d2dddf138 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -11,6 +11,7 @@ add_executable(tests logicalpredicates.cpp main.cpp position.cpp + regex.cpp strsetfunctions.cpp values.cpp varaccess.cpp diff --git a/tests/regex.cpp b/tests/regex.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f0b87e1b551c361d9bc938bda07ea0638a76fbc1 --- /dev/null +++ b/tests/regex.cpp @@ -0,0 +1,68 @@ +#include <boost/test/unit_test.hpp> +#include <boost/bind.hpp> +#include <boost/shared_ptr.hpp> +#include <libcorpus2/sentence.h> + +#include <libwccl/ops/constant.h> +#include <libwccl/ops/regex.h> +#include <libwccl/values/bool.h> +#include <libwccl/values/strset.h> +#include <libwccl/sentencecontext.h> +#include <unicode/unistr.h> + +using namespace Wccl; + +BOOST_AUTO_TEST_SUITE(regexing) + +struct RegexFix +{ + RegexFix() + : sc(boost::make_shared<Corpus2::Sentence>()), + tagset() + { + } + SentenceContext sc; + Corpus2::Tagset tagset; +}; + +BOOST_FIXTURE_TEST_CASE(positive_sanity_check, RegexFix) +{ + StrSet sanity; + sanity.insert("word"); + boost::shared_ptr<Function<StrSet> > sanity_expr(new Constant<StrSet>(sanity)); + Regex r(sanity_expr, "word"); + BOOST_CHECK(r.apply(sc)->get_value()); +} + +BOOST_FIXTURE_TEST_CASE(negative_sanity_check, RegexFix) +{ + StrSet sanity; + sanity.insert("word"); + boost::shared_ptr<Function<StrSet> > sanity_expr(new Constant<StrSet>(sanity)); + Regex r(sanity_expr, "Word"); + BOOST_CHECK(!r.apply(sc)->get_value()); +} + +//TODO: need more regex tests... + +//------------ To string ---------- + +BOOST_FIXTURE_TEST_CASE(regex_tostring, RegexFix) +{ + StrSet sanity; + sanity.insert("word"); + boost::shared_ptr<Function<StrSet> > sanity_expr(new Constant<StrSet>(sanity)); + Regex r(sanity_expr, "Word"); + BOOST_CHECK_EQUAL("regex([\"word\"], \"Word\")", r.to_string(tagset)); +} + +BOOST_AUTO_TEST_CASE(regex_to_raw_string) +{ + StrSet sanity; + sanity.insert("word"); + boost::shared_ptr<Function<StrSet> > sanity_expr(new Constant<StrSet>(sanity)); + Regex r(sanity_expr, "Word"); + BOOST_CHECK_EQUAL("regex([\"word\"], \"Word\")", r.to_raw_string()); +} + +BOOST_AUTO_TEST_SUITE_END() \ No newline at end of file