Skip to content
Snippets Groups Projects
Commit b78e6454 authored by Adam Wardynski's avatar Adam Wardynski
Browse files

Adding Regex operator

parent 924a4b64
Branches
No related merge requests found
......@@ -30,6 +30,7 @@ SET(libwccl_STAT_SRC
ops/nor.cpp
ops/or.cpp
ops/predicate.cpp
ops/regex.cpp
ops/tolower.cpp
ops/toupper.cpp
parser/Parser.cpp
......
#include <libwccl/ops/regex.h>
#include <sstream>
#include <libpwrutils/util.h>
namespace Wccl {
RegexParseError::RegexParseError(
const UnicodeString& pattern,
const UErrorCode& status_code,
const UParseError& parse_error)
: status(u_errorName(status_code)),
pattern_line(parse_error.line),
offset(parse_error.offset),
pre_context(PwrNlp::to_utf8(UnicodeString(parse_error.preContext))),
error(PwrNlp::to_utf8(UnicodeString(parse_error.postContext))),
expression(pattern.toUTF8String(std::string())),
WcclError("Could not parse regular expression: " + status + ":" + error)
{
}
std::string RegexParseError::info() const
{
std::stringstream ss;
ss << "Could not parse regular expression at line " << pattern_line
<< " offset " << offset << ". Status: " << status
<< ". Error: " << error << ". Expression was: " << expression;
return ss.str();
}
boost::shared_ptr<const RegexPattern> compile_regex(const UnicodeString &pat_str)
{
UParseError error;
memset(&error, 0, sizeof(error));
UErrorCode status;
memset(&status, 0, sizeof(status));
boost::shared_ptr<const RegexPattern> pattern(RegexPattern::compile(pat_str, error, status));
if(status != U_ZERO_ERROR)
{
throw RegexParseError(pat_str, status, error);
}
return pattern;
}
Regex::Regex(const Regex::StrSetFunctionPtr &strset_expr, const UnicodeString &patstr)
: strset_expr_(strset_expr),
patstr_(patstr),
pattern_(compile_regex(patstr))
{
BOOST_ASSERT(strset_expr_);
BOOST_ASSERT(pattern_);
}
std::string Regex::to_string(const Corpus2::Tagset& tagset) const
{
std::stringstream ss;
ss << operator_name(tagset) << "(" << strset_expr_->to_string(tagset)
<< ", \"" << PwrNlp::to_utf8(patstr_) << "\")"; //TODO: utf escaping?
return ss.str();
}
std::string Regex::to_raw_string() const {
std::stringstream ss;
ss << raw_operator_name() << "(" << strset_expr_->to_raw_string()
<< ", \"" << PwrNlp::to_utf8(patstr_) << "\")"; //TODO: utf escaping?
return ss.str();
}
Regex::BaseRetValPtr Regex::apply_internal(const SentenceContext& context) const {
const boost::shared_ptr<StrSet >& set = strset_expr_->apply(context);
if(set->empty()) {
return Predicate::False->apply(context);
}
foreach(const UnicodeString& s, set->contents()) {
UErrorCode status = U_ZERO_ERROR;
boost::scoped_ptr<RegexMatcher> matcher(pattern_->matcher(s, status));
if(status != U_ZERO_ERROR) {
BOOST_ASSERT(status == U_ZERO_ERROR);
return Predicate::False->apply(context);
}
bool matched = matcher->matches(status);
if(status != U_ZERO_ERROR) {
BOOST_ASSERT(status == U_ZERO_ERROR);
return Predicate::False->apply(context);
}
if(!matched) {
return Predicate::False->apply(context);
}
}
return Predicate::True->apply(context);
}
} /* end ns Wccl */
#ifndef LIBWCCL_OPS_REGEX_H
#define LIBWCCL_OPS_REGEX_H
#include <boost/shared_ptr.hpp>
#include <boost/scoped_ptr.hpp>
#include <unicode/regex.h>
#include <libwccl/exception.h>
#include <libwccl/values/bool.h>
#include <libwccl/values/strset.h>
#include <libwccl/ops/predicate.h>
namespace Wccl {
/**
* Operator that applies a regular expression to a set of
* strings and returns true if all of the strings match the
* expression
*/
class Regex : public Predicate {
public:
typedef boost::shared_ptr<Function<StrSet> > StrSetFunctionPtr;
Regex(const StrSetFunctionPtr& strset_expr, const UnicodeString& patstr);
/**
* String representation of the operator in form of:
* "regex(strset_expr_string, regex_string)"
*/
virtual std::string to_string(const Corpus2::Tagset& tagset) const;
/**
* String representation of conditional operator in form of:
* "regex(strset_expr_raw_string, regex_string)"
* This version does not require tagset, but may be inclomplete
* and/or contain internal info.
*/
virtual std::string to_raw_string() const;
virtual const std::string raw_operator_name() const {
return "regex";
}
protected:
typedef FunctionBase::BaseRetValPtr BaseRetValPtr;
/**
* Get a string set from the argument expression,
* apply regular expression to each string one by one,
* return false if a string not matching expression is found.
* Return true if all strings matched the regular espression.
*/
virtual BaseRetValPtr apply_internal(const SentenceContext& context) const;
private:
const StrSetFunctionPtr strset_expr_;
const UnicodeString patstr_;
const boost::shared_ptr<const RegexPattern> pattern_;
};
class RegexParseError : WcclError
{
public:
RegexParseError(
const UnicodeString& pattern,
const UErrorCode& status_code,
const UParseError& parse_error);
std::string info() const;
const std::string status;
const int pattern_line;
const int offset;
const std::string pre_context;
const std::string error;
const std::string expression;
};
} /* end ns Wccl */
#endif // LIBWCCL_OPS_REGEX_H
......@@ -11,6 +11,7 @@ add_executable(tests
logicalpredicates.cpp
main.cpp
position.cpp
regex.cpp
strsetfunctions.cpp
values.cpp
varaccess.cpp
......
#include <boost/test/unit_test.hpp>
#include <boost/bind.hpp>
#include <boost/shared_ptr.hpp>
#include <libcorpus2/sentence.h>
#include <libwccl/ops/constant.h>
#include <libwccl/ops/regex.h>
#include <libwccl/values/bool.h>
#include <libwccl/values/strset.h>
#include <libwccl/sentencecontext.h>
#include <unicode/unistr.h>
using namespace Wccl;
BOOST_AUTO_TEST_SUITE(regexing)
struct RegexFix
{
RegexFix()
: sc(boost::make_shared<Corpus2::Sentence>()),
tagset()
{
}
SentenceContext sc;
Corpus2::Tagset tagset;
};
BOOST_FIXTURE_TEST_CASE(positive_sanity_check, RegexFix)
{
StrSet sanity;
sanity.insert("word");
boost::shared_ptr<Function<StrSet> > sanity_expr(new Constant<StrSet>(sanity));
Regex r(sanity_expr, "word");
BOOST_CHECK(r.apply(sc)->get_value());
}
BOOST_FIXTURE_TEST_CASE(negative_sanity_check, RegexFix)
{
StrSet sanity;
sanity.insert("word");
boost::shared_ptr<Function<StrSet> > sanity_expr(new Constant<StrSet>(sanity));
Regex r(sanity_expr, "Word");
BOOST_CHECK(!r.apply(sc)->get_value());
}
//TODO: need more regex tests...
//------------ To string ----------
BOOST_FIXTURE_TEST_CASE(regex_tostring, RegexFix)
{
StrSet sanity;
sanity.insert("word");
boost::shared_ptr<Function<StrSet> > sanity_expr(new Constant<StrSet>(sanity));
Regex r(sanity_expr, "Word");
BOOST_CHECK_EQUAL("regex([\"word\"], \"Word\")", r.to_string(tagset));
}
BOOST_AUTO_TEST_CASE(regex_to_raw_string)
{
StrSet sanity;
sanity.insert("word");
boost::shared_ptr<Function<StrSet> > sanity_expr(new Constant<StrSet>(sanity));
Regex r(sanity_expr, "Word");
BOOST_CHECK_EQUAL("regex([\"word\"], \"Word\")", r.to_raw_string());
}
BOOST_AUTO_TEST_SUITE_END()
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment