Skip to content
Snippets Groups Projects
Commit b78e6454 authored by Adam Wardynski's avatar Adam Wardynski
Browse files

Adding Regex operator

parent 924a4b64
No related branches found
No related tags found
No related merge requests found
...@@ -30,6 +30,7 @@ SET(libwccl_STAT_SRC ...@@ -30,6 +30,7 @@ SET(libwccl_STAT_SRC
ops/nor.cpp ops/nor.cpp
ops/or.cpp ops/or.cpp
ops/predicate.cpp ops/predicate.cpp
ops/regex.cpp
ops/tolower.cpp ops/tolower.cpp
ops/toupper.cpp ops/toupper.cpp
parser/Parser.cpp parser/Parser.cpp
......
#include <libwccl/ops/regex.h>
#include <sstream>
#include <libpwrutils/util.h>
namespace Wccl {
RegexParseError::RegexParseError(
const UnicodeString& pattern,
const UErrorCode& status_code,
const UParseError& parse_error)
: status(u_errorName(status_code)),
pattern_line(parse_error.line),
offset(parse_error.offset),
pre_context(PwrNlp::to_utf8(UnicodeString(parse_error.preContext))),
error(PwrNlp::to_utf8(UnicodeString(parse_error.postContext))),
expression(pattern.toUTF8String(std::string())),
WcclError("Could not parse regular expression: " + status + ":" + error)
{
}
std::string RegexParseError::info() const
{
std::stringstream ss;
ss << "Could not parse regular expression at line " << pattern_line
<< " offset " << offset << ". Status: " << status
<< ". Error: " << error << ". Expression was: " << expression;
return ss.str();
}
boost::shared_ptr<const RegexPattern> compile_regex(const UnicodeString &pat_str)
{
UParseError error;
memset(&error, 0, sizeof(error));
UErrorCode status;
memset(&status, 0, sizeof(status));
boost::shared_ptr<const RegexPattern> pattern(RegexPattern::compile(pat_str, error, status));
if(status != U_ZERO_ERROR)
{
throw RegexParseError(pat_str, status, error);
}
return pattern;
}
Regex::Regex(const Regex::StrSetFunctionPtr &strset_expr, const UnicodeString &patstr)
: strset_expr_(strset_expr),
patstr_(patstr),
pattern_(compile_regex(patstr))
{
BOOST_ASSERT(strset_expr_);
BOOST_ASSERT(pattern_);
}
std::string Regex::to_string(const Corpus2::Tagset& tagset) const
{
std::stringstream ss;
ss << operator_name(tagset) << "(" << strset_expr_->to_string(tagset)
<< ", \"" << PwrNlp::to_utf8(patstr_) << "\")"; //TODO: utf escaping?
return ss.str();
}
std::string Regex::to_raw_string() const {
std::stringstream ss;
ss << raw_operator_name() << "(" << strset_expr_->to_raw_string()
<< ", \"" << PwrNlp::to_utf8(patstr_) << "\")"; //TODO: utf escaping?
return ss.str();
}
Regex::BaseRetValPtr Regex::apply_internal(const SentenceContext& context) const {
const boost::shared_ptr<StrSet >& set = strset_expr_->apply(context);
if(set->empty()) {
return Predicate::False->apply(context);
}
foreach(const UnicodeString& s, set->contents()) {
UErrorCode status = U_ZERO_ERROR;
boost::scoped_ptr<RegexMatcher> matcher(pattern_->matcher(s, status));
if(status != U_ZERO_ERROR) {
BOOST_ASSERT(status == U_ZERO_ERROR);
return Predicate::False->apply(context);
}
bool matched = matcher->matches(status);
if(status != U_ZERO_ERROR) {
BOOST_ASSERT(status == U_ZERO_ERROR);
return Predicate::False->apply(context);
}
if(!matched) {
return Predicate::False->apply(context);
}
}
return Predicate::True->apply(context);
}
} /* end ns Wccl */
#ifndef LIBWCCL_OPS_REGEX_H
#define LIBWCCL_OPS_REGEX_H
#include <boost/shared_ptr.hpp>
#include <boost/scoped_ptr.hpp>
#include <unicode/regex.h>
#include <libwccl/exception.h>
#include <libwccl/values/bool.h>
#include <libwccl/values/strset.h>
#include <libwccl/ops/predicate.h>
namespace Wccl {
/**
* Operator that applies a regular expression to a set of
* strings and returns true if all of the strings match the
* expression
*/
class Regex : public Predicate {
public:
typedef boost::shared_ptr<Function<StrSet> > StrSetFunctionPtr;
Regex(const StrSetFunctionPtr& strset_expr, const UnicodeString& patstr);
/**
* String representation of the operator in form of:
* "regex(strset_expr_string, regex_string)"
*/
virtual std::string to_string(const Corpus2::Tagset& tagset) const;
/**
* String representation of conditional operator in form of:
* "regex(strset_expr_raw_string, regex_string)"
* This version does not require tagset, but may be inclomplete
* and/or contain internal info.
*/
virtual std::string to_raw_string() const;
virtual const std::string raw_operator_name() const {
return "regex";
}
protected:
typedef FunctionBase::BaseRetValPtr BaseRetValPtr;
/**
* Get a string set from the argument expression,
* apply regular expression to each string one by one,
* return false if a string not matching expression is found.
* Return true if all strings matched the regular espression.
*/
virtual BaseRetValPtr apply_internal(const SentenceContext& context) const;
private:
const StrSetFunctionPtr strset_expr_;
const UnicodeString patstr_;
const boost::shared_ptr<const RegexPattern> pattern_;
};
class RegexParseError : WcclError
{
public:
RegexParseError(
const UnicodeString& pattern,
const UErrorCode& status_code,
const UParseError& parse_error);
std::string info() const;
const std::string status;
const int pattern_line;
const int offset;
const std::string pre_context;
const std::string error;
const std::string expression;
};
} /* end ns Wccl */
#endif // LIBWCCL_OPS_REGEX_H
...@@ -11,6 +11,7 @@ add_executable(tests ...@@ -11,6 +11,7 @@ add_executable(tests
logicalpredicates.cpp logicalpredicates.cpp
main.cpp main.cpp
position.cpp position.cpp
regex.cpp
strsetfunctions.cpp strsetfunctions.cpp
values.cpp values.cpp
varaccess.cpp varaccess.cpp
......
#include <boost/test/unit_test.hpp>
#include <boost/bind.hpp>
#include <boost/shared_ptr.hpp>
#include <libcorpus2/sentence.h>
#include <libwccl/ops/constant.h>
#include <libwccl/ops/regex.h>
#include <libwccl/values/bool.h>
#include <libwccl/values/strset.h>
#include <libwccl/sentencecontext.h>
#include <unicode/unistr.h>
using namespace Wccl;
BOOST_AUTO_TEST_SUITE(regexing)
struct RegexFix
{
RegexFix()
: sc(boost::make_shared<Corpus2::Sentence>()),
tagset()
{
}
SentenceContext sc;
Corpus2::Tagset tagset;
};
BOOST_FIXTURE_TEST_CASE(positive_sanity_check, RegexFix)
{
StrSet sanity;
sanity.insert("word");
boost::shared_ptr<Function<StrSet> > sanity_expr(new Constant<StrSet>(sanity));
Regex r(sanity_expr, "word");
BOOST_CHECK(r.apply(sc)->get_value());
}
BOOST_FIXTURE_TEST_CASE(negative_sanity_check, RegexFix)
{
StrSet sanity;
sanity.insert("word");
boost::shared_ptr<Function<StrSet> > sanity_expr(new Constant<StrSet>(sanity));
Regex r(sanity_expr, "Word");
BOOST_CHECK(!r.apply(sc)->get_value());
}
//TODO: need more regex tests...
//------------ To string ----------
BOOST_FIXTURE_TEST_CASE(regex_tostring, RegexFix)
{
StrSet sanity;
sanity.insert("word");
boost::shared_ptr<Function<StrSet> > sanity_expr(new Constant<StrSet>(sanity));
Regex r(sanity_expr, "Word");
BOOST_CHECK_EQUAL("regex([\"word\"], \"Word\")", r.to_string(tagset));
}
BOOST_AUTO_TEST_CASE(regex_to_raw_string)
{
StrSet sanity;
sanity.insert("word");
boost::shared_ptr<Function<StrSet> > sanity_expr(new Constant<StrSet>(sanity));
Regex r(sanity_expr, "Word");
BOOST_CHECK_EQUAL("regex([\"word\"], \"Word\")", r.to_raw_string());
}
BOOST_AUTO_TEST_SUITE_END()
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment