#include <libwccl/ops/regex.h> #include <boost/foreach.hpp> #define foreach BOOST_FOREACH #include <sstream> #include <libpwrutils/util.h> namespace Wccl { RegexParseError::RegexParseError( const UnicodeString& pattern, const UErrorCode& status_code, const UParseError& parse_error) : WcclError("Could not parse regular expression."), status(u_errorName(status_code)), pattern_line(parse_error.line), offset(parse_error.offset), pre_context(PwrNlp::to_utf8(UnicodeString(parse_error.preContext))), error(PwrNlp::to_utf8(UnicodeString(parse_error.postContext))), expression(PwrNlp::to_utf8(UnicodeString(pattern))) { } std::string RegexParseError::info() const { std::stringstream ss; ss << "Could not parse regular expression at line " << pattern_line << " offset " << offset << ". Status: " << status << ". Error: " << error << ". Expression was: " << expression; return ss.str(); } RegexParseError::~RegexParseError() throw() { } boost::shared_ptr<const RegexPattern> compile_regex(const UnicodeString &pat_str) { UParseError error; memset(&error, 0, sizeof(error)); UErrorCode status; memset(&status, 0, sizeof(status)); boost::shared_ptr<const RegexPattern> pattern(RegexPattern::compile(pat_str, error, status)); if(status != U_ZERO_ERROR) { throw RegexParseError(pat_str, status, error); } return pattern; } Regex::Regex(const Regex::StrSetFunctionPtr &strset_expr, const UnicodeString &patstr) : strset_expr_(strset_expr), patstr_(patstr), pattern_(compile_regex(patstr)) { BOOST_ASSERT(strset_expr_); BOOST_ASSERT(pattern_); } std::string Regex::to_string(const Corpus2::Tagset& tagset) const { std::stringstream ss; ss << name(tagset) << "(" << strset_expr_->to_string(tagset) << ", \"" << PwrNlp::to_utf8(patstr_) << "\")"; //TODO: utf escaping? return ss.str(); } std::string Regex::to_raw_string() const { std::stringstream ss; ss << raw_name() << "(" << strset_expr_->to_raw_string() << ", \"" << PwrNlp::to_utf8(patstr_) << "\")"; //TODO: utf escaping? return ss.str(); } Regex::BaseRetValPtr Regex::apply_internal(const FunExecContext& context) const { const boost::shared_ptr<const StrSet>& set = strset_expr_->apply(context); if(set->empty()) { return Predicate::False(context); } foreach(const UnicodeString& s, set->contents()) { UErrorCode status = U_ZERO_ERROR; boost::scoped_ptr<RegexMatcher> matcher(pattern_->matcher(s, status)); if(status != U_ZERO_ERROR) { BOOST_ASSERT(status == U_ZERO_ERROR); return Predicate::False(context); } bool matched = matcher->matches(status); if(status != U_ZERO_ERROR) { BOOST_ASSERT(status == U_ZERO_ERROR); return Predicate::False(context); } if(!matched) { return Predicate::False(context); } } return Predicate::True(context); } } /* end ns Wccl */