Skip to content
Snippets Groups Projects
regex.cpp 2.76 KiB
Newer Older
Adam Wardynski's avatar
Adam Wardynski committed
#include <libwccl/ops/regex.h>
#include <boost/foreach.hpp>
#define foreach         BOOST_FOREACH

Adam Wardynski's avatar
Adam Wardynski committed
#include <sstream>
#include <libpwrutils/util.h>

namespace Wccl {

RegexParseError::RegexParseError(
	const UnicodeString& pattern,
	const UErrorCode& status_code,
	const UParseError& parse_error)
	: WcclError("Could not parse regular expression."),
	  status(u_errorName(status_code)),
Adam Wardynski's avatar
Adam Wardynski committed
	  pattern_line(parse_error.line),
	  offset(parse_error.offset),
	  pre_context(PwrNlp::to_utf8(UnicodeString(parse_error.preContext))),
	  error(PwrNlp::to_utf8(UnicodeString(parse_error.postContext))),
	  expression(PwrNlp::to_utf8(UnicodeString(pattern)))
Adam Wardynski's avatar
Adam Wardynski committed
{
}

std::string RegexParseError::info() const
{
	std::stringstream ss;
	ss << "Could not parse regular expression at line " << pattern_line
		<< " offset " << offset << ". Status: " << status
		<< ". Error: " << error << ". Expression was: " << expression;
	return ss.str();
}

RegexParseError::~RegexParseError() throw()
{
}

Adam Wardynski's avatar
Adam Wardynski committed
boost::shared_ptr<const RegexPattern> compile_regex(const UnicodeString &pat_str)
{
	UParseError error;
	memset(&error, 0, sizeof(error));
	UErrorCode status;
	memset(&status, 0, sizeof(status));
	boost::shared_ptr<const RegexPattern> pattern(RegexPattern::compile(pat_str, error, status));
	if(status != U_ZERO_ERROR)
	{
		throw RegexParseError(pat_str, status, error);
	}
	return pattern;
}

Regex::Regex(const Regex::StrSetFunctionPtr &strset_expr, const UnicodeString &patstr)
	: strset_expr_(strset_expr),
	  patstr_(patstr),
	  pattern_(compile_regex(patstr))
{
	BOOST_ASSERT(strset_expr_);
	BOOST_ASSERT(pattern_);
}

std::string Regex::to_string(const Corpus2::Tagset& tagset) const
{
	std::stringstream ss;
	ss << name(tagset) << "(" << strset_expr_->to_string(tagset)
Adam Wardynski's avatar
Adam Wardynski committed
		<< ", \"" << PwrNlp::to_utf8(patstr_) << "\")"; //TODO: utf escaping?
	return ss.str();
}

std::string Regex::to_raw_string() const {
	std::stringstream ss;
	ss << raw_name() << "(" << strset_expr_->to_raw_string()
Adam Wardynski's avatar
Adam Wardynski committed
		<< ", \"" << PwrNlp::to_utf8(patstr_) << "\")"; //TODO: utf escaping?
	return ss.str();
}

Regex::BaseRetValPtr Regex::apply_internal(const FunExecContext& context) const {
	const boost::shared_ptr<const StrSet>& set = strset_expr_->apply(context);
Adam Wardynski's avatar
Adam Wardynski committed
	if(set->empty()) {
		return Predicate::False(context);
Adam Wardynski's avatar
Adam Wardynski committed
	}
	foreach(const UnicodeString& s, set->contents()) {
		UErrorCode status = U_ZERO_ERROR;
		boost::scoped_ptr<RegexMatcher> matcher(pattern_->matcher(s, status));
		if(status != U_ZERO_ERROR) {
			BOOST_ASSERT(status == U_ZERO_ERROR);
			return Predicate::False(context);
Adam Wardynski's avatar
Adam Wardynski committed
		}
		bool matched = matcher->matches(status);
		if(status != U_ZERO_ERROR) {
			BOOST_ASSERT(status == U_ZERO_ERROR);
			return Predicate::False(context);
Adam Wardynski's avatar
Adam Wardynski committed
		}
		if(!matched) {
			return Predicate::False(context);
	return Predicate::True(context);
Adam Wardynski's avatar
Adam Wardynski committed
}

} /* end ns Wccl */