diff --git a/libwccl/CMakeLists.txt b/libwccl/CMakeLists.txt index 3ef77731233a37d237f00bafe6aca4d3cd28ab19..05affc139f1d602eebdd265cc90cfbc3564dd6c6 100644 --- a/libwccl/CMakeLists.txt +++ b/libwccl/CMakeLists.txt @@ -45,6 +45,7 @@ SET(libwccl_STAT_SRC ops/functions/bool/predicates/regex.cpp ops/functions/bool/predicates/strongagreement.cpp ops/functions/bool/predicates/weakagreement.cpp + ops/functions/match/submatch.cpp ops/functions/position/firsttoken.cpp ops/functions/position/lasttoken.cpp ops/functions/position/relativeposition.cpp diff --git a/libwccl/ops/functions/match/submatch.cpp b/libwccl/ops/functions/match/submatch.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3b25a0ea3f74bf908ee25ca5c5c8913d45ed1996 --- /dev/null +++ b/libwccl/ops/functions/match/submatch.cpp @@ -0,0 +1,51 @@ +#include <libwccl/ops/functions/match/submatch.h> +#include <libwccl/ops/functions/vargetter.h> + +namespace Wccl { + +Submatch::BaseRetValPtr Submatch::apply_internal( + const FunExecContext &context) const +{ + const RetValPtr& match = match_expr_->apply(context); + const MatchData& md = match->get_value(); + return RetValPtr(md.submatch(index_ )); +} + +std::string Submatch::to_string(const Corpus2::Tagset &tagset) const +{ + std::ostringstream ss; + boost::shared_ptr<VarGetter<Match> > getvar = + boost::dynamic_pointer_cast<VarGetter<Match> >(match_expr_); + if (!getvar) { + ss << match_expr_->to_string(tagset); + } else { + std::string s = match_expr_->to_string(tagset); + if (s == Match::var_repr("_M")) { + ss << "M"; + } else { + ss << s; + } + } + ss << " " << name(tagset) << " " << index_; + return ss.str(); +} + +std::ostream& Submatch::write_to(std::ostream& ostream) const +{ + boost::shared_ptr<VarGetter<Match> > getvar = + boost::dynamic_pointer_cast<VarGetter<Match> >(match_expr_); + if (!getvar) { + ostream << *match_expr_; + } else { + std::string s = match_expr_->to_raw_string(); + if (s == Match::var_repr("_M")) { + ostream << "M"; + } else { + ostream << s; + } + } + ostream << " " << raw_name() << " " << index_; + return ostream; +} + +} /* end ns Wccl */ diff --git a/libwccl/ops/functions/match/submatch.h b/libwccl/ops/functions/match/submatch.h new file mode 100644 index 0000000000000000000000000000000000000000..d2eb03137225dd179131fd0d31c241211774f022 --- /dev/null +++ b/libwccl/ops/functions/match/submatch.h @@ -0,0 +1,65 @@ +#ifndef LIBWCCL_OPS_FUNCTIONS_MATCH_SUBMATCH_H +#define LIBWCCL_OPS_FUNCTIONS_MATCH_SUBMATCH_H + +#include <libwccl/ops/function.h> +#include <libwccl/values/match.h> + + +namespace Wccl { + +/** + * Operator that takes a Match and an index and returns + * a submatch at given index (indices start at 1; operator + * works only on Matches that have MatchVector). + */ +class Submatch : public Function<Match> +{ +public: + typedef boost::shared_ptr<Function<Match> > MatchFunctionPtr; + + Submatch(const MatchFunctionPtr& match_expr, size_t index) + : match_expr_(match_expr), + index_(index) + { + BOOST_ASSERT(match_expr_); + if (index < 1) { + throw InvalidArgument("index", "Submatch indices start from 1."); + } + } + + /** + * @returns String representation of the function + */ + std::string to_string(const Corpus2::Tagset& tagset) const; + + /** + * @returns Name of the function + */ + std::string raw_name() const { + return "->"; + } + +protected: + const MatchFunctionPtr match_expr_; + const size_t index_; + + /** + * Writes string representation of the function. + * @returns Stream written to. + */ + std::ostream& write_to(std::ostream& ostream) const; + + /** + * Takes the value of a Match from argument expression, and returns + * a submatch at given index. Works only if Match has a MatchVector. + * @throws WcclError if given Match does not hold a MatchVector, or + * if the index is outside boundaries of MatchVector. + * @returns Match that is in underlying MatchVector at specified index + * (note - indexing starts from 1 not from 0). + */ + BaseRetValPtr apply_internal(const FunExecContext& context) const; +}; + +} /* end ns Wccl */ + +#endif // LIBWCCL_OPS_FUNCTIONS_MATCH_SUBMATCH_H diff --git a/libwccl/parser/grammar.g b/libwccl/parser/grammar.g index 2756b82a8b5f203380daa0a444e9dcbf1f3c3249..2cdb3bc90598039bd71e3bc20073a3945041d141 100644 --- a/libwccl/parser/grammar.g +++ b/libwccl/parser/grammar.g @@ -82,6 +82,7 @@ header { #include <libwccl/ops/match/conditions/tokencondition.h> #include <libwccl/ops/match/actions/markmatch.h> #include <libwccl/ops/match/actions/unmarkmatch.h> + #include <libwccl/ops/functions/match/submatch.h> // Unicode String #include <unicode/uniset.h> @@ -2050,6 +2051,19 @@ match_fit { // } + : + ( ret = match_var_val [tagset, vars] + | "M" { ret.reset(new VarGetter<Match>(vars.create_accessor<Match>("_M"))); } + | LPAREN ret = match_fit [tagset, vars] RPAREN + ) + ( // if there's an arrow after the match, we have a submatch reference + ARROW i: UNSIGNED_INT { ret.reset(new Submatch(ret, token_ref_to_int(i))); } + )? +; + +match_var_val + [const Corpus2::Tagset& tagset, Variables& vars] + returns [boost::shared_ptr<Function<Match> > ret] : ret = match_vector_variable [vars] | ret = match_value_const ; @@ -2211,6 +2225,13 @@ options { : ',' ; +ARROW +options { + paraphrase = "->"; +} + : "->" +; + SYMBOL options { paraphrase = "Symbol"; diff --git a/libwccl/values/matchdata.h b/libwccl/values/matchdata.h index 1062c42792ba92e2fb82c0e63aa279db0bea2d6b..56f094bdb892d436905c250376aaf3d657f43543 100644 --- a/libwccl/values/matchdata.h +++ b/libwccl/values/matchdata.h @@ -2,10 +2,12 @@ #define LIBWCCL_VALUES_MATCHDATA_H #include <libwccl/values/position.h> +#include <libwccl/exception.h> #include <libcorpus2/ann/annotatedsentence.h> namespace Wccl { +class Match; /** * Base abstract class for data held by a Match Value * - VectorMatch, TokenMatch or AnnotationMatch. @@ -32,6 +34,20 @@ public: */ virtual Position last_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>&) const = 0; + /** + * Getter for a submatch at given index (indexing starts from 1). + */ + virtual const boost::shared_ptr<const Match>& submatch(size_t idx) const { + throw WcclError("Getting a submatch is possible only for a MatchVector."); + } + + /** + * Getter for a submatch at given index (indexing starts from 1). + */ + virtual const boost::shared_ptr<Match>& submatch(size_t idx) { + throw WcclError("Getting a submatch is possible only for a MatchVector."); + } + boost::shared_ptr<MatchData> clone() const { return boost::shared_ptr<MatchData>(clone_internal()); } diff --git a/libwccl/values/matchvector.cpp b/libwccl/values/matchvector.cpp index 6ad60968f7016690cbd30aeeab5d341b6fe2407f..d53c3120324a5fad020d385ded734d98e8c87483 100644 --- a/libwccl/values/matchvector.cpp +++ b/libwccl/values/matchvector.cpp @@ -89,10 +89,18 @@ void MatchVector::append(const boost::shared_ptr<MatchData> &m) matches_.push_back(boost::shared_ptr<Match>(new Match(m))); } +const boost::shared_ptr<const Match>& MatchVector::submatch(size_t idx) const { + if (idx + 1< matches_.size() || idx == 0) { + return matches_[idx - 1]; + } else { + throw Wccl::WcclError("Match vector index out of range"); + } +} + const boost::shared_ptr<Match>& MatchVector::submatch(size_t idx) { - if (idx < matches_.size()) { - return matches_[idx]; + if (idx + 1 < matches_.size() || idx == 0) { + return matches_[idx - 1]; } else { throw Wccl::WcclError("Match vector index out of range"); } diff --git a/libwccl/values/matchvector.h b/libwccl/values/matchvector.h index 795ef86fa595d737941187d8441e1b69201b11e7..b465cd120302939ffea7478555b0db2ba4a46b47 100644 --- a/libwccl/values/matchvector.h +++ b/libwccl/values/matchvector.h @@ -47,17 +47,17 @@ public: } /** - * Submatch accesor with bounds checking, throws if out of bounds + * Submatch accessor with bounds checking, throws if out of bounds. + * @note Indexing is assumed from 1. */ const boost::shared_ptr<Match>& submatch(size_t idx); /** - * Submatch indexing operator. Per C++ container tradition, no bounds - * checking is done. + * Submatch accessor with bounds checking, throws if out of bounds. + * Const version. + * @note Indexing is assumed from 1. */ - const boost::shared_ptr<Match>& operator[](size_t idx) const { - return matches_[idx]; - } + const boost::shared_ptr<const Match>& submatch(size_t idx) const; void clear() { matches_.clear();