header { //don't try to add all the headers inside our namespace ANTLR_END_NAMESPACE #include <libwccl/parser/ParserException.h> #include <cstdio> #include <antlr/Token.hpp> #include <boost/lexical_cast.hpp> // values/variables #include <libwccl/variables.h> #include <libwccl/values/bool.h> #include <libwccl/values/tset.h> #include <libwccl/values/strset.h> #include <libwccl/values/position.h> // sentence context #include <libwccl/sentencecontext.h> // operators #include <libwccl/ops/operator.h> #include <libwccl/ops/functions/constant.h> #include <libwccl/ops/functions/vargetter.h> #include <libwccl/ops/functions/conditional.h> #include <libwccl/ops/functions/bool/varsetter.h> #include <libwccl/ops/functions/bool/predicates/debug.h> #include <libwccl/ops/functions/bool/predicates/or.h> #include <libwccl/ops/functions/bool/predicates/nor.h> #include <libwccl/ops/functions/bool/predicates/and.h> #include <libwccl/ops/functions/bool/predicates/regex.h> #include <libwccl/ops/functions/bool/predicates/intersects.h> #include <libwccl/ops/functions/bool/predicates/issubsetof.h> #include <libwccl/ops/functions/bool/predicates/isinside.h> #include <libwccl/ops/functions/bool/predicates/isoutside.h> #include <libwccl/ops/functions/bool/predicates/equals.h> #include <libwccl/ops/functions/bool/predicates/weakagreement.h> #include <libwccl/ops/functions/bool/predicates/pointagreement.h> #include <libwccl/ops/functions/bool/predicates/strongagreement.h> #include <libwccl/ops/functions/bool/predicates/annsub.h> #include <libwccl/ops/functions/strset/affix.h> #include <libwccl/ops/functions/strset/getorth.h> #include <libwccl/ops/functions/strset/toupper.h> #include <libwccl/ops/functions/strset/tolower.h> #include <libwccl/ops/functions/strset/getlemmas.h> #include <libwccl/ops/functions/tset/agrfilter.h> #include <libwccl/ops/functions/tset/catfilter.h> #include <libwccl/ops/functions/tset/getsymbols.h> #include <libwccl/ops/functions/tset/getwordclass.h> #include <libwccl/ops/functions/tset/getsymbolsinrange.h> #include <libwccl/ops/functions/position/relativeposition.h> #include <libwccl/ops/functions/bool/iterations/only.h> #include <libwccl/ops/functions/bool/iterations/atleast.h> #include <libwccl/ops/functions/bool/iterations/leftlook.h> #include <libwccl/ops/functions/bool/iterations/rightlook.h> // Rules, actions #include <libwccl/ops/tagrule.h> #include <libwccl/ops/matchrule.h> #include <libwccl/ops/rulesequence.h> // #include <libwccl/ops/tagactions/unify.h> #include <libwccl/ops/tagactions/delete.h> #include <libwccl/ops/tagactions/select.h> #include <libwccl/ops/tagactions/relabel.h> #include <libwccl/ops/tagactions/mark.h> #include <libwccl/ops/tagactions/unmark.h> // Match operators #include <libwccl/values/tokenmatch.h> #include <libwccl/values/annotationmatch.h> #include <libwccl/values/matchvector.h> #include <libwccl/ops/match/applyoperator.h> #include <libwccl/ops/match/conditions/optionalmatch.h> #include <libwccl/ops/match/conditions/repeatedmatch.h> #include <libwccl/ops/match/conditions/conjconditions.h> #include <libwccl/ops/match/conditions/tokencondition.h> #include <libwccl/ops/match/actions/markmatch.h> #include <libwccl/ops/match/actions/unmarkmatch.h> #include <libwccl/ops/functions/match/submatch.h> // Unicode String #include <unicode/uniset.h> #include <unicode/unistr.h> // start our namespace again ANTLR_BEGIN_NAMESPACE(Wccl) } options { language = "Cpp"; genHashLines = false; namespace = "Wccl"; // genHashLines = true; } // ---------------------------------------------------------------------------- // ANTLR PARSER // ---------------------------------------------------------------------------- class ANTLRParser extends Parser; options { k = 1; buildAST = false; exportVocab = ANTLRExpr; defaultErrorHandler = false; } { private: // const UnicodeString token_ref_to_ustring(antlr::RefToken& rstr) const { return UnicodeString::fromUTF8(((antlr::Token*)rstr)->getText().c_str()).unescape(); } /* const UnicodeString str_token_ref_to_ustring(antlr::RefToken& rstr) const { UnicodeString ret_ustr, ustr = token_ref_to_ustring(rstr); if (ustr.length() < 3) { return ""; } ustr.extract(1, ustr.length() - 2, ret_ustr); return ret_ustr; } */ // const std::string str_token_rem_grav(antlr::RefToken& rstr) const { size_t len = 0; std::string ret = token_ref_to_std_string(rstr); if ((len = ret.length()) < 2) { return ret; } if (ret[0] == '`' && ret[len - 1] == '`') { return ret.substr(1, len - 2); } return ret; } // const std::string token_ref_to_std_string(antlr::RefToken& rstr) const { return (((antlr::Token*)rstr)->getText()); } // int token_ref_to_int(antlr::RefToken& rstr) { return atoi(((antlr::Token*)rstr)->getText().c_str()); } } /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// // "GLOBAL" RULES /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// // ---------------------------------------------------------------------------- // Rule for parsing string set operator with scope. // Returns boost::shared_ptr<Operator<StrSet> > parse_strset_operator [const Corpus2::Tagset &tagset] returns [boost::shared_ptr<Operator<StrSet> > res] { Variables vars; boost::shared_ptr<Function<StrSet> > body; } : body = strset_operator [tagset, vars] { res.reset(new Operator<StrSet>(body, vars)); } EOF ; // ---------------------------------------------------------------------------- // Rule for parsing bool operator with scope. // Returns boost::shared_ptr<Operator<Bool> > parse_bool_operator [const Corpus2::Tagset &tagset] returns [boost::shared_ptr<Operator<Bool> > res] { Variables vars; boost::shared_ptr<Function<Bool> > body; } : body = bool_operator [tagset, vars] { res.reset(new Operator<Bool>(body, vars)); } EOF ; // ---------------------------------------------------------------------------- // Rule for parsing symbol set operator with scope. // Returns boost::shared_ptr<Operator<TSet> > parse_symset_operator [const Corpus2::Tagset &tagset] returns [boost::shared_ptr<Operator<TSet> > res] { Variables vars; boost::shared_ptr<Function<TSet> > body; } : body = symset_operator [tagset, vars] { res.reset(new Operator<TSet>(body, vars)); } EOF ; // ---------------------------------------------------------------------------- // Rule for parsing position operator with scope. // Returns boost::shared_ptr<Operator<Position> > parse_position_operator [const Corpus2::Tagset &tagset] returns [boost::shared_ptr<Operator<Position> > res] { Variables vars; boost::shared_ptr<Function<Position> > body; } : body = position_operator [tagset, vars] { res.reset(new Operator<Position>(body, vars)); } EOF ; // ---------------------------------------------------------------------------- // ---------------------------------------------------------------------------- // Rule for parsing single WCCL Rule // Returns boost::shared_ptr<TagRule> parse_single_rule [const Corpus2::Tagset &tagset] returns [boost::shared_ptr<TagRule> rle] { Variables vars; } : rle = rule [tagset, vars] ; // Rule for parsing rules section in the wccl file // Returns boost::shared_ptr<RuleSequence> parse_rule_sequence [const Corpus2::Tagset& tagset] returns [boost::shared_ptr<RuleSequence> rule_seq] { Variables vars; } : rule_seq = rules[tagset, vars] ; // ---------------------------------------------------------------------------- // ---------------------------------------------------------------------------- // Rule for parsing the match rules // Returns boost::shared_ptr<MatchRule> parse_match_rule [const Corpus2::Tagset& tagset] returns [boost::shared_ptr<MatchRule> ret_match] { Variables vars; vars.get_put<Match>("_M"); } : ret_match = match_rule_operator[tagset, vars] ; /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// // VALUES /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// // ---------------------------------------------------------------------------- // Single or muliple (comma separated) elements in string set, may be: // 'a' "a" [] ['a'] ['a', 'b'] ["a"] ["a", "b"] ['a', "b"] // Parsing strset literal and returning plain strset value. // Returns boost::shared_ptr<StrSet> strset_literal returns [boost::shared_ptr<StrSet> s_set] { s_set.reset(new StrSet()); } : s0: STRING { s_set->insert(token_ref_to_ustring(s0)); } | LBRACKET ( s1: STRING { s_set->insert(token_ref_to_ustring(s1)); } ( COMMA s2: STRING { s_set->insert(token_ref_to_ustring(s2)); } )* )? RBRACKET ; // String set value as constrant string set: // Returns boost::shared_ptr<Constant<StrSet> > strset_value returns [boost::shared_ptr<Constant<StrSet> > val] { boost::shared_ptr<StrSet> set; } : set = strset_literal { val.reset(new Constant<StrSet>(*set)); } ; // ---------------------------------------------------------------------------- // Element of sym set. This rule, inserts element into symbol set // with corresponding tagset. // WARNING! This rule can throw ParserException! Be careful! symset_elem [const Corpus2::Tagset& tagset, boost::shared_ptr<TSet>& t_set] : s1: SYMBOL { try { t_set->insert_symbol(tagset, str_token_rem_grav(s1)); } catch(Corpus2::TagParseError &e) { throw(ParserException(e.info())); } } ; // Symset literal. Symset element may be: // a, `a ` (this is guaranteed by lexer rule - SYMBOL) or {a} {`a`} {a, b} // {`a`, `b`} {a, `b`} {`a`, b} // Parsing symset literal and returning plain symset value. // Returns boost::shared_ptr<TSet> symset_literal [const Corpus2::Tagset& tagset] returns [boost::shared_ptr<TSet> t_set] { t_set.reset(new TSet()); } : symset_elem [tagset, t_set] | LCURLY ( symset_elem [tagset, t_set] (COMMA symset_elem [tagset, t_set])* )? RCURLY ; // Symset value, as constant symbol set // Returns boost::shared_ptr<Constant<TSet> > symset_value [const Corpus2::Tagset& tagset] returns [boost::shared_ptr<Constant<TSet> > val] { boost::shared_ptr<TSet> set; } : set = symset_literal [tagset] { val.reset(new Constant<TSet>(*set)); } ; // ---------------------------------------------------------------------------- // Bool literal. May be True or False. Parsing bool literal and returning // plain bool value. // Returns boost::shared_ptr<Bool> bool_literal returns [boost::shared_ptr<Bool> val] : "True" { val.reset(new Bool(Bool(true ))); } | "False" { val.reset(new Bool(Bool(false))); } ; // Bool value, as constat bool Value // Returns boost::shared_ptr<Constant<Bool> > bool_value returns [boost::shared_ptr<Constant<Bool> > val] { boost::shared_ptr<Bool> bool_lit; } : bool_lit = bool_literal { val.reset(new Constant<Bool>(*bool_lit)); } ; // ---------------------------------------------------------------------------- // Position literal may be: // (+|-)?(0-9)+ or begin or end or nowhere // Parsing position literal and returning plain position value. // returns boost::shared_ptr<Position> position_literal returns [boost::shared_ptr<Position> val] { int i = 0; } : i = number { val.reset(new Position(Position(i))); } | "begin" { val.reset(new Position(Position(Position::Begin))); } | "end" { val.reset(new Position(Position(Position::End))); } | "nowhere" { val.reset(new Position(Position(Position::Nowhere))); } ; // Position as constant position value // Returns boost::shared_ptr<Constant<Position> > position_value returns [boost::shared_ptr<Constant<Position> > val] { boost::shared_ptr<Position> pos_lit; } : pos_lit = position_literal { val.reset(new Constant<Position>(*pos_lit)); } ; // ---------------------------------------------------------------------------- // Value used into match operator such as TOK[position] and ANN[position, name] // Returns boost::shared_ptr<Match> match_literal returns [boost::shared_ptr<Match> val] { boost::shared_ptr<MatchData> m; } : m = match_data_literal { val.reset(new Match(m)); } ; // Constant match value // Returns boost::shared_ptr<Constant<Match> > match_value_const returns [boost::shared_ptr<Constant<Match> > val] { boost::shared_ptr<Match> m; } : m = match_literal { val.reset(new Constant<Match>(*m)); } ; // ---------------------------------------------------------------------------- // Value used into match operator such as TOK[position] and ANN[position, name] // Returns boost::shared_ptr<MatchData> match_data_literal returns [boost::shared_ptr<MatchData> val] : val = token_match_literal | val = ann_match_literal | val = match_vector_literal ; // token match literal - TOK[position] // Returns boost::shared_ptr<TokenMatch> token_match_literal returns [boost::shared_ptr<TokenMatch> val] { boost::shared_ptr<Position> p; } : "TOK" LBRACKET p = position_literal RBRACKET { val.reset(new TokenMatch(*p)); } ; // annotation match literal - ANN[position, name] // Returns boost::shared_ptr<AnnotationMatch> ann_match_literal returns [boost::shared_ptr<AnnotationMatch> val] { boost::shared_ptr<Position> p; } : "ANN" LBRACKET p = position_literal COMMA channel : STRING RBRACKET { val.reset(new AnnotationMatch(*p, token_ref_to_std_string(channel))); } ; // annotation match vector literal: MATCH() or MATCH(token, ann, MATCH()) // Returns boost::shared_ptr<MatchVector> match_vector_literal returns [boost::shared_ptr<MatchVector> val] { val.reset(new MatchVector()); } : "MATCH" LPAREN (match_vector_literal_item[val])? RPAREN ; // Body of the MATCH value. It only adds vector items to the MatchVector // Item may be single or multiple match_vector_literal_item [boost::shared_ptr<MatchVector>& mvector] { boost::shared_ptr<Match> m_val; } : m_val = match_literal { mvector->append(m_val); } ( COMMA m_val = match_literal { mvector->append(m_val); } )* ; // ---------------------------------------------------------------------------- // Number may be unsigned or signed: 1, +1, -1 number returns [int ret] { ret = 0; } : s: SIGNED_INT { ret = token_ref_to_int(s); } | u: UNSIGNED_INT { ret = token_ref_to_int(u); } ; /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// // VARIABLES /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// // ---------------------------------------------------------------------------- // Position: $Name // Get position variable (however, before put into) from variables // Returns boost::shared_ptr<VariableAccessor<Position> > position_variable_acc [Variables& vars] returns [boost::shared_ptr<VariableAccessor<Position> > pos_acc] : POS_PREFIX n: SYMBOL { vars.get_put<Position>(str_token_rem_grav(n)); VariableAccessor<Position> acc = vars.create_accessor<Position>(str_token_rem_grav(n)); pos_acc.reset(new VariableAccessor<Position>(acc)); } ; // VarGetter for Position variable. This rule wrapped position_variable_acc. // Returs boost::shared_ptr<VarGetter<Position> > position_variable [Variables& vars] returns [boost::shared_ptr<VarGetter<Position> > op] { boost::shared_ptr<VariableAccessor<Position> > pos_acc; } : pos_acc = position_variable_acc [vars] { op.reset(new VarGetter<Position>(*pos_acc)); } ; // ---------------------------------------------------------------------------- // String set, $s:name // This expression gets (however, before put into) variable of the type StrSet // from scope -- variables. // Returns boost::shared_ptr<VariableAccessor<StrSet> > strset_variable_acc [Variables& vars] returns [boost::shared_ptr<VariableAccessor<StrSet> > strset_acc] : STR_PREFIX n: SYMBOL { vars.get_put<StrSet>(str_token_rem_grav(n)); VariableAccessor<StrSet> acc = vars.create_accessor<StrSet>(str_token_rem_grav(n)); strset_acc.reset(new VariableAccessor<StrSet>(acc)); } ; // Vargetter for StrSet variable. This rule wrapped strset_variable_acc. // Returns boost::shared_ptr<VarGetter<StrSet> > strset_variable [Variables& vars] returns [boost::shared_ptr<VarGetter<StrSet> > op] { boost::shared_ptr<VariableAccessor<StrSet> > strset_acc; } : strset_acc = strset_variable_acc [vars] { op.reset(new VarGetter<StrSet>(*strset_acc)); } ; // ---------------------------------------------------------------------------- // Symbol set: $t:name // Get symset variable (however, before put into) from variables // Returns boost::shared_ptr<VariableAccessor<TSet> > symset_variable_acc [Variables& vars] returns [boost::shared_ptr<VariableAccessor<TSet> > symset_acc] : TST_PREFIX n: SYMBOL { vars.get_put<TSet>(str_token_rem_grav(n)); VariableAccessor<TSet> acc = vars.create_accessor<TSet>(str_token_rem_grav(n)); symset_acc.reset(new VariableAccessor<TSet>(acc)); } ; // Vargetter for symbol set variable. This rule wrapped symset_variable_acc // Returns boost::shared_ptr<VarGetter<TSet> > symset_variable [Variables& vars] returns [boost::shared_ptr<VarGetter<TSet> > op] { boost::shared_ptr<VariableAccessor<TSet> > symset_acc; } : symset_acc = symset_variable_acc [vars] { op.reset(new VarGetter<TSet>(*symset_acc)); } ; // ---------------------------------------------------------------------------- // Bool: $b:name // Get bool variable (however, before put into) from variables // Returns boost::shared_ptr<VariableAccessor<Bool> > bool_variable_acc [Variables& vars] returns [boost::shared_ptr<VariableAccessor<Bool> > bool_acc] : BOOL_PREFIX n: SYMBOL { vars.get_put<Bool>(str_token_rem_grav(n)); VariableAccessor<Bool> acc = vars.create_accessor<Bool>(str_token_rem_grav(n)); bool_acc.reset(new VariableAccessor<Bool>(acc)); } ; // Vargetter for bool variable. It is only wrapper for bool_variable_acc // Returns boost::shared_ptr<VarGetter<Bool> > bool_variable [Variables& vars] returns [boost::shared_ptr<VarGetter<Bool> > op] { boost::shared_ptr<VariableAccessor<Bool> > bool_acc; } : bool_acc = bool_variable_acc [vars] { op.reset(new VarGetter<Bool>(*bool_acc)); } ; // ---------------------------------------------------------------------------- // Match: $m:name // Get mach vector variable from variavles (before put into variables) // Returns boost::shared_ptr<VariableAccessor<Match> > match_vector_variable_acc [Variables& vars] returns [boost::shared_ptr<VariableAccessor<Match> > mvv_acc] : MATCH_VECTOR_PREFIX n: SYMBOL { vars.get_put<Match>(str_token_rem_grav(n)); VariableAccessor<Match> acc = vars.create_accessor<Match>(str_token_rem_grav(n)); mvv_acc.reset(new VariableAccessor<Match>(acc)); } ; // Vargetter for the match vector variavle. Wrapper for match_vector_variable_acc // Returns boost::shared_ptr<VarGetter<Match> > match_vector_variable [Variables& vars] returns [boost::shared_ptr<VarGetter<Match> > mvv] { boost::shared_ptr<VariableAccessor<Match> > mvv_acc; } : mvv_acc = match_vector_variable_acc [vars] { mvv.reset(new VarGetter<Match>(*mvv_acc)); } ; /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// // OPERATORS /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// // Symbol set (tagset) operators // Returns boost::shared_ptr<Function<TSet> > /////////////////////////////////////////////////////////////////////////////// symset_operator [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<TSet> > ret] : ret = symset_condition [tagset, vars] | {LA(1)==SYMBOL && LA(2)==LBRACKET}? (ret = symset_getsymbol [tagset, vars]) | ret = symset_var_val [tagset, vars] | ret = symset_class [tagset, vars] | ret = symset_range [tagset, vars] | ret = symset_catflt [tagset, vars] | ret = symset_agrflt [tagset, vars] // | LPAREN ret = symset_operator [tagset, vars] RPAREN ; // ---------------------------------------------------------------------------- // It's wrapper for symset variable and symset value. symset_var_val [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<TSet> > op] : op = symset_variable [vars] | op = symset_value [tagset] ; // ---------------------------------------------------------------------------- // Condition of the symset value: // if (Bool, TSet, TSet) // ? TSet ? Bool : {} symset_condition [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<TSet> > op] { boost::shared_ptr<Function<Bool> > test; boost::shared_ptr<Function<TSet> > p_true, p_false; } : "if" LPAREN test = bool_operator [tagset, vars] COMMA p_true = symset_operator [tagset, vars] (COMMA p_false = symset_operator [tagset, vars])? RPAREN { if (p_false) { op.reset(new Conditional<TSet>(test, p_true, p_false)); } else { op.reset(new Conditional<TSet>(test, p_true)); } } | Q_MARK (p_true = symset_operator [tagset, vars]) Q_MARK (test = bool_operator [tagset, vars]) { op.reset(new Conditional<TSet>(test, p_true)); } ; // ---------------------------------------------------------------------------- // GetSymbol operator may be cas, m1, f, sg... // WARNING! This rule can throw ParserException! Be careful! symset_getsymbol [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<TSet> > op] { Corpus2::Tag tag; boost::shared_ptr<Wccl::Function<Position> > position; } : t: SYMBOL LBRACKET position = position_operator [tagset, vars] RBRACKET { try { tag = tagset.parse_symbol(str_token_rem_grav(t)); } catch(Corpus2::TagParseError &e) { throw(ParserException(e.info())); } op.reset(new Wccl::GetSymbols(tag, position)); } ; // ---------------------------------------------------------------------------- // Class operator. symset_class [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<TSet> > ret] { boost::shared_ptr<Function<Position> > pos; } : "class" LBRACKET pos = position_operator [tagset, vars] RBRACKET { ret.reset(new GetWordClass(pos)); } ; // ---------------------------------------------------------------------------- // Range operator: range(class, begin, end) or range({...}, begin, end) symset_range [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<TSet> > ret] { boost::shared_ptr<TSet> tset; boost::shared_ptr<Function<Position> > p1, p2; } : "range" LPAREN (tset = symset_literal [tagset] | tag_class: "class") COMMA p1 = position_operator [tagset, vars] COMMA p2 = position_operator [tagset, vars] RPAREN { if (tag_class) { ret.reset(new GetSymbolsInRange(Corpus2::Tag(-1), p1, p2)); } else { ret.reset(new GetSymbolsInRange(tset->get_value(), p1, p2)); } } ; // ---------------------------------------------------------------------------- // Catflt operator symset_catflt [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<TSet> > ret] { boost::shared_ptr<Function<TSet> > selector, mask; boost::shared_ptr<Function<Position> > position; } : "catflt" LPAREN position = position_operator [tagset, vars] COMMA selector = symset_operator [tagset, vars] COMMA mask = symset_operator [tagset, vars] RPAREN { ret.reset(new CatFilter(position, selector, mask)); } ; // ---------------------------------------------------------------------------- // Agrflt operator symset_agrflt [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<TSet> > ret] { boost::shared_ptr<Function<TSet> > attr, mask; boost::shared_ptr<Function<Position> > lpos, rpos; } : "agrflt" LPAREN lpos = position_operator [tagset, vars] COMMA rpos = position_operator [tagset, vars] COMMA attr = symset_operator [tagset, vars] COMMA mask = symset_operator [tagset, vars] RPAREN { ret.reset(new AgrFilter(lpos, rpos, attr, mask, tagset)); } ; /////////////////////////////////////////////////////////////////////////////// // Position operator // Returns boost::shared_ptr<Function<Position> > /////////////////////////////////////////////////////////////////////////////// position_operator [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<Position> > ret] : ( ret = position_var_val [vars] | ret = position_condition [tagset, vars] | LPAREN ret = position_operator [tagset, vars] RPAREN ) ( // if there is SIGNED_INT after the position, it is actually a relative position i: SIGNED_INT { ret.reset(new RelativePosition(ret, token_ref_to_int(i))); } )? ; // ---------------------------------------------------------------------------- // Wrapper for position variable and position value position_var_val [Variables& vars] returns [boost::shared_ptr<Function<Position> > ret] : ret = position_value | ret = position_variable [vars] ; // ---------------------------------------------------------------------------- // Condition of the position value // if (Bool, Position, Position) // ? Position ? Bool : 0 position_condition [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<Position> > op] { boost::shared_ptr<Function<Bool> > test; boost::shared_ptr<Function<Position> > p_true, p_false; } : "if" LPAREN test = bool_operator [tagset, vars] COMMA p_true = position_operator [tagset, vars] (COMMA p_false = position_operator [tagset, vars])? RPAREN { if (p_false) { op.reset(new Conditional<Position>(test, p_true, p_false)); } else { op.reset(new Conditional<Position>(test, p_true)); } } | Q_MARK p_true = position_operator [tagset, vars] Q_MARK test = bool_operator [tagset, vars] { op.reset(new Conditional<Position>(test, p_true)); } ; /////////////////////////////////////////////////////////////////////////////// // Stiring operator // Returns boost::shared_ptr<Function<StrSet> > /////////////////////////////////////////////////////////////////////////////// strset_operator [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<StrSet> > ret] : ret = strset_orth [tagset, vars] | ret = strset_base [tagset, vars] | ret = strset_lower [tagset, vars] | ret = strset_upper [tagset, vars] | ret = strset_affix [tagset, vars] | ret = strset_var_val [tagset, vars] | ret = strset_condition [tagset, vars] // | LPAREN ret = strset_operator [tagset, vars] RPAREN ; // ---------------------------------------------------------------------------- // Orth operator. strset_orth [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<StrSet> > ret] { boost::shared_ptr<Function<Position> > pos; } : "orth" LBRACKET pos = position_operator [tagset, vars] RBRACKET { ret.reset(new GetOrth(pos)); } ; // ---------------------------------------------------------------------------- // Base operator. strset_base [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<StrSet> > ret] { boost::shared_ptr<Function<Position> > pos; } : "base" LBRACKET pos = position_operator [tagset, vars] RBRACKET { ret.reset(new GetLemmas(pos)); } ; // ---------------------------------------------------------------------------- // Lower operator. strset_lower [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<StrSet> > ret] { boost::shared_ptr<Function<StrSet> > o_ret; } : "lower" LPAREN o_ret = strset_operator [tagset, vars] RPAREN { ret.reset(new ToLower(o_ret)); } ; // ---------------------------------------------------------------------------- // Upper operator. strset_upper [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<StrSet> > ret] { boost::shared_ptr<Function<StrSet> > o_ret; } : "upper" LPAREN o_ret = strset_operator [tagset, vars] RPAREN { ret.reset(new ToUpper(o_ret)); } ; // ---------------------------------------------------------------------------- // Affix operator. strset_affix [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<StrSet> > ret] { int offset = 0; boost::shared_ptr<Function<StrSet> > o_ret; } : "affix" LPAREN o_ret = strset_operator [tagset, vars] COMMA offset = number RPAREN { ret.reset(new Affix(o_ret, offset)); } ; // ---------------------------------------------------------------------------- // Wrapper ofr strset value and strset variable strset_var_val [const Corpus2::Tagset& /*tagset*/, Variables& vars] returns [boost::shared_ptr<Function<StrSet> > op] : op = strset_value | op = strset_variable [vars] ; // ---------------------------------------------------------------------------- // Condition of the strset value // if (Bool, StrSet, StrSet) // ? StrSet ? Bool : [] strset_condition [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<StrSet> > op] { boost::shared_ptr<Function<Bool> > test; boost::shared_ptr<Function<StrSet> > p_true, p_false; } : "if" LPAREN test = bool_operator [tagset, vars] COMMA p_true = strset_operator [tagset, vars] (COMMA p_false = strset_operator [tagset, vars])? RPAREN { if (p_false) { op.reset(new Conditional<StrSet>(test, p_true, p_false)); } else { op.reset(new Conditional<StrSet>(test, p_true)); } } | Q_MARK p_true = strset_operator [tagset, vars] Q_MARK test = bool_operator [tagset, vars] { op.reset(new Conditional<StrSet>(test, p_true)); } ; /////////////////////////////////////////////////////////////////////////////// // Boool operator // Returns boost::shared_ptr<Function<Bool> > /////////////////////////////////////////////////////////////////////////////// bool_operator [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<Bool> > ret] : ret = bool_and [tagset, vars] | ret = bool_or [tagset, vars] | ret = bool_nor [tagset, vars] | ret = bool_var_val [tagset, vars] | ret = bool_regex [tagset, vars] | ret = bool_inout [tagset, vars] | ret = bool_condition [tagset, vars] // setvar: | ret = setvar_operator [tagset, vars] // equal/in/inter: | ret = equal_operator [tagset, vars] | ret = in_operator [tagset, vars] | ret = inter_operator [tagset, vars] // iterations | ret = bool_iteration [tagset, vars] // agreement | ret = bool_agreement [tagset, vars] // | ret = bool_phrase [tagset, vars] // | ret = bool_annsub [tagset, vars] // debug operators | ret = debug_print_operator [tagset, vars] // | LPAREN ret = bool_operator [tagset, vars] RPAREN ; // ---------------------------------------------------------------------------- // comma-separated predicates (bool operators) bool_operator_comma_sep [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<std::vector<boost::shared_ptr<Function<Bool> > > > ret_v] { boost::shared_ptr<Function<Bool> > pred; ret_v.reset( new std::vector<boost::shared_ptr<Function<Bool> > > ); } : pred = bool_operator [tagset, vars] { ret_v->push_back(pred); } ( COMMA pred = bool_operator [tagset, vars] { ret_v->push_back(pred); } )* ; // ---------------------------------------------------------------------------- // And operator. bool_and [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<Bool> > op] { boost::shared_ptr<std::vector<boost::shared_ptr<Function<Bool> > > > ret_v; } : "and" LPAREN ret_v = bool_operator_comma_sep [tagset, vars] RPAREN { op.reset(new And(ret_v)); } ; // ---------------------------------------------------------------------------- // Or operator bool_or [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<Bool> > op] { boost::shared_ptr<std::vector<boost::shared_ptr<Function<Bool> > > > ret_v; } : "or" LPAREN ret_v = bool_operator_comma_sep [tagset, vars] RPAREN { op.reset(new Or(ret_v)); } ; // ---------------------------------------------------------------------------- // Nor/Not operator bool_nor [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<Bool> > op] { boost::shared_ptr<std::vector<boost::shared_ptr<Function<Bool> > > > ret_v; } : "not" LPAREN ret_v = bool_operator_comma_sep [tagset, vars] RPAREN { op.reset(new Nor(ret_v)); } ; // ---------------------------------------------------------------------------- // Wrapper for bool value and bool variable bool_var_val [const Corpus2::Tagset& /*tagset*/, Variables& vars] returns [boost::shared_ptr<Function<Bool> > op] : op = bool_value | op = bool_variable [vars] ; // ---------------------------------------------------------------------------- // Regex operator bool_regex [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<Bool> > op] { boost::shared_ptr<Function<StrSet> > expr; } : "regex" LPAREN expr = strset_operator [tagset, vars] COMMA reg: STRING RPAREN { op.reset(new Regex(expr, token_ref_to_ustring(reg))); } ; // ---------------------------------------------------------------------------- // Input/output operator bool_inout [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<Bool> > op] { boost::shared_ptr<Function<Position> > ret_pos; } : "inside" LPAREN ret_pos = position_operator [tagset, vars] RPAREN { op.reset(new IsInside(ret_pos)); } | "outside" LPAREN ret_pos = position_operator [tagset, vars] RPAREN { op.reset(new IsOutside(ret_pos)); } ; // ---------------------------------------------------------------------------- // if (Bool, Bool, Bool) // ? Bool ? Bool : False bool_condition [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<Bool> > op] { boost::shared_ptr<Function<Bool> > test, p_true, p_false; } : "if" LPAREN test = bool_operator [tagset, vars] COMMA p_true = bool_operator [tagset, vars] (COMMA p_false = bool_operator [tagset, vars])? RPAREN { if (p_false) { op.reset(new Conditional<Bool>(test, p_true, p_false)); } else { op.reset(new Conditional<Bool>(test, p_true)); } } | Q_MARK p_true = bool_operator [tagset, vars] Q_MARK test = bool_operator [tagset, vars] { op.reset(new Conditional<Bool>(test, p_true)); } ; // ---------------------------------------------------------------------------- // Equal operator equal_operator [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<Bool> > op] { boost::shared_ptr<Function<TSet> > t1, t2; boost::shared_ptr<Function<Bool> > b1, b2; boost::shared_ptr<Function<StrSet> > s1, s2; boost::shared_ptr<Function<Position> > p1, p2; } : "equal" LPAREN ( (position_operator [tagset, vars]) => ( p1 = position_operator [tagset, vars] COMMA p2 = position_operator [tagset, vars] { op.reset(new Equals<Position>(p1, p2)); } ) | (symset_operator [tagset, vars]) => ( t1 = symset_operator [tagset, vars] COMMA t2 = symset_operator [tagset, vars] { op.reset(new Equals<TSet>(t1, t2)); } ) | (strset_operator [tagset, vars]) => ( s1 = strset_operator [tagset, vars] COMMA s2 = strset_operator [tagset, vars] { op.reset(new Equals<StrSet>(s1, s2)); } ) | ( b1 = bool_operator [tagset, vars] COMMA b2 = bool_operator [tagset, vars] { op.reset(new Equals<Bool>(b1, b2)); } ) ) RPAREN ; // ---------------------------------------------------------------------------- // In operator in_operator [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<Bool> > op] { boost::shared_ptr<Function<TSet> > t1, t2; boost::shared_ptr<Function<StrSet> > s1, s2; } : "in" LPAREN ( (symset_operator [tagset, vars]) => ( t1 = symset_operator [tagset, vars] COMMA t2 = symset_operator [tagset, vars] { op.reset(new IsSubsetOf<TSet>(t1, t2)); } ) | ( s1 = strset_operator [tagset, vars] COMMA s2 = strset_operator [tagset, vars] { op.reset(new IsSubsetOf<StrSet>(s1, s2)); } ) ) RPAREN ; // ---------------------------------------------------------------------------- // Inter operator inter_operator [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<Bool> > op] { boost::shared_ptr<Function<TSet> > t1, t2; boost::shared_ptr<Function<StrSet> > s1, s2; } : "inter" LPAREN ( (symset_operator [tagset, vars]) => ( t1 = symset_operator [tagset, vars] COMMA t2 = symset_operator [tagset, vars] { op.reset(new Intersects<TSet>(t1, t2)); } ) | ( s1 = strset_operator [tagset, vars] COMMA s2 = strset_operator [tagset, vars] { op.reset(new Intersects<StrSet>(s1, s2)); } ) ) RPAREN ; // ---------------------------------------------------------------------------- // Annotation-sub operator. bool_annsub [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<Bool> > op] { boost::shared_ptr< Function<Match> > match_from; boost::shared_ptr< Function<Match> > match_to; std::string chan_name; } : "annsub" LPAREN match_from = match_fit [tagset, vars] COMMA (match_to = match_fit [tagset, vars] COMMA)? name : STRING RPAREN { if (match_to) { op.reset(new AnnSub(match_from, match_to, chan_name)); } else { op.reset(new AnnSub(match_from, chan_name)); } } ; // ---------------------------------------------------------------------------- // Debug printing: debug_print_operator [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<Bool> > ret] { boost::shared_ptr<FunctionBase> v; } : "debug" LPAREN ( (position_operator [tagset, vars]) => ( v = position_operator [tagset, vars] { ret.reset(new DebugPrint(v)); } ) | (symset_operator [tagset, vars]) => ( v = symset_operator [tagset, vars] { ret.reset(new DebugPrint(v)); } ) | (strset_operator [tagset, vars]) => ( v = strset_operator [tagset, vars] { ret.reset(new DebugPrint(v)); } ) | ( v = bool_operator [tagset, vars] { ret.reset(new DebugPrint(v)); } ) ) RPAREN ; // ---------------------------------------------------------------------------- // Iterations: bool_iteration [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<Bool> > ret] { int min_match = 0; boost::shared_ptr<Function<Bool> > expr; boost::shared_ptr<Function<Position> > lpos, rpos; boost::shared_ptr<VariableAccessor<Position> > pacc; } : "only" LPAREN lpos = position_operator [tagset, vars] COMMA rpos = position_operator [tagset, vars] COMMA pacc = position_variable_acc [vars] COMMA expr = bool_operator [tagset, vars] RPAREN { ret.reset(new Only(lpos, rpos, *pacc, expr)); } | "atleast" LPAREN lpos = position_operator [tagset, vars] COMMA rpos = position_operator [tagset, vars] COMMA pacc = position_variable_acc [vars] COMMA expr = bool_operator [tagset, vars] COMMA min_match = number RPAREN { ret.reset(new AtLeast(lpos, rpos, *pacc, expr, min_match)); } | "llook" LPAREN //note inverted rpos/lpos order rpos = position_operator [tagset, vars] COMMA lpos = position_operator [tagset, vars] COMMA pacc = position_variable_acc [vars] COMMA expr = bool_operator [tagset, vars] RPAREN { ret.reset(new LeftLook(lpos, rpos, *pacc, expr)); } | "rlook" LPAREN lpos = position_operator [tagset, vars] COMMA rpos = position_operator [tagset, vars] COMMA pacc = position_variable_acc [vars] COMMA expr = bool_operator [tagset, vars] RPAREN { ret.reset(new RightLook(lpos, rpos, *pacc, expr)); } ; // ---------------------------------------------------------------------------- // Agreement operator: agr, agrpp, wagr bool_agreement [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<Bool> > ret] { boost::shared_ptr<Function<TSet> > expr; boost::shared_ptr<Function<Position> > lpos, rpos; } : "agr" LPAREN lpos = position_operator [tagset, vars] COMMA rpos = position_operator [tagset, vars] COMMA expr = symset_operator [tagset, vars] RPAREN { ret.reset(new StrongAgreement(lpos, rpos, expr, tagset)); } | "agrpp" LPAREN lpos = position_operator [tagset, vars] COMMA rpos = position_operator [tagset, vars] COMMA expr = symset_operator [tagset, vars] RPAREN { ret.reset(new PointAgreement(lpos, rpos, expr, tagset)); } | "wagr" LPAREN lpos = position_operator [tagset, vars] COMMA rpos = position_operator [tagset, vars] COMMA expr = symset_operator [tagset, vars] RPAREN { ret.reset(new WeakAgreement(lpos, rpos, expr, tagset)); } ; // ---------------------------------------------------------------------------- // Parse operator on L1 level bool_phrase [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<Bool> > ret] : ret = bool_phrase_annotation [tagset, vars] | ret = bool_phrase_iteration [tagset, vars] ; // ---------------------------------------------------------------------------- // Annotation operator: phrase, phrase_beg, phrase_end, phrase_whole, phrase_pp bool_phrase_annotation [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<Bool> > ret] { boost::shared_ptr<Function<Position> > lpos, rpos; } : "phrase" LPAREN lpos = position_operator [tagset, vars] COMMA n1: STRING RPAREN { // TODO } | "phrase_beg" LPAREN lpos = position_operator [tagset, vars] COMMA n2: STRING RPAREN { // TODO } | "phrase_end" LPAREN lpos = position_operator [tagset, vars] COMMA n3: STRING RPAREN { // TODO } | "phrase_whole" LPAREN lpos = position_operator [tagset, vars] COMMA rpos = position_operator [tagset, vars] COMMA n4: STRING RPAREN { // TODO } | "phrase_pp" LPAREN lpos = position_operator [tagset, vars] COMMA rpos = position_operator [tagset, vars] COMMA n5: STRING RPAREN { // TODO } ; // ---------------------------------------------------------------------------- // Phrase iteration operator: lphrase, rphrase bool_phrase_iteration [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<Bool> > ret] { boost::shared_ptr<Function<Position> > position; boost::shared_ptr<VarGetter<Position> > var_position; } : "lphrase" LPAREN position = position_operator [tagset, vars] COMMA var_position = position_variable [vars] COMMA n1: STRING RPAREN { // TODO } | "rphrase" LPAREN position = position_operator [tagset, vars] COMMA var_position = position_variable [vars] COMMA n2: STRING RPAREN { // TODO } ; // ---------------------------------------------------------------------------- // Setvar operator // Returns boost::shared_ptr<Function<Bool> > // ---------------------------------------------------------------------------- setvar_operator [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<Bool> > ret] : "setvar" LPAREN ( ret = position_setvar [tagset, vars] | ret = bool_setvar [tagset, vars] | ret = strset_setvar [tagset, vars] | ret = symset_setvar [tagset, vars] ) RPAREN ; // ---------------------------------------------------------------------------- // Setvar for position position_setvar [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<Bool> > op] { boost::shared_ptr<Function<Position> > ret_op; boost::shared_ptr<VariableAccessor<Position> > ret_acc; } : ret_acc = position_variable_acc [vars] COMMA ret_op = position_operator [tagset, vars] { op.reset(new VarSetter<Position>(*ret_acc, ret_op)); } ; // ---------------------------------------------------------------------------- // Setvar for bool bool_setvar [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<Bool> > op] { boost::shared_ptr<Function<Bool> > ret_op; boost::shared_ptr<VariableAccessor<Bool> > ret_acc; } : ret_acc = bool_variable_acc [vars] COMMA ret_op = bool_operator [tagset, vars] { op.reset(new VarSetter<Bool>(*ret_acc, ret_op)); } ; // ---------------------------------------------------------------------------- // Setvar for strset strset_setvar [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<Bool> > op] { boost::shared_ptr<Function<StrSet> > ret_op; boost::shared_ptr<VariableAccessor<StrSet> > ret_acc; } : ret_acc = strset_variable_acc [vars] COMMA ret_op = strset_operator [tagset, vars] { op.reset(new VarSetter<StrSet>(*ret_acc, ret_op)); } ; // ---------------------------------------------------------------------------- // Setvar for symset symset_setvar [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<Bool> > op] { boost::shared_ptr<Function<TSet> > ret_op; boost::shared_ptr<VariableAccessor<TSet> > ret_acc; } : ret_acc = symset_variable_acc [vars] COMMA ret_op = symset_operator [tagset, vars] { op.reset(new VarSetter<TSet>(*ret_acc, ret_op)); } ; // ---------------------------------------------------------------------------- // ---------------------------------------------------------------------------- // Tagging actions and rules: // ---------------------------------------------------------------------------- // ---------------------------------------------------------------------------- // Single action such as select, delete, relabel or unify action [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<TagAction> act] : act = action_select [tagset, vars] | act = action_delete [tagset, vars] | act = action_relabel [tagset, vars] // | act = action_unify [tagset, vars] // | act = action_mark [tagset, vars] | act = action_unmark [tagset, vars] ; // Action sequence - the actions are separated with commas: // select(...), select(...), delete(...) action_sequence [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<std::vector<boost::shared_ptr<TagAction> > > v_act] { boost::shared_ptr<TagAction> act; v_act.reset(new std::vector<boost::shared_ptr<TagAction> >); } : act = action[tagset, vars] { v_act->push_back(act); } ( COMMA act = action[tagset, vars] { v_act->push_back(act); } )* ; // ---------------------------------------------------------------------------- // Single rule: // rule(NAME, ACTIONS) or rule(NAME, COND, ACTIONS) rule [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<TagRule> rle] { boost::shared_ptr<Function<Bool> > condition; boost::shared_ptr<std::vector<boost::shared_ptr<TagAction> > > actions; } : "rule" LPAREN name: STRING COMMA (condition = bool_operator [tagset, vars] COMMA)? actions = action_sequence [tagset, vars] RPAREN { if (condition) { rle.reset( new TagRule(token_ref_to_std_string(name), vars, actions, condition)); } else { rle.reset( new TagRule(token_ref_to_std_string(name), vars, actions)); } } /* : "rule" LPAREN name: STRING COMMA ( (bool_operator[tagset, vars]) => ( condition = bool_operator [tagset, vars] COMMA actions = action_sequence [tagset, vars] { // rule(NAME, COND, ACTIONS) rle.reset( new TagRule(token_ref_to_std_string(name), vars, actions, condition)); } ) | ( actions = action_sequence [tagset, vars] { // rule(NAME, ACTIONS) rle.reset(new TagRule(token_ref_to_std_string(name), vars, actions)); } ) ) RPAREN */ ; // Rule sequence rule_sequence [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<RuleSequence> rule_seq] { // FIXME czy tutaj przypadkiem nie powinno byc shared_ptr? boost::shared_ptr<TagRule> rle; rule_seq.reset(new RuleSequence()); } : rle = rule [tagset, vars] { rule_seq->push_back(*rle); } ( COMMA rle = rule [tagset, vars] { rule_seq->push_back(*rle); } )* ; // Temporary name. // This is wrapper for rule_sequence in rules section in the wccl file rules [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<RuleSequence> rule_seq] : "rules" LPAREN rule_seq = rule_sequence [tagset, vars] RPAREN { // } ; // ---------------------------------------------------------------------------- // Select action: // select(position, predicate) or select(predicate); action_select [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Select> action] { boost::shared_ptr<Function<Position> > pos; boost::shared_ptr<Function<Bool> > condition; } : "select" LPAREN ( (position_operator [tagset, vars]) => ( pos = position_operator [tagset, vars] COMMA condition = bool_operator [tagset, vars] { // select(positon, condition); action.reset(new Select(condition, pos)); } ) | ( condition = bool_operator [tagset, vars] { // select(condition); action.reset(new Select(condition)); } ) ) RPAREN ; // ---------------------------------------------------------------------------- // Delete action // delete(position, predicate) or delete(predicate); action_delete [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Delete> action] { boost::shared_ptr<Function<Position> > pos; boost::shared_ptr<Function<Bool> > condition; } : "delete" LPAREN ( (position_operator [tagset, vars]) => ( pos = position_operator [tagset, vars] COMMA condition = bool_operator [tagset, vars] { // delete(positon, condition); action.reset(new Delete(condition, pos)); } ) | ( condition = bool_operator [tagset, vars] { // delete(condition); action.reset(new Delete(condition)); } ) ) RPAREN ; // ---------------------------------------------------------------------------- // Relabel action // relabel(pos, symset, predicate) or relabel(symset, predicate) action_relabel [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Relabel> action] { boost::shared_ptr<Function<Position> > pos; boost::shared_ptr<Function<Bool> > condition; boost::shared_ptr<Function<TSet> > replace_with; } : "relabel" LPAREN ( (position_operator [tagset, vars]) => ( pos = position_operator [tagset, vars] COMMA replace_with = symset_operator [tagset, vars] COMMA condition = bool_operator [tagset, vars] { // relabel(pos, symset, predicate) action.reset(new Relabel(replace_with, condition, pos)); } ) | ( replace_with = symset_operator [tagset, vars] COMMA condition = bool_operator [tagset, vars] { // relabel(symset, predicate) action.reset(new Relabel(replace_with, condition)); } ) ) RPAREN ; // ---------------------------------------------------------------------------- // Unify action action_unify [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Unify> action] { boost::shared_ptr<Function<TSet> > attribs_expr; boost::shared_ptr<Function<Position> > pos_begin, pos_end; } : "unify" LPAREN pos_begin = position_operator [tagset, vars] COMMA pos_end = position_operator [tagset, vars] COMMA attribs_expr = symset_operator [tagset, vars] RPAREN { action.reset(new Unify(pos_begin, pos_end, attribs_expr)); } ; // ---------------------------------------------------------------------------- // Mark action action_mark [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Mark> action] { boost::shared_ptr<Function<Position> > pos_begin, pos_end, pos_head; } : "mark" LPAREN pos_begin = position_operator [tagset, vars] COMMA pos_end = position_operator [tagset, vars] COMMA (pos_head = position_operator [tagset, vars] COMMA)? chan_name: STRING RPAREN { action.reset(new Mark(pos_begin, pos_end, pos_head, ((antlr::Token*)chan_name)->getText())); } ; // ---------------------------------------------------------------------------- // Unmark action action_unmark [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Unmark> action] { boost::shared_ptr<Function<Position> > pos; } : "unmark" LPAREN pos = position_operator [tagset, vars] COMMA chan_name: STRING RPAREN { action.reset(new Unmark(pos, ((antlr::Token*)chan_name)->getText())); } ; // ---------------------------------------------------------------------------- // ---------------------------------------------------------------------------- // Match rules // Returns boost::shared_ptr<MatchRule> match_rule_operator [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<MatchRule> ret_op] { boost::shared_ptr<ApplyOperator> apply; } : apply = match_apply_operator [tagset, vars] { ret_op = boost::make_shared<MatchRule>(vars, apply); } ; // Match apply operator: // apply(match(), cond(conditions), actions(actions)) // apply(match(), actions(actions)) // Returns boost::shared_ptr<ApplyOperator> match_apply_operator [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<ApplyOperator> ret_op] { VariableAccessor<Match> matches = vars.create_accessor<Match>("_M");; boost::shared_ptr<const MatchOperator> match_op; boost::shared_ptr<std::vector<boost::shared_ptr<MatchAction> > > actions; boost::shared_ptr<std::vector<boost::shared_ptr<Function<Bool> > > > conditions; } : "apply" LPAREN match_op = match_operator[tagset, vars] COMMA ("cond" LPAREN conditions = bool_operator_comma_sep [tagset, vars] RPAREN COMMA)? "actions" LPAREN actions = match_action_comma_sep [tagset, vars] RPAREN RPAREN { if (conditions) { ret_op.reset( new ApplyOperator(matches, match_op, actions, conditions) ); } else { ret_op.reset( new ApplyOperator(matches, match_op, actions) ); } } ; // Match operator: match(match_conditions) // Returns boost::shared_ptr<MatchOperator> match_operator [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<MatchOperator> op] { boost::shared_ptr<ConjConditions> match_cond; } : "match" LPAREN match_cond = match_condition [tagset,vars] RPAREN { op.reset(new MatchOperator(match_cond)); } ; // Match conditions. Wrapper for vector of the match conditions match_condition [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<ConjConditions> condition] { std::vector<boost::shared_ptr<const MatchCondition> > m_cond; } : m_cond = match_condition_in [tagset, vars] { condition.reset(new ConjConditions(m_cond)); } ; // Match conditions. // Retutns std::vector< boost::shared_ptr<const MatchCondition> > match_condition_in [const Corpus2::Tagset& tagset, Variables& vars] returns [std::vector< boost::shared_ptr<const MatchCondition> > ret] { boost::shared_ptr<const MatchCondition> r_cond; } : r_cond = match_cond_all[tagset, vars] { ret.push_back(r_cond); } ( COMMA r_cond = match_cond_all[tagset, vars] { ret.push_back(r_cond); } )* ; // One of the match condition // Returns boost::shared_ptr<const MatchCondition> match_cond_all [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<const MatchCondition> ret] : ret = match_cond_optional [tagset, vars] | ret = match_cond_repeate [tagset, vars] | ret = match_cond_token [tagset, vars] ; // Match condition - token (wraps a L0 predicate) // Returns boost::shared_ptr<const MatchCondition> match_cond_token [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<const TokenCondition> ret] { boost::shared_ptr<Function<Bool> > bool_op; } : bool_op = bool_operator [tagset, vars] { ret = boost::make_shared<TokenCondition>(bool_op); } ; // Match condition - optional // Returns boost::shared_ptr<OptionalMatch> match_cond_optional [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<OptionalMatch> mtch] { boost::shared_ptr<ConjConditions> m_cond; } : "optional" LPAREN m_cond = match_condition [tagset, vars] RPAREN { mtch.reset(new OptionalMatch(m_cond)); } ; // Match condition - repeat // Returns boost::shared_ptr<RepeatedMatch> match_cond_repeate [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<RepeatedMatch> mtch] { boost::shared_ptr<ConjConditions> m_cond; } : "repeat" LPAREN m_cond = match_condition [tagset, vars] RPAREN { mtch.reset(new RepeatedMatch(m_cond)); } ; // ---------------------------------------------------------------------------- // Match actions. Match action can be mark or unmark // Returns boost::shared_ptr<MatchAction> match_action [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<MatchAction> m_act] : m_act = match_mark_action [tagset, vars] | m_act = match_unmark_action [tagset, vars] ; // Match mark action // Returns match_mark_action [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<MarkMatch> m_act] { boost::shared_ptr<Function<Match> > match_to; boost::shared_ptr<Function<Match> > match_from; boost::shared_ptr<Function<Match> > head_match; } : "mark" LPAREN match_from = match_fit[tagset, vars] COMMA ( match_to = match_fit[tagset, vars] COMMA ( head_match = match_fit[tagset, vars] COMMA )? )? annotation_name : STRING RPAREN { if (!match_to) { m_act.reset( new MarkMatch( match_from, ((antlr::Token*)annotation_name)->getText())); } else { if (!head_match) { m_act.reset( new MarkMatch( match_from, match_to, ((antlr::Token*)annotation_name)->getText())); } else { m_act.reset( new MarkMatch( match_from, match_to, head_match, ((antlr::Token*)annotation_name)->getText())); } } } ; // Match unmark action // Returns boost::shared_ptr<UnmarkMatch> match_unmark_action [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<UnmarkMatch> m_act] { boost::shared_ptr<Function<Match> > match_at; } : "unmark" LPAREN match_at = match_fit[tagset, vars] COMMA annotation_name : STRING RPAREN { m_act.reset( new UnmarkMatch( match_at, ((antlr::Token*)annotation_name)->getText())); } ; // Match action separated by comma // Returns boost::shared_ptr<std::vector<boost::shared_ptr<MatchAction> > > match_action_comma_sep [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<std::vector<boost::shared_ptr<MatchAction> > > r_vec] { boost::shared_ptr<MatchAction> act; r_vec.reset( new std::vector<boost::shared_ptr<MatchAction> > ); } : act = match_action [tagset, vars] { r_vec->push_back(act); } ( COMMA act = match_action [tagset, vars] { r_vec->push_back(act); } )* ; // Function<Match> is wrapper for Constant<Match> and Function<Match> // Returns boost::shared_ptr<Function<Match> > match_fit [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<Match> > ret] { // } : ( ret = match_var_val [tagset, vars] | "M" { ret.reset(new VarGetter<Match>(vars.create_accessor<Match>("_M"))); } | LPAREN ret = match_fit [tagset, vars] RPAREN ) ( // if there's an arrow after the match, we have a submatch reference ARROW i: UNSIGNED_INT { ret.reset(new Submatch(ret, token_ref_to_int(i))); } )? ; match_var_val [const Corpus2::Tagset& tagset, Variables& vars] returns [boost::shared_ptr<Function<Match> > ret] : ret = match_vector_variable [vars] | ret = match_value_const ; /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// // ANTLR LEXER /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// class ANTLRLexer extends Lexer; options { exportVocab = ANTLRExpr; charVocabulary = '\3'..'\377'; testLiterals = false; k = 2; } // TODO STRING options { paraphrase = "a string"; } : '"'! (~('"' | '\n' | '\r'))* '"'! | '\''! (~('\'' | '\n' | '\r'))* '\''! ; SIGNED_INT options { paraphrase = "Signed integer"; } : ('-'|'+') (' '!|'\t'!)* ('0'..'9')+ ; UNSIGNED_INT options { paraphrase = "Unsigned integer"; } : ('0'..'9')+ ; QUOT_MARK options { paraphrase = "Quote"; } : '\'' ; APOS_MARK options { paraphrase = "Apostrophe"; } : '"' ; Q_MARK options { paraphrase = "Question mark"; } : '?' ; E_MARK options { paraphrase = "Exclamation mark"; } : '!' ; STR_PREFIX options { paraphrase = "String prefix"; } : "$s:" ; TST_PREFIX options { paraphrase = "Symset prefix"; } : "$t:" ; BOOL_PREFIX options { paraphrase = "Bool prefix"; } : "$b:" ; POS_PREFIX options { paraphrase = "Position prefix"; } : '$' ; MATCH_VECTOR_PREFIX options { paraphrase = "Match vector prefix"; } : "$m:" ; LBRACKET options { paraphrase = "'['"; } : '[' ; RBRACKET options { paraphrase = "']'"; } : ']' ; LPAREN options { paraphrase = "'('"; } : '(' ; RPAREN options { paraphrase = "')'"; } : ')' ; LCURLY options { paraphrase = "'{'"; } : '{' ; RCURLY options { paraphrase = "'}'"; } : '}' ; AT_MARK options { paraphrase = "'@'"; } : '@' ; COMMA options { paraphrase = "','"; } : ',' ; ARROW options { paraphrase = "->"; } : "->" ; SYMBOL options { paraphrase = "Symbol"; testLiterals = true; } : ('a'..'z' | 'A'..'Z' | '_') ('a'..'z' | 'A'..'Z' | '_' | '0'..'9')* | '`' ('a'..'z' | 'A'..'Z' | '_') ('a'..'z' | 'A'..'Z' | '_' | '0'..'9')* '`' ; WS : ( ' ' | '\t' | '\f' | ( "\r\n" | '\r' | '\n' ) { newline(); } ) { $setType(antlr::Token::SKIP); } ; COMMENT options { paraphrase = "Single line comment"; } : "//" (~('\n'|'\r'))* { $setType(antlr::Token::SKIP); } ; ML_COMMENT options { paraphrase = "Multi line comment"; } : "/*" ( // TODO: test it and add reference to the site it's taken from! /* This actually works OK despite the ambiguity that '\r' '\n' can be matched in one alternative or by matching '\r' in one iteration and '\n' in another.. But this is really matched just by one rule per (...)* loop iteration, so it's OK. This is exactly how they do it all over the web - just turn off the warning for this particular token.*/ options { generateAmbigWarnings = false; } : { LA(2)!='/' }? '*' | '\r' '\n' { newline(); } | '\r' { newline(); } | '\n' { newline(); } | ~('*'|'\n'|'\r') )* "*/" { $setType(antlr::Token::SKIP); } ; HASH options { paraphrase = "'#'"; } : '#' ; //DSEPARATOR //options { // paraphrase = "':-'"; //} // : ":-" //;