header { #include <libwccl/parser/ParserException.h> #include <cstdio> #include <antlr/Token.hpp> #include <boost/lexical_cast.hpp> // values/variables #include <libwccl/variables.h> #include <libwccl/values/bool.h> #include <libwccl/values/tset.h> #include <libwccl/values/strset.h> #include <libwccl/values/position.h> #include <libwccl/values/positionref.h> // sentence context #include <libwccl/sentencecontext.h> // operators #include <libwccl/ops/and.h> #include <libwccl/ops/affix.h> #include <libwccl/ops/toupper.h> #include <libwccl/ops/tolower.h> #include <libwccl/ops/constant.h> #include <libwccl/ops/functions.h> #include <libwccl/ops/logicalpredicate.h> } options { language = "Cpp"; } // ---------------------------------------------------------------------------- // ANTLR PARSER // ---------------------------------------------------------------------------- class ANTLRParser extends Parser; options { k = 6; exportVocab = ANTLRExpr; buildAST = false; defaultErrorHandler = false; } { private: // const std::string token_ref_to_std_string(antlr::RefToken& rstr) { return (((antlr::Token*)rstr)->getText()); } // int token_ref_to_int(antlr::RefToken& rstr) { return atoi(this->token_ref_to_std_string(rstr).c_str()); } // hepls function for processing boost::shared_ptr<Wccl::Function<Wccl::StrSet> > get_str_set_expr( boost::shared_ptr<Wccl::StrSet> ret_str_set) { boost::shared_ptr<Wccl::Function<Wccl::StrSet> > strset_expr( new Wccl::Constant<Wccl::StrSet>(*ret_str_set.get()) ); return strset_expr; } Wccl::SentenceContext get_tmp_context() { boost::shared_ptr<Corpus2::Sentence> sentence; Wccl::SentenceContext sc(sentence); return sc; } } /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// // ---------------------------------------------------------------------------------- // Start all rules /* start_rules { std::string name = ""; } : values_ref [name] { fprintf(stderr, "%s\n", name.c_str()); } | position_op [name] { fprintf(stderr, "%s\n", name.c_str()); } | filters_op [name] { fprintf(stderr, "%s\n", name.c_str()); } | setvar_op [name] { fprintf(stderr, "%s\n", name.c_str()); } | boolean_op [name] { fprintf(stderr, "%s\n", name.c_str()); } ; */ /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// // "GLOBAL" RULES // ---------------------------------------------------------------------------- // Rules for parsing string operators in scope (variables). // Returns boost::shared_ptr<Wccl::Function<Wccl::StrSet> > parse_string_operator returns [boost::shared_ptr<Wccl::Function<Wccl::StrSet> > res] { Wccl::Variables vars; boost::shared_ptr<Wccl::StrSet> mret; } : mret = string_operators [vars, res] ; // ---------------------------------------------------------------------------- // Rules for parsing predicates in scope (variables). // Returns boost::shared_ptr<Wccl::Function<Wccl::Bool> > parse_predicates returns [boost::shared_ptr<Wccl::Function<Wccl::Bool> > res] { Wccl::Variables vars; boost::shared_ptr<Wccl::Bool> mret; } : mret = predicates [vars, res] ; // ---------------------------------------------------------------------------- // Rules for parsing values in scope (variables). // Returns boost::shared_ptr<Wccl::Value> parse_values returns [boost::shared_ptr<Wccl::Value> ret] { Wccl::Variables vars; } : ret = values [vars] ; /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// // All values: // Values can be use for setvar(...,..) // ---------------------------------------------------------------------------- values [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::Value> res] : res = position [vars] | res = str_set [vars] | res = sym_set [vars] | res = boolean [vars] ; // ---------------------------------------------------------------------------- // Values reference => values + position_ref // !! Cannot use for setvar(...,...) !! /* values_ref [std::string& name]: values [name] | position_ref [name] | boolean_ref [name] ; */ /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// // Position: $0name // ---------------------------------------------------------------------------- // TODO Cos nie lapie dobrze implementacja!!! Moze jakas dodatkowa // TODO regula do lexera? position [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::Position> val] : DOLLAR "0" n: SYMBOL { val = vars.get_put<Wccl::Position>(token_ref_to_std_string(n)); } ; // ---------------------------------------------------------------------------- // Position reference: $(0-9)+name // !! Cannot use for setvar(...,...) !! position_ref [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::PositionRef> val] : DOLLAR p_ref: INT n: SYMBOL { val.reset( new Wccl::PositionRef( vars.get_put<Wccl::Position>(token_ref_to_std_string(n)), token_ref_to_int(p_ref) ) ); } ; // ---------------------------------------------------------------------------- // String set, call examples: $name, $Name, $_name, $_Name etc. // This expression gets variable of tyme StrSet from string-named variable // Returns variable<StrSet> from Set-variables str_set [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::StrSet> val] : DOLLAR n: SYMBOL { val = vars.get_put<Wccl::StrSet>(token_ref_to_std_string(n)); } ; // ---------------------------------------------------------------------------- // Symbol set: $$name sym_set [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::TSet> val] : DOLLAR DOLLAR n: SYMBOL { val = vars.get_put<Wccl::TSet>(token_ref_to_std_string(n)); } ; // ---------------------------------------------------------------------------- // Bool: $?name boolean [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::Bool> val] : DOLLAR Q_MARK n: SYMBOL { val = vars.get_put<Wccl::Bool>(token_ref_to_std_string(n)); } ; // Boolean $!name /* boolean_ref [std::string& name]: DOLLAR E_MARK n1: SYMBOL { name = token_ref_to_std_string(n1); } ; */ ///////////////////////////////////////////////////////////////////////////////////// // OPERATORS ///////////////////////////////////////////////////////////////////////////////////// // ---------------------------------------------------------------------------------- // Positions operator // TODO range przyjmuje postion_ref. ?? Zmienic na position ?? /* position_op [std::string& name] { std::string r1, r2; } : "flex" LBRACKET position_ref [name] RBRACKET | "range" LPAREN s: SYMBOL COMMA position_ref [r1] COMMA position_ref [r2] RPAREN { name = ("Range opertator from " + token_ref_to_std_string(s) + " [" + r1 + ":" + r2 + "]!"); } ; */ // ---------------------------------------------------------------------------------- // Filtering operator /* filters_op [std::string& name] { std::string p, p2, e1, e2; } : "catflt" LPAREN position_ref [p] COMMA es_any [e1] COMMA es_any [e2] RPAREN { name = ( "Catflt operator in position " + p + " for sets " + e1 + " " + e2); } | "agrflt" LPAREN position_ref [p] COMMA position_ref [p2] COMMA es_any [e1] COMMA i: INT RPAREN { name = ( "Agrflt operator p1 " + p + " p2 " + p2 + " for set " + e1 + " aggr_attrs " + token_ref_to_std_string(i)); } ; */ // ---------------------------------------------------------------------------------- // Setvar operator /* setvar_op [std::string& value] : setvar_pos [value] | setvar_bool [value] | setvar_sset [value] | setvar_tset [value] ; */ // setvar dla position przyjmuje position_ref -> TODO sprawdzic dlaczego // gramatyka nie pokrywa "setvar" LPAREN position COMMA position_v RPAREN /* setvar_pos [std::string& value] : "setvar" LPAREN position_ref [value] COMMA position_v [value] RPAREN // : "setvar" LPAREN position [value] COMMA position_v [value] RPAREN ; */ /* setvar_bool [std::string& value] : "setvar" LPAREN boolean [value] COMMA boolean_v [value] RPAREN ; */ /* setvar_sset [std::string& value] : "setvar" LPAREN str_set [value] COMMA str_set_v [value] RPAREN ; */ /* setvar_tset [std::string& value] : "setvar" LPAREN sym_set [value] COMMA sym_set_v [value] RPAREN ; */ /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// // VALUES /////////////////////////////////////////////////////////////////////////////// // ---------------------------------------------------------------------------- // Single or muliple values in string set str_set_v_in [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::StrSet> var] { var.reset(new Wccl::StrSet); } : v1: STRING { var->insert(token_ref_to_std_string(v1).c_str()); } | v2: STRING COMMA var = str_set_v_in [vars] { var->insert(token_ref_to_std_string(v2).c_str()); } ; // string set called as unnamed (temporary) StrSet: // calls: [] ['a'] ['a', 'b'] ["a"] ["a", "b"] ['a', "b"] // Actually, doing nothing with vars. str_set_v [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::StrSet> val] : LBRACKET RBRACKET { val.reset(new Wccl::StrSet); // initialize as unnamed empty variable } | LBRACKET val = str_set_v_in [vars] RBRACKET ; // ---------------------------------------------------------------------------- // element of sym set /* sym_set_elem_s [std::string& value] : s1: SYMBOL { value += token_ref_to_std_string(s1); } | s2: SYMBOL COMMA sym_set_elem_s [value] { value += token_ref_to_std_string(s2); } | s3: SYMBOL COMMA sym_set_elem_g [value] { value += token_ref_to_std_string(s3); } ; */ // element of sym set /* sym_set_elem_g [std::string& value] : G_MARK s1: SYMBOL G_MARK { value += token_ref_to_std_string(s1); } | G_MARK s2: SYMBOL G_MARK COMMA sym_set_elem_g [value] { value += token_ref_to_std_string(s2); } | G_MARK s3: SYMBOL G_MARK COMMA sym_set_elem_s [value] { value += token_ref_to_std_string(s3); } ; */ // sym set in /* sym_set_in [std::string& value] : sym_set_elem_s [value] | sym_set_elem_g [value] ; */ // sym set {} {a} {a, b} /* sym_set_v [std::string& value] : LCURLY RCURLY | LCURLY sym_set_in [value] RCURLY ; */ // ---------------------------------------------------------------------------- // boolean: boolean_v [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::Bool> val] : "True" { val.reset(new Wccl::Bool(true )); } | "False" { val.reset(new Wccl::Bool(false)); } | val = boolean [vars] ; // ---------------------------------------------------------------------------- // position value: /* position_v [std::string& value] : i: INT { value = token_ref_to_std_string(i); } | "begin" { value = "begin"; } | "end" { value = "end"; } | "nowhere" { value = "nowhere"; } | position [value] ; */ // ---------------------------------------------------------------------------- // internal values: /* v_literal [std::string& value] : s1: STRING { value = token_ref_to_std_string(s1); } | s2: SYMBOL { value = token_ref_to_std_string(s2); } ; */ ///////////////////////////////////////////////////////////////////////////////////// // constants // set of values /* st::shared_ptr<Wccl::StrSet> ret]s_literal [std::string& v] : LBRACKET ((v_literal[v]) (COMMA v_literal [v])*)? RBRACKET ; */ // comma-separated predicates /* seq_et [std::string& v]: et_any [v] (COMMA et_any [v])* ; */ /* es_any [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::StrSet> ret] s_literal [v] | es_op [v] ; */ /* et_bool [std::string& v]: boolean [v] | boolean_op [v] ; */ // set relations /* et_set [std::string& v] { std::string v1, v2; } : "in" LPAREN es_any [v1] COMMA es_any [v2] RPAREN { v = ("name " + v1 + " " + v2); } | "inter" LPAREN es_any [v1] COMMA es_any [v2] RPAREN { v = ("inter " + v1 + " " + v2); } | "equal" LPAREN es_any [v1] COMMA es_any [v2] RPAREN { v = ("equal " + v1 + " " + v2); } ; */ /* et_string [std::string& v] : "isbig" LPAREN es_any [v] RPAREN | "hasnum" LPAREN es_any [v] RPAREN ; */ /* et_action [std::string& v] { std::string v1, v2; } : "delete" LPAREN et_any [v] RPAREN | "select" LPAREN et_any [v] RPAREN | "relabel" LPAREN es_any [v1] COMMA et_any [v2] RPAREN { v = ("relabel " + v1 + " " + v2); } | "unify" LPAREN es_any [v1] COMMA i: INT RPAREN { v = ("relabel " + v1 + " on position " + token_ref_to_std_string(i)); } | "mark" LPAREN s1: SYMBOL RPAREN { v = ("mark " + token_ref_to_std_string(s1)); } | "unmark" LPAREN s2: SYMBOL RPAREN { v = ("unmark " + token_ref_to_std_string(s2)); } | "startnew" LPAREN s3: SYMBOL RPAREN { v = ("startnew " + token_ref_to_std_string(s3)); } | "lextend" LPAREN s4: SYMBOL RPAREN { v = ("lextend " + token_ref_to_std_string(s4)); } | "rextend" LPAREN s5: SYMBOL RPAREN { v = ("rextend " + token_ref_to_std_string(s5)); } ; */ /* et_iter [std::string& v] { std::string v1, v2, v3, v4; } : "only" LPAREN position_ref [v1] COMMA position_ref [v2] COMMA position_ref [v3] COMMA et_any [v4] RPAREN | "atleast" LPAREN position_ref [v1] COMMA position_ref [v2] COMMA position_ref [v3] COMMA et_any [v4] COMMA i:INT RPAREN | "llook" LPAREN position_ref [v1] COMMA position_ref [v2] COMMA position_ref [v3] COMMA et_any [v4] RPAREN | "rlook" LPAREN position_ref [v1] COMMA position_ref [v2] COMMA position_ref [v3] COMMA et_any [v4] RPAREN | "setvar" LPAREN position_ref [v1] COMMA position_ref [v2] RPAREN | "lskip" LPAREN position_ref [v1] COMMA SYMBOL COMMA position_ref [v2] COMMA et_any [v3] RPAREN | "lphrase" LPAREN position_ref [v1] COMMA SYMBOL COMMA position_ref [v2] RPAREN | "rphrase" LPAREN position_ref [v1] COMMA SYMBOL COMMA position_ref [v2] RPAREN | "accept" LPAREN seq_et [v1] RPAREN ; */ // predicates checking agreement /* et_agr [std::string& name] { std::string p1, p2, v; } : "agr" LPAREN position_ref [p1] COMMA position_ref [p2] COMMA es_any [v] COMMA i1: INT RPAREN | "agrpp" LPAREN position_ref [p1] COMMA position_ref [p2] COMMA es_any [v] COMMA i2: INT RPAREN | "wagr" LPAREN position_ref [p1] COMMA position_ref [p2] COMMA es_any [v] COMMA i3: INT RPAREN ; */ /* // annotation checking predicates et_annot [std::string& v] : "phrase" LPAREN position_ref [v] COMMA s: SYMBOL RPAREN ; */ /* // constraints et_any [std::string& v] : et_bool [v] | et_set [v] | et_string [v] | et_action [v] | et_iter [v] | et_agr [v] | et_annot [v] ; */ /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// // Stiring operators returns boost::shared_ptr<Wccl::Function<Wccl::StrSet> > /////////////////////////////////////////////////////////////////////////////// // ---------------------------------------------------------------------------- string_operators [Wccl::Variables& vars, boost::shared_ptr<Wccl::Function<Wccl::StrSet> >& op] returns [boost::shared_ptr<Wccl::StrSet> ret] : ret = op_orth [vars] { /* op.reset( new Wccl::Orth( boost::shared_ptr<Wccl::Function<Wccl::StrSet> >( new Wccl::Constant<Wccl::StrSet>(*ret.get()) ) ) ); */ } | ret = op_base [vars] { /* op.reset( new Wccl::Base( boost::shared_ptr<Wccl::Function<Wccl::StrSet> >( new Wccl::Constant<Wccl::StrSet>(*ret.get()) ) ) ); */ } | ret = op_lower [vars] { op.reset( new Wccl::ToLower( boost::shared_ptr<Wccl::Function<Wccl::StrSet> >( new Wccl::Constant<Wccl::StrSet>(*ret.get()) ) ) ); } | ret = op_upper [vars] { op.reset( new Wccl::ToUpper( boost::shared_ptr<Wccl::Function<Wccl::StrSet> >( new Wccl::Constant<Wccl::StrSet>(*ret.get()) ) ) ); } | ret = op_affix [vars] { op.reset( new Wccl::ToUpper( boost::shared_ptr<Wccl::Function<Wccl::StrSet> >( new Wccl::Constant<Wccl::StrSet>(*ret.get()) ) ) ); } ; // Implementations of string operators: // ---------------------------------------------------------------------------- op_orth [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::StrSet> ret] { boost::shared_ptr<Wccl::PositionRef> tmpPosRef; } : "orth" LBRACKET tmpPosRef = position_ref [vars] RBRACKET { // TODO } ; // ---------------------------------------------------------------------------- op_base [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::StrSet> ret] { boost::shared_ptr<Wccl::PositionRef> tmpPosRef; } : "base" LBRACKET tmpPosRef = position_ref [vars] RBRACKET { // TODO } ; // ---------------------------------------------------------------------------- op_lower [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::StrSet> ret] { boost::shared_ptr<Wccl::StrSet> ret_str_set; boost::shared_ptr<Wccl::Function<Wccl::StrSet> > tmp_op; } : "lower" LPAREN ( ret_str_set = str_set [vars] | ret_str_set = str_set_v [vars] ) RPAREN { Wccl::ToLower to_lower(get_str_set_expr(ret_str_set)); ret = to_lower.apply(get_tmp_context()); } | "lower" LPAREN ret = string_operators[vars, tmp_op] RPAREN ; // ---------------------------------------------------------------------------- op_upper [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::StrSet> ret] { boost::shared_ptr<Wccl::StrSet> ret_str_set; boost::shared_ptr<Wccl::Function<Wccl::StrSet> > tmp_op; } : "upper" LPAREN ( ret_str_set = str_set [vars] | ret_str_set = str_set_v [vars] ) RPAREN { Wccl::ToUpper to_upper(get_str_set_expr(ret_str_set)); ret = to_upper.apply(get_tmp_context()); } | "upper" LPAREN ret = string_operators[vars, tmp_op] RPAREN ; // ---------------------------------------------------------------------------- op_affix [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::StrSet> ret] { boost::shared_ptr<Wccl::StrSet> ret_str_set; boost::shared_ptr<Wccl::Function<Wccl::StrSet> > tmp_op; } : "affix" LPAREN ( ret_str_set = str_set [vars] | ret_str_set = str_set_v [vars] | ret_str_set = string_operators[vars, tmp_op] ) COMMA p_af: INT RPAREN { Wccl::Affix affix(get_str_set_expr(ret_str_set), token_ref_to_int(p_af)); ret = affix.apply(get_tmp_context()); } ; /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// // Predicates returns boost::shared_ptr<Wccl::Function<Wccl::Bool> > /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// predicates [Wccl::Variables& vars, boost::shared_ptr<Wccl::Function<Wccl::Bool> >& pr] returns [boost::shared_ptr<Wccl::Bool> ret] : ret = logical_predicates [vars, pr] ; // Implementations of predicates: // ---------------------------------------------------------------------------- logical_predicates [Wccl::Variables& vars, boost::shared_ptr<Wccl::Function<Wccl::Bool> >& pr] returns [boost::shared_ptr<Wccl::Bool> ret] { boost::shared_ptr<Wccl::LogicalPredicate::BoolFunctionPtr> v; } : ret = lpred_and [vars] { // pr.reset(new Wccl::And(v)); } ; // ---------------------------------------------------------------------------- lpred_and [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::Bool> ret] { boost::shared_ptr<Wccl::Function<Wccl::Bool> > tmpPr; } : "and" LPAREN ret = logical_predicates [vars, tmpPr] (COMMA ret = logical_predicates [vars, tmpPr])* RPAREN ; /* lpred_not lpred_or */ /* boolean_op [std::string& name] : "and" LPAREN seq_et [name] RPAREN | "not" LPAREN seq_et [name] RPAREN | "or" LPAREN seq_et [name] RPAREN ; */ // ---------------------------------------------------------------------------------- // ANTLR LEXER // ---------------------------------------------------------------------------------- class ANTLRLexer extends Lexer; options { k = 2; exportVocab = ANTLRExpr; charVocabulary = '\3'..'\377'; testLiterals = false; } STRING options { paraphrase = "a string"; } : '"' (~'"')* '"' | '\'' (~'\'')* '\'' ; INT options { paraphrase = "Integer"; } : ('-'|'+')?('0'..'9')+ ; Q_MARK options { paraphrase = "Query mark"; } : '?' ; E_MARK options { paraphrase = "Exclamanation mark"; } : '!' ; G_MARK options { paraphrase = "Gravis mark"; } : '`' ; LBRACKET options { paraphrase = "'['"; } : '[' ; RBRACKET options { paraphrase = "']'"; } : ']' ; LPAREN options { paraphrase = "'('"; } : '(' ; RPAREN options { paraphrase = "')'"; } : ')' ; LCURLY options { paraphrase = "'{'"; } : '{' ; RCURLY options { paraphrase = "'}'"; } : '}' ; DOLLAR options { paraphrase = "'$'"; } : '$' ; AT_MARK options { paraphrase = "'@'"; } : '@' ; COMMA options { paraphrase = "','"; } : ',' ; SYMBOL options { paraphrase = "symbol"; testLiterals = true; } : ( 'a'..'z' | 'A'..'Z' | '_' ) ( 'a'..'z' | 'A'..'Z' | '_' | '0'..'9' )* ; WS : ( ' ' | '\t' | '\r' '\n' {newline(); } | '\n' {newline(); } ) { $setType(antlr::Token::SKIP); } ; COMMENT options { paraphrase = "Comment"; } : "//" (~'\n')* '\n'{ $setType(antlr::Token::SKIP); newline(); } ; HASH options { paraphrase = "'#'"; } : '#' ; DSEPARATOR options { paraphrase = "':-'"; } : ":-" ;