header { #include "ParserException.h" // libwccl #include <cstdio> #include <antlr/Token.hpp> #include <boost/lexical_cast.hpp> } options { language = "Cpp"; } // ---------------------------------------------------------------------------------- // ANTLR PARSER // ---------------------------------------------------------------------------------- class ANTLRParser extends Parser; options { k = 6; exportVocab = ANTLRExpr; buildAST = false; defaultErrorHandler = false; } { private: // const std::string token_ref_to_std_string(antlr::RefToken& rstr) { return (((antlr::Token*)rstr)->getText()); } const int token_ref_to_int(antlr::RefToken& rstr) { return atoi(this->token_ref_to_std_string(rstr).c_str()); } } ///////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////// // ---------------------------------------------------------------------------------- // Start all rules start_rules { std::string name = ""; } : values_ref [name] { fprintf(stderr, "%s\n", name.c_str()); } | position_op [name] { fprintf(stderr, "%s\n", name.c_str()); } | filters_op [name] { fprintf(stderr, "%s\n", name.c_str()); } | setvar_op [name] { fprintf(stderr, "%s\n", name.c_str()); } | string_operators [name] { fprintf(stderr, "%s\n", name.c_str()); } | boolean_op [name] { fprintf(stderr, "%s\n", name.c_str()); } ; // --------------------------------------------------------------------------------- // values: // Walues can be use for setvar(...,..) values [std::string& name]: position [name] | str_set [name] | sym_set [name] | boolean [name] ; // // Values reference => values + position_ref // !! Cannot use for setvar(...,...) !! values_ref [std::string& name]: values [name] | position_ref [name] | boolean_ref [name] ; ///////////////////////////////////////////////////////////////////////////////////// // Position: $0name position [std::string& name]: DOLLAR "0" n: SYMBOL { name = token_ref_to_std_string(n); } ; // // Position reference: $(0-9)+name // !! Cannot use for setvar(...,...) !! position_ref [std::string& name]: DOLLAR INT n: SYMBOL { name = token_ref_to_std_string(n); } ; ///////////////////////////////////////////////////////////////////////////////////// // String set: $name str_set [std::string& name]: DOLLAR n: SYMBOL { name = token_ref_to_std_string(n); } ; ///////////////////////////////////////////////////////////////////////////////////// // Sym set: $$name sym_set [std::string& name]: DOLLAR DOLLAR n: SYMBOL { name = token_ref_to_std_string(n); } ; ///////////////////////////////////////////////////////////////////////////////////// // Bool: $?name boolean [std::string& name]: DOLLAR Q_MARK n1: SYMBOL { name = token_ref_to_std_string(n1); } ; // Boolean $!name boolean_ref [std::string& name]: DOLLAR E_MARK n1: SYMBOL { name = token_ref_to_std_string(n1); } ; ///////////////////////////////////////////////////////////////////////////////////// // OPERATORS ///////////////////////////////////////////////////////////////////////////////////// // ---------------------------------------------------------------------------------- // Positions operator // TODO range przyjmuje postion_ref. ?? Zmienic na position ?? position_op [std::string& name] { std::string r1, r2; } : "flex" LBRACKET position_ref [name] RBRACKET | "range" LPAREN s: SYMBOL COMMA position_ref [r1] COMMA position_ref [r2] RPAREN { name = ("Range opertator from " + token_ref_to_std_string(s) + " [" + r1 + ":" + r2 + "]!"); } ; // ---------------------------------------------------------------------------------- // Filtering operator filters_op [std::string& name] { std::string p, p2, e1, e2; } : "catflt" LPAREN position_ref [p] COMMA es_any [e1] COMMA es_any [e2] RPAREN { name = ( "Catflt operator in position " + p + " for sets " + e1 + " " + e2); } | "agrflt" LPAREN position_ref [p] COMMA position_ref [p2] COMMA es_any [e1] COMMA i: INT RPAREN { name = ( "Agrflt operator p1 " + p + " p2 " + p2 + " for set " + e1 + " aggr_attrs " + token_ref_to_std_string(i)); } ; // ---------------------------------------------------------------------------------- // Setvar operator setvar_op [std::string& value] : setvar_pos [value] | setvar_bool [value] | setvar_sset [value] | setvar_tset [value] ; // setvar dla position przyjmuje position_ref -> TODO sprawdzic dlaczego // gramatyka nie pokrywa "setvar" LPAREN position COMMA position_v RPAREN setvar_pos [std::string& value] : "setvar" LPAREN position_ref [value] COMMA position_v [value] RPAREN // : "setvar" LPAREN position [value] COMMA position_v [value] RPAREN ; setvar_bool [std::string& value] : "setvar" LPAREN boolean [value] COMMA boolean_v [value] RPAREN ; setvar_sset [std::string& value] : "setvar" LPAREN str_set [value] COMMA str_set_v [value] RPAREN ; setvar_tset [std::string& value] : "setvar" LPAREN sym_set [value] COMMA sym_set_v [value] RPAREN ; // ---------------------------------------------------------------------------------- // Values ///////////////////////////////////////////////////////////////////////////////////// // boolean: boolean_v [std::string& value] : "True" { value = "True"; } | "False" { value = "False"; } | boolean[value] ; ///////////////////////////////////////////////////////////////////////////////////// // position value: position_v [std::string& value] : i: INT { value = token_ref_to_std_string(i); } | "begin" { value = "begin"; } | "end" { value = "end"; } | "nowhere" { value = "nowhere"; } | position [value] ; ///////////////////////////////////////////////////////////////////////////////////// // string set in str_set_in [std::string& value] : v1: STRING { value += token_ref_to_std_string(v1); } | v2: STRING COMMA str_set_in [value] { value += (", " + token_ref_to_std_string(v2)); } ; // string set [] ['a'] ['a', 'b'] ["a"] ["a", "b"] ['a', "b"] str_set_v [std::string& value] : LBRACKET RBRACKET | LBRACKET str_set_in[value] RBRACKET ; ///////////////////////////////////////////////////////////////////////////////////// // element of sym set sym_set_elem_s [std::string& value] : s1: SYMBOL { value += token_ref_to_std_string(s1); } | s2: SYMBOL COMMA sym_set_elem_s [value] { value += token_ref_to_std_string(s2); } | s3: SYMBOL COMMA sym_set_elem_g [value] { value += token_ref_to_std_string(s3); } ; // element of sym set sym_set_elem_g [std::string& value] : G_MARK s1: SYMBOL G_MARK { value += token_ref_to_std_string(s1); } | G_MARK s2: SYMBOL G_MARK COMMA sym_set_elem_g [value] { value += token_ref_to_std_string(s2); } | G_MARK s3: SYMBOL G_MARK COMMA sym_set_elem_s [value] { value += token_ref_to_std_string(s3); } ; // sym set in sym_set_in [std::string& value] : sym_set_elem_s [value] | sym_set_elem_g [value] ; // sym set {} {a} {a, b} sym_set_v [std::string& value] : LCURLY RCURLY | LCURLY sym_set_in [value] RCURLY ; ///////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////// // internal values: v_literal [std::string& value] : s1: STRING { value = token_ref_to_std_string(s1); } | s2: SYMBOL { value = token_ref_to_std_string(s2); } ; ///////////////////////////////////////////////////////////////////////////////////// // constants // set of values s_literal [std::string& v] : LBRACKET ((v_literal[v]) (COMMA v_literal [v])*)? RBRACKET ; // comma-separated predicates seq_et [std::string& v]: et_any [v] (COMMA et_any [v])* ; es_any [std::string& v]: s_literal [v] | es_op [v] ; es_op [std::string& v]: position_ref [v] | filters_op [v] ; et_bool [std::string& v]: boolean [v] | boolean_op [v] ; // set relations et_set [std::string& v] { std::string v1, v2; } : "in" LPAREN es_any [v1] COMMA es_any [v2] RPAREN { v = ("name " + v1 + " " + v2); } | "inter" LPAREN es_any [v1] COMMA es_any [v2] RPAREN { v = ("inter " + v1 + " " + v2); } | "equal" LPAREN es_any [v1] COMMA es_any [v2] RPAREN { v = ("equal " + v1 + " " + v2); } ; et_string [std::string& v] : "isbig" LPAREN es_any [v] RPAREN | "hasnum" LPAREN es_any [v] RPAREN ; et_action [std::string& v] { std::string v1, v2; } : "delete" LPAREN et_any [v] RPAREN | "select" LPAREN et_any [v] RPAREN | "relabel" LPAREN es_any [v1] COMMA et_any [v2] RPAREN { v = ("relabel " + v1 + " " + v2); } | "unify" LPAREN es_any [v1] COMMA i: INT RPAREN { v = ("relabel " + v1 + " on position " + token_ref_to_std_string(i)); } | "mark" LPAREN s1: SYMBOL RPAREN { v = ("mark " + token_ref_to_std_string(s1)); } | "unmark" LPAREN s2: SYMBOL RPAREN { v = ("unmark " + token_ref_to_std_string(s2)); } | "startnew" LPAREN s3: SYMBOL RPAREN { v = ("startnew " + token_ref_to_std_string(s3)); } | "lextend" LPAREN s4: SYMBOL RPAREN { v = ("lextend " + token_ref_to_std_string(s4)); } | "rextend" LPAREN s5: SYMBOL RPAREN { v = ("rextend " + token_ref_to_std_string(s5)); } ; et_iter [std::string& v] { std::string v1, v2, v3, v4; } : "only" LPAREN position_ref [v1] COMMA position_ref [v2] COMMA position_ref [v3] COMMA et_any [v4] RPAREN | "atleast" LPAREN position_ref [v1] COMMA position_ref [v2] COMMA position_ref [v3] COMMA et_any [v4] COMMA i:INT RPAREN | "llook" LPAREN position_ref [v1] COMMA position_ref [v2] COMMA position_ref [v3] COMMA et_any [v4] RPAREN | "rlook" LPAREN position_ref [v1] COMMA position_ref [v2] COMMA position_ref [v3] COMMA et_any [v4] RPAREN | "setvar" LPAREN position_ref [v1] COMMA position_ref [v2] RPAREN | "lskip" LPAREN position_ref [v1] COMMA SYMBOL COMMA position_ref [v2] COMMA et_any [v3] RPAREN | "lphrase" LPAREN position_ref [v1] COMMA SYMBOL COMMA position_ref [v2] RPAREN | "rphrase" LPAREN position_ref [v1] COMMA SYMBOL COMMA position_ref [v2] RPAREN | "accept" LPAREN seq_et [v1] RPAREN ; // predicates checking agreement et_agr [std::string& name] { std::string p1, p2, v; } : "agr" LPAREN position_ref [p1] COMMA position_ref [p2] COMMA es_any [v] COMMA i1: INT RPAREN | "agrpp" LPAREN position_ref [p1] COMMA position_ref [p2] COMMA es_any [v] COMMA i2: INT RPAREN | "wagr" LPAREN position_ref [p1] COMMA position_ref [p2] COMMA es_any [v] COMMA i3: INT RPAREN ; // annotation checking predicates et_annot [std::string& v] : "phrase" LPAREN position_ref [v] COMMA s: SYMBOL RPAREN ; // constraints et_any [std::string& v] : et_bool [v] | et_set [v] | et_string [v] | et_action [v] | et_iter [v] | et_agr [v] | et_annot [v] ; ///////////////////////////////////////////////////////////////////////////////////// // OERATORS ///////////////////////////////////////////////////////////////////////////////////// // Operators returns str_set: orth[$-2P] string_operators [std::string& name] : op_orth [name] | op_base [name] | op_lower [name] | op_upper [name] | op_affix [name] ; op_orth [std::string& name] : "orth" LBRACKET position_ref [name] RBRACKET { name = "Orth operator!"; } ; op_base [std::string& name] : "base" LBRACKET position_ref [name] RBRACKET { name = "Base operator!"; } ; op_lower [std::string& name] : "lower" LPAREN str_set [name] RPAREN { name = "Lower operator!"; } | "lower" LPAREN str_set_v [name] RPAREN { name = "Lower operator!"; } ; op_upper [std::string& name] : "upper" LPAREN str_set [name] RPAREN { name = "Upper operator!"; } | "upper" LPAREN str_set_v [name] RPAREN { name = "Upper operator!"; } ; op_affix [std::string& name] : "affix" LPAREN str_set [name] COMMA n1: INT RPAREN { name = "Affix operator " + token_ref_to_std_string(n1) + "!"; } | "affix" LPAREN str_set_v [name] COMMA n2: INT RPAREN { name = "Affix operator " + token_ref_to_std_string(n2) + "!"; } ; boolean_op [std::string& name] : "and" LPAREN seq_et [name] RPAREN | "not" LPAREN seq_et [name] RPAREN | "or" LPAREN seq_et [name] RPAREN ; // ---------------------------------------------------------------------------------- // ANTLR LEXER // ---------------------------------------------------------------------------------- class ANTLRLexer extends Lexer; options { k = 2; exportVocab = ANTLRExpr; charVocabulary = '\3'..'\377'; testLiterals = false; } STRING options { paraphrase = "a string"; } : '"' (~'"')* '"' | '\'' (~'\'')* '\'' ; INT options { paraphrase = "Integer"; } : ('-'|'+')?('0'..'9')+ ; Q_MARK options { paraphrase = "Query mark"; } : '?' ; E_MARK options { paraphrase = "Exclamanation mark"; } : '!' ; G_MARK options { paraphrase = "Gravis mark"; } : '`' ; LBRACKET options { paraphrase = "'['"; } : '[' ; RBRACKET options { paraphrase = "']'"; } : ']' ; LPAREN options { paraphrase = "'('"; } : '(' ; RPAREN options { paraphrase = "')'"; } : ')' ; LCURLY options { paraphrase = "'{'"; } : '{' ; RCURLY options { paraphrase = "'}'"; } : '}' ; DOLLAR options { paraphrase = "'$'"; } : '$' ; AT_MARK options { paraphrase = "'@'"; } : '@' ; COMMA options { paraphrase = "','"; } : ',' ; SYMBOL options { paraphrase = "symbol"; testLiterals = true; } : ( 'a'..'z' | 'A'..'Z' | '_' ) ( 'a'..'z' | 'A'..'Z' | '_' | '0'..'9' )* ; WS : ( ' ' | '\t' | '\r' '\n' {newline(); } | '\n' {newline(); } ) { $setType(antlr::Token::SKIP); } ; COMMENT options { paraphrase = "Comment"; } : "//" (~'\n')* '\n'{ $setType(antlr::Token::SKIP); newline(); } ; HASH options { paraphrase = "'#'"; } : '#' ; DSEPARATOR options { paraphrase = "':-'"; } : ":-" ;