header { #include <libwccl/parser/ParserException.h> #include <cstdio> #include <antlr/Token.hpp> #include <boost/lexical_cast.hpp> // values/variables #include <libwccl/variables.h> #include <libwccl/values/bool.h> #include <libwccl/values/tset.h> #include <libwccl/values/strset.h> #include <libwccl/values/position.h> #include <libwccl/values/positionref.h> // sentence context #include <libwccl/sentencecontext.h> // operators #include <libwccl/ops/or.h> #include <libwccl/ops/nor.h> #include <libwccl/ops/and.h> #include <libwccl/ops/affix.h> #include <libwccl/ops/toupper.h> #include <libwccl/ops/tolower.h> #include <libwccl/ops/constant.h> #include <libwccl/ops/functions.h> #include <libwccl/ops/logicalpredicate.h> // Unicode String #include <unicode/uniset.h> #include <unicode/unistr.h> } options { language = "Cpp"; } // ---------------------------------------------------------------------------- // ANTLR PARSER // ---------------------------------------------------------------------------- class ANTLRParser extends Parser; options { k = 6; exportVocab = ANTLRExpr; buildAST = false; defaultErrorHandler = false; } { private: // const UnicodeString token_ref_to_ustring(antlr::RefToken& rstr) const { return UnicodeString::fromUTF8(((antlr::Token*)rstr)->getText()).unescape(); } // const std::string token_ref_to_std_string(antlr::RefToken& rstr) const { return (((antlr::Token*)rstr)->getText()); } // int token_ref_to_int(antlr::RefToken& rstr) { return atoi(((antlr::Token*)rstr)->getText().c_str()); } // hepls function for processing boost::shared_ptr<Wccl::Function<Wccl::StrSet> > get_str_set_expr( boost::shared_ptr<Wccl::StrSet> ret_str_set) { boost::shared_ptr<Wccl::Function<Wccl::StrSet> > strset_expr( new Wccl::Constant<Wccl::StrSet>(*ret_str_set.get()) ); return strset_expr; } Wccl::SentenceContext get_tmp_context() { boost::shared_ptr<Corpus2::Sentence> sentence; Wccl::SentenceContext sc(sentence); return sc; } } /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// // ---------------------------------------------------------------------------------- // Start all rules /* start_rules { std::string name = ""; } : values_ref [name] { fprintf(stderr, "%s\n", name.c_str()); } | position_op [name] { fprintf(stderr, "%s\n", name.c_str()); } | filters_op [name] { fprintf(stderr, "%s\n", name.c_str()); } | setvar_op [name] { fprintf(stderr, "%s\n", name.c_str()); } | boolean_op [name] { fprintf(stderr, "%s\n", name.c_str()); } ; */ /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// // "GLOBAL" RULES // ---------------------------------------------------------------------------- // Rules for parsing string operators in scope (variables). // Returns boost::shared_ptr<Wccl::Function<Wccl::StrSet> > parse_string_operator returns [boost::shared_ptr<Wccl::Function<Wccl::StrSet> > res] { Wccl::Variables vars; } : res = string_operators [vars] ; // ---------------------------------------------------------------------------- // Rules for parsing predicates in scope (variables). // Returns boost::shared_ptr<Wccl::Function<Wccl::Bool> > parse_predicates returns [boost::shared_ptr<Wccl::Function<Wccl::Bool> > res] { Wccl::Variables vars; } : res = predicates [vars] ; // ---------------------------------------------------------------------------- // Rules for parsing values in scope (variables). // Returns boost::shared_ptr<Wccl::Value> /* parse_values returns [boost::shared_ptr<Wccl::Value> ret] { Wccl::Variables vars; } : ret = values [vars] ; */ /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// // All values: // Values can be use for setvar(...,..) // ---------------------------------------------------------------------------- /* values [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::Constant<Wccl::Value> > res] : res = position [vars] | res = str_set [vars] | res = sym_set [vars] | res = boolean [vars] ; */ // ---------------------------------------------------------------------------- // Values reference => values + position_ref // !! Cannot use for setvar(...,...) !! /* values_ref [std::string& name]: values [name] | position_ref [name] | boolean_ref [name] ; */ /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// // Position: $0name // ---------------------------------------------------------------------------- // TODO Cos nie lapie dobrze implementacja!!! Moze jakas dodatkowa // TODO regula do lexera? position [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::Constant<Wccl::Position> > op] { boost::shared_ptr<Wccl::Position> val; } : DOLLAR "0" n: SYMBOL { val = vars.get_put<Wccl::Position>(token_ref_to_std_string(n)); op.reset(new Wccl::Constant<Wccl::Position>(*val.get())); } ; // ---------------------------------------------------------------------------- // Position reference: $(0-9)+name // !! Cannot use for setvar(...,...) !! /* position_ref [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::PositionRef> val] : DOLLAR p_ref: INT n: SYMBOL { val.reset( new Wccl::PositionRef( vars.get_put<Wccl::Position>(token_ref_to_std_string(n)), token_ref_to_int(p_ref) ) ); } ; */ // ---------------------------------------------------------------------------- // String set, call examples: $name, $Name, $_name, $_Name etc. // This expression gets variable of tyme StrSet from string-named variable // Returns Wccl::Function<Wccl::StrSet> from Set-variables str_set [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::Constant<Wccl::StrSet> > op] { boost::shared_ptr<Wccl::StrSet> val; } : DOLLAR n: SYMBOL { val = vars.get_put<Wccl::StrSet>(token_ref_to_std_string(n)); op.reset(new Wccl::Constant<Wccl::StrSet>(*val.get())); } ; // ---------------------------------------------------------------------------- // Symbol set: $$name sym_set [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::Constant<Wccl::TSet> > op] { boost::shared_ptr<Wccl::TSet> val; } : DOLLAR DOLLAR n: SYMBOL { val = vars.get_put<Wccl::TSet>(token_ref_to_std_string(n)); op.reset(new Wccl::Constant<Wccl::TSet>(*val.get())); } ; // ---------------------------------------------------------------------------- // Bool: $?name boolean [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::Constant<Wccl::Bool> > op] { boost::shared_ptr<Wccl::Bool> val; } : DOLLAR Q_MARK n: SYMBOL { val = vars.get_put<Wccl::Bool>(token_ref_to_std_string(n)); op.reset(new Wccl::Constant<Wccl::Bool>(*val.get())); } ; // Boolean $!name /* boolean_ref [std::string& name]: DOLLAR E_MARK n1: SYMBOL { name = token_ref_to_std_string(n1); } ; */ ///////////////////////////////////////////////////////////////////////////////////// // OPERATORS ///////////////////////////////////////////////////////////////////////////////////// // ---------------------------------------------------------------------------------- // Positions operator // TODO range przyjmuje postion_ref. ?? Zmienic na position ?? /* position_op [std::string& name] { std::string r1, r2; } : "flex" LBRACKET position_ref [name] RBRACKET | "range" LPAREN s: SYMBOL COMMA position_ref [r1] COMMA position_ref [r2] RPAREN { name = ("Range opertator from " + token_ref_to_std_string(s) + " [" + r1 + ":" + r2 + "]!"); } ; */ // ---------------------------------------------------------------------------------- // Filtering operator /* filters_op [std::string& name] { std::string p, p2, e1, e2; } : "catflt" LPAREN position_ref [p] COMMA es_any [e1] COMMA es_any [e2] RPAREN { name = ( "Catflt operator in position " + p + " for sets " + e1 + " " + e2); } | "agrflt" LPAREN position_ref [p] COMMA position_ref [p2] COMMA es_any [e1] COMMA i: INT RPAREN { name = ( "Agrflt operator p1 " + p + " p2 " + p2 + " for set " + e1 + " aggr_attrs " + token_ref_to_std_string(i)); } ; */ // ---------------------------------------------------------------------------------- // Setvar operator /* setvar_op [std::string& value] : setvar_pos [value] | setvar_bool [value] | setvar_sset [value] | setvar_tset [value] ; */ // setvar dla position przyjmuje position_ref -> TODO sprawdzic dlaczego // gramatyka nie pokrywa "setvar" LPAREN position COMMA position_v RPAREN /* setvar_pos [std::string& value] : "setvar" LPAREN position_ref [value] COMMA position_v [value] RPAREN // : "setvar" LPAREN position [value] COMMA position_v [value] RPAREN ; */ /* setvar_bool [std::string& value] : "setvar" LPAREN boolean [value] COMMA boolean_v [value] RPAREN ; */ /* setvar_sset [std::string& value] : "setvar" LPAREN str_set [value] COMMA str_set_v [value] RPAREN ; */ /* setvar_tset [std::string& value] : "setvar" LPAREN sym_set [value] COMMA sym_set_v [value] RPAREN ; */ /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// // VALUES /////////////////////////////////////////////////////////////////////////////// // ---------------------------------------------------------------------------- // Single or muliple values in string set str_set_v_in [boost::shared_ptr<Wccl::StrSet>& s_set] : v1: STRING { s_set->insert(token_ref_to_ustring(v1)); } | v2: STRING COMMA str_set_v_in [s_set] { s_set->insert(token_ref_to_ustring(v2)); } ; // string set, called as unnamed (temporary) StrSet: // calls: [] ['a'] ['a', 'b'] ["a"] ["a", "b"] ['a', "b"] or variable $A str_set_v [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::Constant<Wccl::StrSet> > val] { boost::shared_ptr<Wccl::StrSet> set(new Wccl::StrSet); } : LBRACKET RBRACKET { val.reset(new Wccl::Constant<Wccl::StrSet>(*set.get())); } | LBRACKET str_set_v_in [set] RBRACKET { val.reset(new Wccl::Constant<Wccl::StrSet>(*set.get())); } | val = str_set [vars] ; // ---------------------------------------------------------------------------- // element of sym set /* sym_set_elem_s [std::string& value] : s1: SYMBOL { value += token_ref_to_std_string(s1); } | s2: SYMBOL COMMA sym_set_elem_s [value] { value += token_ref_to_std_string(s2); } | s3: SYMBOL COMMA sym_set_elem_g [value] { value += token_ref_to_std_string(s3); } ; */ // element of sym set /* sym_set_elem_g [std::string& value] : G_MARK s1: SYMBOL G_MARK { value += token_ref_to_std_string(s1); } | G_MARK s2: SYMBOL G_MARK COMMA sym_set_elem_g [value] { value += token_ref_to_std_string(s2); } | G_MARK s3: SYMBOL G_MARK COMMA sym_set_elem_s [value] { value += token_ref_to_std_string(s3); } ; */ // sym set in /* sym_set_in [std::string& value] : sym_set_elem_s [value] | sym_set_elem_g [value] ; */ // sym set {} {a} {a, b} /* sym_set_v [std::string& value] : LCURLY RCURLY | LCURLY sym_set_in [value] RCURLY ; */ // ---------------------------------------------------------------------------- // boolean: boolean_v [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::Constant<Wccl::Bool> > val] : "True" { val.reset(new Wccl::Constant<Wccl::Bool>(Wccl::Bool(true ))); } | "False" { val.reset(new Wccl::Constant<Wccl::Bool>(Wccl::Bool(false))); } | val = boolean [vars] ; // ---------------------------------------------------------------------------- // position value: /* position_v [std::string& value] : i: INT { value = token_ref_to_std_string(i); } | "begin" { value = "begin"; } | "end" { value = "end"; } | "nowhere" { value = "nowhere"; } | position [value] ; */ // ---------------------------------------------------------------------------- // internal values: /* v_literal [std::string& value] : s1: STRING { value = token_ref_to_std_string(s1); } | s2: SYMBOL { value = token_ref_to_std_string(s2); } ; */ ///////////////////////////////////////////////////////////////////////////////////// // constants // set of values /* st::shared_ptr<Wccl::StrSet> ret]s_literal [std::string& v] : LBRACKET ((v_literal[v]) (COMMA v_literal [v])*)? RBRACKET ; */ // comma-separated predicates /* seq_et [std::string& v]: et_any [v] (COMMA et_any [v])* ; */ /* es_any [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::StrSet> ret] s_literal [v] | es_op [v] ; */ /* et_bool [std::string& v]: boolean [v] | boolean_op [v] ; */ // set relations /* et_set [std::string& v] { std::string v1, v2; } : "in" LPAREN es_any [v1] COMMA es_any [v2] RPAREN { v = ("name " + v1 + " " + v2); } | "inter" LPAREN es_any [v1] COMMA es_any [v2] RPAREN { v = ("inter " + v1 + " " + v2); } | "equal" LPAREN es_any [v1] COMMA es_any [v2] RPAREN { v = ("equal " + v1 + " " + v2); } ; */ /* et_string [std::string& v] : "isbig" LPAREN es_any [v] RPAREN | "hasnum" LPAREN es_any [v] RPAREN ; */ /* et_action [std::string& v] { std::string v1, v2; } : "delete" LPAREN et_any [v] RPAREN | "select" LPAREN et_any [v] RPAREN | "relabel" LPAREN es_any [v1] COMMA et_any [v2] RPAREN { v = ("relabel " + v1 + " " + v2); } | "unify" LPAREN es_any [v1] COMMA i: INT RPAREN { v = ("relabel " + v1 + " on position " + token_ref_to_std_string(i)); } | "mark" LPAREN s1: SYMBOL RPAREN { v = ("mark " + token_ref_to_std_string(s1)); } | "unmark" LPAREN s2: SYMBOL RPAREN { v = ("unmark " + token_ref_to_std_string(s2)); } | "startnew" LPAREN s3: SYMBOL RPAREN { v = ("startnew " + token_ref_to_std_string(s3)); } | "lextend" LPAREN s4: SYMBOL RPAREN { v = ("lextend " + token_ref_to_std_string(s4)); } | "rextend" LPAREN s5: SYMBOL RPAREN { v = ("rextend " + token_ref_to_std_string(s5)); } ; */ /* et_iter [std::string& v] { std::string v1, v2, v3, v4; } : "only" LPAREN position_ref [v1] COMMA position_ref [v2] COMMA position_ref [v3] COMMA et_any [v4] RPAREN | "atleast" LPAREN position_ref [v1] COMMA position_ref [v2] COMMA position_ref [v3] COMMA et_any [v4] COMMA i:INT RPAREN | "llook" LPAREN position_ref [v1] COMMA position_ref [v2] COMMA position_ref [v3] COMMA et_any [v4] RPAREN | "rlook" LPAREN position_ref [v1] COMMA position_ref [v2] COMMA position_ref [v3] COMMA et_any [v4] RPAREN | "setvar" LPAREN position_ref [v1] COMMA position_ref [v2] RPAREN | "lskip" LPAREN position_ref [v1] COMMA SYMBOL COMMA position_ref [v2] COMMA et_any [v3] RPAREN | "lphrase" LPAREN position_ref [v1] COMMA SYMBOL COMMA position_ref [v2] RPAREN | "rphrase" LPAREN position_ref [v1] COMMA SYMBOL COMMA position_ref [v2] RPAREN | "accept" LPAREN seq_et [v1] RPAREN ; */ // predicates checking agreement /* et_agr [std::string& name] { std::string p1, p2, v; } : "agr" LPAREN position_ref [p1] COMMA position_ref [p2] COMMA es_any [v] COMMA i1: INT RPAREN | "agrpp" LPAREN position_ref [p1] COMMA position_ref [p2] COMMA es_any [v] COMMA i2: INT RPAREN | "wagr" LPAREN position_ref [p1] COMMA position_ref [p2] COMMA es_any [v] COMMA i3: INT RPAREN ; */ /* // annotation checking predicates et_annot [std::string& v] : "phrase" LPAREN position_ref [v] COMMA s: SYMBOL RPAREN ; */ /* // constraints et_any [std::string& v] : et_bool [v] | et_set [v] | et_string [v] | et_action [v] | et_iter [v] | et_agr [v] | et_annot [v] ; */ /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// // Stiring operators returns boost::shared_ptr<Wccl::Function<Wccl::StrSet> > /////////////////////////////////////////////////////////////////////////////// // ---------------------------------------------------------------------------- string_operators [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::Function<Wccl::StrSet> > ret] /* : ret = op_orth [vars] | ret = op_base [vars] */ : ret = op_lower [vars] | ret = op_upper [vars] | ret = op_affix [vars] | ret = str_set_v [vars] ; // Implementations of string operators: // ---------------------------------------------------------------------------- /* op_orth [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::StrSet> ret] { boost::shared_ptr<Wccl::PositionRef> tmpPosRef; } : "orth" LBRACKET tmpPosRef = position_ref [vars] RBRACKET { // TODO } ; */ // ---------------------------------------------------------------------------- /* op_base [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::StrSet> ret] { boost::shared_ptr<Wccl::PositionRef> tmpPosRef; } : "base" LBRACKET tmpPosRef = position_ref [vars] RBRACKET { // TODO } ; */ // ---------------------------------------------------------------------------- // returns boost::shared_ptr<Wccl::Function<Wccl::StrSet> > op_lower [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::Function<Wccl::StrSet> > ret] { boost::shared_ptr<Wccl::Function<Wccl::StrSet> > o_ret; } : "lower" LPAREN o_ret = string_operators[vars] RPAREN { ret.reset(new Wccl::ToLower(o_ret)); } ; // ---------------------------------------------------------------------------- op_upper [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::Function<Wccl::StrSet> > ret] { boost::shared_ptr<Wccl::Function<Wccl::StrSet> > o_ret; } : "upper" LPAREN o_ret = string_operators[vars] RPAREN { ret.reset(new Wccl::ToUpper(o_ret)); } ; // ---------------------------------------------------------------------------- op_affix [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::Function<Wccl::StrSet> > ret] { boost::shared_ptr<Wccl::Function<Wccl::StrSet> > o_ret; } : "affix" LPAREN o_ret = string_operators[vars] COMMA offset: INT RPAREN { ret.reset(new Wccl::Affix(o_ret, token_ref_to_int(offset))); } ; /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// // Predicates returns boost::shared_ptr<Wccl::Function<Wccl::Bool> > /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// predicates [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::Function<Wccl::Bool> > ret] : ret = logical_predicates [vars] ; // Implementations of predicates: // ---------------------------------------------------------------------------- logical_predicates [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::Function<Wccl::Bool> > ret] : ret = lpred_and [vars] | ret = lpred_or [vars] | ret = lpred_nor [vars] | ret = boolean_v [vars] ; // comma-separated predicates logical_predicates_comma_sep [Wccl::Variables& vars] returns [boost::shared_ptr<std::vector<boost::shared_ptr<Wccl::Function<Wccl::Bool> > > > ret_v] { boost::shared_ptr<Wccl::Function<Wccl::Bool> > pred; ret_v.reset(new std::vector<boost::shared_ptr<Wccl::Function<Wccl::Bool> > >); } : pred = logical_predicates [vars] { ret_v->push_back(pred); } ( COMMA pred = logical_predicates [vars] { ret_v->push_back(pred); })* ; // ---------------------------------------------------------------------------- lpred_and [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::Function<Wccl::Bool> > op] { boost::shared_ptr< std::vector<boost::shared_ptr<Wccl::Function<Wccl::Bool> > > > ret_v; } : "and" LPAREN ret_v = logical_predicates_comma_sep [vars] RPAREN { op.reset(new Wccl::And(ret_v)); } ; lpred_or [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::Function<Wccl::Bool> > op] { boost::shared_ptr< std::vector<boost::shared_ptr<Wccl::Function<Wccl::Bool> > > > ret_v; } : "or" LPAREN ret_v = logical_predicates_comma_sep [vars] RPAREN { op.reset(new Wccl::Or(ret_v)); } ; lpred_nor [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::Function<Wccl::Bool> > op] { boost::shared_ptr< std::vector<boost::shared_ptr<Wccl::Function<Wccl::Bool> > > > ret_v; } : "nor" LPAREN ret_v = logical_predicates_comma_sep [vars] RPAREN { op.reset(new Wccl::Nor(ret_v)); } ; // ---------------------------------------------------------------------------------- // ANTLR LEXER // ---------------------------------------------------------------------------------- class ANTLRLexer extends Lexer; options { k = 2; exportVocab = ANTLRExpr; charVocabulary = '\3'..'\377'; testLiterals = false; } STRING options { paraphrase = "a string"; } : '"' (~'"')* '"' | '\'' (~'\'')* '\'' ; // STRING_APOS // options { // paraphrase = "a string without apostrophe"; // } // : (~'"')* // ; // STRING_QUOT // options { // paraphrase = "a string without quotation"; // } // : (~'\'')* // ; INT options { paraphrase = "Integer"; } : ('-'|'+')?('0'..'9')+ ; QUOT_MARK options { paraphrase = "Quota mark"; } : '\'' ; APOS_MARK options { paraphrase = "Aposptrophe mark"; } : '"' ; Q_MARK options { paraphrase = "Query mark"; } : '?' ; E_MARK options { paraphrase = "Exclamanation mark"; } : '!' ; G_MARK options { paraphrase = "Gravis mark"; } : '`' ; LBRACKET options { paraphrase = "'['"; } : '[' ; RBRACKET options { paraphrase = "']'"; } : ']' ; LPAREN options { paraphrase = "'('"; } : '(' ; RPAREN options { paraphrase = "')'"; } : ')' ; LCURLY options { paraphrase = "'{'"; } : '{' ; RCURLY options { paraphrase = "'}'"; } : '}' ; DOLLAR options { paraphrase = "'$'"; } : '$' ; AT_MARK options { paraphrase = "'@'"; } : '@' ; COMMA options { paraphrase = "','"; } : ',' ; SYMBOL options { paraphrase = "symbol"; testLiterals = true; } : ( 'a'..'z' | 'A'..'Z' | '_' ) ( 'a'..'z' | 'A'..'Z' | '_' | '0'..'9' )* ; WS : ( ' ' | '\t' | '\r' '\n' {newline(); } | '\n' {newline(); } ) { $setType(antlr::Token::SKIP); } ; COMMENT options { paraphrase = "Comment"; } : "//" (~'\n')* '\n'{ $setType(antlr::Token::SKIP); newline(); } ; HASH options { paraphrase = "'#'"; } : '#' ; DSEPARATOR options { paraphrase = "':-'"; } : ":-" ;