diff --git a/libwccl/parser/Parser.cpp b/libwccl/parser/Parser.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3fedb78ef0aa9c606e5c9ed67d1668b1673efdea --- /dev/null +++ b/libwccl/parser/Parser.cpp @@ -0,0 +1,84 @@ +#include "Parser.h" + +/** + * @desc Parser constructor. Default tagset is NULL + */ +Parser::Parser(const Corpus2::Tagset& t) : tagset(t) +{ +} + +/** + * + */ +Parser::~Parser() +{ + // +} + +// ---------------------------------------------------------------------------- + +/** + * @desc Overloaded parsing operator writed in string. Calls base method + * @arg str operator as string + * @return call method @see parserOperator(const std::istream&) + */ +/* +std::string Parser::parseOperator(const std::string& str) const +{ + std::stringstream ss (std::stringstream::in | std::stringstream::out); + ss << str; + + return this->parseOperator(ss); +} +*/ +/** + * @desc Base method for parsing operator in stream + * @arg istr input stream with operator + * @return Operator + */ +/* +std::string Parser::parseOperator(std::istream& istr) const +{ + ANTLRLexer lexer(istr); + ANTLRParser parser(lexer); + + return "Ala ma kota"; +} +*/ + +// ----------------------------------------------------------------------------- + +/** + * @desc Overloaded parsing expression writed in string. Calls base method + * @arg str variables in string + */ +void Parser::parseExpression(const std::string& str) const +{ + std::stringstream ss (std::stringstream::in | std::stringstream::out); + ss << str; + + return this->parseExpression(ss); +} + +/** + * @desc Base method for parsing expression in stream + * @arg istr input stream with variable + * @return Operator + */ +void Parser::parseExpression(std::istream& istr) const +{ + ANTLRLexer lexer(istr); + ANTLRParser parser(lexer); + + try { + parser.start_rules(); + + std::cerr << "Syntax ok!" << std::endl; + } + catch (ParserException e) { + std::cerr << e.info() << std::endl; + } + catch (...) { + std::cerr << "Syntax error!" << std::endl; + } +} diff --git a/libwccl/parser/Parser.h b/libwccl/parser/Parser.h new file mode 100644 index 0000000000000000000000000000000000000000..08b06c3abc3c4a9ace1a74a5e9600b23b916b3c4 --- /dev/null +++ b/libwccl/parser/Parser.h @@ -0,0 +1,28 @@ +#ifndef PARSER_H +#define PARSER_H + +#include <sstream> +#include <libcorpus2/tagset.h> + +#include "ANTLRLexer.hpp" +#include "ANTLRParser.hpp" + +#include "ParserException.h" + +// <libwccl> + +class Parser { +public: + Parser(const Corpus2::Tagset&); + ~Parser(); + + // -------------------------------------------------------------------------- + // FIXME + void parseExpression(const std::string&) const; + void parseExpression(std::istream& ) const; + +private: + const Corpus2::Tagset &tagset; +}; + +#endif // PARSER_H diff --git a/libwccl/parser/ParserException.cpp b/libwccl/parser/ParserException.cpp new file mode 100644 index 0000000000000000000000000000000000000000..48aca90a0f3005eb7f38d16b078aaa22a8a93109 --- /dev/null +++ b/libwccl/parser/ParserException.cpp @@ -0,0 +1,6 @@ +#include "ParserException.h" + +ParserException::ParserException(std::string msg) +{ + this->msg = msg; +} diff --git a/libwccl/parser/ParserException.h b/libwccl/parser/ParserException.h new file mode 100644 index 0000000000000000000000000000000000000000..b74923295ed48349a08f343a7734e5e173694bc8 --- /dev/null +++ b/libwccl/parser/ParserException.h @@ -0,0 +1,18 @@ +#ifndef PARSEREXCEPTION_H +#define PARSEREXCEPTION_H + +#include <string> + +class ParserException +{ +public: + ParserException(const std::string); + + // + const std::string info() const { return this->msg; } + +private: + std::string msg; +}; + +#endif // PARSEREXCEPTION_H diff --git a/libwccl/parser/grammar.g b/libwccl/parser/grammar.g new file mode 100644 index 0000000000000000000000000000000000000000..b768eff1de0589b4e78d3d9554d01cae33d24636 --- /dev/null +++ b/libwccl/parser/grammar.g @@ -0,0 +1,498 @@ +header { + #include "ParserException.h" + + // libwccl + + #include <cstdio> + #include <antlr/Token.hpp> + #include <boost/lexical_cast.hpp> +} + +options { + language = "Cpp"; +} + +// ---------------------------------------------------------------------------------- +// ANTLR PARSER +// ---------------------------------------------------------------------------------- +class ANTLRParser extends Parser; +options { + k = 6; + exportVocab = ANTLRExpr; + buildAST = false; + defaultErrorHandler = false; +} +{ +private: + // + const std::string token_ref_to_std_string(antlr::RefToken& rstr) { + return (((antlr::Token*)rstr)->getText()); + } + const int token_ref_to_int(antlr::RefToken& rstr) { + return atoi(this->token_ref_to_std_string(rstr).c_str()); + } +} +///////////////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////////////// +// ---------------------------------------------------------------------------------- +// Start all rules +start_rules +{ + std::string name = ""; +} + : values_ref [name] { fprintf(stderr, "%s\n", name.c_str()); } + | position_op [name] { fprintf(stderr, "%s\n", name.c_str()); } + | filters_op [name] { fprintf(stderr, "%s\n", name.c_str()); } + | setvar_op [name] { fprintf(stderr, "%s\n", name.c_str()); } + | string_operators [name] { fprintf(stderr, "%s\n", name.c_str()); } + | boolean_op [name] { fprintf(stderr, "%s\n", name.c_str()); } + ; + +// --------------------------------------------------------------------------------- +// values: +// Walues can be use for setvar(...,..) +values [std::string& name]: + position [name] + | str_set [name] + | sym_set [name] + | boolean [name] + ; +// +// Values reference => values + position_ref +// !! Cannot use for setvar(...,...) !! +values_ref [std::string& name]: + values [name] + | position_ref [name] + | boolean_ref [name] + ; +///////////////////////////////////////////////////////////////////////////////////// +// Position: $0name +position [std::string& name]: + DOLLAR "0" n: SYMBOL { name = token_ref_to_std_string(n); } + ; +// +// Position reference: $(0-9)+name +// !! Cannot use for setvar(...,...) !! +position_ref [std::string& name]: + DOLLAR INT n: SYMBOL { name = token_ref_to_std_string(n); } + ; +///////////////////////////////////////////////////////////////////////////////////// +// String set: $name +str_set [std::string& name]: + DOLLAR n: SYMBOL { name = token_ref_to_std_string(n); } + ; +///////////////////////////////////////////////////////////////////////////////////// +// Sym set: $$name +sym_set [std::string& name]: + DOLLAR DOLLAR n: SYMBOL { name = token_ref_to_std_string(n); } + ; +///////////////////////////////////////////////////////////////////////////////////// +// Bool: $?name +boolean [std::string& name]: + DOLLAR Q_MARK n1: SYMBOL { name = token_ref_to_std_string(n1); } + ; +// Boolean $!name +boolean_ref [std::string& name]: + DOLLAR E_MARK n1: SYMBOL { name = token_ref_to_std_string(n1); } + ; + +///////////////////////////////////////////////////////////////////////////////////// +// OPERATORS +///////////////////////////////////////////////////////////////////////////////////// +// ---------------------------------------------------------------------------------- +// Positions operator +// TODO range przyjmuje postion_ref. ?? Zmienic na position ?? +position_op [std::string& name] +{ + std::string r1, r2; +} + : "flex" LBRACKET position_ref [name] RBRACKET + | "range" LPAREN s: SYMBOL COMMA position_ref [r1] COMMA position_ref [r2] RPAREN + { name = ("Range opertator from " + token_ref_to_std_string(s) + " [" + r1 + ":" + r2 + "]!"); } + ; +// ---------------------------------------------------------------------------------- +// Filtering operator +filters_op [std::string& name] +{ + std::string p, p2, e1, e2; +} + : "catflt" LPAREN position_ref [p] COMMA es_any [e1] COMMA es_any [e2] RPAREN + { name = ( "Catflt operator in position " + p + " for sets " + e1 + " " + e2); } + | "agrflt" LPAREN position_ref [p] COMMA position_ref [p2] COMMA es_any [e1] COMMA i: INT RPAREN + { name = ( "Agrflt operator p1 " + p + " p2 " + p2 + " for set " + e1 + " aggr_attrs " + token_ref_to_std_string(i)); } + ; + +// ---------------------------------------------------------------------------------- +// Setvar operator +setvar_op [std::string& value] + : setvar_pos [value] + | setvar_bool [value] + | setvar_sset [value] + | setvar_tset [value] + ; + +// setvar dla position przyjmuje position_ref -> TODO sprawdzic dlaczego +// gramatyka nie pokrywa "setvar" LPAREN position COMMA position_v RPAREN +setvar_pos [std::string& value] + : "setvar" LPAREN position_ref [value] COMMA position_v [value] RPAREN +// : "setvar" LPAREN position [value] COMMA position_v [value] RPAREN + ; + +setvar_bool [std::string& value] + : "setvar" LPAREN boolean [value] COMMA boolean_v [value] RPAREN + ; + +setvar_sset [std::string& value] + : "setvar" LPAREN str_set [value] COMMA str_set_v [value] RPAREN + ; + +setvar_tset [std::string& value] + : "setvar" LPAREN sym_set [value] COMMA sym_set_v [value] RPAREN + ; + +// ---------------------------------------------------------------------------------- +// Values +///////////////////////////////////////////////////////////////////////////////////// +// boolean: +boolean_v [std::string& value] + : "True" { value = "True"; } + | "False" { value = "False"; } + | boolean[value] + ; +///////////////////////////////////////////////////////////////////////////////////// +// position value: +position_v [std::string& value] + : i: INT { value = token_ref_to_std_string(i); } + | "begin" { value = "begin"; } + | "end" { value = "end"; } + | "nowhere" { value = "nowhere"; } + | position [value] + ; +///////////////////////////////////////////////////////////////////////////////////// +// string set in +str_set_in [std::string& value] + : v1: STRING { value += token_ref_to_std_string(v1); } + | v2: STRING COMMA str_set_in [value] { value += (", " + token_ref_to_std_string(v2)); } + ; +// string set [] ['a'] ['a', 'b'] ["a"] ["a", "b"] ['a', "b"] +str_set_v [std::string& value] + : LBRACKET RBRACKET + | LBRACKET str_set_in[value] RBRACKET + ; +///////////////////////////////////////////////////////////////////////////////////// +// element of sym set +sym_set_elem_s [std::string& value] + : s1: SYMBOL { value += token_ref_to_std_string(s1); } + | s2: SYMBOL COMMA sym_set_elem_s [value] { value += token_ref_to_std_string(s2); } + | s3: SYMBOL COMMA sym_set_elem_g [value] { value += token_ref_to_std_string(s3); } + ; +// element of sym set +sym_set_elem_g [std::string& value] + : G_MARK s1: SYMBOL G_MARK { value += token_ref_to_std_string(s1); } + | G_MARK s2: SYMBOL G_MARK COMMA sym_set_elem_g [value] { value += token_ref_to_std_string(s2); } + | G_MARK s3: SYMBOL G_MARK COMMA sym_set_elem_s [value] { value += token_ref_to_std_string(s3); } + ; +// sym set in +sym_set_in [std::string& value] + : sym_set_elem_s [value] + | sym_set_elem_g [value] + ; +// sym set {} {a} {a, b} +sym_set_v [std::string& value] + : LCURLY RCURLY + | LCURLY sym_set_in [value] RCURLY + ; + +///////////////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////////////// +// internal values: +v_literal [std::string& value] + : s1: STRING { value = token_ref_to_std_string(s1); } + | s2: SYMBOL { value = token_ref_to_std_string(s2); } + ; + +///////////////////////////////////////////////////////////////////////////////////// +// constants +// set of values +s_literal [std::string& v] + : LBRACKET ((v_literal[v]) (COMMA v_literal [v])*)? RBRACKET + ; + +// comma-separated predicates +seq_et [std::string& v]: + et_any [v] (COMMA et_any [v])* + ; + +es_any [std::string& v]: + s_literal [v] + | es_op [v] + ; + +es_op [std::string& v]: + position_ref [v] + | filters_op [v] + ; + +et_bool [std::string& v]: + boolean [v] + | boolean_op [v] + ; + +// set relations +et_set [std::string& v] +{ + std::string v1, v2; +} + : "in" LPAREN es_any [v1] COMMA es_any [v2] RPAREN { v = ("name " + v1 + " " + v2); } + | "inter" LPAREN es_any [v1] COMMA es_any [v2] RPAREN { v = ("inter " + v1 + " " + v2); } + | "equal" LPAREN es_any [v1] COMMA es_any [v2] RPAREN { v = ("equal " + v1 + " " + v2); } + ; + +et_string [std::string& v] + : "isbig" LPAREN es_any [v] RPAREN + | "hasnum" LPAREN es_any [v] RPAREN + ; + +et_action [std::string& v] +{ + std::string v1, v2; +} + : "delete" LPAREN et_any [v] RPAREN + | "select" LPAREN et_any [v] RPAREN + | "relabel" LPAREN es_any [v1] COMMA et_any [v2] RPAREN { v = ("relabel " + v1 + " " + v2); } + | "unify" LPAREN es_any [v1] COMMA i: INT RPAREN { v = ("relabel " + v1 + " on position " + token_ref_to_std_string(i)); } + | "mark" LPAREN s1: SYMBOL RPAREN { v = ("mark " + token_ref_to_std_string(s1)); } + | "unmark" LPAREN s2: SYMBOL RPAREN { v = ("unmark " + token_ref_to_std_string(s2)); } + | "startnew" LPAREN s3: SYMBOL RPAREN { v = ("startnew " + token_ref_to_std_string(s3)); } + | "lextend" LPAREN s4: SYMBOL RPAREN { v = ("lextend " + token_ref_to_std_string(s4)); } + | "rextend" LPAREN s5: SYMBOL RPAREN { v = ("rextend " + token_ref_to_std_string(s5)); } + ; + +et_iter [std::string& v] +{ + std::string v1, v2, v3, v4; +} + : "only" LPAREN position_ref [v1] COMMA position_ref [v2] COMMA position_ref [v3] COMMA et_any [v4] RPAREN + | "atleast" LPAREN position_ref [v1] COMMA position_ref [v2] COMMA position_ref [v3] COMMA et_any [v4] COMMA i:INT RPAREN + | "llook" LPAREN position_ref [v1] COMMA position_ref [v2] COMMA position_ref [v3] COMMA et_any [v4] RPAREN + | "rlook" LPAREN position_ref [v1] COMMA position_ref [v2] COMMA position_ref [v3] COMMA et_any [v4] RPAREN + | "setvar" LPAREN position_ref [v1] COMMA position_ref [v2] RPAREN + | "lskip" LPAREN position_ref [v1] COMMA SYMBOL COMMA position_ref [v2] COMMA et_any [v3] RPAREN + | "lphrase" LPAREN position_ref [v1] COMMA SYMBOL COMMA position_ref [v2] RPAREN + | "rphrase" LPAREN position_ref [v1] COMMA SYMBOL COMMA position_ref [v2] RPAREN + | "accept" LPAREN seq_et [v1] RPAREN + ; + +// predicates checking agreement +et_agr [std::string& name] +{ + std::string p1, p2, v; +} + : "agr" LPAREN position_ref [p1] COMMA position_ref [p2] COMMA es_any [v] COMMA i1: INT RPAREN + | "agrpp" LPAREN position_ref [p1] COMMA position_ref [p2] COMMA es_any [v] COMMA i2: INT RPAREN + | "wagr" LPAREN position_ref [p1] COMMA position_ref [p2] COMMA es_any [v] COMMA i3: INT RPAREN + ; + +// annotation checking predicates +et_annot [std::string& v] + : "phrase" LPAREN position_ref [v] COMMA s: SYMBOL RPAREN + ; + +// constraints +et_any [std::string& v] + : et_bool [v] + | et_set [v] + | et_string [v] + | et_action [v] + | et_iter [v] + | et_agr [v] + | et_annot [v] + ; + +///////////////////////////////////////////////////////////////////////////////////// +// OERATORS +///////////////////////////////////////////////////////////////////////////////////// +// Operators returns str_set: orth[$-2P] +string_operators [std::string& name] + : op_orth [name] + | op_base [name] + | op_lower [name] + | op_upper [name] + | op_affix [name] +; + +op_orth [std::string& name] + : "orth" LBRACKET position_ref [name] RBRACKET { name = "Orth operator!"; } + ; + +op_base [std::string& name] + : "base" LBRACKET position_ref [name] RBRACKET { name = "Base operator!"; } + ; + +op_lower [std::string& name] + : "lower" LPAREN str_set [name] RPAREN { name = "Lower operator!"; } + | "lower" LPAREN str_set_v [name] RPAREN { name = "Lower operator!"; } + ; + +op_upper [std::string& name] + : "upper" LPAREN str_set [name] RPAREN { name = "Upper operator!"; } + | "upper" LPAREN str_set_v [name] RPAREN { name = "Upper operator!"; } + ; + +op_affix [std::string& name] + : "affix" LPAREN str_set [name] COMMA n1: INT RPAREN { name = "Affix operator " + token_ref_to_std_string(n1) + "!"; } + | "affix" LPAREN str_set_v [name] COMMA n2: INT RPAREN { name = "Affix operator " + token_ref_to_std_string(n2) + "!"; } + ; + +boolean_op [std::string& name] + : "and" LPAREN seq_et [name] RPAREN + | "not" LPAREN seq_et [name] RPAREN + | "or" LPAREN seq_et [name] RPAREN + ; + +// ---------------------------------------------------------------------------------- +// ANTLR LEXER +// ---------------------------------------------------------------------------------- +class ANTLRLexer extends Lexer; +options { + k = 2; + exportVocab = ANTLRExpr; + charVocabulary = '\3'..'\377'; + testLiterals = false; +} + +STRING +options { + paraphrase = "a string"; +} + : '"' (~'"')* '"' + | '\'' (~'\'')* '\'' + ; + +INT +options { + paraphrase = "Integer"; +} + : ('-'|'+')?('0'..'9')+ + ; + +Q_MARK +options { + paraphrase = "Query mark"; +} + : '?' + ; + +E_MARK +options { + paraphrase = "Exclamanation mark"; +} + : '!' + ; + +G_MARK +options { + paraphrase = "Gravis mark"; +} + : '`' + ; + +LBRACKET +options { + paraphrase = "'['"; +} + : '[' + ; + +RBRACKET +options { + paraphrase = "']'"; +} + : ']' + ; + +LPAREN +options { + paraphrase = "'('"; +} + : '(' + ; + +RPAREN +options { + paraphrase = "')'"; +} + : ')' + ; + +LCURLY +options { + paraphrase = "'{'"; +} + : '{' + ; + +RCURLY +options { + paraphrase = "'}'"; +} + : '}' + ; + +DOLLAR +options { + paraphrase = "'$'"; +} + : '$' + ; + +AT_MARK +options { + paraphrase = "'@'"; +} + : '@' + ; + +COMMA +options { + paraphrase = "','"; +} + : ',' + ; + +SYMBOL +options { + paraphrase = "symbol"; + testLiterals = true; +} + : ( 'a'..'z' | 'A'..'Z' | '_' ) ( 'a'..'z' | 'A'..'Z' | '_' | '0'..'9' )* + ; + +WS + : + ( ' ' + | '\t' + | '\r' '\n' {newline(); } + | '\n' {newline(); } ) { $setType(antlr::Token::SKIP); } + ; + +COMMENT +options { + paraphrase = "Comment"; +} + : "//" (~'\n')* '\n'{ $setType(antlr::Token::SKIP); newline(); } + ; + +HASH +options { + paraphrase = "'#'"; +} + : '#' + ; + +DSEPARATOR +options { + paraphrase = "':-'"; +} + : ":-" + ;