Newer
Older
#include <libwccl/parser/ParserException.h>
#include <cstdio>
#include <antlr/Token.hpp>
#include <boost/lexical_cast.hpp>
// values/variables
#include <libwccl/variables.h>
#include <libwccl/values/bool.h>
#include <libwccl/values/tset.h>
#include <libwccl/values/strset.h>
#include <libwccl/values/position.h>
#include <libwccl/values/positionref.h>
// sentence context
#include <libwccl/sentencecontext.h>
// operators
#include <libwccl/ops/and.h>
#include <libwccl/ops/affix.h>
#include <libwccl/ops/toupper.h>
#include <libwccl/ops/tolower.h>
#include <libwccl/ops/constant.h>
#include <libwccl/ops/functions.h>
#include <libwccl/ops/logicalpredicate.h>
// Unicode String
#include <unicode/uniset.h>
#include <unicode/unistr.h>
// ----------------------------------------------------------------------------
// ----------------------------------------------------------------------------
class ANTLRParser extends Parser;
options {
k = 6;
exportVocab = ANTLRExpr;
buildAST = false;
defaultErrorHandler = false;
}
{
private:
//
const UnicodeString token_ref_to_ustring(antlr::RefToken& rstr) const {
return UnicodeString::fromUTF8(((antlr::Token*)rstr)->getText()).unescape();
}
//
const std::string token_ref_to_std_string(antlr::RefToken& rstr) const {
return (((antlr::Token*)rstr)->getText());
//
int token_ref_to_int(antlr::RefToken& rstr) {
return atoi(((antlr::Token*)rstr)->getText().c_str());
// hepls function for processing
boost::shared_ptr<Wccl::Function<Wccl::StrSet> > get_str_set_expr(
boost::shared_ptr<Wccl::StrSet> ret_str_set)
{
boost::shared_ptr<Wccl::Function<Wccl::StrSet> > strset_expr(
new Wccl::Constant<Wccl::StrSet>(*ret_str_set.get())
);
return strset_expr;
}
Wccl::SentenceContext get_tmp_context() {
boost::shared_ptr<Corpus2::Sentence> sentence;
Wccl::SentenceContext sc(sentence);
return sc;
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// ----------------------------------------------------------------------------------
// Start all rules
start_rules
{
std::string name = "";
}
: values_ref [name] { fprintf(stderr, "%s\n", name.c_str()); }
| position_op [name] { fprintf(stderr, "%s\n", name.c_str()); }
| filters_op [name] { fprintf(stderr, "%s\n", name.c_str()); }
| setvar_op [name] { fprintf(stderr, "%s\n", name.c_str()); }
| boolean_op [name] { fprintf(stderr, "%s\n", name.c_str()); }
;
*/
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// "GLOBAL" RULES
// ----------------------------------------------------------------------------
// Rules for parsing string operators in scope (variables).
// Returns boost::shared_ptr<Wccl::Function<Wccl::StrSet> >
parse_string_operator
returns [boost::shared_ptr<Wccl::Function<Wccl::StrSet> > res]
{
Wccl::Variables vars;
}
;
// ----------------------------------------------------------------------------
// Rules for parsing predicates in scope (variables).
// Returns boost::shared_ptr<Wccl::Function<Wccl::Bool> >
parse_predicates
returns [boost::shared_ptr<Wccl::Function<Wccl::Bool> > res]
{
Wccl::Variables vars;
boost::shared_ptr<Wccl::Bool> mret;
}
: mret = predicates [vars, res]
;
// ----------------------------------------------------------------------------
// Rules for parsing values in scope (variables).
// Returns boost::shared_ptr<Wccl::Value>
parse_values
returns [boost::shared_ptr<Wccl::Value> ret]
{
Wccl::Variables vars;
}
: ret = values [vars]
;
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// All values:
// Values can be use for setvar(...,..)
// ----------------------------------------------------------------------------
values
[Wccl::Variables& vars]
returns [boost::shared_ptr<Wccl::Constant<Wccl::Value> > res]
: res = position [vars]
| res = str_set [vars]
| res = sym_set [vars]
| res = boolean [vars]
;
// ----------------------------------------------------------------------------
// Values reference => values + position_ref
// !! Cannot use for setvar(...,...) !!
values_ref [std::string& name]:
values [name]
| position_ref [name]
| boolean_ref [name]
;
*/
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// ----------------------------------------------------------------------------
// TODO Cos nie lapie dobrze implementacja!!! Moze jakas dodatkowa
// TODO regula do lexera?
position
[Wccl::Variables& vars]
returns [boost::shared_ptr<Wccl::Constant<Wccl::Position> > op]
{
boost::shared_ptr<Wccl::Position> val;
}
: DOLLAR "0" n: SYMBOL {
val = vars.get_put<Wccl::Position>(token_ref_to_std_string(n));
op.reset(new Wccl::Constant<Wccl::Position>(*val.get()));
}
;
// ----------------------------------------------------------------------------
// Position reference: $(0-9)+name
// !! Cannot use for setvar(...,...) !!
position_ref
[Wccl::Variables& vars]
returns [boost::shared_ptr<Wccl::PositionRef> val]
: DOLLAR p_ref: INT n: SYMBOL {
val.reset(
new Wccl::PositionRef(
vars.get_put<Wccl::Position>(token_ref_to_std_string(n)),
token_ref_to_int(p_ref)
)
);
}
;
// ----------------------------------------------------------------------------
// String set, call examples: $name, $Name, $_name, $_Name etc.
// This expression gets variable of tyme StrSet from string-named variable
// Returns Wccl::Function<Wccl::StrSet> from Set-variables
str_set
[Wccl::Variables& vars]
returns [boost::shared_ptr<Wccl::Constant<Wccl::StrSet> > op]
{
boost::shared_ptr<Wccl::StrSet> val;
}
: DOLLAR n: SYMBOL {
val = vars.get_put<Wccl::StrSet>(token_ref_to_std_string(n));
op.reset(new Wccl::Constant<Wccl::StrSet>(*val.get()));
}
;
// ----------------------------------------------------------------------------
// Symbol set: $$name
sym_set
[Wccl::Variables& vars]
returns [boost::shared_ptr<Wccl::Constant<Wccl::TSet> > op]
{
boost::shared_ptr<Wccl::TSet> val;
}
: DOLLAR DOLLAR n: SYMBOL {
val = vars.get_put<Wccl::TSet>(token_ref_to_std_string(n));
op.reset(new Wccl::Constant<Wccl::TSet>(*val.get()));
}
;
// ----------------------------------------------------------------------------
boolean
[Wccl::Variables& vars]
returns [boost::shared_ptr<Wccl::Constant<Wccl::Bool> > op]
{
boost::shared_ptr<Wccl::Bool> val;
}
: DOLLAR Q_MARK n: SYMBOL {
val = vars.get_put<Wccl::Bool>(token_ref_to_std_string(n));
op.reset(new Wccl::Constant<Wccl::Bool>(*val.get()));
boolean_ref [std::string& name]:
DOLLAR E_MARK n1: SYMBOL { name = token_ref_to_std_string(n1); }
;
/////////////////////////////////////////////////////////////////////////////////////
// OPERATORS
/////////////////////////////////////////////////////////////////////////////////////
// ----------------------------------------------------------------------------------
// Positions operator
// TODO range przyjmuje postion_ref. ?? Zmienic na position ??
position_op [std::string& name]
{
std::string r1, r2;
}
: "flex" LBRACKET position_ref [name] RBRACKET
| "range" LPAREN s: SYMBOL COMMA position_ref [r1] COMMA position_ref [r2] RPAREN
{ name = ("Range opertator from " + token_ref_to_std_string(s) + " [" + r1 + ":" + r2 + "]!"); }
;
// ----------------------------------------------------------------------------------
// Filtering operator
filters_op [std::string& name]
{
std::string p, p2, e1, e2;
}
: "catflt" LPAREN position_ref [p] COMMA es_any [e1] COMMA es_any [e2] RPAREN
{ name = ( "Catflt operator in position " + p + " for sets " + e1 + " " + e2); }
| "agrflt" LPAREN position_ref [p] COMMA position_ref [p2] COMMA es_any [e1] COMMA i: INT RPAREN
{ name = ( "Agrflt operator p1 " + p + " p2 " + p2 + " for set " + e1 + " aggr_attrs " + token_ref_to_std_string(i)); }
;
// ----------------------------------------------------------------------------------
// Setvar operator
setvar_op [std::string& value]
: setvar_pos [value]
| setvar_bool [value]
| setvar_sset [value]
| setvar_tset [value]
;
// setvar dla position przyjmuje position_ref -> TODO sprawdzic dlaczego
// gramatyka nie pokrywa "setvar" LPAREN position COMMA position_v RPAREN
setvar_pos [std::string& value]
: "setvar" LPAREN position_ref [value] COMMA position_v [value] RPAREN
// : "setvar" LPAREN position [value] COMMA position_v [value] RPAREN
;
setvar_bool [std::string& value]
: "setvar" LPAREN boolean [value] COMMA boolean_v [value] RPAREN
;
setvar_sset [std::string& value]
: "setvar" LPAREN str_set [value] COMMA str_set_v [value] RPAREN
;
setvar_tset [std::string& value]
: "setvar" LPAREN sym_set [value] COMMA sym_set_v [value] RPAREN
;
*/
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// VALUES
///////////////////////////////////////////////////////////////////////////////
// ----------------------------------------------------------------------------
// Single or muliple values in string set
str_set_v_in
[boost::shared_ptr<Wccl::StrSet>& s_set]
: v1: STRING { s_set->insert(token_ref_to_ustring(v1)); }
| v2: STRING COMMA str_set_v_in [s_set] {
s_set->insert(token_ref_to_ustring(v2));
}
;
// string set, called as unnamed (temporary) StrSet:
// calls: [] ['a'] ['a', 'b'] ["a"] ["a", "b"] ['a', "b"] or variable $A
str_set_v
[Wccl::Variables& vars]
returns [boost::shared_ptr<Wccl::Constant<Wccl::StrSet> > val]
boost::shared_ptr<Wccl::StrSet> set(new Wccl::StrSet);
}
: LBRACKET RBRACKET {
val.reset(new Wccl::Constant<Wccl::StrSet>(*set.get()));
}
| LBRACKET str_set_v_in [set] RBRACKET {
val.reset(new Wccl::Constant<Wccl::StrSet>(*set.get()));
// ----------------------------------------------------------------------------
sym_set_elem_s [std::string& value]
: s1: SYMBOL { value += token_ref_to_std_string(s1); }
| s2: SYMBOL COMMA sym_set_elem_s [value] { value += token_ref_to_std_string(s2); }
| s3: SYMBOL COMMA sym_set_elem_g [value] { value += token_ref_to_std_string(s3); }
;
sym_set_elem_g [std::string& value]
: G_MARK s1: SYMBOL G_MARK { value += token_ref_to_std_string(s1); }
| G_MARK s2: SYMBOL G_MARK COMMA sym_set_elem_g [value] { value += token_ref_to_std_string(s2); }
| G_MARK s3: SYMBOL G_MARK COMMA sym_set_elem_s [value] { value += token_ref_to_std_string(s3); }
;
sym_set_in [std::string& value]
: sym_set_elem_s [value]
| sym_set_elem_g [value]
;
sym_set_v [std::string& value]
: LCURLY RCURLY
| LCURLY sym_set_in [value] RCURLY
;
*/
// ----------------------------------------------------------------------------
// boolean:
boolean_v
[Wccl::Variables& vars]
returns [boost::shared_ptr<Wccl::Bool> val]
: "True" { val.reset(new Wccl::Bool(true )); }
| "False" { val.reset(new Wccl::Bool(false)); }
| val = boolean [vars]
;
// ----------------------------------------------------------------------------
// position value:
/*
position_v [std::string& value]
: i: INT { value = token_ref_to_std_string(i); }
| "begin" { value = "begin"; }
| "end" { value = "end"; }
| "nowhere" { value = "nowhere"; }
| position [value]
;
*/
// ----------------------------------------------------------------------------
v_literal [std::string& value]
: s1: STRING { value = token_ref_to_std_string(s1); }
| s2: SYMBOL { value = token_ref_to_std_string(s2); }
;
/////////////////////////////////////////////////////////////////////////////////////
// constants
// set of values
/*
st::shared_ptr<Wccl::StrSet> ret]s_literal [std::string& v]
*/
/*
es_any [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::StrSet> ret]
et_set [std::string& v]
{
std::string v1, v2;
}
: "in" LPAREN es_any [v1] COMMA es_any [v2] RPAREN { v = ("name " + v1 + " " + v2); }
| "inter" LPAREN es_any [v1] COMMA es_any [v2] RPAREN { v = ("inter " + v1 + " " + v2); }
| "equal" LPAREN es_any [v1] COMMA es_any [v2] RPAREN { v = ("equal " + v1 + " " + v2); }
;
et_string [std::string& v]
: "isbig" LPAREN es_any [v] RPAREN
| "hasnum" LPAREN es_any [v] RPAREN
;
et_action [std::string& v]
{
std::string v1, v2;
}
: "delete" LPAREN et_any [v] RPAREN
| "select" LPAREN et_any [v] RPAREN
| "relabel" LPAREN es_any [v1] COMMA et_any [v2] RPAREN { v = ("relabel " + v1 + " " + v2); }
| "unify" LPAREN es_any [v1] COMMA i: INT RPAREN { v = ("relabel " + v1 + " on position " + token_ref_to_std_string(i)); }
| "mark" LPAREN s1: SYMBOL RPAREN { v = ("mark " + token_ref_to_std_string(s1)); }
| "unmark" LPAREN s2: SYMBOL RPAREN { v = ("unmark " + token_ref_to_std_string(s2)); }
| "startnew" LPAREN s3: SYMBOL RPAREN { v = ("startnew " + token_ref_to_std_string(s3)); }
| "lextend" LPAREN s4: SYMBOL RPAREN { v = ("lextend " + token_ref_to_std_string(s4)); }
| "rextend" LPAREN s5: SYMBOL RPAREN { v = ("rextend " + token_ref_to_std_string(s5)); }
;
et_iter [std::string& v]
{
std::string v1, v2, v3, v4;
}
: "only" LPAREN position_ref [v1] COMMA position_ref [v2] COMMA position_ref [v3] COMMA et_any [v4] RPAREN
| "atleast" LPAREN position_ref [v1] COMMA position_ref [v2] COMMA position_ref [v3] COMMA et_any [v4] COMMA i:INT RPAREN
| "llook" LPAREN position_ref [v1] COMMA position_ref [v2] COMMA position_ref [v3] COMMA et_any [v4] RPAREN
| "rlook" LPAREN position_ref [v1] COMMA position_ref [v2] COMMA position_ref [v3] COMMA et_any [v4] RPAREN
| "setvar" LPAREN position_ref [v1] COMMA position_ref [v2] RPAREN
| "lskip" LPAREN position_ref [v1] COMMA SYMBOL COMMA position_ref [v2] COMMA et_any [v3] RPAREN
| "lphrase" LPAREN position_ref [v1] COMMA SYMBOL COMMA position_ref [v2] RPAREN
| "rphrase" LPAREN position_ref [v1] COMMA SYMBOL COMMA position_ref [v2] RPAREN
| "accept" LPAREN seq_et [v1] RPAREN
;
et_agr [std::string& name]
{
std::string p1, p2, v;
}
: "agr" LPAREN position_ref [p1] COMMA position_ref [p2] COMMA es_any [v] COMMA i1: INT RPAREN
| "agrpp" LPAREN position_ref [p1] COMMA position_ref [p2] COMMA es_any [v] COMMA i2: INT RPAREN
| "wagr" LPAREN position_ref [p1] COMMA position_ref [p2] COMMA es_any [v] COMMA i3: INT RPAREN
;
// annotation checking predicates
et_annot [std::string& v]
: "phrase" LPAREN position_ref [v] COMMA s: SYMBOL RPAREN
;
// constraints
et_any [std::string& v]
: et_bool [v]
| et_set [v]
| et_string [v]
| et_action [v]
| et_iter [v]
| et_agr [v]
| et_annot [v]
;
*/
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Stiring operators returns boost::shared_ptr<Wccl::Function<Wccl::StrSet> >
///////////////////////////////////////////////////////////////////////////////
// ----------------------------------------------------------------------------
string_operators
[Wccl::Variables& vars]
returns [boost::shared_ptr<Wccl::Function<Wccl::StrSet> > ret]
/*
: ret = op_orth [vars]
| ret = op_base [vars]
*/
: ret = op_lower [vars]
| ret = op_upper [vars]
| ret = op_affix [vars]
| ret = str_set_v [vars]
// Implementations of string operators:
// ----------------------------------------------------------------------------
op_orth [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::StrSet> ret]
{
boost::shared_ptr<Wccl::PositionRef> tmpPosRef;
}
: "orth" LBRACKET tmpPosRef = position_ref [vars] RBRACKET {
// TODO
}
// ----------------------------------------------------------------------------
op_base [Wccl::Variables& vars] returns [boost::shared_ptr<Wccl::StrSet> ret]
{
boost::shared_ptr<Wccl::PositionRef> tmpPosRef;
}
: "base" LBRACKET tmpPosRef = position_ref [vars] RBRACKET {
// TODO
}
// ----------------------------------------------------------------------------
// returns boost::shared_ptr<Wccl::Function<Wccl::StrSet> >
op_lower
[Wccl::Variables& vars] returns
[boost::shared_ptr<Wccl::Function<Wccl::StrSet> > ret]
boost::shared_ptr<Wccl::Function<Wccl::StrSet> > o_ret;
}
: "lower" LPAREN o_ret = string_operators[vars] RPAREN {
ret.reset(new Wccl::ToLower(o_ret));
}
;
// ----------------------------------------------------------------------------
op_upper
[Wccl::Variables& vars]
returns [boost::shared_ptr<Wccl::Function<Wccl::StrSet> > ret]
boost::shared_ptr<Wccl::Function<Wccl::StrSet> > o_ret;
}
: "upper" LPAREN o_ret = string_operators[vars] RPAREN {
ret.reset(new Wccl::ToUpper(o_ret));
}
;
// ----------------------------------------------------------------------------
op_affix
[Wccl::Variables& vars]
returns [boost::shared_ptr<Wccl::Function<Wccl::StrSet> > ret]
boost::shared_ptr<Wccl::Function<Wccl::StrSet> > o_ret;
}
: "affix" LPAREN o_ret = string_operators[vars] COMMA offset: INT RPAREN {
ret.reset(new Wccl::Affix(o_ret, token_ref_to_int(offset)));
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
}
;
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Predicates returns boost::shared_ptr<Wccl::Function<Wccl::Bool> >
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
predicates
[Wccl::Variables& vars, boost::shared_ptr<Wccl::Function<Wccl::Bool> >& pr]
returns [boost::shared_ptr<Wccl::Bool> ret]
: ret = logical_predicates [vars, pr]
;
// Implementations of predicates:
// ----------------------------------------------------------------------------
logical_predicates
[Wccl::Variables& vars, boost::shared_ptr<Wccl::Function<Wccl::Bool> >& pr]
returns [boost::shared_ptr<Wccl::Bool> ret]
{
boost::shared_ptr<Wccl::LogicalPredicate::BoolFunctionPtr> v;
}
: ret = lpred_and [vars] {
// pr.reset(new Wccl::And(v));
}
;
// ----------------------------------------------------------------------------
lpred_and
[Wccl::Variables& vars]
returns [boost::shared_ptr<Wccl::Bool> ret]
{
boost::shared_ptr<Wccl::Function<Wccl::Bool> > tmpPr;
}
: "and" LPAREN ret = logical_predicates [vars, tmpPr] (COMMA ret = logical_predicates [vars, tmpPr])* RPAREN
;
/*
lpred_not
lpred_or
*/
/*
boolean_op [std::string& name]
: "and" LPAREN seq_et [name] RPAREN
| "not" LPAREN seq_et [name] RPAREN
| "or" LPAREN seq_et [name] RPAREN
;
// ----------------------------------------------------------------------------------
// ANTLR LEXER
// ----------------------------------------------------------------------------------
class ANTLRLexer extends Lexer;
options {
k = 2;
exportVocab = ANTLRExpr;
charVocabulary = '\3'..'\377';
testLiterals = false;
}
STRING
options {
paraphrase = "a string";
}
: '"' (~'"')* '"'
| '\'' (~'\'')* '\''
;
// STRING_APOS
// options {
// paraphrase = "a string without apostrophe";
// }
// : (~'"')*
// ;
// STRING_QUOT
// options {
// paraphrase = "a string without quotation";
// }
// : (~'\'')*
// ;
INT
options {
paraphrase = "Integer";
}
: ('-'|'+')?('0'..'9')+
;
QUOT_MARK
options {
paraphrase = "Quota mark";
}
: '\''
;
APOS_MARK
options {
paraphrase = "Aposptrophe mark";
}
: '"'
;
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
Q_MARK
options {
paraphrase = "Query mark";
}
: '?'
;
E_MARK
options {
paraphrase = "Exclamanation mark";
}
: '!'
;
G_MARK
options {
paraphrase = "Gravis mark";
}
: '`'
;
LBRACKET
options {
paraphrase = "'['";
}
: '['
;
RBRACKET
options {
paraphrase = "']'";
}
: ']'
;
LPAREN
options {
paraphrase = "'('";
}
: '('
;
RPAREN
options {
paraphrase = "')'";
}
: ')'
;
LCURLY
options {
paraphrase = "'{'";
}
: '{'
;
RCURLY
options {
paraphrase = "'}'";
}
: '}'
;
DOLLAR
options {
paraphrase = "'$'";
}
: '$'
;
AT_MARK
options {
paraphrase = "'@'";
}
: '@'
;
COMMA
options {
paraphrase = "','";
}
: ','
;
SYMBOL
options {
paraphrase = "symbol";
testLiterals = true;
}
: ( 'a'..'z' | 'A'..'Z' | '_' ) ( 'a'..'z' | 'A'..'Z' | '_' | '0'..'9' )*
;
WS
:
( ' '
| '\t'
| '\r' '\n' {newline(); }
| '\n' {newline(); } ) { $setType(antlr::Token::SKIP); }
;
COMMENT
options {
paraphrase = "Comment";
}
: "//" (~'\n')* '\n'{ $setType(antlr::Token::SKIP); newline(); }
;
HASH
options {
paraphrase = "'#'";
}
: '#'
;
DSEPARATOR
options {
paraphrase = "':-'";
}
: ":-"
;