Newer
Older
//don't try to add all the headers inside our namespace
#include <libwccl/parser/ParserException.h>
#include <cstdio>
#include <antlr/Token.hpp>
#include <boost/lexical_cast.hpp>
// values/variables
#include <libwccl/variables.h>
#include <libwccl/values/bool.h>
#include <libwccl/values/tset.h>
#include <libwccl/values/strset.h>
#include <libwccl/values/position.h>
// sentence context
#include <libwccl/sentencecontext.h>
// operators
#include <libwccl/ops/operator.h>
#include <libwccl/ops/functions/constant.h>
#include <libwccl/ops/functions/vargetter.h>
#include <libwccl/ops/functions/conditional.h>
#include <libwccl/ops/functions/bool/varsetter.h>
ilor
committed
#include <libwccl/ops/functions/bool/predicates/debug.h>
#include <libwccl/ops/functions/bool/predicates/or.h>
#include <libwccl/ops/functions/bool/predicates/nor.h>
#include <libwccl/ops/functions/bool/predicates/and.h>
#include <libwccl/ops/functions/bool/predicates/regex.h>
#include <libwccl/ops/functions/bool/predicates/intersects.h>
#include <libwccl/ops/functions/bool/predicates/issubsetof.h>
#include <libwccl/ops/functions/bool/predicates/isinside.h>
#include <libwccl/ops/functions/bool/predicates/isoutside.h>
#include <libwccl/ops/functions/bool/predicates/equals.h>
#include <libwccl/ops/functions/bool/predicates/weakagreement.h>
#include <libwccl/ops/functions/bool/predicates/pointagreement.h>
#include <libwccl/ops/functions/bool/predicates/strongagreement.h>
#include <libwccl/ops/functions/strset/affix.h>
#include <libwccl/ops/functions/strset/getorth.h>
#include <libwccl/ops/functions/strset/toupper.h>
#include <libwccl/ops/functions/strset/tolower.h>
#include <libwccl/ops/functions/strset/getlemmas.h>
#include <libwccl/ops/functions/tset/getsymbols.h>
#include <libwccl/ops/functions/tset/getwordclass.h>
#include <libwccl/ops/functions/tset/getsymbolsinrange.h>
#include <libwccl/ops/functions/position/relativeposition.h>
#include <libwccl/ops/functions/position/lasttoken.h>
#include <libwccl/ops/functions/position/firsttoken.h>
#include <libwccl/ops/functions/bool/iterations/only.h>
#include <libwccl/ops/functions/bool/iterations/atleast.h>
#include <libwccl/ops/functions/bool/iterations/leftlook.h>
#include <libwccl/ops/functions/bool/iterations/rightlook.h>
ilor
committed
#include <libwccl/ops/matchrule.h>
#include <libwccl/ops/rulesequence.h>
//
#include <libwccl/ops/tagactions/unify.h>
#include <libwccl/ops/tagactions/delete.h>
#include <libwccl/ops/tagactions/select.h>
#include <libwccl/ops/tagactions/relabel.h>
#include <libwccl/ops/tagactions/mark.h>
#include <libwccl/ops/tagactions/unmark.h>
// Match operators
#include <libwccl/values/tokenmatch.h>
#include <libwccl/values/annotationmatch.h>
#include <libwccl/values/matchvector.h>
#include <libwccl/ops/match/applyoperator.h>
#include <libwccl/ops/match/conditions/optionalmatch.h>
#include <libwccl/ops/match/conditions/repeatedmatch.h>
#include <libwccl/ops/match/conditions/isannotatedas.h>
#include <libwccl/ops/match/conditions/matchtext.h>
#include <libwccl/ops/match/conditions/conjconditions.h>
#include <libwccl/ops/match/conditions/tokencondition.h>
#include <libwccl/ops/match/conditions/oneof.h>
#include <libwccl/ops/match/actions/markmatch.h>
#include <libwccl/ops/match/actions/unmarkmatch.h>
#include <libwccl/ops/functions/match/submatch.h>
// Unicode String
#include <unicode/uniset.h>
#include <unicode/unistr.h>
ANTLR_BEGIN_NAMESPACE(Wccl)
Paweł Kędzia
committed
genHashLines = false;
namespace = "Wccl";
// ----------------------------------------------------------------------------
// ----------------------------------------------------------------------------
Paweł Kędzia
committed
exportVocab = ANTLRExpr;
const UnicodeString token_ref_to_ustring(antlr::RefToken& rstr) const {
return UnicodeString::fromUTF8(((antlr::Token*)rstr)->getText().c_str()).unescape();
Paweł Kędzia
committed
const UnicodeString str_token_ref_to_ustring(antlr::RefToken& rstr) const {
UnicodeString ret_ustr, ustr = token_ref_to_ustring(rstr);
if (ustr.length() < 3) {
return "";
}
ustr.extract(1, ustr.length() - 2, ret_ustr);
return ret_ustr;
}
Paweł Kędzia
committed
//
const std::string str_token_rem_grav(antlr::RefToken& rstr) const {
size_t len = 0;
std::string ret = token_ref_to_std_string(rstr);
if ((len = ret.length()) < 2) {
return ret;
}
if (ret[0] == '`' && ret[len - 1] == '`') {
return ret.substr(1, len - 2);
}
return ret;
}
//
const std::string token_ref_to_std_string(antlr::RefToken& rstr) const {
return (((antlr::Token*)rstr)->getText());
//
int token_ref_to_int(antlr::RefToken& rstr) {
return atoi(((antlr::Token*)rstr)->getText().c_str());
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// ----------------------------------------------------------------------------
// Rule for parsing string set operator with scope.
// Returns boost::shared_ptr<Operator<StrSet> >
parse_strset_operator
Paweł Kędzia
committed
[const Corpus2::Tagset &tagset]
returns [boost::shared_ptr<Operator<StrSet> > res]
Variables vars;
boost::shared_ptr<Function<StrSet> > body;
: body = strset_operator [tagset, vars] {
res.reset(new Operator<StrSet>(body, vars));
// ----------------------------------------------------------------------------
// Rule for parsing bool operator with scope.
// Returns boost::shared_ptr<Operator<Bool> >
parse_bool_operator
Paweł Kędzia
committed
[const Corpus2::Tagset &tagset]
returns [boost::shared_ptr<Operator<Bool> > res]
Variables vars;
boost::shared_ptr<Function<Bool> > body;
: body = bool_operator [tagset, vars] {
res.reset(new Operator<Bool>(body, vars));
// ----------------------------------------------------------------------------
// Rule for parsing symbol set operator with scope.
// Returns boost::shared_ptr<Operator<TSet> >
parse_symset_operator
Paweł Kędzia
committed
[const Corpus2::Tagset &tagset]
returns [boost::shared_ptr<Operator<TSet> > res]
Variables vars;
boost::shared_ptr<Function<TSet> > body;
: body = symset_operator [tagset, vars] {
res.reset(new Operator<TSet>(body, vars));
// ----------------------------------------------------------------------------
// Rule for parsing position operator with scope.
// Returns boost::shared_ptr<Operator<Position> >
parse_position_operator
returns [boost::shared_ptr<Operator<Position> > res]
Variables vars;
boost::shared_ptr<Function<Position> > body;
: body = position_operator [tagset, vars] {
res.reset(new Operator<Position>(body, vars));
// ----------------------------------------------------------------------------
// ----------------------------------------------------------------------------
// Rule for parsing single WCCL Rule
parse_single_rule
[const Corpus2::Tagset &tagset]
{
Variables vars;
}
: rle = rule [tagset, vars]
;
// Rule for parsing rules section in the wccl file
// Returns boost::shared_ptr<RuleSequence>
parse_rule_sequence
[const Corpus2::Tagset& tagset]
returns [boost::shared_ptr<RuleSequence> rule_seq]
{
Variables vars;
}
: rule_seq = rules[tagset, vars]
;
// ----------------------------------------------------------------------------
// ----------------------------------------------------------------------------
// Rule for parsing the match rules
ilor
committed
// Returns boost::shared_ptr<MatchRule>
parse_match_rule
[const Corpus2::Tagset& tagset]
ilor
committed
returns [boost::shared_ptr<MatchRule> ret_match]
vars.get_put<Match>("_M");
}
: ret_match = match_rule_operator[tagset, vars]
;
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// VALUES
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// ----------------------------------------------------------------------------
// Single or muliple (comma separated) elements in string set, may be:
// 'a' "a" [] ['a'] ['a', 'b'] ["a"] ["a", "b"] ['a', "b"]
// Parsing strset literal and returning plain strset value.
// Returns boost::shared_ptr<StrSet>
strset_literal
returns [boost::shared_ptr<StrSet> s_set]
Paweł Kędzia
committed
{
s_set.reset(new StrSet());
Paweł Kędzia
committed
}
: s0: STRING {
s_set->insert(token_ref_to_ustring(s0));
| LBRACKET
(
s1: STRING {
s_set->insert(token_ref_to_ustring(s1));
}
(
COMMA s2: STRING {
s_set->insert(token_ref_to_ustring(s2));
}
)*
)?
RBRACKET
;
// String set value as constrant string set:
// Returns boost::shared_ptr<Constant<StrSet> >
strset_value
returns [boost::shared_ptr<Constant<StrSet> > val]
boost::shared_ptr<StrSet> set;
: set = strset_literal {
val.reset(new Constant<StrSet>(*set));
Paweł Kędzia
committed
// ----------------------------------------------------------------------------
// Element of sym set. This rule, inserts element into symbol set
// with corresponding tagset.
// WARNING! This rule can throw ParserException! Be careful!
symset_elem
[const Corpus2::Tagset& tagset, boost::shared_ptr<TSet>& t_set]
}
catch(Corpus2::TagParseError &e) {
throw(ParserException(e.info()));
}
// Symset literal. Symset element may be:
// a, `a ` (this is guaranteed by lexer rule - SYMBOL) or {a} {`a`} {a, b}
// {`a`, `b`} {a, `b`} {`a`, b}
// Parsing symset literal and returning plain symset value.
// Returns boost::shared_ptr<TSet>
symset_literal
returns [boost::shared_ptr<TSet> t_set]
Paweł Kędzia
committed
{
t_set.reset(new TSet());
Paweł Kędzia
committed
}
: symset_elem [tagset, t_set]
(
symset_elem [tagset, t_set] (COMMA symset_elem [tagset, t_set])*
)?
// Symset value, as constant symbol set
// Returns boost::shared_ptr<Constant<TSet> >
symset_value
returns [boost::shared_ptr<Constant<TSet> > val]
boost::shared_ptr<TSet> set;
: set = symset_literal [tagset] {
val.reset(new Constant<TSet>(*set));
Paweł Kędzia
committed
// ----------------------------------------------------------------------------
// Bool literal. May be True or False. Parsing bool literal and returning
// plain bool value.
// Returns boost::shared_ptr<Bool>
Paweł Kędzia
committed
bool_literal
returns [boost::shared_ptr<Bool> val]
: "True" { val.reset(new Bool(Bool(true ))); }
| "False" { val.reset(new Bool(Bool(false))); }
Paweł Kędzia
committed
;
// Bool value, as constat bool Value
// Returns boost::shared_ptr<Constant<Bool> >
bool_value
returns [boost::shared_ptr<Constant<Bool> > val]
Paweł Kędzia
committed
{
boost::shared_ptr<Bool> bool_lit;
Paweł Kędzia
committed
}
: bool_lit = bool_literal {
val.reset(new Constant<Bool>(*bool_lit));
Paweł Kędzia
committed
}
Paweł Kędzia
committed
// ----------------------------------------------------------------------------
// Position literal may be:
// (+|-)?(0-9)+ or begin or end or nowhere
// Parsing position literal and returning plain position value.
// returns boost::shared_ptr<Position>
Paweł Kędzia
committed
position_literal
returns [boost::shared_ptr<Position> val]
val.reset(new Position(Position(i)));
val.reset(new Position(Position(Position::Begin)));
Paweł Kędzia
committed
}
val.reset(new Position(Position(Position::End)));
val.reset(new Position(Position(Position::Nowhere)));
Paweł Kędzia
committed
}
;
// Position as constant position value
// Returns boost::shared_ptr<Constant<Position> >
Paweł Kędzia
committed
position_value
returns [boost::shared_ptr<Constant<Position> > val]
Paweł Kędzia
committed
{
boost::shared_ptr<Position> pos_lit;
Paweł Kędzia
committed
}
: pos_lit = position_literal {
val.reset(new Constant<Position>(*pos_lit));
// ----------------------------------------------------------------------------
// Value used into match operator such as TOK[position] and ANN[position, name]
// Returns boost::shared_ptr<Match>
Adam Wardynski
committed
match_literal
returns [boost::shared_ptr<Match> val]
{
boost::shared_ptr<MatchData> m;
}
Adam Wardynski
committed
: m = match_data_literal {
val.reset(new Match(m));
}
;
// Constant match value
// Returns boost::shared_ptr<Constant<Match> >
match_value_const
returns [boost::shared_ptr<Constant<Match> > val]
{
boost::shared_ptr<Match> m;
}
Adam Wardynski
committed
: m = match_literal {
val.reset(new Constant<Match>(*m));
}
;
// ----------------------------------------------------------------------------
// Value used into match operator such as TOK[position] and ANN[position, name]
// Returns boost::shared_ptr<MatchData>
Adam Wardynski
committed
match_data_literal
returns [boost::shared_ptr<MatchData> val]
Adam Wardynski
committed
: val = token_match_literal
| val = ann_match_literal
| val = match_vector_literal
Adam Wardynski
committed
// token match literal - TOK[position]
// Returns boost::shared_ptr<TokenMatch>
Adam Wardynski
committed
token_match_literal
returns [boost::shared_ptr<TokenMatch> val]
{
boost::shared_ptr<Position> p;
}
: "TOK" LBRACKET p = position_literal RBRACKET {
val.reset(new TokenMatch(*p));
}
;
Adam Wardynski
committed
// annotation match literal - ANN[position, name]
// Returns boost::shared_ptr<AnnotationMatch>
Adam Wardynski
committed
ann_match_literal
returns [boost::shared_ptr<AnnotationMatch> val]
{
boost::shared_ptr<Position> p;
}
: "ANN" LBRACKET p = position_literal COMMA channel : STRING RBRACKET {
val.reset(new AnnotationMatch(*p, token_ref_to_std_string(channel)));
}
;
Adam Wardynski
committed
// annotation match vector literal: MATCH() or MATCH(token, ann, MATCH())
// Returns boost::shared_ptr<MatchVector>
Adam Wardynski
committed
match_vector_literal
returns [boost::shared_ptr<MatchVector> val]
{
val.reset(new MatchVector());
}
Adam Wardynski
committed
: "MATCH" LPAREN (match_vector_literal_item[val])? RPAREN
;
// Body of the MATCH value. It only adds vector items to the MatchVector
// Item may be single or multiple
Adam Wardynski
committed
match_vector_literal_item [boost::shared_ptr<MatchVector>& mvector]
{
boost::shared_ptr<Match> m_val;
}
Adam Wardynski
committed
: m_val = match_literal {
mvector->append(m_val);
}
(
COMMA
Adam Wardynski
committed
m_val = match_literal {
mvector->append(m_val);
}
)*
;
// ----------------------------------------------------------------------------
// Number may be unsigned or signed: 1, +1, -1
number
returns [int ret]
{
ret = 0;
}
: s: SIGNED_INT { ret = token_ref_to_int(s); }
| u: UNSIGNED_INT { ret = token_ref_to_int(u); }
;
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// VARIABLES
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// ----------------------------------------------------------------------------
// Position: $Name
// Get position variable (however, before put into) from variables
// Returns boost::shared_ptr<VariableAccessor<Position> >
[Variables& vars]
returns [boost::shared_ptr<VariableAccessor<Position> > pos_acc]
Paweł Kędzia
committed
: POS_PREFIX n: SYMBOL {
vars.get_put<Position>(str_token_rem_grav(n));
VariableAccessor<Position> acc =
vars.create_accessor<Position>(str_token_rem_grav(n));
pos_acc.reset(new VariableAccessor<Position>(acc));
// VarGetter for Position variable. This rule wrapped position_variable_acc.
// Returs boost::shared_ptr<VarGetter<Position> >
[Variables& vars]
returns [boost::shared_ptr<VarGetter<Position> > op]
boost::shared_ptr<VariableAccessor<Position> > pos_acc;
}
: pos_acc = position_variable_acc [vars] {
op.reset(new VarGetter<Position>(*pos_acc));
Paweł Kędzia
committed
// ----------------------------------------------------------------------------
Loading full blame...