diff --git a/examples/indecl-ops.ccl b/examples/indecl-ops.ccl new file mode 100644 index 0000000000000000000000000000000000000000..e022d32debdbe4e53db42ef989544f38e2bcb443 --- /dev/null +++ b/examples/indecl-ops.ccl @@ -0,0 +1,13 @@ +import("indecl.lex", "indecl") // import file as "indecl" + +@b:"indecl" ( // tests for particular classes from the lexicon + inter(lex(lower(orth[0]), "indecl"), ["adv"]); + inter(lex(lower(orth[0]), "indecl"), ["interj"]); + inter(lex(lower(orth[0]), "indecl"), ["part"]); + inter(lex(lower(orth[0]), "indecl"), ["prep"]) +) + +@s:"indecl_label" ( // gets the label from the lexicon + lex(lower(orth[0]), "indecl") +) + diff --git a/examples/indecl.lex b/examples/indecl.lex new file mode 100644 index 0000000000000000000000000000000000000000..e78d6fb59ee189c1c9f593c2c4f962592b875a68 --- /dev/null +++ b/examples/indecl.lex @@ -0,0 +1,8 @@ +by part +och interj +ach interj +dla prep +bez prep +z prep +dziÅ› adv +wczoraj adv diff --git a/examples/ne-match.ccl b/examples/ne-match.ccl new file mode 100644 index 0000000000000000000000000000000000000000..1129219e8ba2f0e52913d5fa6dc2bc6927e73676 --- /dev/null +++ b/examples/ne-match.ccl @@ -0,0 +1,54 @@ +match_rules( + + // „wyżyna Xâ€, gdzie X jest oznaczone anotacjÄ… reladj_gaz_based + // poza tym wymagane jest uzgodnienie + apply( + match( + regex( base[0], 'wyżyna' ), + is( 'reladj_gaz_based' ) + ), + cond( + ann(:1, 'capitalized_noun' ), + agr(first(:1), first(:2), {nmb,gnd,cas}) + //equal( nmb[first(:1)], nmb[first(:2)] ), + //equal( cas[first(:1)], cas[first(:2)] ), + //equal( gnd[first(:1)], gnd[first(:2)] ) + ), + actions( + mark(M, 'HIGHLAND_NAM') + ) + ); + + // „wyżyna Xâ€, gdzie X może być subst i może być nom + // znakuje drugi element jako HIGHLAND_NAM, + // pierwszy element nie może być oznakowany jako capitalized_noun + apply( + match( + regex( base[0], 'wyżyna'), + and( inter(class[0], {subst}), inter(cas[0], {nom}) ) + ), + cond( + not( ann(:1, 'capitalized_noun' ) ) + ), + actions( + mark(:2, 'HIGHLAND_NAM') + ) + ); + + // „MIASTO Yâ€, gdzie MIASTO to anotacja city_trigger (jedno lub wielotokenowa), + // a Y to first_capital_word, poza tym mamy post-condition, że Y nie zawiera siÄ™ + // w anotacji city_nam_gaz + apply( + match( + is( 'city_trigger' ), + is( 'first_capital_word' ) + ), + cond( + not( annsub(:2, 'city_nam_gaz') ) + ), + actions( + mark(:2, 'CITY_NAM') + ) + ) + +) diff --git a/examples/np-match.ccl b/examples/np-match.ccl new file mode 100644 index 0000000000000000000000000000000000000000..127fb7a4f11c89457c79ed8a07ea30416add72d0 --- /dev/null +++ b/examples/np-match.ccl @@ -0,0 +1,33 @@ +match_rules( + apply( + match( + optional(equal(base[0], "nie")), + repeat( + inter(class[0], {adj, ppas, pact}) + ) + ), + cond( + agr(first(:2), last(M), {nmb,gnd,cas}) + ), + actions( + mark(M, "AdjP") + ) + ); + + apply( + match( + optional(is("AdjP")), + inter(class[0], {subst, ger, depr}) + ), + cond( + or( + empty(:1), + agrpp(last(:1), first(:2), {nmb,gnd,cas}) + ) + ), + actions( + mark(M, M, :2, "NP") + ) + ) +) + diff --git a/examples/simple-ops.ccl b/examples/simple-ops.ccl new file mode 100644 index 0000000000000000000000000000000000000000..19482624c948c3b3f4c15a6ffeb733c63f154050 --- /dev/null +++ b/examples/simple-ops.ccl @@ -0,0 +1,12 @@ +@s:"orths" ( + orth[-1]; orth[0]; orth[1] +) + +@t:"wclass" ( + class[-1]; class[0]; class[1] +) + +@b:"agr2" ( + agr(-1,1,{nmb,gnd,cas}) +) + diff --git a/examples/takipi_rules.ccl b/examples/takipi_rules.ccl new file mode 100644 index 0000000000000000000000000000000000000000..8701884d0a4fb298dcfea621bb4385bdaa6514f9 --- /dev/null +++ b/examples/takipi_rules.ccl @@ -0,0 +1,692 @@ +tag_rules( + + rule("10", + + and( + + equal(class[0],{adj,conj,pred,qub,subst}), + equal(class[-1],{prep}), + inter(cas[-1],cas[0]), + not( + in(orth[-1],["Niby","niby"]), + agrpp(0, 1, {sg,n,cas}) + ), + not( + and( + equal(class[1],{ppron3}), + inter(cas[1],{gen}), + agrpp(0, 2, {sg,n,cas}) + ) + ) + ), + + delete(not( + equal(class[0],{subst}) + )) + + ); + + rule("40", + + and( + equal(orth[0],["wszystkim"]), + in(orth[-1],["przede","Przede"]) + ), + + delete(not( + and( + equal(class[0],{subst}), + equal(cas[0],{inst}), + equal(gnd[0],{n}) + ) + )) + + ); + + rule("41", + + and( + in(orth[0],["ze","Ze","ZE"]), + in(orth[1],["wszech","WSZECH"]) + ), + + delete(not(equal(cas[0],{gen}))) + + ); + + rule("42", + + and( + in(orth[0],["W","w"]), + in(orth[1],["stanie","STANIE"]) + ), + + delete(not(equal(cas[0],{loc}))) + + ); + + rule("70", + + and( + in(class[0],{adj,ger,subst}), + inter(cas[0],{gen}), + in(orth[-1],["dużo","maÅ‚o","mnóstwo","parÄ™","niewiele","wiele"]) + ), + + delete(not( + equal(cas[0],{gen}) + )) + + ); + + rule("96", + + and( + in(orth[0],["z","Z","ze","Ze"]), + or( + in(orth[1],["dala","daleka","bliska","godzinÄ™"]), + and( + + inter(class[1],{num}), + not(inter(cas[1],{gen,inst})) + ) + ) + ), + + delete(not( + equal(class[0],{qub}) + )) + + ); + + rule("97", + + and( + in(orth[0],["z","Z","ze","Ze"]), + or( + inter(cas[1],{gen,inst}), + regex(orth[1], "\\p{Lu}.*"), + regex(orth[0], ".*[0-9].*") + ) + ), + + delete(not( + equal(class[0],{prep}) + )) + + ); + + rule("98", + + and( + in(orth[0],["z","Z","ze","Ze"]), + in(orth[1], ["tak", "Tak"]) + ), + + delete(not( + equal(class[0],{prep}) + )) + + ); + + rule("99", + + and( + in(orth[0],["niedawna","dawna"]), + in({prep},class[-1]) + ), + + delete(not(equal(class[0],{qub}))) + + ); + + rule("100", + + and( + equal(class[-1],{prep}), + not( + and( + not( + regex(orth[-1], "\\p{Lu}.*") + ), + regex(orth[0], "\\p{Lu}.*") + ), + in(orth[-1],["niby","Niby"]), + inter(class[0],{ppron3,num}), + in(orth[0],["tysiÄ…c","tego","niedawna"]), + and( + in(orth[-1],["po","Po"]), + equal(cas[0],{dat}) + ), + + and( + in(orth[-1],["o","O"]), + in(orth[0],["rany", "RANY"]) + ), + equal(class[0],{prep}), + + and( + in(orth[-1],["mimo","Mimo","pomimo","Pomimo"]), + equal(orth[0],["to"]) + ) + ), + in(cas[0],{nom,gen,acc,dat,loc,inst,voc}) + ), + + delete( + not( + equal(cas[0],{}), + equal(cas[0],{gen}), + in(cas[0],cas[-1]) + )) + + ); + + rule("101", + + and( + equal(class[-1],{prep}), + equal(class[0],{ppron3}), + not( + inter(class[1],{adj,ger,ign,num,numcol,pact,ppas,subst,xxs,xxx}), + and( + inter(class[1],{adv,qub}), + inter(class[2],{adj,ger,ign,num,numcol,pact,ppas,subst,xxs,xxx}) + ) + ) + ), + + delete(not( + equal(cas[0],{}), + in(cas[0],cas[-1]) + )) + + ); + + rule("103", + + and( + equal(orth[-1],["w"]), + equal(orth[0],["tysiÄ…c"]), + equal(class[1],{num}) + ), + + delete(not( + equal(cas[0],{}), + equal(cas[0],{nom}) + )) + + ); + + rule("105", + + and( + equal(class[0],{subst}), + not( + equal(orth[0],["tysiÄ…c"]), + and( + equal(class[-1],{prep}), + regex(orth[0], "\\p{Lu}.*"), + not( + regex(orth[-1], "\\p{Lu}.*") + ) + ) + ), + llook(-1,begin,$Prep,equal(class[$Prep],{prep})), + not( + inter(base[$Prep],["niby"]) + ), + only($Prep + 1,-1,$A,and( + equal(class[$A],{adj}), + not( + inter(base[$A],["który","jakiÅ›","jaki"]) + ) + )) + ), + + delete(not( + equal(cas[0],{gen}), + in(cas[0],cas[-1]) + )) + + ); + + rule("130", + + and( + and( + equal(class[-1],{num}), + not( + inter(base[-1],["oba","półtora","wiele"]) + ) + ), + and( + in(class[0],{subst}), + not( + inter(base[0],["jeden","procent","deka","gram","kilo","para","wolta"]) + ), + agrpp(0, -1, {nmb,gnd,cas}) + ) + ), + + delete(equal(nmb[0],{sg})) + + ); + + rule("140", + + and( + inter(class[0],{fin}), + equal(class[-1],{fin}), + not( + and( + in(orth[-1],["jest","znaczy"]), + equal(orth[-2],["to"]) + ), + and( + equal(orth[-1],["wydaje"]), + equal(orth[0],["może"]) + ) + ) + ), + + delete(equal(class[0],{fin})) + + ); + + rule("141", + + and( + inter(class[0],{fin}), + equal(class[-1],{qub}), + equal(class[-2],{fin}) + ), + + delete(equal(class[0],{fin})) + + ); + + rule("142", + + and( + inter(class[0],{fin}), + equal(class[-1],{qub}), + equal(class[-2],{qub}), + equal(class[-3],{fin}) + ), + + delete(equal(class[0],{fin})) + + ); + + rule("150", + + and( + inter(class[0],{fin}), + equal(class[1],{fin}) + ), + + delete(equal(class[0],{fin})) + + ); + + rule("151", + + and( + inter(class[0],{fin}), + equal(class[1],{qub}), + equal(class[2],{fin}) + ), + + delete(equal(class[0],{fin})) + + ); + + rule("152", + + and( + inter(class[0],{fin}), + equal(class[1],{qub}), + equal(class[2],{qub}), + equal(class[3],{fin}) + ), + + delete(equal(class[0],{fin})) + + ); + + rule("160", + + and( + equal(class[-1],{prep}), + not( + in(orth[-1],["Niby","niby"]) + ), + inter(class[0],{fin,praet}) + ), + + delete(in(class[0],{fin,praet})) + + ); + + rule("170", + + and( + equal(class[-1],{prep}), + equal(class[0],{adj}), + not( + inter(base[0],["który"]), + and( + equal(orth[-1],["po"]), + in(orth[0],["pierwsze","drugie","trzecie","czwarte","piÄ…te"]) + ), + and( + inter(base[0],["niektóry","inny"]), + inter(nmb[0],{pl}) + ) + ), + or( + equal(class[1],{adj}), + and( + equal(class[1],{subst}), + in(cas[1],{nom,acc,dat,loc,inst,voc}), + not( + in(base[1],["tysiÄ…c","milion","miliard"]) + ) + ) + ), + agrpp(0, 1, {nmb,gnd,cas}) + ), + + delete(not( + agrpp(0, 1, {nmb,gnd,cas}) + )) + + ); + + rule("175", + + and( + equal(orth[-1],["po"]), + in(orth[0],["pierwsze","drugie","trzecie","czwarte","piÄ…te"]) + ), + + delete( + not( + and( + equal(class[0],{adj}), + equal(nmb[0],{sg}), + equal(cas[0],{acc}), + equal(gnd[0],{n}) + ) + )) + + ); + + rule("180", + + and( + equal(orth[0],["po"]), + equal(class[1],{adjp}) + ), + + delete(not( + equal(cas[0],{loc}) + )) + + ); + + rule("181", + + and( + in(orth[0],["prawie","Prawie","PRAWIE"]), + not( + and( + in({prep},class[-1]), + inter(cas[-1],cas[0]) + ) + ) + ), + + delete(not( + equal(class[0],{qub}) + )) + + ); + + rule("190", + + and( + equal(class[0],{prep}), + in(class[1],{adj,ppas,pact,subst,depr,ger,num,ppron12,ppron3,numcol}), + not( + and( + inter(cas[1],{gen}), + not( + equal(cas[2],{}) + ) + ), + and( + in(orth[0],["Po","po"]), + equal(cas[1],{dat}) + ), + and( + in(orth[-1],["Z","z"]), + equal(orth[0],["nad"]) + ), + and( + in(orth[-1],["Od","od"]), + equal(orth[0],["przed"]) + ), + and( + in(orth[0],["Ze","ze","PomiÄ™dzy","pomiÄ™dzy"]), + inter(class[1],{num}) + ), + and( + in(base[1],["tysiÄ…c","milion","miliard"]), + inter(cas[1],{nom}) + ), + and( + not( + regex(orth[0], "\\p{Lu}.*") + ), + regex(orth[1], "\\p{Lu}.*") + ) + ) + ), + + delete(not( + in(cas[0],cas[1]) + )) + + ); + + rule("200", + + and( + inter(cas[0],{acc}), + not( + inter(class[0],{prep}) + ), + equal(orth[-1],["jako"]) + ), + + delete(not( + in(cas[0],{gen,acc,nom}), + equal(cas[0],{}) + )) + + ); + + rule("210", + + and( + inter(base[0],["który"]), + equal(orth[-1],[","]), + not( + equal(cas[-2],{}), + inter(cas[-2],{gen}) + ), + agrpp(-2, 0, {nmb,gnd}) + ), + + delete(not( + agrpp(-2, 0, {nmb,gnd}) + )) + + ); + + rule("250", + + and( + inter(base[2],["który"]), + equal(orth[1],[","]), + not( + equal(cas[0],{}), + inter(cas[0],{gen}), + and( + not( + agrpp(2, 0, {nmb,gnd}) + ), + inter( gnd[0],gnd[1]) + ) + ), + agrpp(2, 0, {nmb,gnd}) + ), + + delete(not( + agrpp(2, 0, {nmb,gnd}) + )) + + ); + + rule("270", + + and( + in(class[-1],{subst,ger}), + in(class[0],{adj,pact,ppas}), + equal(orth[1],[","]), + inter(base[2],["który"]), + not( + inter(base[0],["ten"]) + ), + agrpp(-1, 0, {nmb,gnd,cas}) + ), + + delete(not( + agrpp(-1, 0, {nmb,gnd,cas}) + )) + + ); + + rule("280", + + and( + equal(class[-1],{}), + equal(class[0],{adj}), + in(class[1],{subst,ger}), + agrpp(0, 1, {nmb,gnd,cas}) + ), + + delete(not( + in(gnd[0],gnd[1]) + )) + + ); + + rule("300", + + and( + inter(acm[-1],{rec}), + inter(gnd[0],{m1}), + not( + inter(class[0],{num}), + in(orth[0],["jeden"]) + ) + ), + + delete(and( + equal(gnd[0],{m1}), + equal(cas[0],{nom}) + )) + + ); + + rule("320", + + and( + equal(orth[0],["obok"]), + equal(class[1],{interp}), + in(orth[1],[",","."]) + ), + + delete(equal(class[0],{prep})) + + ); + + rule("330", + + and( + equal(class[-1],{prep}), + not( + equal(class[0],{prep}) + ), + inter(class[0],{prep}), + not( + in(orth[-1],["Niby","niby"]), + and( + in(orth[-1],["Na","na"]), + equal(orth[1],["dzieÅ„"]) + ), + and( + equal(orth[0],["ponad"]), + in(orth[-1],["O","o","Przez","przez","Na","na"]) + ), + and( + in(orth[0],["z"]), + in(orth[-1],["Przed","przed"]) + ), + and( + in(orth[0],["przed"]), + in(orth[-1],["Od","od"]) + ) + ) + ), + + delete(equal(class[0],{prep})) + + ); + + rule("340", + + and( + equal(class[0],{adj}), + in(class[1],{subst,ger}), + agrpp(0, 1, {nmb,gnd,cas}), + not( + + in(base[0],["jeden","czyj","nasz","swój","twój","mój","godny","niegodny"]), + and( + inter(base[0],["który","jaki"]), + or( + equal(class[-1],{interp}), + equal(class[-2],{interp}) + ) + ), + inter(class[-1],{prep,ign}), + and( + equal(orth[-1],["-"]), + inter(class[-2],{adja}) + ), + agrpp(-1, 0, {nmb,gnd,cas}) + ) + ), + + delete(not( + agrpp(0, 1, {nmb,gnd,cas}) + )) + + ) + +) + diff --git a/libwccl/CMakeLists.txt b/libwccl/CMakeLists.txt index 2e4b2bc29aedb37db29bb5b85cdfe8ddd9d1a24a..120220144a2ef6bcb7e3f31231a6abc568a3a3db 100644 --- a/libwccl/CMakeLists.txt +++ b/libwccl/CMakeLists.txt @@ -77,6 +77,7 @@ SET(libwccl_STAT_SRC ops/match/conditions/repeatedmatch.cpp ops/match/conditions/tokencondition.cpp ops/matchrule.cpp + ops/matchrulesequence.cpp ops/opsequence.cpp ops/tagaction.cpp ops/tagactions/delete.cpp diff --git a/libwccl/lexicon/lexfilegrammar.g b/libwccl/lexicon/lexfilegrammar.g index 19640c6667a661b29189388001097d2983f2fce4..ce3664818000e66f3eff7c68d0cb0d9cf31ecb56 100644 --- a/libwccl/lexicon/lexfilegrammar.g +++ b/libwccl/lexicon/lexfilegrammar.g @@ -53,10 +53,10 @@ private: /////////////////////////////////////////////////////////////////////////////// parse_lexicon_file - [const std::string& name] + [const std::string& name, const std::string& file_name] returns [boost::shared_ptr<Lexicon> lex] { - lex.reset(new Lexicon(name)); + lex.reset(new Lexicon(name, file_name)); } : (lexicon_entry [*lex])* EOF diff --git a/libwccl/lexicon/lexicon.h b/libwccl/lexicon/lexicon.h index c02f6284805c7901c2baa10cc4d5b5954e785d22..8cc9d2ae435fc521417b096a2ee7d7eb89e625a3 100644 --- a/libwccl/lexicon/lexicon.h +++ b/libwccl/lexicon/lexicon.h @@ -14,8 +14,9 @@ class Lexicon : boost::noncopyable public: typedef boost::unordered_map<UnicodeString, UnicodeString> map_t; - Lexicon(const std::string& name) - : name_(name) + Lexicon(const std::string& name, const std::string& file_name) + : name_(name), + file_name_(file_name) { BOOST_ASSERT(!name_.empty()); } @@ -40,6 +41,10 @@ public: return name_; } + std::string file_name() const { + return file_name_; + } + bool has_key(const UnicodeString& key) const { return map_.find(key) != map_.end(); } @@ -55,6 +60,7 @@ public: private: map_t map_; const std::string name_; + const std::string file_name_; }; } /* end ns Wccl */ diff --git a/libwccl/lexicon/lexiconparser.cpp b/libwccl/lexicon/lexiconparser.cpp index acb54cc41e804bc9e1afffaca47b02ebfa2a492a..578d322bf3bb69ea16c309bc652d3412306903d1 100644 --- a/libwccl/lexicon/lexiconparser.cpp +++ b/libwccl/lexicon/lexiconparser.cpp @@ -22,7 +22,7 @@ boost::shared_ptr<Lexicon> LexiconParser::parse_lexicon( ANTLRLexiconLexer lexer(is); ANTLRLexiconParser parser(lexer); - return parser.parse_lexicon_file(lexicon_name); + return parser.parse_lexicon_file(lexicon_name, filename); } } /* end ns Wccl */ diff --git a/libwccl/lexicon/lexicons.h b/libwccl/lexicon/lexicons.h index 1fd0c5ff01b2975458de1a2ee69a4fcac4818dbf..96f4fa1b2d0f4499558df39a4750d87950698ea9 100644 --- a/libwccl/lexicon/lexicons.h +++ b/libwccl/lexicon/lexicons.h @@ -26,6 +26,10 @@ public: void insert(const boost::shared_ptr<Lexicon>& lexicon); + const map_t& get_lexicons() const { + return lexicons_; + } + private: map_t lexicons_; }; diff --git a/libwccl/ops/functions/bool/predicates/ann.cpp b/libwccl/ops/functions/bool/predicates/ann.cpp index c2faa04cbbc6d4f45de1c480f757a51036614066..dc07e957a133b2bb9d5a044d244adf949dec8a41 100644 --- a/libwccl/ops/functions/bool/predicates/ann.cpp +++ b/libwccl/ops/functions/bool/predicates/ann.cpp @@ -16,11 +16,11 @@ Ann::BaseRetValPtr Ann::apply_internal(const FunExecContext& context) const boost::shared_ptr<const Match> check_from = check_from_->apply(context); boost::shared_ptr<const Match> check_to = (check_from_ == check_to_) ? check_from : check_to_->apply(context); - int abs_left = check_from->first_token(as).get_value(); + int abs_left = check_from->first_token(as); if (abs_left < 0) { throw WcclError("Received starting match that points outside sentence."); } - int abs_right = check_to->last_token(as).get_value(); + int abs_right = check_to->last_token(as); if (abs_right >= context.sentence_context().size()) { throw WcclError("Received ending match that points outside sentence."); } diff --git a/libwccl/ops/functions/bool/predicates/annsub.cpp b/libwccl/ops/functions/bool/predicates/annsub.cpp index bcaefc254bd9afbbc3c650f1be2788e2da39abdc..cb03321eac6d43b3df7273deb7ad862af8fe0c42 100644 --- a/libwccl/ops/functions/bool/predicates/annsub.cpp +++ b/libwccl/ops/functions/bool/predicates/annsub.cpp @@ -16,11 +16,11 @@ AnnSub::BaseRetValPtr AnnSub::apply_internal(const FunExecContext& context) cons boost::shared_ptr<const Match> check_from = check_from_->apply(context); boost::shared_ptr<const Match> check_to = (check_from_ == check_to_) ? check_from : check_to_->apply(context); - int abs_left = check_from->first_token(as).get_value(); + int abs_left = check_from->first_token(as); if (abs_left < 0) { throw WcclError("Received starting match that points outside sentence."); } - int abs_right = check_to->last_token(as).get_value(); + int abs_right = check_to->last_token(as); if (abs_right >= context.sentence_context().size()) { throw WcclError("Received ending match that points outside sentence."); } diff --git a/libwccl/ops/functions/position/firsttoken.cpp b/libwccl/ops/functions/position/firsttoken.cpp index 8c900756afc3c6029c85b8a75c73d4b3bdb3bf63..b92b66d608574bf2488495d942219e5ca014f3e8 100644 --- a/libwccl/ops/functions/position/firsttoken.cpp +++ b/libwccl/ops/functions/position/firsttoken.cpp @@ -21,7 +21,9 @@ FirstToken::BaseRetValPtr FirstToken::apply_internal( if(match->empty()) { return detail::DefaultFunction<Position>()->apply(context); } - return boost::make_shared<Position>(match->first_token(s)); + int abs_pos = match->first_token(s); + int rel_pos = abs_pos - context.sentence_context().get_position(); + return boost::make_shared<Position>(rel_pos); } std::string FirstToken::to_string(const Corpus2::Tagset &tagset) const diff --git a/libwccl/ops/functions/position/lasttoken.cpp b/libwccl/ops/functions/position/lasttoken.cpp index 6d4ca00193da63f17cd35f717054eb21d93ffbd0..6b4f43ecd83251faf7a4cdadd6792b5bec953e0d 100644 --- a/libwccl/ops/functions/position/lasttoken.cpp +++ b/libwccl/ops/functions/position/lasttoken.cpp @@ -14,13 +14,15 @@ LastToken::BaseRetValPtr LastToken::apply_internal( if (!s) { throw InvalidArgument( "context", - "Supplied context does not have valid Corpus2::AnnotatedSentence."); + "Supplied context does not have a valid Corpus2::AnnotatedSentence."); } const Function<Match>::RetValPtr match = match_expr_->apply(context); if(match->empty()) { return detail::DefaultFunction<Position>()->apply(context); } - return boost::make_shared<Position>(match->last_token(s)); + int abs_pos = match->last_token(s); + int rel_pos = abs_pos - context.sentence_context().get_position(); + return boost::make_shared<Position>(rel_pos); } std::string LastToken::to_string(const Corpus2::Tagset &tagset) const diff --git a/libwccl/ops/match/actions/markmatch.cpp b/libwccl/ops/match/actions/markmatch.cpp index 7970e710e49b84dad031566963f410992f40fde9..e80515179823c1f1e7633b9a54020c6d146ad3af 100644 --- a/libwccl/ops/match/actions/markmatch.cpp +++ b/libwccl/ops/match/actions/markmatch.cpp @@ -21,12 +21,12 @@ void MarkMatch::execute(const ActionExecContext& context) const boost::shared_ptr<const Match> head_match = (match_from_ == head_match_) ? match_from : head_match_->apply(context); - int abs_left = match_from->first_token(as).get_value(); + int abs_left = match_from->first_token(as); if (abs_left < 0) { throw WcclError("Received starting match that points outside sentence."); } - int abs_right = match_to->last_token(as).get_value(); + int abs_right = match_to->last_token(as); if (abs_right >= sc.size()) { throw WcclError("Received ending match that points outside sentence."); } @@ -34,7 +34,7 @@ void MarkMatch::execute(const ActionExecContext& context) const throw WcclError("Received starting match points after the received ending match."); } - int abs_head = head_match->first_token(as).get_value(); + int abs_head = head_match->first_token(as); if (abs_head < abs_left || abs_head > abs_right) { throw WcclError("Received head match points outside range defined by start and end matches."); } diff --git a/libwccl/ops/match/actions/unmarkmatch.cpp b/libwccl/ops/match/actions/unmarkmatch.cpp index 863bb0e67b7b1b68c4fa376a28ad517cf9fc2a28..3f0a8efe704c8e69aa42b1cee9ab95fed4c86ac8 100644 --- a/libwccl/ops/match/actions/unmarkmatch.cpp +++ b/libwccl/ops/match/actions/unmarkmatch.cpp @@ -18,7 +18,7 @@ void UnmarkMatch::execute(const ActionExecContext& context) const throw InvalidArgument("context", "Sentence does not have annotation channel \"" + chan_name_ + "\"."); } - int abs_pos = match_->apply(context)->first_token(as).get_value(); + int abs_pos = match_->apply(context)->first_token(as); if(sc.is_outside(abs_pos)) { throw WcclError("Received starting match that points outside sentence."); } diff --git a/libwccl/ops/matchrulesequence.cpp b/libwccl/ops/matchrulesequence.cpp new file mode 100644 index 0000000000000000000000000000000000000000..eec6e42296fddd9770d6e810a4f1e68ee4e2089c --- /dev/null +++ b/libwccl/ops/matchrulesequence.cpp @@ -0,0 +1,46 @@ +#include <libwccl/ops/matchrulesequence.h> +#include <libpwrutils/foreach.h> + +namespace Wccl { + +void MatchRuleSequence::apply_all(const boost::shared_ptr<Corpus2::AnnotatedSentence>& sentence) +{ + if(!sentence || sentence->empty()) { + throw InvalidArgument( + "sentence", + "Received an empty sentence."); + } + foreach (MatchRule& rule, *this) { + rule.apply(sentence); + } +} + +std::string MatchRuleSequence::to_string(const Corpus2::Tagset& tagset) const +{ + std::ostringstream os; + os << "match_rules(\n"; + for (size_t i = 0; i < size(); ++i) { + if (i != 0) { + os << ";\n"; + } + os << at(i).to_string(tagset); + } + os << ")"; + return os.str(); +} + +std::ostream& MatchRuleSequence::write_to(std::ostream &os) const +{ + os << "match_rules(\n"; + for (size_t i = 0; i < size(); ++i) { + if (i != 0) { + os << ";\n"; + } + os << at(i); + } + os << ")"; + return os; +} + +} /* end ns Wccl */ + diff --git a/libwccl/ops/matchrulesequence.h b/libwccl/ops/matchrulesequence.h new file mode 100644 index 0000000000000000000000000000000000000000..db908a16ba1758bbb8fe75bcafccc563adfe5c0f --- /dev/null +++ b/libwccl/ops/matchrulesequence.h @@ -0,0 +1,82 @@ +#ifndef LIBWCCL_OPS_MATCHRULESEQUENCE_H +#define LIBWCCL_OPS_MATCHRULESEQUENCE_H + +#include <libwccl/ops/matchrule.h> + +namespace Wccl { + +/** + * Represents a sequence of parsed WCCL Match rules. It's a conveniency wrapper around + * vector of MatchRule objects, that allows automatic execution of all contained MatchRules + * one by one. + * @note The class methods are not thread-safe + */ +class MatchRuleSequence : public std::vector<MatchRule>, public Expression +{ +public: + MatchRuleSequence(const std::vector<MatchRule>& rules); + + MatchRuleSequence(); + + /** + * Executes all contained Rules sequentially, once for each position + * starting from 0 to given sentence's end. + * @returns True if any of the Rules made a change on any of the sentence + * positions, False otherwise. + * @param sentence Sentence to execute on. + * @see execute_once() - equivalent method; the \link operator()() operator() \endlink allows + * more convenient functional notation, however if you only have a pointer + * you might prefer the execute_once() method as shown below. The choice is yours. + * @see execute_until_done() - executes all Rules repeatedly, until there are no changes. + * \code + * Bool res; + * res = ruleseq(sentence); + * // versus + * res = ruleseq.execute_once(sentence); + * // or if you have a pointer... + * res = (*ruleseq_ptr)(sentence); + * // versus + * res = ruleseq_ptr->execute_once(sentence); + * \endcode + */ + void operator()(const boost::shared_ptr<Corpus2::AnnotatedSentence>& sentence); + + /** + * Executes all contained Rules sequentially, once for each position + * starting from 0 to given sentence's end. + * @returns True if any of the Rules made a change on any of the sentence's + * positions, False otherwise. + * @param sentence Sentence to execute on. + * @see \link operator()() operator() \endlink - an equivalent of this method that + * allows functional notation, treating RuleSeqence directly as a function object + * @see execute_until_done() - executes all Rules repeatedly, until there are no changes. + */ + void apply_all(const boost::shared_ptr<Corpus2::AnnotatedSentence>& sentence); + + std::string to_string(const Corpus2::Tagset& tagset) const; +protected: + std::ostream& write_to(std::ostream& os) const; +}; + + + +// +//--- implementation details --- +// +inline +MatchRuleSequence::MatchRuleSequence(const std::vector<MatchRule>& rules) + : std::vector<MatchRule>(rules) { +} + +inline +MatchRuleSequence::MatchRuleSequence() + : std::vector<MatchRule>() { +} + +inline +void MatchRuleSequence::operator()(const boost::shared_ptr<Corpus2::AnnotatedSentence>& sentence) { + apply_all(sentence); +} + +} /* end ns Wccl */ +#endif // LIBWCCL_OPS_MATCHRULESEQUENCE_H diff --git a/libwccl/ops/tagrulesequence.cpp b/libwccl/ops/tagrulesequence.cpp index 73912941ba2ddafae855f68a66dbca5cba2d7daa..353a37e821b45c4d635bd04481a622424f4f9a13 100644 --- a/libwccl/ops/tagrulesequence.cpp +++ b/libwccl/ops/tagrulesequence.cpp @@ -44,10 +44,10 @@ int TagRuleSequence::execute_until_done(const boost::shared_ptr<Corpus2::Sentenc std::string TagRuleSequence::to_string(const Corpus2::Tagset& tagset) const { std::ostringstream os; - os << "rules("; + os << "tag_rules("; for (size_t i = 0; i < size(); ++i) { if (i != 0) { - os << ", \n"; + os << ";\n"; } os << at(i).to_string(tagset); } @@ -57,10 +57,10 @@ std::string TagRuleSequence::to_string(const Corpus2::Tagset& tagset) const std::ostream& TagRuleSequence::write_to(std::ostream &os) const { - os << "rules("; + os << "tag_rules("; for (size_t i = 0; i < size(); ++i) { if (i != 0) { - os << ", \n"; + os << ";\n"; } os << at(i); } diff --git a/libwccl/ops/tagrulesequence.h b/libwccl/ops/tagrulesequence.h index 4a1ab672876b3b6224e793246e7da5b4fe02729b..34d5534ebac76610f97b20195e969fb39e19f7c7 100644 --- a/libwccl/ops/tagrulesequence.h +++ b/libwccl/ops/tagrulesequence.h @@ -6,15 +6,15 @@ namespace Wccl { /** - * Represents a sequence of parsed WCCL Rules. It's a conveniency wrapper around - * vector of Rule objects, that allows automatic execution of all contained Rules + * Represents a sequence of parsed WCCL tag rules. It's a conveniency wrapper around + * vector of TagRule objects, that allows automatic execution of all contained TagRules * for all positions of a Sentence. * @note The class methods are not thread-safe */ class TagRuleSequence : public std::vector<TagRule>, public Expression { public: - TagRuleSequence(std::vector<TagRule> rules); + TagRuleSequence(const std::vector<TagRule>& rules); TagRuleSequence(); @@ -77,7 +77,7 @@ protected: //--- implementation details --- // inline -TagRuleSequence::TagRuleSequence(std::vector<TagRule> rules) +TagRuleSequence::TagRuleSequence(const std::vector<TagRule>& rules) : std::vector<TagRule>(rules) { } diff --git a/libwccl/parser/Parser.cpp b/libwccl/parser/Parser.cpp index 8cf342df0c1751ae9b9d9d33fe05f8ed64fbe2bd..de20b637e8aa0ede5afa18a4adaaf09b81042258 100644 --- a/libwccl/parser/Parser.cpp +++ b/libwccl/parser/Parser.cpp @@ -6,6 +6,8 @@ #include <antlr/MismatchedTokenException.hpp> #include <antlr/TokenStreamRecognitionException.hpp> +#include <libcorpus2/tagsetmanager.h> + namespace Wccl { /** @@ -15,6 +17,10 @@ Parser::Parser(const Corpus2::Tagset& t) : tagset_(t) { } +Parser::Parser(const std::string& tagset_name) : tagset_(Corpus2::get_named_tagset(tagset_name)) +{ +} + /** * */ diff --git a/libwccl/parser/Parser.h b/libwccl/parser/Parser.h index 3d642eaf63984341be306e977c47a557c802fbfd..bcf22d958a09afcf92a0b1ec50a7adc6729f0fbb 100644 --- a/libwccl/parser/Parser.h +++ b/libwccl/parser/Parser.h @@ -32,7 +32,8 @@ namespace Wccl{ class Parser { public: - Parser(const Corpus2::Tagset&); + explicit Parser(const Corpus2::Tagset&); + explicit Parser(const std::string& tagset_name); ~Parser(); // --------------------------------------------------------------------------- diff --git a/libwccl/parser/grammar.g b/libwccl/parser/grammar.g index 074020eeaefa442a181ce8036d414c1fc02e3f42..f342038bb7c8fec032db8cc17222478275fa8e08 100644 --- a/libwccl/parser/grammar.g +++ b/libwccl/parser/grammar.g @@ -282,7 +282,6 @@ parse_match_rule { Lexicons empty_lex; ParsingScope scope(tagset, empty_lex); - scope.variables().get_put<Match>("_M"); } : ret_match = match_rule_operator[scope] ; @@ -295,15 +294,10 @@ parse_wccl_file returns [boost::shared_ptr<WcclFile> wccl_file] { wccl_file = boost::make_shared<WcclFile>(tagset, search_path); - boost::shared_ptr<TagRuleSequence> rule_seq; } : (imports_section [*wccl_file])? - (any_operator_section [*wccl_file] )* - ( - rule_seq = parse_tag_rule_sequence [tagset] { wccl_file->set_tag_rules(rule_seq); } - (any_operator_section [*wccl_file] )* - )? - EOF + (wccl_file_section [*wccl_file])+ + EOF ; @@ -495,10 +489,9 @@ match_data_literal token_match_literal returns [boost::shared_ptr<TokenMatch> val] { - boost::shared_ptr<Position> p; } - : "TOK" LBRACKET p = position_literal RBRACKET { - val.reset(new TokenMatch(*p)); + : "TOK" LBRACKET u: UNSIGNED_INT RBRACKET { + val.reset(new TokenMatch(token_ref_to_int(u))); } ; @@ -507,10 +500,9 @@ token_match_literal ann_match_literal returns [boost::shared_ptr<AnnotationMatch> val] { - boost::shared_ptr<Position> p; } - : "ANN" LBRACKET p = position_literal COMMA channel : STRING RBRACKET { - val.reset(new AnnotationMatch(*p, token_ref_to_std_string(channel))); + : "ANN" LBRACKET u : UNSIGNED_INT COMMA channel : STRING RBRACKET { + val.reset(new AnnotationMatch(token_ref_to_int(u), token_ref_to_std_string(channel))); } ; @@ -1900,6 +1892,49 @@ import [WcclFile& wccl_file] } ; +wccl_file_section [WcclFile& wccl_file] + : any_operator_section [wccl_file] + | tag_rules_section [wccl_file] + | match_rules_section [wccl_file] +; + +tag_rules_section [WcclFile& wccl_file] +{ + boost::shared_ptr<TagRuleSequence> rule_seq; +} + : rule_seq = parse_tag_rule_sequence [wccl_file.tagset()] { + if (wccl_file.has_tag_rules()) { + throw ParserException("Only one tag_rules section allowed in a WCCL file."); + } + wccl_file.set_tag_rules(rule_seq); + } +; + +match_rules_section [WcclFile& wccl_file] +{ + ParsingScope scope(wccl_file); + boost::shared_ptr<MatchRule> match_rule; + boost::shared_ptr<MatchRuleSequence> rule_seq = boost::make_shared<MatchRuleSequence>(); +} + : "match_rules" { + if (wccl_file.has_match_rules()) { + throw ParserException("Only one match_rules section allowed in a WCCL file."); + } + } + LPAREN + match_rule = match_rule_operator [scope] { + rule_seq->push_back(*match_rule); + scope.reset_variables(); + } + ( + SEMI match_rule = match_rule_operator [scope] { + rule_seq->push_back(*match_rule); + scope.reset_variables(); + } + )* + RPAREN { wccl_file.set_match_rules(rule_seq); } +; + any_operator_section [WcclFile& wccl_file] { @@ -2117,11 +2152,12 @@ tag_rule_sequence ( SEMI rle = tag_rule [scope] { rule_seq->push_back(*rle); + scope.reset_variables(); } )* ; -// Temporary name. +// ---------------------------------------------------------------------------- // This is wrapper for tag_rule_sequence in rules section in the wccl file tag_rules [ParsingScope& scope] @@ -2288,6 +2324,7 @@ match_rule_operator returns [boost::shared_ptr<MatchRule> ret_op] { boost::shared_ptr<ApplyOperator> apply; + scope.variables().get_put<Match>("_M"); } : apply = match_apply_operator [scope] { ret_op = boost::make_shared<MatchRule>(scope.variables(), apply); diff --git a/libwccl/values/annotationmatch.cpp b/libwccl/values/annotationmatch.cpp index 54fb777c7d07969024e423371b5b009cb055ff7c..d13db4e26a12611f4a488d41262499d42c361f47 100644 --- a/libwccl/values/annotationmatch.cpp +++ b/libwccl/values/annotationmatch.cpp @@ -1,43 +1,43 @@ #include <libwccl/values/annotationmatch.h> +#include <libwccl/values/position.h> +#include <boost/lexical_cast.hpp> namespace Wccl { std::string AnnotationMatch::to_raw_string() const { - return "ANN[" + position_.to_raw_string() + "," + channel_ + "]"; + return "ANN[" + boost::lexical_cast<std::string>(abs_pos_) + "," + channel_ + "]"; } -Position AnnotationMatch::first_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>& s) const +int AnnotationMatch::first_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>& s) const { - size_t fpos = position_.get_value(); const Corpus2::AnnotationChannel& chan = s->get_channel(channel_); - int seg = chan.get_segment_at(fpos); + int seg = chan.get_segment_at(abs_pos_); if (seg > 0) { - for (size_t i = 0; i < fpos; ++i) { + for (int i = 0; i < abs_pos_; ++i) { if (chan.get_segment_at(i) == seg) { - return Position(i); + return i; } } - return position_; + return abs_pos_; } else { - return Position(); + return Position::Nowhere; } } -Position AnnotationMatch::last_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>& s) const +int AnnotationMatch::last_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>& s) const { - size_t fpos = position_.get_value(); const Corpus2::AnnotationChannel& chan = s->get_channel(channel_); - int seg = chan.get_segment_at(fpos); + int seg = chan.get_segment_at(abs_pos_); if (seg > 0) { - for (size_t i = s->size() - 1; i > fpos; --i) { + for (int i = s->size() - 1; i > abs_pos_; --i) { if (chan.get_segment_at(i) == seg) { - return Position(i); + return i; } } - return position_; + return abs_pos_; } else { - return Position(); + return Position::Nowhere; } } diff --git a/libwccl/values/annotationmatch.h b/libwccl/values/annotationmatch.h index 90015fed0bd966c7b0db53ecc4cdd854f7b5113f..7abc957e766fc8b98d65ef1fff9883a73b74794e 100644 --- a/libwccl/values/annotationmatch.h +++ b/libwccl/values/annotationmatch.h @@ -9,16 +9,10 @@ class AnnotationMatch : public MatchData { public: - explicit AnnotationMatch(Position position, const std::string channel) - : position_(position), channel_(channel) + explicit AnnotationMatch(int abs_pos, const std::string& channel) + : abs_pos_(abs_pos), channel_(channel) { - BOOST_ASSERT(position_.get_value() != Position::Nowhere); - } - - explicit AnnotationMatch(int pos,const std::string channel) - : position_(pos), channel_(channel) - { - BOOST_ASSERT(position_.get_value() != Position::Nowhere); + BOOST_ASSERT(abs_pos_ >= 0); } /// MatchData override. @@ -27,10 +21,10 @@ public: } /// MatchData override. - Position first_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>& s) const; + int first_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>& s) const; /// MatchData override. - Position last_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>& s) const; + int last_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>& s) const; /// MatchData override std::string to_raw_string() const; @@ -42,7 +36,7 @@ protected: } private: - Position position_; + int abs_pos_; std::string channel_; }; diff --git a/libwccl/values/match.h b/libwccl/values/match.h index 5be080f617e9459f8da069c2441619584ef10c82..ee6a588efeec10cb94dfeef56bea381c9b88b004 100644 --- a/libwccl/values/match.h +++ b/libwccl/values/match.h @@ -81,9 +81,9 @@ public: /** * Getter for the first token matched. If the match is empty, must return - * Nowhere. + * Position::Nowhere. */ - Position first_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>& s) const { + int first_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>& s) const { return match_->first_token(s); } @@ -91,7 +91,7 @@ public: * Getter for the last token matched. If the match is empty, must return * Nowhere. */ - Position last_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>& s) const { + int last_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>& s) const { return match_->last_token(s); } diff --git a/libwccl/values/matchdata.h b/libwccl/values/matchdata.h index f113256ade77789626b0df9c249797f6b5a40317..d415f9f6b93bb11b07b8892651d495487133af55 100644 --- a/libwccl/values/matchdata.h +++ b/libwccl/values/matchdata.h @@ -1,7 +1,6 @@ #ifndef LIBWCCL_VALUES_MATCHDATA_H #define LIBWCCL_VALUES_MATCHDATA_H -#include <libwccl/values/position.h> #include <libwccl/exception.h> #include <libcorpus2/ann/annotatedsentence.h> @@ -24,15 +23,15 @@ public: virtual bool empty() const = 0; /** * Getter for the first token matched. If the match is empty, must return - * Nowhere. + * Position::Nowhere. */ - virtual Position first_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>&) const = 0; + virtual int first_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>&) const = 0; /** * Getter for the last token matched. If the match is empty, must return - * Nowhere. + * Position::Nowhere. */ - virtual Position last_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>&) const = 0; + virtual int last_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>&) const = 0; /** * Getter for a submatch at given index (indexing starts from 1). diff --git a/libwccl/values/matchvector.cpp b/libwccl/values/matchvector.cpp index 6415ed0d4ac4bd2a7b1d19b928868ffa3c9ce62b..7fb0ab019bb209ed32f1e0c2b8155c34250bee40 100644 --- a/libwccl/values/matchvector.cpp +++ b/libwccl/values/matchvector.cpp @@ -22,44 +22,44 @@ std::string MatchVector::to_raw_string() const return ss.str(); } -Position MatchVector::first_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>& s) const +int MatchVector::first_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>& s) const { if (matches_.empty()) { - return Position(Position::Nowhere); + return Position::Nowhere; } else { // Negative positions are invalid, including specials like Nowhere, // so we can't just find minimum value but minimum *non-negative* value. // Note: yes, the code assumes the special values like Nowhere are indeed negative. - Position p = matches_.front()->first_token(s); + int p = matches_.front()->first_token(s); size_t i = 1; - while ((p.get_value() < 0) && (i < matches_.size())) { + while ((p < 0) && (i < matches_.size())) { p = matches_[i]->first_token(s); ++i; } while (i < matches_.size()) { - Position c = matches_[i]->first_token(s); - if ((c.get_value() >= 0) && (c.get_value() < p.get_value())) { + int c = matches_[i]->first_token(s); + if ((c >= 0) && (c < p)) { p = c; } ++i; } - return p; + return p >= 0 ? p : Position::Nowhere; } } -Position MatchVector::last_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>& s) const +int MatchVector::last_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>& s) const { if (matches_.empty()) { - return Position(Position::Nowhere); + return Position::Nowhere; } else { - Position p = matches_.front()->last_token(s); + int p = matches_.front()->last_token(s); for (size_t i = 1; i < matches_.size(); ++i) { - Position c = matches_[i]->last_token(s); - if (c.get_value() > p.get_value()) { + int c = matches_[i]->last_token(s); + if (c > p) { p = c; } } - return p; + return p >= 0 ? p : Position::Nowhere; } } diff --git a/libwccl/values/matchvector.h b/libwccl/values/matchvector.h index 2519ae6804bac4a4d091926919035c3607f978b3..c97f7d9cd2b76b79de04ed804e604cff7223140a 100644 --- a/libwccl/values/matchvector.h +++ b/libwccl/values/matchvector.h @@ -26,10 +26,10 @@ public: bool empty() const; /// MatchData override. - Position first_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>& s) const; + int first_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>& s) const; /// MatchData override. - Position last_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>& s) const; + int last_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>& s) const; /// MatchData override std::string to_raw_string() const; diff --git a/libwccl/values/tokenmatch.cpp b/libwccl/values/tokenmatch.cpp index 0852ad96367538980e54698af4d4dbeb87acd4d9..7cd31cf0fd9a7962ee37549366d70e5674a2fee2 100644 --- a/libwccl/values/tokenmatch.cpp +++ b/libwccl/values/tokenmatch.cpp @@ -1,10 +1,11 @@ #include <libwccl/values/tokenmatch.h> +#include <boost/lexical_cast.hpp> namespace Wccl { std::string TokenMatch::to_raw_string() const { - return "TOK[" + position_.to_raw_string() + "]"; + return "TOK[" + boost::lexical_cast<std::string>(abs_pos_) + "]"; } } /* end ns Wccl */ diff --git a/libwccl/values/tokenmatch.h b/libwccl/values/tokenmatch.h index 46e085cdd39f2e2624a13c233c515c0694d5c7f3..99382f70c861e86ec5660ca6553207791e59f209 100644 --- a/libwccl/values/tokenmatch.h +++ b/libwccl/values/tokenmatch.h @@ -9,16 +9,10 @@ class TokenMatch : public MatchData { public: - explicit TokenMatch(Position position) - : position_(position) + explicit TokenMatch(int abs_pos) + : abs_pos_(abs_pos) { - BOOST_ASSERT(position_.get_value() != Position::Nowhere); - } - - explicit TokenMatch(int pos) - : position_(pos) - { - BOOST_ASSERT(position_.get_value() != Position::Nowhere); + BOOST_ASSERT(abs_pos_ >= 0); } /// MatchData override. @@ -27,13 +21,13 @@ public: } /// MatchData override. - Position first_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>&) const { - return position_; + int first_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>&) const { + return abs_pos_; } /// MatchData override. - Position last_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>&) const { - return position_; + int last_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>&) const { + return abs_pos_; } /// MatchData override @@ -46,7 +40,7 @@ protected: } private: - Position position_; + int abs_pos_; }; } /* end ns Wccl */ diff --git a/libwccl/wcclfile.cpp b/libwccl/wcclfile.cpp index 9509c5d329659e3eaaaf9959adcb0b2552d7ea12..22cb8edee7cbe91389bda809c3b5fbca9a85bb02 100644 --- a/libwccl/wcclfile.cpp +++ b/libwccl/wcclfile.cpp @@ -36,14 +36,39 @@ boost::shared_ptr<const TagRuleSequence> WcclFile::get_tag_rules_ptr() const return tag_rules_; } +boost::shared_ptr<MatchRuleSequence> WcclFile::get_match_rules_ptr() +{ + if (!has_match_rules()) { + throw WcclError("There are no match rules."); + } + return match_rules_; +} + +boost::shared_ptr<const MatchRuleSequence> WcclFile::get_match_rules_ptr() const +{ + if (!has_match_rules()) { + throw WcclError("There are no match rules."); + } + return match_rules_; +} + std::ostream& WcclFile::write_to(std::ostream& os) const { + if (has_lexicons()) { + foreach(const Lexicons::map_t::value_type& v, lexicons_->get_lexicons()) { + os << "import(\"" << v.second->file_name() << ", \"" + << v.second->name() << "\")\n"; + } + } foreach(const boost::shared_ptr<FunctionalOpSequence>& s, all_sections_) { os << s->to_string(tagset_) << '\n'; } if (has_tag_rules()) { os << tag_rules_->to_string(tagset_) << '\n'; } + if (has_match_rules()) { + os << match_rules_->to_string(tagset_) << '\n'; + } return os; } diff --git a/libwccl/wcclfile.h b/libwccl/wcclfile.h index 2919b49c5ff142fc56846cf422727c10c0b7cd18..e012b58b2e2357de68f534387c83fe02b4fba753 100644 --- a/libwccl/wcclfile.h +++ b/libwccl/wcclfile.h @@ -8,12 +8,19 @@ #include <libwccl/values/tset.h> #include <libwccl/wcclfileopsections.h> #include <libwccl/ops/tagrulesequence.h> +#include <libwccl/ops/matchrulesequence.h> #include <libwccl/lexicon/lexicons.h> #include <libwccl/exception.h> #include <libpwrutils/pathsearch.h> namespace Wccl { +/** + * Class representing contents of parsed WCCL files + * - imported lexicons, any typed and untyped named + * operator sections, up to one tag rules section, + * and up to one match rules section. + */ class WcclFile : WcclFileOpSections<UntypedOpSequence>, WcclFileOpSections<OpSequence<StrSet> >, @@ -25,63 +32,284 @@ class WcclFile public: WcclFile(const Corpus2::Tagset& tagset, const std::string& search_path); + ///////////////////// + // Untyped and typed operator sections: @X:"sectioname" ( op1; op2 ) + ///////////////////// + + /** + * @returns All untyped sections from the WCCL file: @"name" ( anyop0; ...; anyopN ) + */ const std::vector<boost::shared_ptr<UntypedOpSequence> >& untyped_sections(); + /** + * @returns All sections of given type T from the WCCL file: @T:"name" ( op<T>0; ...; op<T>N ) + */ template<class T> const typename std::vector<boost::shared_ptr<OpSequence<T> > >& sections(); + /** + * @returns True if the WCCL file contains untyped section of given name. + * False otherwise. + */ bool has_untyped_section(const std::string& name) const; + /** + * @returns True if the WCCL file contains section of given name and type T. + * False otherwise. + */ template<class T> bool has_section(const std::string& name) const; + /** + * @returns Names of all untyped sections in this WCCL file, in the order they + * were found in the file. + */ std::vector<std::string> untyped_section_names() const; + /** + * @returns Names of all sections of given type T from this WCCL file, in the + * order they were found in the file. + */ template<class T> std::vector<std::string> section_names() const; + /** + * @returns Untyped section of given name. + * @throws InvalidArgument if there is no untyped section of given name. + */ UntypedOpSequence& get_untyped_section(const std::string& name); + /** + * @returns Untyped section of given name (const). + * @throws InvalidArgument if there is no untyped section of given name. + */ const UntypedOpSequence& get_untyped_section(const std::string& name) const; + /** + * @returns Section of given name and type T. + * @throws InvalidArgument if there is no section of given name and type T. + */ template<class T> OpSequence<T>& get_section(const std::string& name); + /** + * @returns Section of given name and type T (const). + * @throws InvalidArgument if there is no section of given name and type T. + */ template<class T> const OpSequence<T>& get_section(const std::string& name) const; - + /** + * @returns Shared pointer to an untyped section of the given name. + * @throws InvalidArgument if there is no untyped section of given name. + */ boost::shared_ptr<UntypedOpSequence> get_untyped_section_ptr(const std::string& name); + /** + * @returns Shared pointer to an untyped section of the given name (const). + * @throws InvalidArgument if there is no untyped section of given name. + */ boost::shared_ptr<const UntypedOpSequence> get_untyped_section_ptr(const std::string& name) const; + /** + * @returns Shared pointer to a section of the given name and type T. + * @throws InvalidArgument if there is no section of given name and type T. + */ template<class T> boost::shared_ptr<OpSequence<T> > get_section_ptr(const std::string& name); + /** + * @returns Shared pointer to a section of the given name and type T (const). + * @throws InvalidArgument if there is no section of given name and type T. + */ template<class T> boost::shared_ptr<const OpSequence<T> > get_section_ptr(const std::string& name) const; - + /** + * @returns Operator from an untyped section of given name, present at given index. + * @note Index is zero-based and default is 0 (no idx given = the first operator in the sequence). + * @throws InvalidArgument if there is no untyped section of given name or if idx is out of range. + */ FunctionalOperator& get_untyped_op(const std::string& name, size_t idx = 0); + /** + * @returns Operator from an untyped section of given name, present at given index (const). + * @note Index is zero-based and default is 0 (no idx given = the first operator in the sequence). + * @throws InvalidArgument if there is no untyped section of given name or if idx is out of range. + */ const FunctionalOperator& get_untyped_op(const std::string& name, size_t idx = 0) const; + /** + * @returns Operator from a section of given name and type T, present at given index. + * @note Index is zero-based and default is 0 (no idx given = the first operator in the sequence). + * @throws InvalidArgument if there is no section of given name and type T or if idx is out of range. + */ template<class T> Operator<T>& get_op(const std::string& name, size_t idx = 0); + /** + * @returns Operator from a section of given name and type T, present at given index (const). + * @note Index is zero-based and default is 0 (no idx given = the first operator in the sequence). + * @throws InvalidArgument if there is no section of given name and type T or if idx is out of range. + */ template<class T> const Operator<T>& get_op(const std::string& name, size_t idx = 0) const; - + /** + * @returns Shared pointer to an operator from an untyped section of given name, present at given index. + * @note Index is zero-based and default is 0 (no idx given = the first operator in the sequence). + * @throws InvalidArgument if there is no untyped section of given name or if idx is out of range. + */ boost::shared_ptr<FunctionalOperator> get_untyped_op_ptr(const std::string& name, size_t idx = 0); + /** + * @returns Shared pointer to an operator from an untyped section of given name, present at given index (const). + * @note Index is zero-based and default is 0 (no idx given = the first operator in the sequence). + * @throws InvalidArgument if there is no untyped section of given name or if idx is out of range. + */ boost::shared_ptr<const FunctionalOperator> get_untyped_op_ptr(const std::string& name, size_t idx = 0) const; + /** + * @returns Shared pointer to an operator from a section of given name and type T, present at given index. + * @note Index is zero-based and default is 0 (no idx given = the first operator in the sequence). + * @throws InvalidArgument if there is no section of given name and type T, or if idx is out of range. + */ template<class T> boost::shared_ptr<Operator<T> > get_op_ptr(const std::string& name, size_t idx = 0); + /** + * @returns Shared pointer to an operator from a section of given name and type T, present at given index (const). + * @note Index is zero-based and default is 0 (no idx given = the first operator in the sequence). + * @throws InvalidArgument if there is no section of given name and type T, or if idx is out of range. + */ template<class T> boost::shared_ptr<const Operator<T> > get_op_ptr(const std::string& name, size_t idx = 0) const; + /** + * @returns Vector of all name-operator pairs from untyped sections, in the order they were found in the file. + * @note Generated names of operators are derived from section name they are contained within, + * suffixed with hyphen and zero-based index. E.g. the first operator in section named "seq" + * will have name "seq-0", second operator's name will be "seq-1" etc. + */ UntypedOpSequence::name_op_v_t gen_name_untyped_op_pairs(); + /** + * @returns Vector of all name-operator pairs from untyped sections, in the order they were found in the file (const). + * @note Generated names of operators are derived from section name they are contained within, + * suffixed with hyphen and zero-based index. E.g. the first operator in section named "seq" + * will have name "seq-0", second operator's name will be "seq-1" etc. + */ UntypedOpSequence::name_op_v_c_t gen_name_untyped_op_pairs() const; + /** + * @returns Vector of all name-operator pairs from sections of given type T, in the order they were found in the file. + * @note Generated names of operators are derived from section name they are contained within, + * suffixed with hyphen and zero-based index. E.g. the first operator in section named "seq" + * will have name "seq-0", second operator's name will be "seq-1" etc. + */ template<class T> typename OpSequence<T>::name_op_v_t gen_name_op_pairs(); + /** + * @returns Vector of all name-operator pairs from sections of given type T, in the order they were found in the file (const). + * @note Generated names of operators are derived from section name they are contained within, + * suffixed with hyphen and zero-based index. E.g. the first operator in section named "seq" + * will have name "seq-0", second operator's name will be "seq-1" etc. + */ template<class T> typename OpSequence<T>::name_op_v_c_t gen_name_op_pairs() const; + /** + * @returns Vector of all name-operator pairs from all sections of the file (both typed and untyped), + * in the order they were found in the file. + * @note Generated names of operators are derived from section name they are contained within, + * suffixed with hyphen and zero-based index. E.g. the first operator in section named "seq" + * will have name "seq-0", second operator's name will be "seq-1" etc. + */ FunctionalOpSequence::name_op_v_t gen_all_op_pairs(); + /** + * @returns Vector of all name-operator pairs from all sections of the file (both typed and untyped), + * in the order they were found in the file (const). + * @note Generated names of operators are derived from section name they are contained within, + * suffixed with hyphen and zero-based index. E.g. the first operator in section named "seq" + * will have name "seq-0", second operator's name will be "seq-1" etc. + */ FunctionalOpSequence::name_op_v_c_t gen_all_op_pairs() const; - void import_lexicon(const boost::shared_ptr<Lexicon>& lexicon); + + /////////////////////////// + // Lexicons, import sections: import("path", "name") + /////////////////////////// + + /** + * @returns True if there is an imported Lexicon of given name. False otherwise. + */ bool has_lexicon(const std::string& name) const; + /** + * @returns True if there are any Lexicons imported. + */ + bool has_lexicons() const; + /** + * @returns Shared pointer to a Lexicon of given name (const). + * @throws InvalidArgument if there is no Lexicon of given name imported. + */ boost::shared_ptr<const Lexicon> get_lexicon_ptr(const std::string& name) const; + /** + * @returns Lexicon of given name (const). + * @throws InvalidArgument if there is no Lexicon of given name imported. + */ const Lexicon& get_lexicon(const std::string& name) const; + /** + * @returns Shared pointer to collection of all imported Lexicons (const). + */ boost::shared_ptr<const Lexicons> get_lexicons_ptr() const; + /** + * @returns Collection of all imported Lexicons (const). + */ const Lexicons& get_lexicons() const; + + /////////////////////////// + // Tag rules section: tag_rules ( rule1; rule2 ) + /////////////////////////// + + /** + * @returns True if this WcclFile has tag_rules section. False otherwise. + */ + bool has_tag_rules() const; + /** + * @returns The tag rules section from this WCCL file. + * @throws WcclError if there is no tag rules section. + */ + const TagRuleSequence& get_tag_rules() const; + /** + * @returns The tag rules section from this WCCL file. + * @throws WcclError if there is no tag rules section. + */ + boost::shared_ptr<TagRuleSequence> get_tag_rules_ptr(); + /** + * @returns The tag rules section from this WCCL file (const). + * @throws WcclError if there is no tag rules section. + */ + boost::shared_ptr<const TagRuleSequence> get_tag_rules_ptr() const; + + /////////////////////////// + // Match rules section: match_rules ( apply1; apply2 ) + /////////////////////////// + + /** + * @returns True if this WcclFile has match_rules section. False otherwise. + */ + bool has_match_rules() const; + + /** + * @returns The match rules section from this WCCL file. + * @throws WcclError if there is no match_rules section. + */ + const MatchRuleSequence& get_match_rules() const; + /** + * @returns The match rules section from this WCCL file. + * @throws WcclError if there is no match_rules section. + */ + boost::shared_ptr<MatchRuleSequence> get_match_rules_ptr(); + /** + * @returns The match rules section from this WCCL file (const). + * @throws WcclError if there is no match_rules section. + */ + boost::shared_ptr<const MatchRuleSequence> get_match_rules_ptr() const; + + /////////////////////////// + // Miscelaneous + /////////////////////////// + friend std::ostream& operator<<(std::ostream& ostream, const WcclFile& wccl_file); + std::string to_string() const; + + const Corpus2::Tagset& tagset() const; + const PwrNlp::PathSearcher<Wccl::FileNotFound> path() const { return path_; } + PwrNlp::PathSearcher<Wccl::FileNotFound> path() { return path_; } + + /////////////////////////// + // Internal parser-related methods to add sections. + /////////////////////////// void add_untyped_section(const boost::shared_ptr<UntypedOpSequence>& section); void add_untyped_section(const boost::shared_ptr<const UntypedOpSequence>& section); void add_untyped_section(const UntypedOpSequence& section); @@ -92,25 +320,17 @@ public: template<class T> void add_section(const OpSequence<T>& section); - bool has_tag_rules() const; + void import_lexicon(const boost::shared_ptr<Lexicon>& lexicon); void set_tag_rules(const boost::shared_ptr<TagRuleSequence>& tag_rules); - - const TagRuleSequence& get_tag_rules() const; - boost::shared_ptr<TagRuleSequence> get_tag_rules_ptr(); - boost::shared_ptr<const TagRuleSequence> get_tag_rules_ptr() const; - - friend std::ostream& operator<<(std::ostream& ostream, const WcclFile& wccl_file); - std::string to_string() const; - - const Corpus2::Tagset& tagset() const; - const PwrNlp::PathSearcher<Wccl::FileNotFound> path() const { return path_; } - PwrNlp::PathSearcher<Wccl::FileNotFound> path() { return path_; } + + void set_match_rules(const boost::shared_ptr<MatchRuleSequence>& match_rules); private: std::ostream& write_to(std::ostream& ostream) const; std::vector<boost::shared_ptr<FunctionalOpSequence> > all_sections_; boost::shared_ptr<TagRuleSequence> tag_rules_; + boost::shared_ptr<MatchRuleSequence> match_rules_; boost::shared_ptr<Lexicons> lexicons_; const Corpus2::Tagset& tagset_; PwrNlp::PathSearcher<Wccl::FileNotFound> path_; @@ -307,6 +527,12 @@ bool WcclFile::has_lexicon(const std::string& name) const return lexicons_->has_lexicon(name); } +inline +bool WcclFile::has_lexicons() const +{ + return lexicons_; +} + inline boost::shared_ptr<const Lexicon> WcclFile::get_lexicon_ptr(const std::string& name) const { @@ -394,6 +620,27 @@ void WcclFile::set_tag_rules(const boost::shared_ptr<TagRuleSequence>& tag_rules tag_rules_ = tag_rules; } +inline +bool WcclFile::has_match_rules() const +{ + return match_rules_; +} + +inline +const MatchRuleSequence& WcclFile::get_match_rules() const +{ + return *get_match_rules_ptr(); +} + +inline +void WcclFile::set_match_rules(const boost::shared_ptr<MatchRuleSequence>& match_rules) +{ + if (has_match_rules()) { + throw WcclError("Tag rules already added."); + } + match_rules_ = match_rules; +} + inline std::ostream& operator <<(std::ostream& ostream, const WcclFile& wccl_file) { return wccl_file.write_to(ostream); diff --git a/swig/CMakeLists.txt b/swig/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..acbdc02cea84abb4a8eb0b89f22437c58a6ff549 --- /dev/null +++ b/swig/CMakeLists.txt @@ -0,0 +1,120 @@ +########## wccl wrappers -- SWIG ############### + +PROJECT(WcclSwigWrap) + +find_package(Corpus2 1.0.8 REQUIRED) +set(CORPUS2_LIBS ${Corpus2_LIBRARY}) + +find_package(PwrUtils 1.0.1 REQUIRED) +set(PWRUTILS_LIBS ${PwrUtils_LIBRARY}) + +# find_package(Wccl 1.0.1 REQUIRED) +# set(WCCL_LIBS ${Wccl_LIBRARY}) +set(CORPUS2_PWR_WCCL_LIBS ${CORPUS2_LIBS} ${PWRUTILS_LIBS} ${WCCL_LIBS}) + +include_directories (${Libwccl_SOURCE_DIR} "../libwccl") +link_directories(${Libwccl_BINARY_DIR}) + +FIND_PACKAGE(SWIG REQUIRED) +INCLUDE(${SWIG_USE_FILE}) + +find_package(PythonLibs) +find_package(PythonInterp) +# idea taken from pyplot build system +execute_process( + COMMAND + ${PYTHON_EXECUTABLE} -c "from distutils import sysconfig; print sysconfig.get_python_lib(1,0,prefix='${CMAKE_INSTALL_EXEC_PREFIX}')" + OUTPUT_VARIABLE PYTHON_INSTDIR + OUTPUT_STRIP_TRAILING_WHITESPACE +) +message(STATUS "INFO: " "python lib: ${PYTHON_INSTDIR}" ) + +INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_PATH}) +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# ----------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- + +SET(CMAKE_SWIG_FLAGS "") +SET(SWIG_SRC_DIR ${SWIG_DIR}/wccl) + +# ----------------------------------------------------------------------------- +# wccl +SET_SOURCE_FILES_PROPERTIES(wccl.i PROPERTIES CPLUSPLUS ON) +SET_SOURCE_FILES_PROPERTIES(wccl.i PROPERTIES SWIG_FLAGS "-includeall" ) +SWIG_ADD_MODULE(wccl python wccl.i ) +SWIG_LINK_LIBRARIES(wccl ${PYTHON_LIBRARIES} ${CORPUS2_PWR_WCCL_LIBS}) + +# ----------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- + +set_target_properties( + _wccl + PROPERTIES + INSTALL_NAME_DIR "${PYTHON_INSTDIR}" +) + +set(PERM_SCRIPTS + OWNER_READ + OWNER_WRITE + OWNER_EXECUTE + GROUP_READ + GROUP_EXECUTE + WORLD_READ + WORLD_EXECUTE +) + +# ----------------------------------------------------------------------------- +# Install python modules +# ----------------------------------------------------------------------------- + +install( + TARGETS _wccl + LIBRARY + DESTINATION ${PYTHON_INSTDIR} + PERMISSIONS ${PERM_SCRIPTS} +) + +install( + FILES ${CMAKE_CURRENT_BINARY_DIR}/wccl.py + DESTINATION ${PYTHON_INSTDIR} + PERMISSIONS ${PERM_SCRIPTS} +) + +# ----------------------------------------------------------------------------- +# Install SWIG files +# ----------------------------------------------------------------------------- + +install( + FILES libcclactionexeccontext.i + libcclannotationmatch.i + libcclbool.i + libcclexpression.i + libcclfunctionaloperator.i + libcclfunctionalopsequence.i + libccllexicon.i + libccllexicons.i + libcclmatchdata.i + libcclmatch.i + libcclmatchrule.i + libcclmatchrulesequence.i + libcclmatchvector.i + libccloperator.i + libcclparsedexpression.i + libcclparser.i + libcclposition.i + libcclsentencecontext.i + libcclstrset.i + libccltagaction.i + libccltagrule.i + libccltagrulesequence.i + libccltokenmatch.i + libccltset.i + libcclvalue.i + libcclvariables.i + libcclwcclfile.i + libcclwcclfileopsections.i + wccl.i + DESTINATION ${SWIG_SRC_DIR} + PERMISSIONS ${PERM_SCRIPTS} +) diff --git a/swig/Makefile b/swig/Makefile index 3c69872e9bd436a997dc34169ca413d8c2b9d5ec..7a89f0495df24bed6d056625f6d5af645926b1ab 100644 --- a/swig/Makefile +++ b/swig/Makefile @@ -24,13 +24,26 @@ CBIN=libcclvalue.o \ libcclmatchdata.o \ libccltokenmatch.o \ libcclannotationmatch.o \ + libcclmatchvector.o \ libcclexpression.o \ libcclparsedexpression.o \ libcclfunctionaloperator.o \ libccloperator.o \ libcclvariables.o \ libcclsentencecontext.o \ - libcclparser.o + libcclparser.o \ + libcclactionexeccontext.o \ + libccltagaction.o \ + libccltagrule.o \ + libccltagrulesequence.o \ + libcclmatchrule.o \ + libcclmatchrulesequence.o \ + libcclfunctionalopsequence.o \ + libccllexicon.o \ + libccllexicons.o \ + libcclwcclfileopsections.o \ + libcclwcclfile.o \ + wccl.o CBINOUT=_libcclvalue.so \ _libcclstrset.so \ @@ -45,15 +58,27 @@ CBINOUT=_libcclvalue.so \ _libcclmatchdata.so \ _libccltokenmatch.so \ _libcclannotationmatch.so \ + _libcclmatchvector.so \ _libcclsentencecontext.so \ _libccloperator.so \ _libcclparser.so \ + _libcclactionexeccontext.so \ + _libccltagaction.so \ + _libccltagrule.so \ + _libccltagrulesequence.so \ + _libcclmatchrule.so \ + _libcclmatchrulesequence.so \ + _libcclfunctionalopsequence.so \ + _libccllexicon.so \ + _libccllexicons.so \ + _libcclwcclfileopsections.so \ + _libcclwcclfile.so \ + _wccl.so \ _boost_shared_ptr.so CWRAP=libcclvalue_wrap.cxx \ libcclstrset_wrap.cxx \ libcclbool_wrap.cxx \ - libcclparser_wrap.cxx \ libcclposition_wrap.cxx \ libccltset_wrap.cxx \ libcclexpression_wrap.cxx \ @@ -64,9 +89,22 @@ CWRAP=libcclvalue_wrap.cxx \ libcclmatchdata_wrap.cxx \ libccltokenmatch_wrap.cxx \ libcclannotationmatch_wrap.cxx \ + libcclmatchvector_wrap.cxx \ libcclsentencecontext_wrap.cxx \ - libcorpussentence_wrap.cxx \ libccloperator_wrap.cxx \ + libcclparser_wrap.cxx \ + libcclactionexeccontext_wrap.cxx \ + libccltagaction_wrap.cxx \ + libccltagrule_wrap.cxx \ + libccltagrulesequence_wrap.cxx \ + libcclmatchrule_wrap.cxx \ + libcclmatchrulesequence_wrap.cxx \ + libcclfunctionalopsequence_wrap.cxx \ + libccllexicon_wrap.cxx \ + libccllexicons_wrap.cxx \ + libcclwcclfileopsections_wrap.cxx \ + libcclwcclfile_wrap.cxx \ + wccl_wrap.cxx \ boost_shared_ptr_wrap.cxx CWRAPBIN=libcclvalue_wrap.o \ @@ -82,15 +120,27 @@ CWRAPBIN=libcclvalue_wrap.o \ libcclmatchdata_wrap.o \ libccltokenmatch_wrap.o \ libcclannotationmatch_wrap.o \ + libcclmatchvector_wrap.o \ libcclsentencecontext_wrap.o \ libccloperator_wrap.o \ libcclparser_wrap.o \ + libcclactionexeccontext_wrap.o \ + libccltagaction_wrap.o \ + libccltagrule_wrap.o \ + libccltagrulesequence_wrap.o \ + libcclmatchrule_wrap.o \ + libcclmatchrulesequence_wrap.o \ + libcclfunctionalopsequence_wrap.o \ + libccllexicon_wrap.o \ + libccllexicons_wrap.o \ + libcclwcclfileopsections_wrap.o \ + libcclwcclfile_wrap.o \ + wccl_wrap.o \ boost_shared_ptr_wrap.o PYMODULES=libcclvalue.py \ libcclstrset.py \ libcclbool.py \ - libcclparser.py \ libcclposition.py \ libccltset.py \ libcclmatch.py \ @@ -101,8 +151,22 @@ PYMODULES=libcclvalue.py \ libcclvariables.py \ libcclsentencecontext.py \ libcclannotationmatch.py \ + libcclmatchvector.py \ libcclfunctionaloperator.py \ libccloperator.py \ + libcclparser.py \ + libcclactionexeccontext.py \ + libccltagaction.py \ + libccltagrule.py \ + libccltagrulesequence.py \ + libcclmatchrule.py \ + libcclmatchrulesequence.py \ + libcclfunctionalopsequence.py \ + libccllexicon.py \ + libccllexicons.py \ + libcclwcclfileopsections.py \ + libcclwcclfile.py \ + wccl.py \ boost_shared_ptr.py PYCBIN=libcclvalue.pyc \ @@ -118,9 +182,22 @@ PYCBIN=libcclvalue.pyc \ libcclvariables.pyc \ libcclsentencecontext.pyc \ libcclannotationmatch.pyc \ + libcclmatchvector.pyc \ libcclfunctionaloperator.pyc \ libccloperator.pyc \ libcclparser.pyc \ + libcclactionexeccontext.pyc \ + libccltagaction.pyc \ + libccltagrule.pyc \ + libccltagrulesequence.pyc \ + libcclmatchrule.pyc \ + libcclmatchrulesequence.pyc \ + libcclfunctionalopsequence.pyc \ + libccllexicon.pyc \ + libccllexicons.pyc \ + libcclwcclfileopsections.pyc \ + libcclwcclfile.pyc \ + wccl.pyc \ boost_shared_ptr.pyc all:boost_shared_ptr.o $(CBIN) @@ -205,6 +282,13 @@ libcclannotationmatch.o: $(CPP) -shared libcclannotationmatch_wrap.o \ $(CCLBIN) $(ANTLRLIB) -o _libcclannotationmatch.so +# MatchVector +libcclmatchvector.o: + $(SWIG) $(SWIGOPTS_LANG) libcclmatchvector.i + $(CPP) -c libcclmatchvector_wrap.cxx -I$(PYTHONDIR) -I$(WCCLDIR) $(CPPFLAGS) + $(CPP) -shared libcclmatchvector_wrap.o \ + $(CCLBIN) $(ANTLRLIB) -o _libcclmatchvector.so + # Expression libcclexpression.o: $(SWIG) $(SWIGOPTS_LANG) libcclexpression.i @@ -256,6 +340,93 @@ libcclparser.o: $(CPP) -shared libcclparser_wrap.o \ $(CCLBIN) $(ANTLRLIB) -o _libcclparser.so +# ----------------------------------------------------------------------------- + +# ActionExecContext +libcclactionexeccontext.o: + $(SWIG) $(SWIGOPTS_LANG) libcclactionexeccontext.i + $(CPP) -c libcclactionexeccontext_wrap.cxx -I$(PYTHONDIR) -I$(WCCLDIR) $(CPPFLAGS) + $(CPP) -shared libcclactionexeccontext_wrap.o \ + $(CCLBIN) $(ANTLRLIB) -o _libcclactionexeccontext.so + +# TagAction +libccltagaction.o: + $(SWIG) $(SWIGOPTS_LANG) libccltagaction.i + $(CPP) -c libccltagaction_wrap.cxx -I$(PYTHONDIR) -I$(WCCLDIR) $(CPPFLAGS) + $(CPP) -shared libccltagaction_wrap.o \ + $(CCLBIN) $(ANTLRLIB) -o _libccltagaction.so + +# TagRule +libccltagrule.o: + $(SWIG) $(SWIGOPTS_LANG) libccltagrule.i + $(CPP) -c libccltagrule_wrap.cxx -I$(PYTHONDIR) -I$(WCCLDIR) $(CPPFLAGS) + $(CPP) -shared libccltagrule_wrap.o \ + $(CCLBIN) $(ANTLRLIB) -o _libccltagrule.so + +# TagRuleSequence +libccltagrulesequence.o: + $(SWIG) $(SWIGOPTS_LANG) libccltagrulesequence.i + $(CPP) -c libccltagrulesequence_wrap.cxx -I$(PYTHONDIR) -I$(WCCLDIR) $(CPPFLAGS) + $(CPP) -shared libccltagrulesequence_wrap.o \ + $(CCLBIN) $(ANTLRLIB) -o _libccltagrulesequence.so + +# MatchRule +libcclmatchrule.o: + $(SWIG) $(SWIGOPTS_LANG) libcclmatchrule.i + $(CPP) -c libcclmatchrule_wrap.cxx -I$(PYTHONDIR) -I$(WCCLDIR) $(CPPFLAGS) + $(CPP) -shared libcclmatchrule_wrap.o \ + $(CCLBIN) $(ANTLRLIB) -o _libcclmatchrule.so + +# MatchRuleSequence +libcclmatchrulesequence.o: + $(SWIG) $(SWIGOPTS_LANG) libcclmatchrulesequence.i + $(CPP) -c libcclmatchrulesequence_wrap.cxx -I$(PYTHONDIR) -I$(WCCLDIR) $(CPPFLAGS) + $(CPP) -shared libcclmatchrulesequence_wrap.o \ + $(CCLBIN) $(ANTLRLIB) -o _libcclmatchrulesequence.so + +# FunctionalOpSequence +libcclfunctionalopsequence.o: + $(SWIG) $(SWIGOPTS_LANG) libcclfunctionalopsequence.i + $(CPP) -c libcclfunctionalopsequence_wrap.cxx -I$(PYTHONDIR) -I$(WCCLDIR) $(CPPFLAGS) + $(CPP) -shared libcclfunctionalopsequence_wrap.o \ + $(CCLBIN) $(ANTLRLIB) -o _libcclfunctionalopsequence.so + +# Lexicon +libccllexicon.o: + $(SWIG) $(SWIGOPTS_LANG) libccllexicon.i + $(CPP) -c libccllexicon_wrap.cxx -I$(PYTHONDIR) -I$(WCCLDIR) $(CPPFLAGS) + $(CPP) -shared libccllexicon_wrap.o \ + $(CCLBIN) $(ANTLRLIB) -o _libccllexicon.so + +# Lexicons +libccllexicons.o: + $(SWIG) $(SWIGOPTS_LANG) libccllexicons.i + $(CPP) -c libccllexicons_wrap.cxx -I$(PYTHONDIR) -I$(WCCLDIR) $(CPPFLAGS) + $(CPP) -shared libccllexicons_wrap.o \ + $(CCLBIN) $(ANTLRLIB) -o _libccllexicons.so + +# WcclFileOpSections +libcclwcclfileopsections.o: + $(SWIG) $(SWIGOPTS_LANG) libcclwcclfileopsections.i + $(CPP) -c libcclwcclfileopsections_wrap.cxx -I$(PYTHONDIR) -I$(WCCLDIR) $(CPPFLAGS) + $(CPP) -shared libcclwcclfileopsections_wrap.o \ + $(CCLBIN) $(ANTLRLIB) -o _libcclwcclfileopsections.so + +# WcclFile +libcclwcclfile.o: + $(SWIG) $(SWIGOPTS_LANG) libcclwcclfile.i + $(CPP) -c libcclwcclfile_wrap.cxx -I$(PYTHONDIR) -I$(WCCLDIR) $(CPPFLAGS) + $(CPP) -shared libcclwcclfile_wrap.o \ + $(CCLBIN) $(ANTLRLIB) -o _libcclwcclfile.so + +# ----------------------------------------------------------------------------- +# Wccl Library +wccl.o: + $(SWIG) $(SWIGOPTS_LANG) wccl.i + $(CPP) -c wccl_wrap.cxx -I$(PYTHONDIR) -I$(WCCLDIR) $(CPPFLAGS) + $(CPP) -shared wccl_wrap.o \ + $(CCLBIN) $(ANTLRLIB) -o _wccl.so + # ----------------------------------------------------------------------------- clean: rm -f $(CBIN) $(CBINOUT) $(CWRAP) $(CWRAPBIN) $(PYMODULES) $(PYCBIN) diff --git a/swig/libcclactionexeccontext.i b/swig/libcclactionexeccontext.i new file mode 100644 index 0000000000000000000000000000000000000000..528164bc7ce3c4f43b11f5b16b0eabb240751d1d --- /dev/null +++ b/swig/libcclactionexeccontext.i @@ -0,0 +1,31 @@ +#ifndef SWIG_LIBWCCL_ACTIONEXECCONTEXT_I +#define SWIG_LIBWCCL_ACTIONEXECCONTEXT_I + +%module libcclactionexeccontext +%{ + #include <libwccl/ops/tagaction.h> +%} + +%include "libcclvariables.i" +%include "libcclsentencecontext.i" + +namespace Wccl { + class ActionExecContext { + public: + ActionExecContext( + SentenceContext& sentence_context, + const boost::shared_ptr<Variables>& vars); + /* --------------------------------------------------------------------- */ + + SentenceContext& sentence_context() const; + /* --------------------------------------------------------------------- */ + + const boost::shared_ptr<Variables>& variables() const; + }; +} + +using namespace boost; +using namespace Wccl; +using namespace std; + +#endif /* SWIG_LIBWCCL_ACTIONEXECCONTEXT_I */ diff --git a/swig/libcclannotationmatch.i b/swig/libcclannotationmatch.i index 791be9f5269346fb72b3df9fd59ac721765327de..7a2c9176b8f524b813c96f8379acd4830b7144cc 100644 --- a/swig/libcclannotationmatch.i +++ b/swig/libcclannotationmatch.i @@ -6,32 +6,32 @@ #include <libwccl/values/annotationmatch.h> %} -%include "std_string.i" -%include "libcclposition.i" %include "libcclmatchdata.i" %include "libcorpus/libcorpusannotatedsentence.i" +%include "std_string.i" + %nodefaultctor Wccl::AnnotationMatch; namespace Wccl { class AnnotationMatch : public MatchData { public: - explicit AnnotationMatch(Position position, const std::string channel); explicit AnnotationMatch(int pos, const std::string channel); /* --------------------------------------------------------------------- */ bool empty() const; /* --------------------------------------------------------------------- */ - Position first_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>&) const; - Position last_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>&) const; + int first_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>&) const; + int last_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>&) const; /* --------------------------------------------------------------------- */ std::string to_raw_string() const; }; } -using namespace std; +using namespace boost; using namespace Wccl; +using namespace std; #endif /* SWIG_LIBWCCL_ANNOTATIONMATCH_I */ diff --git a/swig/libcclexpression.i b/swig/libcclexpression.i index 128f7a9f0492a0a05defd21bab6d3d501b54e14b..84816e4b9eff26b659e87b6bc4164f62ebadf982 100644 --- a/swig/libcclexpression.i +++ b/swig/libcclexpression.i @@ -7,11 +7,11 @@ #include <boost/noncopyable.hpp> %} -%include "libcorpustagset.i" +%include "libcorpus/libcorpustagset.i" %include "std_string.i" namespace Wccl { - class Expression : boost::noncopyable { + class Expression { public: virtual std::string to_string(const Corpus2::Tagset& tagset) const = 0; std::string to_raw_string() const; diff --git a/swig/libcclfunctionaloperator.i b/swig/libcclfunctionaloperator.i index a36c72d8d2d750fcbdaea8354c411cd510e2bce4..a8a9b8d33b6f989155c763a4986e52ee640390c4 100644 --- a/swig/libcclfunctionaloperator.i +++ b/swig/libcclfunctionaloperator.i @@ -14,10 +14,15 @@ %include "std_string.i" %include "boost_shared_ptr.i" +// %template(ValuePtr) boost::shared_ptr<Wccl::Value>; +%template(FunctionalOperatorPtr) boost::shared_ptr<Wccl::FunctionalOperator>; + namespace Wccl { class FunctionalOperator : public ParsedExpression { public: - // virtual boost::shared_ptr<const Value> base_apply(const SentenceContext& sc) = 0; + virtual boost::shared_ptr<const Value> base_apply(const SentenceContext& sc) = 0; + + /* --------------------------------------------------------------------- */ boost::shared_ptr<FunctionalOperator> clone_ptr() const; boost::shared_ptr<FunctionalOperator> clone_clean_ptr() const; @@ -27,8 +32,6 @@ namespace Wccl { }; } -%template (FunctionalOperatorSharedPtr) boost::shared_ptr<Wccl::FunctionalOperator>; - using namespace boost; using namespace std; using namespace Wccl; diff --git a/swig/libcclfunctionalopsequence.i b/swig/libcclfunctionalopsequence.i new file mode 100644 index 0000000000000000000000000000000000000000..d88f920eb74dc0ba50a32593653b26a62761dafe --- /dev/null +++ b/swig/libcclfunctionalopsequence.i @@ -0,0 +1,205 @@ +#ifndef SWIG_LIBWCCL_FUNCTIONALOPSEQUENCE_I +#define SWIG_LIBWCCL_FUNCTIONALOPSEQUENCE_I + +%module libcclfunctionalopsequence +%{ + #include <libwccl/ops/opsequence.h> +%} + +%include "libccloperator.i" +%include "libcclexpression.i" +%include "libcclfunctionaloperator.i" + +%include "std_pair.i" +%include "std_string.i" +%include "std_vector.i" + +%include "boost_shared_ptr.i" + +%nodefaultctor Wccl::FunctionalOpSequence; + +%template(fun_op_ptr_t) boost::shared_ptr<FunctionalOperator>; +%template(fun_op_ptr_c_t) boost::shared_ptr<const FunctionalOperator>; +%template(name_op_pair_t) std::pair<std::string, boost::shared_ptr<FunctionalOperator> > ; +%template(name_op_pair_c_t) std::pair<std::string, boost::shared_ptr<const FunctionalOperator> >; +%template(name_op_v_t) std::vector<std::pair<std::string, boost::shared_ptr<FunctionalOperator> > >; +%template(name_op_v_c_t) std::vector<std::pair<std::string, boost::shared_ptr<const FunctionalOperator> > >; + +namespace Wccl { + class FunctionalOpSequence : public Expression { + public: + typedef shared_ptr<FunctionalOperator> fun_op_ptr_t; + typedef shared_ptr<const FunctionalOperator> fun_op_ptr_c_t; + typedef std::pair<std::string, fun_op_ptr_t> name_op_pair_t; + typedef std::pair<std::string, fun_op_ptr_c_t> name_op_pair_c_t; + typedef std::vector<name_op_pair_t> name_op_v_t; + typedef std::vector<name_op_pair_c_t> name_op_v_c_t; + /* --------------------------------------------------------------------- */ + + std::string name() const; + /* --------------------------------------------------------------------- */ + + virtual size_t size() const = 0; + virtual bool empty() const = 0; + /* --------------------------------------------------------------------- */ + + virtual FunctionalOperator& get(size_t idx) = 0; + virtual const FunctionalOperator& get(size_t idx) const = 0; + /* --------------------------------------------------------------------- */ + + virtual fun_op_ptr_t get_untyped_ptr(size_t idx) = 0; + virtual fun_op_ptr_c_t get_untyped_ptr(size_t idx) const = 0; + /* --------------------------------------------------------------------- */ + + name_op_v_t& add_name_op_pairs_untyped(name_op_v_t& pairs); + name_op_v_c_t& add_name_op_pairs_untyped(name_op_v_c_t& pairs) const; + /* --------------------------------------------------------------------- */ + + name_op_v_t gen_name_op_pairs_untyped(); + name_op_v_c_t gen_name_op_pairs_untyped() const; + /* --------------------------------------------------------------------- */ + + std::string gen_op_name(size_t idx) const; + /* --------------------------------------------------------------------- */ + + name_op_pair_t gen_name_op_pair_untyped(size_t idx); + name_op_pair_c_t gen_name_op_pair_untyped(size_t idx) const; + /* --------------------------------------------------------------------- */ + + shared_ptr<FunctionalOpSequence> clone() const; + /* --------------------------------------------------------------------- */ + + virtual ~FunctionalOpSequence(); + /* --------------------------------------------------------------------- */ + + protected: + FunctionalOpSequence(const std::string& name); + virtual FunctionalOpSequence* clone_internal() const = 0; + }; // FunctionalOpSequence + + /* ----------------------------------------------------------------------- */ + + + class UntypedOpSequence : public FunctionalOpSequence { + public: + typedef FunctionalOperator op_t; + + UntypedOpSequence(const std::string& name); + /* --------------------------------------------------------------------- */ + + bool empty() const; + size_t size() const; + /* --------------------------------------------------------------------- */ + + void append(const fun_op_ptr_t& op); + /* --------------------------------------------------------------------- */ + + op_t& get(size_t idx); + const op_t& get(size_t idx) const; + /* --------------------------------------------------------------------- */ + + fun_op_ptr_t get_untyped_ptr(size_t idx); + fun_op_ptr_c_t get_untyped_ptr(size_t idx) const; + /* --------------------------------------------------------------------- */ + + fun_op_ptr_t get_ptr(size_t idx); + fun_op_ptr_c_t get_ptr(size_t idx) const; + /* --------------------------------------------------------------------- */ + + name_op_v_t& add_name_op_pairs(name_op_v_t& pairs); + name_op_v_c_t& add_name_op_pairs(name_op_v_c_t& pairs) const; + /* --------------------------------------------------------------------- */ + + name_op_v_t gen_name_op_pairs(); + name_op_v_c_t gen_name_op_pairs() const; + /* --------------------------------------------------------------------- */ + + name_op_pair_t gen_name_op_pair(size_t idx); + name_op_pair_c_t gen_name_op_pair(size_t idx) const; + /* --------------------------------------------------------------------- */ + + std::string to_string(const Corpus2::Tagset& tagset) const; + /* --------------------------------------------------------------------- */ + + UntypedOpSequence(const UntypedOpSequence& seq); + /* --------------------------------------------------------------------- */ + + shared_ptr<UntypedOpSequence> clone() const; + /* --------------------------------------------------------------------- */ + + protected: + std::ostream& write_to(std::ostream& ostream) const; + UntypedOpSequence* clone_internal() const; + }; // UntypedOpSequence + + /* ----------------------------------------------------------------------- */ + + %rename(op_t) Wccl::Operator<T>; + %rename(op_ptr_t) boost::shared_ptr<Wccl::Operator<T> >; + %rename(op_ptr_c_t) boost::shared_ptr<const Wccl::Operator<T> >; + + template<class T> class OpSequence : public FunctionalOpSequence { + public: + typedef Operator<T> op_t; + typedef typename shared_ptr<Operator<T> > op_ptr_t; + typedef typename shared_ptr<const Operator<T> > op_ptr_c_t; + typedef typename std::pair<std::string, op_ptr_t> name_op_pair_t; + typedef typename std::pair<std::string, op_ptr_c_t> name_op_pair_c_t; + typedef typename std::vector<name_op_pair_t> name_op_v_t; + typedef typename std::vector<name_op_pair_c_t> name_op_v_c_t; + /* --------------------------------------------------------------------- */ + + OpSequence(const std::string& name); + /* --------------------------------------------------------------------- */ + + bool empty() const; + size_t size() const; + /* --------------------------------------------------------------------- */ + + void append(const op_ptr_t& op); + /* --------------------------------------------------------------------- */ + + op_t& get(size_t idx); + const op_t& get(size_t idx) const; + /* --------------------------------------------------------------------- */ + + op_ptr_t get_ptr(size_t idx); + op_ptr_c_t get_ptr(size_t idx) const; + /* --------------------------------------------------------------------- */ + + FunctionalOpSequence::fun_op_ptr_t get_untyped_ptr(size_t idx); + FunctionalOpSequence::fun_op_ptr_c_t get_untyped_ptr(size_t idx) const; + /* --------------------------------------------------------------------- */ + + name_op_v_t gen_name_op_pairs(); + name_op_v_c_t gen_name_op_pairs() const; + /* --------------------------------------------------------------------- */ + + name_op_v_t& add_name_op_pairs(name_op_v_t& pairs); + name_op_v_c_t& add_name_op_pairs(name_op_v_c_t& pairs) const; + /* --------------------------------------------------------------------- */ + + name_op_pair_t gen_name_op_pair(size_t idx); + name_op_pair_c_t gen_name_op_pair(size_t idx) const; + /* --------------------------------------------------------------------- */ + + std::string to_string(const Corpus2::Tagset& tagset) const; + /* --------------------------------------------------------------------- */ + + OpSequence(const OpSequence<T>& seq); + /* --------------------------------------------------------------------- */ + + shared_ptr<OpSequence<T> > clone() const; + /* --------------------------------------------------------------------- */ + + protected: + std::ostream& write_to(std::ostream& os) const; + virtual OpSequence<T>* clone_internal() const; + }; // OpSequence +} + +using namespace boost; +using namespace Wccl; +using namespace std; + +#endif /* SWIG_LIBWCCL_FUNCTIONALOPSEQUENCE_I */ diff --git a/swig/libccllexicon.i b/swig/libccllexicon.i new file mode 100644 index 0000000000000000000000000000000000000000..69cc3240af27e9aacfb7303fa37c8b0e7d1611fa --- /dev/null +++ b/swig/libccllexicon.i @@ -0,0 +1,69 @@ +#ifndef SWIG_LIBWCCL_LEXICON_I +#define SWIG_LIBWCCL_LEXICON_I + +%module libccllexicon +%{ + #include <libwccl/lexicon/lexicon.h> + + #include <libcorpus2/lexeme.h> + #include <boost/unordered_map.hpp> +%} + +%include "libcclstrset.i" + +%include "std_string.i" +%include "boost_shared_ptr.i" + +%rename(map_t) boost::unordered_map<UnicodeString, UnicodeString>; + +%nodefaultctor Wccl::Lexicon; + +namespace Wccl { + class Lexicon { + public: + typedef boost::unordered_map<UnicodeString, UnicodeString> map_t; + + /* --------------------------------------------------------------------- */ + + Lexicon(const std::string& name, const std::string& file_name); + + /* --------------------------------------------------------------------- */ + + const UnicodeString& translate(const UnicodeString& key) const; + // boost::shared_ptr<StrSet> translate(const StrSet& set) const; + + // TODO + // std::string translate_utf8(const std::string&); + + /* --------------------------------------------------------------------- */ + + std::string name() const; + std::string file_name() const; + + /* --------------------------------------------------------------------- */ + + bool has_key(const UnicodeString& key) const; + + // TODO + // bool has_key_utf8(const std::string& key) const + + /* --------------------------------------------------------------------- */ + + void insert(const UnicodeString& key, const UnicodeString& value); + void insert(const UnicodeString& key); + + // TODO + // void insert_utf8(const std::string& key, const std::string& value); + // void insert_utf8(const std::string& key); + + /* --------------------------------------------------------------------- */ + + const map_t& map() const; + }; +} + +using namespace boost; +using namespace Wccl; +using namespace std; + +#endif /* SWIG_LIBWCCL_LEXICON_I */ diff --git a/swig/libccllexicons.i b/swig/libccllexicons.i new file mode 100644 index 0000000000000000000000000000000000000000..a25137943d6119590c2f6489a1ede13572b0ce65 --- /dev/null +++ b/swig/libccllexicons.i @@ -0,0 +1,48 @@ +#ifndef SWIG_LIBWCCL_LEXICONS_I +#define SWIG_LIBWCCL_LEXICONS_I + +%module libccllexicons +%{ + #include <libwccl/lexicon/lexicons.h> +%} + +%include "libccllexicon.i" + +%include "std_string.i" +%include "boost_shared_ptr.i" + +%rename(map_t) boost::unordered_map<std::string, boost::shared_ptr<Lexicon> >; + +namespace Wccl { + class Lexicons { + public: + typedef unordered_map<std::string, shared_ptr<Lexicon> > map_t; + + /* --------------------------------------------------------------------- */ + + Lexicons(); + + /* --------------------------------------------------------------------- */ + + bool has_lexicon(const std::string& name) const; + + /* --------------------------------------------------------------------- */ + + const Lexicon& get(const std::string& name) const; + shared_ptr<const Lexicon> get_ptr(const std::string& name) const; + + /* --------------------------------------------------------------------- */ + + void insert(const shared_ptr<Lexicon>& lexicon); + + /* --------------------------------------------------------------------- */ + + const map_t& get_lexicons() const; + }; +} + +using namespace boost; +using namespace Wccl; +using namespace std; + +#endif /* SWIG_LIBWCCL_LEXICONS_I */ diff --git a/swig/libcclmatch.i b/swig/libcclmatch.i index 41ba586bd2fc24102b69d20a9bc29dc212d2081d..f64cefe2c8b71ad4ee145af6703b441a6d2d12ca 100644 --- a/swig/libcclmatch.i +++ b/swig/libcclmatch.i @@ -3,15 +3,23 @@ %module libcclbool %{ - #include <libwccl/values/value.h> #include <libwccl/values/match.h> %} %include "libcclvalue.i" +%include "libcclmatchdata.i" +%include "libccltokenmatch.i" +%include "libcclannotationmatch.i" +%include "libcclmatchvector.i" +%include "libcorpus/libcorpusannotatedsentence.i" + %include "std_string.i" +// %include "boost_shared_ptr.i" %feature("notabstract") Wccl::Match; +// %template (MatchPtr) boost::shared_ptr<Wccl::Match>; + namespace Wccl { class Match : public Value { public: @@ -20,12 +28,37 @@ namespace Wccl { std::string make_var_repr(const std::string &var_name) const { return var_repr(var_name); } + /* --------------------------------------------------------------------- */ Match(); + /* --------------------------------------------------------------------- */ + + Match(const boost::shared_ptr<MatchData>& data); + Match(const boost::shared_ptr<TokenMatch>& data); + Match(const boost::shared_ptr<AnnotationMatch>& data); + Match(const boost::shared_ptr<MatchVector>& data); + /* --------------------------------------------------------------------- */ + + Match(const MatchData& data); + Match(const Match& match); + /* --------------------------------------------------------------------- */ + + %rename(OperatorEqMatch) *::operator=(const Match& other); + /* --------------------------------------------------------------------- */ + + // const MatchData& get_value() const; + MatchData& get_value(); + /* --------------------------------------------------------------------- */ + + void set_value(const MatchData& m); + /* --------------------------------------------------------------------- */ + + bool empty() const; + /* --------------------------------------------------------------------- */ - // TODO - // virtual Position first_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>&) const; - // virtual Position last_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>&) const; + int first_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>& s) const; + int last_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>& s) const; + /* --------------------------------------------------------------------- */ std::string to_raw_string() const; }; diff --git a/swig/libcclmatchdata.i b/swig/libcclmatchdata.i index 08893106f536fa2416731fcb79d718d68c639d91..f25d53a5b15e14c4282d9bbf9c324e6bbbdad7b3 100644 --- a/swig/libcclmatchdata.i +++ b/swig/libcclmatchdata.i @@ -6,34 +6,42 @@ #include <libwccl/values/matchdata.h> %} -%include "boost_shared_ptr.i" %include "libcclmatch.i" %include "libcorpus/libcorpusannotatedsentence.i" -// %template(MatchDataPtr) boost::shared_ptr<Wccl::MatchData>; +%include "std_string.i" +%include "boost_shared_ptr.i" + +%nodefaultctor Wccl::MatchData; + +%template(MatchDataPtr) boost::shared_ptr<Wccl::MatchData>; namespace Wccl { class MatchData { public: - ~MatchData(); + virtual bool empty() const = 0; /* --------------------------------------------------------------------- */ - virtual bool empty() const = 0; - virtual Position first_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>&) const = 0; - virtual Position last_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>&) const = 0; + virtual int first_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>&) const = 0; + virtual int last_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>&) const = 0; /* --------------------------------------------------------------------- */ + // virtual const boost::shared_ptr<const Match> submatch(size_t) const; virtual const boost::shared_ptr<Match>& submatch(size_t); /* --------------------------------------------------------------------- */ boost::shared_ptr<MatchData> clone() const; + + /* --------------------------------------------------------------------- */ + + virtual std::string to_raw_string() const = 0; /* --------------------------------------------------------------------- */ - virtual std::string to_raw_string() const = 0; + virtual ~MatchData() {} }; } diff --git a/swig/libcclmatchrule.i b/swig/libcclmatchrule.i new file mode 100644 index 0000000000000000000000000000000000000000..f2ca44c1761436396a7553bfe05e7611e97b73ce --- /dev/null +++ b/swig/libcclmatchrule.i @@ -0,0 +1,55 @@ +#ifndef SWIG_LIBWCCL_MATCHRULE_I +#define SWIG_LIBWCCL_MATCHRULE_I + +%module libcclmatchrule +%{ + #include <libwccl/ops/matchrule.h> +%} + +%include "libcclvariables.i" +%include "libcclparsedexpression.i" +%include "libcorpus/libcorpusannotatedsentence.i" + +%include "std_string.i" +%include "boost_shared_ptr.i" + +namespace Wccl { + class MatchRule : public ParsedExpression { + public: + MatchRule( + const Variables& variables, + const boost::shared_ptr<ApplyOperator>& apply + ); + + MatchRule(const MatchRule& other, bool clean = false); + MatchRule(); + + /* --------------------------------------------------------------------- */ + + %rename(OpFunMatchRule) operator()(const boost::shared_ptr<Corpus2::AnnotatedSentence>& s); + void operator()(const boost::shared_ptr<Corpus2::AnnotatedSentence>& s); + + void apply(const boost::shared_ptr<Corpus2::AnnotatedSentence>& s); + + /* --------------------------------------------------------------------- */ + + MatchRule clone() const; + MatchRule clone_clean() const; + + boost::shared_ptr<MatchRule> clone_ptr() const; + boost::shared_ptr<MatchRule> clone_clean_ptr() const; + + /* --------------------------------------------------------------------- */ + + %rename(OpEqMatchRule) *::operator=(const MatchRule& other); + + /* --------------------------------------------------------------------- */ + + std::string to_string(const Corpus2::Tagset& tagset) const; + }; +} + +using namespace std; +using namespace Wccl; + +#endif /* SWIG_LIBWCCL_MATCHRULE_I */ diff --git a/swig/libcclmatchrulesequence.i b/swig/libcclmatchrulesequence.i new file mode 100644 index 0000000000000000000000000000000000000000..590bd907df27b48518fa7a5bde173691b3d52d0f --- /dev/null +++ b/swig/libcclmatchrulesequence.i @@ -0,0 +1,44 @@ +#ifndef SWIG_LIBWCCL_MATCHRULESEQUENCE_I +#define SWIG_LIBWCCL_MATCHRULESEQUENCE_I + +%module libcclmatchrulesequence +%{ + #include <libwccl/ops/matchrulesequence.h> +%} + +%include "libcclmatchrule.i" +%include "libcclexpression.i" +%include "libcorpus/libcorpusannotatedsentence.i" + +%include "std_string.i" +%include "std_vector.i" +%include "boost_shared_ptr.i" + +%template(MatchRuleVector) std::vector<MatchRule>; + +namespace Wccl { + class MatchRuleSequence : public std::vector<MatchRule>, public Expression { + public: + MatchRuleSequence(); + MatchRuleSequence(const std::vector<MatchRule>& rules); + + /* --------------------------------------------------------------------- */ + + %rename(OpFunMatchRuleSequence) operator()(const shared_ptr<Corpus2::AnnotatedSentence>& sentence); + void operator()(const shared_ptr<Corpus2::AnnotatedSentence>& sentence); + + /* --------------------------------------------------------------------- */ + + void apply_all(const shared_ptr<Corpus2::AnnotatedSentence>& sentence); + + /* --------------------------------------------------------------------- */ + + std::string to_string(const Corpus2::Tagset& tagset) const; + }; +} + +using namespace boost; +using namespace Wccl; +using namespace std; + +#endif /* SWIG_LIBWCCL_MATCHRULESEQUENCE_I */ diff --git a/swig/libcclmatchvector.i b/swig/libcclmatchvector.i new file mode 100644 index 0000000000000000000000000000000000000000..f5fa3bba7ce26a4541fa48443762749615a15c98 --- /dev/null +++ b/swig/libcclmatchvector.i @@ -0,0 +1,56 @@ +#ifndef SWIG_LIBWCCL_MATCHVECTOR_I +#define SWIG_LIBWCCL_MATCHVECTOR_I + +%module libcclmatchvector +%{ + #include <libwccl/values/matchvector.h> +%} + +%include "libcclmatch.i" +%include "libcclmatchdata.i" +%include "libccltokenmatch.i" +%include "libcclannotationmatch.i" +%include "libcorpus/libcorpusannotatedsentence.i" + +%include "std_string.i" +%include "boost_shared_ptr.i" + +%template(MatchVectorPtr) boost::shared_ptr<Wccl::MatchVector>; + +namespace Wccl { + class MatchVector { + public: + MatchVector(); + ~MatchVector(); + /* --------------------------------------------------------------------- */ + + bool empty() const; + /* --------------------------------------------------------------------- */ + + int first_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>& s) const; + int last_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>& s) const; + /* --------------------------------------------------------------------- */ + + std::string to_raw_string() const; + /* --------------------------------------------------------------------- */ + + void append(const boost::shared_ptr<Match>& m); + void append(const boost::shared_ptr<MatchData>& m); + void append(const boost::shared_ptr<MatchVector>& m); + void append(const boost::shared_ptr<TokenMatch>& m); + void append(const boost::shared_ptr<AnnotationMatch>& m); + /* --------------------------------------------------------------------- */ + + size_t size() const; + void clear(); + /* --------------------------------------------------------------------- */ + + const boost::shared_ptr<Match>& submatch(size_t idx); + // const boost::shared_ptr<const Match> submatch(size_t idx) const; + }; +} + +using namespace std; +using namespace Wccl; + +#endif /* SWIG_LIBWCCL_MATCHVECTOR_I */ diff --git a/swig/libccloperator.i b/swig/libccloperator.i index 609d91715a35d2ea7b424eb7232dac9418ac2b7a..6f32a56533de816ee857c9555ac9110002eec511 100644 --- a/swig/libccloperator.i +++ b/swig/libccloperator.i @@ -21,31 +21,33 @@ %feature("notabstract") Wccl::Operator; -%rename(__op_cop__) Wccl::Operator::operator=(const Operator& other); -%rename(__op_fun__) Wccl::Operator::operator()(const SentenceContext& sentence_context); - namespace Wccl { template <class T> class Operator : public FunctionalOperator { public: Operator(); - Operator& operator=(const Operator& other); -/* - Operator(const boost::shared_ptr<const Function<T> >& body, const Variables& variables); -*/ - Operator(const Operator& other, bool clean = false); + /* --------------------------------------------------------------------- */ + %rename(OperatorEq) *::operator=(const Operator& other); + + /* --------------------------------------------------------------------- */ + %rename(OpFun) operator()(const SentenceContext& sentence_context); boost::shared_ptr<const T> operator()(const SentenceContext& sentence_context); + + /* --------------------------------------------------------------------- */ boost::shared_ptr<const T> apply(const SentenceContext& sentence_context); boost::shared_ptr<T> copy_apply(const SentenceContext& sentence_context); boost::shared_ptr<const Value> base_apply(const SentenceContext& sc); + /* --------------------------------------------------------------------- */ Operator clone() const; Operator clone_clean() const; + /* --------------------------------------------------------------------- */ boost::shared_ptr<Operator<T> > clone_ptr() const; boost::shared_ptr<Operator<T> > clone_clean_ptr() const; + /* --------------------------------------------------------------------- */ std::string to_string(const Corpus2::Tagset& tagset) const; protected: @@ -58,10 +60,10 @@ namespace Wccl { %template (StrSetOperator) Operator<StrSet>; %template (PositionOperator) Operator<Position>; - %template (SharedPtrBoolOperator) boost::shared_ptr<Operator<Bool> >; - %template (SharedPtrTSetOperator) boost::shared_ptr<Operator<TSet> >; - %template (SharedPtrStrSetOperator) boost::shared_ptr<Operator<StrSet> >; - %template (SharedPtrPositionOperator) boost::shared_ptr<Operator<Position> >; + %template (BoolOperatorPtr) boost::shared_ptr<Operator<Bool> >; + %template (TSetOperatorPtr) boost::shared_ptr<Operator<TSet> >; + %template (StrSetOperatorPtr) boost::shared_ptr<Operator<StrSet> >; + %template (PositionOperatorPtr) boost::shared_ptr<Operator<Position> >; } using namespace boost; diff --git a/swig/libcclparsedexpression.i b/swig/libcclparsedexpression.i index 12e3cfb6a688d4e03c1acf41b0d82e671dac93bb..f6a4e879a0a858ee09cdb4ce5aa1782e6e86dd97 100644 --- a/swig/libcclparsedexpression.i +++ b/swig/libcclparsedexpression.i @@ -18,13 +18,15 @@ %include "std_string.i" %include "boost_shared_ptr.i" -%rename(__op_get__) Wccl::ParsedExpression::operator[](const std::string& var_name) const; +%template (ParsedExpressionPtr) boost::shared_ptr<Wccl::ParsedExpression>; namespace Wccl { class ParsedExpression : public Expression { public: + %rename(Getvalue) operator[](const std::string& var_name) const; const Value& operator[](const std::string& var_name) const; - + + /* --------------------------------------------------------------------- */ template<class T> T& get(const std::string& var_name); %template(get_bool) get<Bool>; %template(get_tset) get<TSet>; @@ -32,6 +34,7 @@ namespace Wccl { %template(get_position) get<Position>; %template(get_match) get<Match>; + /* --------------------------------------------------------------------- */ template<class T> void set(const std::string& var_name, const T& value); %template(set_bool) set<Bool>; %template(set_tset) set<TSet>; @@ -39,9 +42,14 @@ namespace Wccl { %template(set_position) set<Position>; %template(set_match) set<Match>; + /* --------------------------------------------------------------------- */ void clean(); + + /* --------------------------------------------------------------------- */ boost::shared_ptr<ParsedExpression> clone_ptr() const; boost::shared_ptr<ParsedExpression> clone_clean_ptr() const; + + /* --------------------------------------------------------------------- */ std::string variables_string(const Corpus2::Tagset& tagset) const; std::ostream& dump_variables(std::ostream& ostream, const Corpus2::Tagset& tagset) const; diff --git a/swig/libcclparser.i b/swig/libcclparser.i index fc2284bbf6efe82dcca453060980e12062fce2c2..658d57a658cdced8a5b5f4fac2c6599a5d9603ca 100644 --- a/swig/libcclparser.i +++ b/swig/libcclparser.i @@ -6,21 +6,140 @@ #include <libwccl/parser/Parser.h> %} -%include "libcorpustagsetmanager.i" +%include "libccltagrule.i" +%include "libccltagrulesequence.i" +%include "libcclmatchrule.i" +%include "libcclwcclfile.i" +%include "libcclfunctionaloperator.i" + +%include "libcorpus/libcorpustagset.i" %include "std_string.i" %include "std_vector.i" +%include "boost_shared_ptr.i" namespace Wccl { class Parser { public: - Parser(const Corpus2::Tagset&); - ~TagsetManager(); + explicit Parser(const Corpus2::Tagset&); + explicit Parser(const std::string& tagset_name); + ~Parser(); + + /* --------------------------------------------------------------------- */ + + %exception { + try { + $action + } catch (PwrNlp::PwrNlpError &e) { + PyErr_SetString(PyExc_IndexError, e.info().c_str()); + return NULL; + } + } + shared_ptr<Operator<StrSet> > + parseStringOperator (const std::string& operator_string) const; + + %exception { + try { + $action + } catch (PwrNlp::PwrNlpError &e) { + PyErr_SetString(PyExc_IndexError, e.info().c_str()); + return NULL; + } + } + shared_ptr<Operator<Bool> > + parseBoolOperator (const std::string& operator_string) const; + + %exception { + try { + $action + } catch (PwrNlp::PwrNlpError &e) { + PyErr_SetString(PyExc_IndexError, e.info().c_str()); + return NULL; + } + } + shared_ptr<Operator<TSet> > + parseSymSetOperator (const std::string& operator_string) const; + + %exception { + try { + $action + } catch (PwrNlp::PwrNlpError &e) { + PyErr_SetString(PyExc_IndexError, e.info().c_str()); + return NULL; + } + } + shared_ptr<Operator<Position> > + parsePositionOperator(const std::string& operator_string) const; + + /* --------------------------------------------------------------------- */ + + %exception { + try { + $action + } catch (PwrNlp::PwrNlpError &e) { + PyErr_SetString(PyExc_IndexError, e.info().c_str()); + return NULL; + } + } + shared_ptr<FunctionalOperator> + parseAnyOperator(const std::string& operator_string) const; + + /* --------------------------------------------------------------------- */ + + %exception { + try { + $action + } catch (PwrNlp::PwrNlpError &e) { + PyErr_SetString(PyExc_IndexError, e.info().c_str()); + return NULL; + } + } + shared_ptr<TagRuleSequence> + parseTagRuleSequence(const std::string& rule_string) const; + + %exception { + try { + $action + } catch (PwrNlp::PwrNlpError &e) { + PyErr_SetString(PyExc_IndexError, e.info().c_str()); + return NULL; + } + } + shared_ptr<TagRule> + parseSingleRule(const std::string& rule_string) const; + + %exception { + try { + $action + } catch (PwrNlp::PwrNlpError &e) { + PyErr_SetString(PyExc_IndexError, e.info().c_str()); + return NULL; + } + } + shared_ptr<MatchRule> + parseMatchRule(const std::string& rule_string) const; /* --------------------------------------------------------------------- */ + + %exception { + try { + $action + } catch (PwrNlp::PwrNlpError &e) { + PyErr_SetString(PyExc_IndexError, e.info().c_str()); + return NULL; + } + } + shared_ptr<WcclFile> parseWcclFile( + const std::string& file_contents_string, + const std::string& search_path = ".") const; + + /* --------------------------------------------------------------------- */ + + const Corpus2::Tagset& tagset() const; }; } +using namespace boost; using namespace std; using namespace Corpus2; diff --git a/swig/libcclsentencecontext.i b/swig/libcclsentencecontext.i index b1bb4459e9bbe96460fbf1037e28be6dabd136e0..e3202d8ac16261c70f06ed60126b3411dbfbe0d2 100644 --- a/swig/libcclsentencecontext.i +++ b/swig/libcclsentencecontext.i @@ -6,9 +6,9 @@ #include <libwccl/sentencecontext.h> %} -%include "libcorpustoken.i" -%include "libcorpuslexeme.i" -%include "libcorpussentence.i" +%include "libcorpus/libcorpustoken.i" +%include "libcorpus/libcorpuslexeme.i" +%include "libcorpus/libcorpussentence.i" %include "libcclposition.i" diff --git a/swig/libccltagaction.i b/swig/libccltagaction.i new file mode 100644 index 0000000000000000000000000000000000000000..7d0e477a1f001c05bfbbb8b5fd5f11f853e95a12 --- /dev/null +++ b/swig/libccltagaction.i @@ -0,0 +1,28 @@ +#ifndef SWIG_LIBWCCL_TAGACTION_I +#define SWIG_LIBWCCL_TAGACTION_I + +%module libccltagaction +%{ + #include <libwccl/ops/tagaction.h> +%} + +%include "libcclbool.i" +%include "libcclexpression.i" +%include "libcclactionexeccontext.i" + +%include "std_string.i" + +%nodefaultctor Wccl::TagAction; + +namespace Wccl { + class TagAction : public Expression { + public: + virtual std::string name() const = 0; + virtual Bool execute(const ActionExecContext& context) const = 0; + }; +} + +using namespace std; +using namespace Wccl; + +#endif /* SWIG_LIBWCCL_TAGACTION_I */ diff --git a/swig/libccltagrule.i b/swig/libccltagrule.i new file mode 100644 index 0000000000000000000000000000000000000000..b694be1a89c4285d6d3368cf0bb72ad21d9d04c2 --- /dev/null +++ b/swig/libccltagrule.i @@ -0,0 +1,74 @@ +#ifndef SWIG_LIBWCCL_TAGRULE_I +#define SWIG_LIBWCCL_TAGRULE_I + +%module libccltagrule +%{ + #include <libwccl/ops/tagrule.h> +%} + +%include "libcclbool.i" +%include "libccltagaction.i" +%include "libcclvariables.i" +%include "libcclsentencecontext.i" +%include "libcclparsedexpression.i" + +%include "libcorpus/libcorpustagset.i" + +%include "std_string.i" +%include "boost_shared_ptr.i" + +%template(TagRulePtr) boost::shared_ptr<Wccl::TagRule>; + +namespace Wccl { + class TagRule : public ParsedExpression { + public: + /* + TagRule( + const std::string& name, + const Variables& variables, + const boost::shared_ptr<const std::vector<boost::shared_ptr<TagAction> > >& actions, + const boost::shared_ptr<const Function<Bool> >& condition = TrueCondition() + ); + */ + + TagRule(); + TagRule(const TagRule& other, bool clean = false); + + /* --------------------------------------------------------------------- */ + + %rename(OpFunTagRule) operator()(const SentenceContext& sentence_context); + Bool operator()(SentenceContext& sentence_context); + /* --------------------------------------------------------------------- */ + + Bool execute(SentenceContext& sentence_context); + + std::string name() const; + /* --------------------------------------------------------------------- */ + + TagRule clone() const; + TagRule clone_clean() const; + /* --------------------------------------------------------------------- */ + + shared_ptr<TagRule> clone_ptr() const; + shared_ptr<TagRule> clone_clean_ptr() const; + /* --------------------------------------------------------------------- */ + + %rename(OperatorEqTagRule) *::operator=(const TagRule& other); + /* --------------------------------------------------------------------- */ + + std::string to_string(const Corpus2::Tagset& tagset) const; + /* --------------------------------------------------------------------- */ + + protected: + TagRule* clone_internal() const; + + private: + // static const boost::shared_ptr<const Function<Bool> > TrueCondition(); + }; +} + +using namespace boost; +using namespace Wccl; +using namespace std; + +#endif /* SWIG_LIBWCCL_TAGRULE_I */ diff --git a/swig/libccltagrulesequence.i b/swig/libccltagrulesequence.i new file mode 100644 index 0000000000000000000000000000000000000000..f85f04f721dfb713c4eac05425171b25653648ed --- /dev/null +++ b/swig/libccltagrulesequence.i @@ -0,0 +1,47 @@ +#ifndef SWIG_LIBWCCL_TAGRULESEQUENCE_I +#define SWIG_LIBWCCL_TAGRULESEQUENCE_I + +%module libccltagrulesequence +%{ + #include <libwccl/ops/tagrulesequence.h> +%} + +%include "libcclbool.i" +%include "libccltagrule.i" +%include "libcclexpression.i" +%include "libcorpus/libcorpussentence.i" + +%include "std_string.i" +%include "std_vector.i" +%include "boost_shared_ptr.i" + +%template(StdVectorTagRule) std::vector<TagRule>; +%template(TagRuleSequencPtr) boost::shared_ptr<Wccl::TagRuleSequence>; + +namespace Wccl { + class TagRuleSequence : public std::vector<TagRule>, public Expression { + public: + TagRuleSequence(const std::vector<TagRule>& rules); + TagRuleSequence(); + + /* --------------------------------------------------------------------- */ + + %rename(OpFunTagRuleSequence) operator()( + const shared_ptr<Corpus2::Sentence>& sentence + ); + Bool operator()(const shared_ptr<Corpus2::Sentence>& sentence); + + /* --------------------------------------------------------------------- */ + + Bool execute_once(const shared_ptr<Corpus2::Sentence>& sentence); + int execute_until_done(const shared_ptr<Corpus2::Sentence>& sentence, int max_iter = 1000); + + std::string to_string(const Corpus2::Tagset& tagset) const; + }; +} + +using namespace boost; +using namespace Wccl; +using namespace std; + +#endif /* SWIG_LIBWCCL_TAGRULESEQUENCE_I */ diff --git a/swig/libccltokenmatch.i b/swig/libccltokenmatch.i index 32757f86b051bc116a1cdb2e870c7432cd287ff0..a4045e958103bf88f380027de1ae250c65340735 100644 --- a/swig/libccltokenmatch.i +++ b/swig/libccltokenmatch.i @@ -6,7 +6,6 @@ #include <libwccl/values/tokenmatch.h> %} -%include "libcclposition.i" %include "libcclmatchdata.i" %include "libcorpus/libcorpusannotatedsentence.i" @@ -15,15 +14,14 @@ namespace Wccl { class TokenMatch : public MatchData { public: - explicit TokenMatch(Position position); explicit TokenMatch(int pos); - + /* --------------------------------------------------------------------- */ bool empty() const; /* --------------------------------------------------------------------- */ - Position first_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>&) const; - Position last_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>&) const; + int first_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>&) const; + int last_token(const boost::shared_ptr<Corpus2::AnnotatedSentence>&) const; /* --------------------------------------------------------------------- */ std::string to_raw_string() const; @@ -33,4 +31,5 @@ namespace Wccl { using namespace std; using namespace Wccl; + #endif /* SWIG_LIBWCCL_TOKENMATCH_I */ diff --git a/swig/libcclvariables.i b/swig/libcclvariables.i index 24d6d7493c1c93a181523a14ecc299face003d5f..3048feb24eda8137896233e485f90eae4da0f015 100644 --- a/swig/libcclvariables.i +++ b/swig/libcclvariables.i @@ -16,6 +16,21 @@ %include "std_string.i" %include "boost_shared_ptr.i" +%template(ValuePtr) boost::shared_ptr<Wccl::Value>; +%template(BoolPtr) boost::shared_ptr<Wccl::Bool>; +%template(PositionPtr) boost::shared_ptr<Wccl::Position>; +%template(StrSetPtr) boost::shared_ptr<Wccl::StrSet>; +%template(TSetPtr) boost::shared_ptr<Wccl::TSet>; +%template(MatchPtr) boost::shared_ptr<Wccl::Match>; +%template(VariablesPtr) boost::shared_ptr<Wccl::Variables>; +%template(ConstValuePtr) boost::shared_ptr<Wccl::Value const>; +%template(ConstBoolPtr) boost::shared_ptr<Wccl::Bool const>; +%template(ConstPositionPtr) boost::shared_ptr<Wccl::Position const>; +%template(ConstStrSetPtr) boost::shared_ptr<Wccl::StrSet const>; +%template(ConstTSetPtr) boost::shared_ptr<Wccl::TSet const>; +%template(ConstMatchPtr) boost::shared_ptr<Wccl::Match const>; +%template(ConstVariablesPtr) boost::shared_ptr<Wccl::Variables const>; + namespace Wccl { /* ----------------------------------------------------------------------- */ /* Helper detail class */ @@ -48,13 +63,13 @@ namespace Wccl { /* ----------------------------------------------------------------------- */ /* Variables */ - class Variables : + class Variables /*: detail::Vmap<Value>, detail::Vmap<Bool>, detail::Vmap<Position>, detail::Vmap<StrSet>, detail::Vmap<TSet>, - detail::Vmap<Match> + detail::Vmap<Match> */ { public: Variables(); @@ -153,8 +168,6 @@ namespace Wccl { }; } -%template(VariablesSharedPtr) boost::shared_ptr<Wccl::Variables>; - using namespace boost; using namespace std; using namespace Wccl; diff --git a/swig/libcclwcclfile.i b/swig/libcclwcclfile.i new file mode 100644 index 0000000000000000000000000000000000000000..81dda6a08462f88622e372e96866c2a00333203f --- /dev/null +++ b/swig/libcclwcclfile.i @@ -0,0 +1,261 @@ +#ifndef SWIG_LIBWCCL_WCCLFILE_I +#define SWIG_LIBWCCL_WCCLFILE_I + +%module libcclwcclfile +%{ + #include <libwccl/wcclfile.h> +%} + +%include "libcclbool.i" +%include "libccltset.i" +%include "libcclmatch.i" +%include "libcclstrset.i" +%include "libcclposition.i" +%include "libccllexicon.i" +%include "libccllexicons.i" +%include "libccltagrulesequence.i" +%include "libcclmatchrulesequence.i" +%include "libcclfunctionaloperator.i" +%include "libcclwcclfileopsections.i" +%include "libcclfunctionalopsequence.i" + +%include "libcorpus/libcorpustagset.i" + +%include "std_string.i" +%include "std_vector.i" +%include "boost_shared_ptr.i" + +%template(UntypedOpSequencePtrVector) std::vector<boost::shared_ptr<Wccl::UntypedOpSequence> >; +%template(BoolOpSequencePtrVector) std::vector<boost::shared_ptr<Wccl::OpSequence<Wccl::Bool> > >; +%template(StrSetOpSequencePtrVector) std::vector<boost::shared_ptr<Wccl::OpSequence<Wccl::StrSet> > >; +%template(TSetOpSequencePtrVector) std::vector<boost::shared_ptr<Wccl::OpSequence<Wccl::TSet> > >; +%template(PositionOpSequencePtrVector) std::vector<boost::shared_ptr<Wccl::OpSequence<Wccl::Position> > >; +%template(MatchOpSequencePtrVector) std::vector<boost::shared_ptr<Wccl::OpSequence<Wccl::Match> > >; + +%nodefaultctor Wccl::WcclFile; + +%template(OpSequenceStrSet) Wccl::WcclFileOpSections<Wccl::OpSequence<Wccl::StrSet> >; + +namespace Wccl { + class WcclFile + : public WcclFileOpSections<UntypedOpSequence> /*, + WcclFileOpSections<OpSequence<StrSet> >, + WcclFileOpSections<OpSequence<TSet> >, + WcclFileOpSections<OpSequence<Bool> >, + WcclFileOpSections<OpSequence<Position> >, + WcclFileOpSections<OpSequence<Match> > + */ { + public: + WcclFile(const Corpus2::Tagset& tagset, const std::string& search_path); + + /* --------------------------------------------------------------------- */ + /* --------------------------------------------------------------------- */ + /* --------------------------------------------------------------------- */ + + /* + const std::vector<shared_ptr<UntypedOpSequence> >& untyped_sections(); + */ + + /* + template<class T> const typename std::vector<shared_ptr<OpSequence<T> > >& sections(); + %template(sections_bool) sections<Bool>; + %template(sections_tset) sections<TSet>; + %template(sections_strset) sections<StrSet>; + %template(sections_position) sections<Position>; + %template(sections_match) sections<Match>; + */ + + bool has_untyped_section(const std::string& name) const; + + template<class T> bool has_section(const std::string& name) const; + %template(has_section_bool) has_section<Bool>; + %template(has_section_tset) has_section<TSet>; + %template(has_section_strset) has_section<StrSet>; + %template(has_section_position) has_section<Position>; + %template(has_section_match) has_section<Match>; + + /* --------------------------------------------------------------------- */ + + std::vector<std::string> untyped_section_names() const; + + template<class T> std::vector<std::string> section_names() const; + %template(section_names_bool) section_names<Bool>; + %template(section_names_tset) section_names<TSet>; + %template(section_names_strset) section_names<StrSet>; + %template(section_names_position) section_names<Position>; + %template(section_names_match) section_names<Match>; + + /* --------------------------------------------------------------------- */ + + // const UntypedOpSequence& get_untyped_section(const std::string& name) const; + UntypedOpSequence& get_untyped_section(const std::string& name); + + /* --------------------------------------------------------------------- */ + + // template<class T> const OpSequence<T>& get_section(const std::string& name) const; + template<class T> OpSequence<T>& get_section(const std::string& name); + %template(get_section_bool) get_section<Bool>; + %template(get_section_tset) get_section<TSet>; + %template(get_section_strset) get_section<StrSet>; + %template(get_section_position) get_section<Position>; + %template(get_section_match) get_section<Match>; + + shared_ptr<UntypedOpSequence> get_untyped_section_ptr(const std::string& name); + shared_ptr<const UntypedOpSequence> get_untyped_section_ptr(const std::string& name) const; + + /* --------------------------------------------------------------------- */ + + // template<class T> shared_ptr<const OpSequence<T> > get_section_ptr(const std::string& name) const; + template<class T> shared_ptr<OpSequence<T> > get_section_ptr(const std::string& name); + %template(get_section_ptr_bool) get_section_ptr<Bool>; + %template(get_section_ptr_tset) get_section_ptr<TSet>; + %template(get_section_ptr_strset) get_section_ptr<StrSet>; + %template(get_section_ptr_position) get_section_ptr<Position>; + %template(get_section_ptr_match) get_section_ptr<Match>; + + /* --------------------------------------------------------------------- */ + + // const FunctionalOperator& get_untyped_op(const std::string& name, size_t idx = 0) const; + FunctionalOperator& get_untyped_op(const std::string& name, size_t idx = 0); + + /* --------------------------------------------------------------------- */ + + // template<class T> const Operator<T>& get_op(const std::string& name, size_t idx = 0) const; + template<class T> Operator<T>& get_op(const std::string& name, size_t idx = 0); + /* + %template(get_op_bool) get_op<Bool>; + %template(get_op_tset) get_op<TSet>; + %template(get_op_strset) get_op<StrSet>; + %template(get_op_position) get_op<Position>; + %template(get_op_match) get_op<Match>; + */ + + /* --------------------------------------------------------------------- */ + + // shared_ptr<const FunctionalOperator> get_untyped_op_ptr(const std::string& name, size_t idx = 0) const; + shared_ptr<FunctionalOperator> get_untyped_op_ptr(const std::string& name, size_t idx = 0); + + /* --------------------------------------------------------------------- */ + + // template<class T> shared_ptr<const Operator<T> > get_op_ptr(const std::string& name, size_t idx = 0) const; + template<class T> shared_ptr<Operator<T> > get_op_ptr(const std::string& name, size_t idx = 0); + %template(get_op_ptr_bool) get_op_ptr<Bool>; + %template(get_op_ptr_tset) get_op_ptr<TSet>; + %template(get_op_ptr_strset) get_op_ptr<StrSet>; + %template(get_op_ptr_position) get_op_ptr<Position>; + %template(get_op_ptr_match) get_op_ptr<Match>; + + /* --------------------------------------------------------------------- */ + + UntypedOpSequence::name_op_v_t gen_name_untyped_op_pairs(); + UntypedOpSequence::name_op_v_c_t gen_name_untyped_op_pairs() const; + + /* --------------------------------------------------------------------- */ + + // template<class T> typename OpSequence<T>::name_op_v_c_t gen_name_op_pairs() const; + template<class T> typename OpSequence<T>::name_op_v_t gen_name_op_pairs(); + %template(gen_name_op_pairs_bool) gen_name_op_pairs<Bool>; + %template(gen_name_op_pairs_tset) gen_name_op_pairs<TSet>; + %template(gen_name_op_pairs_strset) gen_name_op_pairs<StrSet>; + %template(gen_name_op_pairs_position) gen_name_op_pairs<Position>; + %template(gen_name_op_pairs_match) gen_name_op_pairs<Match>; + + /* --------------------------------------------------------------------- */ + + // FunctionalOpSequence::name_op_v_c_t gen_all_op_pairs() const; + FunctionalOpSequence::name_op_v_t gen_all_op_pairs(); + + + /* --------------------------------------------------------------------- */ + /* --------------------------------------------------------------------- */ + /* --------------------------------------------------------------------- */ + + bool has_lexicon(const std::string& name) const; + bool has_lexicons() const; + + shared_ptr<const Lexicon> get_lexicon_ptr(const std::string& name) const; + const Lexicon& get_lexicon(const std::string& name) const; + + shared_ptr<const Lexicons> get_lexicons_ptr() const; + const Lexicons& get_lexicons() const; + + /* --------------------------------------------------------------------- */ + /* --------------------------------------------------------------------- */ + /* --------------------------------------------------------------------- */ + + bool has_tag_rules() const; + + const TagRuleSequence& get_tag_rules() const; + + // shared_ptr<const TagRuleSequence> get_tag_rules_ptr() const; + shared_ptr<TagRuleSequence> get_tag_rules_ptr(); + + /* --------------------------------------------------------------------- */ + /* --------------------------------------------------------------------- */ + /* --------------------------------------------------------------------- */ + + bool has_match_rules() const; + + const MatchRuleSequence& get_match_rules() const; + + // shared_ptr<const MatchRuleSequence> get_match_rules_ptr() const; + shared_ptr<MatchRuleSequence> get_match_rules_ptr(); + + /* --------------------------------------------------------------------- */ + /* --------------------------------------------------------------------- */ + /* --------------------------------------------------------------------- */ + + const Corpus2::Tagset& tagset() const; + + /* + // const PwrNlp::PathSearcher<Wccl::FileNotFound> path() const { return path_; } + PwrNlp::PathSearcher<Wccl::FileNotFound> path(); + */ + + /* --------------------------------------------------------------------- */ + /* --------------------------------------------------------------------- */ + /* --------------------------------------------------------------------- */ + + /* + // void add_untyped_section(const boost::shared_ptr<const UntypedOpSequence>& section); + void add_untyped_section(const shared_ptr<UntypedOpSequence>& section); + */ + + void add_untyped_section(const UntypedOpSequence& section); + + /* + // template<class T> void add_section(const shared_ptr<const OpSequence<T> >& section); + template<class T> void add_section(const shared_ptr<OpSequence<T> >& section); + %template(add_section_ptr_bool) add_section<Bool>; + %template(add_section_ptr_tset) add_section<TSet>; + %template(add_section_ptr_strset) add_section<StrSet>; + %template(add_section_ptr_position) add_section<Position>; + %template(add_section_ptr_match) add_section<Match>; + */ + + template<class T> void add_section(const OpSequence<T>& section); + %template(add_section_bool) add_section<Bool>; + %template(add_section_tset) add_section<TSet>; + %template(add_section_strset) add_section<StrSet>; + %template(add_section_position) add_section<Position>; + %template(add_section_match) add_section<Match>; + + /* + void import_lexicon(const boost::shared_ptr<Lexicon>& lexicon); + */ + + /* + void set_tag_rules(const shared_ptr<TagRuleSequence>& tag_rules); + */ + + /* + void set_match_rules(const shared_ptr<MatchRuleSequence>& match_rules); + */ + }; +} + +using namespace boost; +using namespace Wccl; +using namespace std; + +#endif /* SWIG_LIBWCCL_WCCLFILE_I */ diff --git a/swig/libcclwcclfileopsections.i b/swig/libcclwcclfileopsections.i new file mode 100644 index 0000000000000000000000000000000000000000..f5ca97642e8f8c51413fadbfe0af88ea86450410 --- /dev/null +++ b/swig/libcclwcclfileopsections.i @@ -0,0 +1,88 @@ +#ifndef SWIG_LIBWCCL_WCCLFILEOPSECTIONS_I +#define SWIG_LIBWCCL_WCCLFILEOPSECTIONS_I + +%module libcclwcclfileopsections +%{ + #include <libwccl/wcclfileopsections.h> +%} + +%include "libcclbool.i" +%include "libcclmatch.i" +%include "libcclposition.i" +%include "libcclstrset.i" +%include "libccltset.i" +%include "libcclvalue.i" +%include "libcclfunctionalopsequence.i" + +%include "std_string.i" +%include "std_vector.i" +%include "boost_shared_ptr.i" + +namespace Wccl { + template<class T> class WcclFileOpSections { + public: + typedef typename T::op_t op_t; + typedef typename boost::shared_ptr<op_t> op_ptr_t; + typedef typename boost::shared_ptr<const op_t> op_ptr_c_t; + typedef typename boost::shared_ptr<T> ptr_t; + typedef typename boost::shared_ptr<const T> ptr_c_t; + typedef typename std::vector<ptr_t> ptr_v_t; + typedef typename boost::unordered_map<std::string, ptr_t> map_t; + typedef typename T::name_op_v_t name_op_v_t; + typedef typename T::name_op_v_c_t name_op_v_c_t; + /* --------------------------------------------------------------------- */ + + protected: + bool has_section(const std::string& name) const; + const ptr_v_t& sections(); + /* --------------------------------------------------------------------- */ + + size_t size() const; + bool empty() const; + /* --------------------------------------------------------------------- */ + + std::vector<std::string> section_names() const; + /* --------------------------------------------------------------------- */ + + T& get_section(const std::string& name); + const T& get_section(const std::string& name) const; + /* --------------------------------------------------------------------- */ + + ptr_t get_section_ptr(const std::string& name); + ptr_c_t get_section_ptr(const std::string& name) const; + /* --------------------------------------------------------------------- */ + + op_t& get_op(const std::string& name, size_t idx = 0); + const op_t& get_op(const std::string& name, size_t idx = 0) const; + /* --------------------------------------------------------------------- */ + + op_ptr_t get_op_ptr(const std::string& name, size_t idx = 0); + op_ptr_c_t get_op_ptr(const std::string& name, size_t idx = 0) const; + /* --------------------------------------------------------------------- */ + + name_op_v_t& add_name_op_pairs(name_op_v_t& pairs); + name_op_v_c_t& add_name_op_pairs(name_op_v_c_t& pairs) const; + /* --------------------------------------------------------------------- */ + + name_op_v_t gen_name_op_pairs(); + name_op_v_c_t gen_name_op_pairs() const; + /* --------------------------------------------------------------------- */ + + WcclFileOpSections(); + + void append(const ptr_t& section); + /* --------------------------------------------------------------------- */ + }; + + %template(UntypedOpSequenceWcclFileOpSections) WcclFileOpSections<UntypedOpSequence>; + %template(StrSetOpSequence) WcclFileOpSections<OpSequence<StrSet> >; + %template(TSetOpSequence) WcclFileOpSections<OpSequence<TSet> >; + %template(BoolOpSequence) WcclFileOpSections<OpSequence<Bool> >; + %template(PositionOpSequence) WcclFileOpSections<OpSequence<Position> >; + %template(MatchOpSequence) WcclFileOpSections<OpSequence<Match> >; +} + +using namespace std; +using namespace Wccl; + +#endif /* SWIG_LIBWCCL_WCCLFILEOPSECTIONS_I */ diff --git a/swig/wccl.i b/swig/wccl.i new file mode 100644 index 0000000000000000000000000000000000000000..9eadbc020731426842ca8b2d2e00bf575ae46813 --- /dev/null +++ b/swig/wccl.i @@ -0,0 +1,38 @@ +#ifndef SWIG_WCCL_I +#define SWIG_WCCL_I + +%module wccl +%{ + // +%} + +%include "libcclactionexeccontext.i" +%include "libcclannotationmatch.i" +%include "libcclbool.i" +%include "libcclexpression.i" +%include "libcclfunctionaloperator.i" +%include "libcclmatchdata.i" +%include "libcclmatch.i" +%include "libcclmatchrule.i" +%include "libcclmatchvector.i" +%include "libccloperator.i" +%include "libcclparsedexpression.i" +%include "libcclparser.i" +%include "libcclposition.i" +%include "libcclsentencecontext.i" +%include "libcclstrset.i" +%include "libccltagaction.i" +%include "libccltagrule.i" +%include "libccltagrulesequence.i" +%include "libccltokenmatch.i" +%include "libccltset.i" +%include "libcclvalue.i" +%include "libcclvariables.i" +%include "libcclfunctionalopsequence.i" +%include "libcclwcclfileopsections.i" +%include "libcclmatchrulesequence.i" +%include "libccllexicon.i" +%include "libccllexicons.i" +%include "libcclwcclfile.i" + +#endif /* SWIG_LIBWCCL_I */ diff --git a/tests/ann_op.cpp b/tests/ann_op.cpp index 3b24a010fcd54a6d59c87be9e9c2a06a3e8da6ad..5fcd11ec75b5ac0127899a6230b88ec4d6fdc34f 100644 --- a/tests/ann_op.cpp +++ b/tests/ann_op.cpp @@ -42,8 +42,8 @@ struct AnnSubFix : public Wccl::PositionFixture BOOST_FIXTURE_TEST_CASE(ann_not, AnnSubFix) { boost::shared_ptr< Constant<Match> > m0, m1; - m0.reset(new Constant<Match>(Match(TokenMatch(pos_value(0))))); - m1.reset(new Constant<Match>(Match(TokenMatch(pos_value(1))))); + m0.reset(new Constant<Match>(Match(TokenMatch(pos_value(0).get_value())))); + m1.reset(new Constant<Match>(Match(TokenMatch(pos_value(1).get_value())))); Ann a(m0, m1, "ch1"); boost::shared_ptr<const Bool> rv = a.apply(cx); BOOST_REQUIRE(rv); @@ -57,8 +57,8 @@ BOOST_FIXTURE_TEST_CASE(ann_not, AnnSubFix) BOOST_FIXTURE_TEST_CASE(ann_yes, AnnSubFix) { boost::shared_ptr< Constant<Match> > m0, m1; - m0.reset(new Constant<Match>(Match(TokenMatch(pos_value(2))))); - m1.reset(new Constant<Match>(Match(TokenMatch(pos_value(3))))); + m0.reset(new Constant<Match>(Match(TokenMatch(pos_value(2).get_value())))); + m1.reset(new Constant<Match>(Match(TokenMatch(pos_value(3).get_value())))); Ann a(m0, m1, "ch1"); boost::shared_ptr<const Bool> rv = a.apply(cx); BOOST_REQUIRE(rv); @@ -72,8 +72,8 @@ BOOST_FIXTURE_TEST_CASE(ann_yes, AnnSubFix) BOOST_FIXTURE_TEST_CASE(ann_sub, AnnSubFix) { boost::shared_ptr< Constant<Match> > m0, m1; - m0.reset(new Constant<Match>(Match(TokenMatch(pos_value(2))))); - m1.reset(new Constant<Match>(Match(TokenMatch(pos_value(2))))); + m0.reset(new Constant<Match>(Match(TokenMatch(pos_value(2).get_value())))); + m1.reset(new Constant<Match>(Match(TokenMatch(pos_value(2).get_value())))); Ann a(m0, m1, "ch1"); boost::shared_ptr<const Bool> rv = a.apply(cx); BOOST_REQUIRE(rv); @@ -90,8 +90,8 @@ BOOST_FIXTURE_TEST_CASE(ann_sub, AnnSubFix) BOOST_FIXTURE_TEST_CASE(ann_to_string, AnnSubFix) { boost::shared_ptr< Constant<Match> > m0, m1; - m0.reset(new Constant<Match>(Match(TokenMatch(pos_value(0))))); - m1.reset(new Constant<Match>(Match(TokenMatch(pos_value(1))))); + m0.reset(new Constant<Match>(Match(TokenMatch(pos_value(0).get_value())))); + m1.reset(new Constant<Match>(Match(TokenMatch(pos_value(1).get_value())))); Ann ann(m0, m1, "ch"); BOOST_CHECK_EQUAL("ann(TOK[0], TOK[1], \"ch\")", ann.to_string(tagset)); } @@ -99,8 +99,8 @@ BOOST_FIXTURE_TEST_CASE(ann_to_string, AnnSubFix) BOOST_FIXTURE_TEST_CASE(ann_to_string_one, AnnSubFix) { boost::shared_ptr< Constant<Match> > m0, m1; - m0.reset(new Constant<Match>(Match(TokenMatch(pos_value(0))))); - m1.reset(new Constant<Match>(Match(TokenMatch(pos_value(1))))); + m0.reset(new Constant<Match>(Match(TokenMatch(pos_value(0).get_value())))); + m1.reset(new Constant<Match>(Match(TokenMatch(pos_value(1).get_value())))); Ann ann(m0, "ch"); BOOST_CHECK_EQUAL("ann(TOK[0], \"ch\")", ann.to_string(tagset)); } @@ -108,8 +108,8 @@ BOOST_FIXTURE_TEST_CASE(ann_to_string_one, AnnSubFix) BOOST_FIXTURE_TEST_CASE(annsub_to_string, AnnSubFix) { boost::shared_ptr< Constant<Match> > m0, m1; - m0.reset(new Constant<Match>(Match(TokenMatch(pos_value(0))))); - m1.reset(new Constant<Match>(Match(TokenMatch(pos_value(1))))); + m0.reset(new Constant<Match>(Match(TokenMatch(pos_value(0).get_value())))); + m1.reset(new Constant<Match>(Match(TokenMatch(pos_value(1).get_value())))); AnnSub ann(m0, m1, "ch"); BOOST_CHECK_EQUAL("annsub(TOK[0], TOK[1], \"ch\")", ann.to_string(tagset)); } @@ -117,8 +117,8 @@ BOOST_FIXTURE_TEST_CASE(annsub_to_string, AnnSubFix) BOOST_FIXTURE_TEST_CASE(annsub_to_string_one, AnnSubFix) { boost::shared_ptr< Constant<Match> > m0, m1; - m0.reset(new Constant<Match>(Match(TokenMatch(pos_value(0))))); - m1.reset(new Constant<Match>(Match(TokenMatch(pos_value(1))))); + m0.reset(new Constant<Match>(Match(TokenMatch(pos_value(0).get_value())))); + m1.reset(new Constant<Match>(Match(TokenMatch(pos_value(1).get_value())))); AnnSub ann(m0, "ch"); BOOST_CHECK_EQUAL("annsub(TOK[0], \"ch\")", ann.to_string(tagset)); } diff --git a/tests/isempty.cpp b/tests/isempty.cpp index a55593a3b92d03b955d7e1ccfdb8e9b1baf21c02..0256bd570281e21390915f64419a81886bf52cb5 100644 --- a/tests/isempty.cpp +++ b/tests/isempty.cpp @@ -70,7 +70,7 @@ BOOST_FIXTURE_TEST_CASE(default_match, IsEmptyFix) BOOST_FIXTURE_TEST_CASE(token_match, IsEmptyFix) { - Match token_match(TokenMatch(Position(0))); + Match token_match(TokenMatch(0)); boost::shared_ptr<Function<Match> > match_expr(new Constant<Match>(token_match)); IsEmpty<Match> e(match_expr); BOOST_CHECK(!e.apply(cx)->get_value()); diff --git a/tests/match.cpp b/tests/match.cpp index 9b3ab09ee9b8da47a745fc1f0dbb23bfde605e2d..9a7008bdcf30719664e661b6a47a547ff47571a8 100644 --- a/tests/match.cpp +++ b/tests/match.cpp @@ -20,8 +20,8 @@ BOOST_AUTO_TEST_CASE(empty) boost::shared_ptr<Corpus2::AnnotatedSentence> ptr; Wccl::Match m; BOOST_CHECK(m.empty()); - BOOST_CHECK_EQUAL(m.first_token(ptr).get_value(), Wccl::Position::Nowhere); - BOOST_CHECK_EQUAL(m.last_token(ptr).get_value(), Wccl::Position::Nowhere); + BOOST_CHECK_EQUAL(m.first_token(ptr), Wccl::Position::Nowhere); + BOOST_CHECK_EQUAL(m.last_token(ptr), Wccl::Position::Nowhere); BOOST_CHECK_EQUAL(m.to_raw_string(), "MATCH()"); } @@ -30,8 +30,8 @@ BOOST_AUTO_TEST_CASE(token) boost::shared_ptr<Corpus2::AnnotatedSentence> ptr; Wccl::TokenMatch m(1); BOOST_CHECK(!m.empty()); - BOOST_CHECK_EQUAL(m.first_token(ptr).get_value(), 1); - BOOST_CHECK_EQUAL(m.last_token(ptr).get_value(), 1); + BOOST_CHECK_EQUAL(m.first_token(ptr), 1); + BOOST_CHECK_EQUAL(m.last_token(ptr), 1); BOOST_CHECK_EQUAL(m.to_raw_string(), "TOK[1]"); } @@ -40,29 +40,29 @@ BOOST_AUTO_TEST_CASE(vector1) boost::shared_ptr<Corpus2::AnnotatedSentence> ptr; Wccl::MatchVector m; BOOST_CHECK(m.empty()); - BOOST_CHECK_EQUAL(m.first_token(ptr).get_value(), Wccl::Position::Nowhere); - BOOST_CHECK_EQUAL(m.last_token(ptr).get_value(), Wccl::Position::Nowhere); + BOOST_CHECK_EQUAL(m.first_token(ptr), Wccl::Position::Nowhere); + BOOST_CHECK_EQUAL(m.last_token(ptr), Wccl::Position::Nowhere); BOOST_CHECK_EQUAL(m.to_raw_string(), "MATCH()"); m.append(boost::make_shared<Wccl::MatchVector>()); BOOST_CHECK(m.empty()); - BOOST_CHECK_EQUAL(m.first_token(ptr).get_value(), Wccl::Position::Nowhere); - BOOST_CHECK_EQUAL(m.last_token(ptr).get_value(), Wccl::Position::Nowhere); + BOOST_CHECK_EQUAL(m.first_token(ptr), Wccl::Position::Nowhere); + BOOST_CHECK_EQUAL(m.last_token(ptr), Wccl::Position::Nowhere); BOOST_CHECK_EQUAL(m.to_raw_string(), "MATCH(MATCH())"); m.append(boost::make_shared<Wccl::MatchVector>()); BOOST_CHECK(m.empty()); - BOOST_CHECK_EQUAL(m.first_token(ptr).get_value(), Wccl::Position::Nowhere); - BOOST_CHECK_EQUAL(m.last_token(ptr).get_value(), Wccl::Position::Nowhere); + BOOST_CHECK_EQUAL(m.first_token(ptr), Wccl::Position::Nowhere); + BOOST_CHECK_EQUAL(m.last_token(ptr), Wccl::Position::Nowhere); BOOST_CHECK_EQUAL(m.to_raw_string(), "MATCH(MATCH(),MATCH())"); Wccl::MatchVector m2; m2.append(boost::make_shared<Wccl::MatchVector>(m)); BOOST_CHECK(m2.empty()); - BOOST_CHECK_EQUAL(m2.first_token(ptr).get_value(), Wccl::Position::Nowhere); - BOOST_CHECK_EQUAL(m2.last_token(ptr).get_value(), Wccl::Position::Nowhere); + BOOST_CHECK_EQUAL(m2.first_token(ptr), Wccl::Position::Nowhere); + BOOST_CHECK_EQUAL(m2.last_token(ptr), Wccl::Position::Nowhere); BOOST_CHECK_EQUAL(m2.to_raw_string(), "MATCH(MATCH(MATCH(),MATCH()))"); m2.append(boost::make_shared<Wccl::MatchVector>()); BOOST_CHECK(m2.empty()); - BOOST_CHECK_EQUAL(m2.first_token(ptr).get_value(), Wccl::Position::Nowhere); - BOOST_CHECK_EQUAL(m2.last_token(ptr).get_value(), Wccl::Position::Nowhere); + BOOST_CHECK_EQUAL(m2.first_token(ptr), Wccl::Position::Nowhere); + BOOST_CHECK_EQUAL(m2.last_token(ptr), Wccl::Position::Nowhere); BOOST_CHECK_EQUAL(m2.to_raw_string(), "MATCH(MATCH(MATCH(),MATCH()),MATCH())"); } @@ -72,44 +72,44 @@ BOOST_AUTO_TEST_CASE(matchvector_first_last) Wccl::MatchVector m; m.append(boost::make_shared<TokenMatch>(5)); BOOST_CHECK_EQUAL(m.to_raw_string(), "MATCH(TOK[5])"); - BOOST_CHECK_EQUAL(m.first_token(ptr).get_value(), 5); - BOOST_CHECK_EQUAL(m.last_token(ptr).get_value(), 5); + BOOST_CHECK_EQUAL(m.first_token(ptr), 5); + BOOST_CHECK_EQUAL(m.last_token(ptr), 5); m.append(boost::make_shared<TokenMatch>(6)); - BOOST_CHECK_EQUAL(m.first_token(ptr).get_value(), 5); - BOOST_CHECK_EQUAL(m.last_token(ptr).get_value(), 6); + BOOST_CHECK_EQUAL(m.first_token(ptr), 5); + BOOST_CHECK_EQUAL(m.last_token(ptr), 6); BOOST_CHECK_EQUAL(m.to_raw_string(), "MATCH(TOK[5],TOK[6])"); m.append(boost::make_shared<TokenMatch>(4)); - BOOST_CHECK_EQUAL(m.first_token(ptr).get_value(), 4); - BOOST_CHECK_EQUAL(m.last_token(ptr).get_value(), 6); + BOOST_CHECK_EQUAL(m.first_token(ptr), 4); + BOOST_CHECK_EQUAL(m.last_token(ptr), 6); BOOST_CHECK_EQUAL(m.to_raw_string(), "MATCH(TOK[5],TOK[6],TOK[4])"); Wccl::MatchVector m2; m2.append(boost::make_shared<TokenMatch>(5)); boost::shared_ptr<Wccl::MatchVector> m1 = boost::make_shared<Wccl::MatchVector>(m); m2.append(m1); - BOOST_CHECK_EQUAL(m2.first_token(ptr).get_value(), 4); - BOOST_CHECK_EQUAL(m2.last_token(ptr).get_value(), 6); + BOOST_CHECK_EQUAL(m2.first_token(ptr), 4); + BOOST_CHECK_EQUAL(m2.last_token(ptr), 6); BOOST_CHECK_EQUAL(m2.to_raw_string(), "MATCH(TOK[5],MATCH(TOK[5],TOK[6],TOK[4]))"); m2.append(boost::make_shared<TokenMatch>(2)); - BOOST_CHECK_EQUAL(m2.first_token(ptr).get_value(), 2); - BOOST_CHECK_EQUAL(m2.last_token(ptr).get_value(), 6); + BOOST_CHECK_EQUAL(m2.first_token(ptr), 2); + BOOST_CHECK_EQUAL(m2.last_token(ptr), 6); m2.append(boost::make_shared<TokenMatch>(7)); - BOOST_CHECK_EQUAL(m2.first_token(ptr).get_value(), 2); - BOOST_CHECK_EQUAL(m2.last_token(ptr).get_value(), 7); + BOOST_CHECK_EQUAL(m2.first_token(ptr), 2); + BOOST_CHECK_EQUAL(m2.last_token(ptr), 7); m1->append(boost::make_shared<TokenMatch>(1)); - BOOST_CHECK_EQUAL(m2.first_token(ptr).get_value(), 1); - BOOST_CHECK_EQUAL(m2.last_token(ptr).get_value(), 7); + BOOST_CHECK_EQUAL(m2.first_token(ptr), 1); + BOOST_CHECK_EQUAL(m2.last_token(ptr), 7); Wccl::MatchVector m3; BOOST_CHECK_EQUAL(m3.to_raw_string(), "MATCH()"); - BOOST_CHECK_EQUAL(m3.first_token(ptr).get_value(), Wccl::Position::Nowhere); - BOOST_CHECK_EQUAL(m3.last_token(ptr).get_value(), Wccl::Position::Nowhere); + BOOST_CHECK_EQUAL(m3.first_token(ptr), Wccl::Position::Nowhere); + BOOST_CHECK_EQUAL(m3.last_token(ptr), Wccl::Position::Nowhere); m3.append(boost::make_shared<MatchVector>()); BOOST_CHECK_EQUAL(m3.to_raw_string(), "MATCH(MATCH())"); - BOOST_CHECK_EQUAL(m3.first_token(ptr).get_value(), Wccl::Position::Nowhere); - BOOST_CHECK_EQUAL(m3.last_token(ptr).get_value(), Wccl::Position::Nowhere); + BOOST_CHECK_EQUAL(m3.first_token(ptr), Wccl::Position::Nowhere); + BOOST_CHECK_EQUAL(m3.last_token(ptr), Wccl::Position::Nowhere); m3.append(boost::make_shared<TokenMatch>(1)); BOOST_CHECK_EQUAL(m3.to_raw_string(), "MATCH(MATCH(),TOK[1])"); - BOOST_CHECK_EQUAL(m3.first_token(ptr).get_value(), 1); - BOOST_CHECK_EQUAL(m3.last_token(ptr).get_value(), 1); + BOOST_CHECK_EQUAL(m3.first_token(ptr), 1); + BOOST_CHECK_EQUAL(m3.last_token(ptr), 1); } BOOST_AUTO_TEST_CASE(varmatch) @@ -117,9 +117,9 @@ BOOST_AUTO_TEST_CASE(varmatch) boost::shared_ptr<Corpus2::AnnotatedSentence> ptr; Wccl::Variables v; v.put<Wccl::Match>("a", new Wccl::Match(boost::shared_ptr<MatchData>(new Wccl::TokenMatch(1)))); - BOOST_CHECK_EQUAL(v.get<Wccl::Match>("a")->first_token(ptr).get_value(), 1); + BOOST_CHECK_EQUAL(v.get<Wccl::Match>("a")->first_token(ptr), 1); BOOST_CHECK(v.get_put<Wccl::Match>("b")->empty()); - BOOST_CHECK_EQUAL(v.get_put<Wccl::Match>("b")->first_token(ptr).get_value(), + BOOST_CHECK_EQUAL(v.get_put<Wccl::Match>("b")->first_token(ptr), Wccl::Position::Nowhere); } diff --git a/tests/rules-data/match/head/cclmatch.xml b/tests/rules-data/match/head/cclmatch.xml new file mode 100644 index 0000000000000000000000000000000000000000..97422c15d1ed4115efe0781940055a7c4714741d --- /dev/null +++ b/tests/rules-data/match/head/cclmatch.xml @@ -0,0 +1,62 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE cesAna SYSTEM "xcesAnaIPI.dtd"> +<cesAna xmlns:xlink="http://www.w3.org/1999/xlink" version="1.0" type="lex disamb"> +<chunkList> + <chunk> + <sentence> + <tok> + <orth>Tu</orth> + <lex><base>tu</base><ctag>qub</ctag></lex> + </tok> + <tok> + <orth>jest</orth> + <lex><base>być</base><ctag>verb</ctag></lex> + </tok> + <tok> + <orth>czarny</orth> + <lex><base>czarny</base><ctag>adj</ctag></lex> + </tok> + <tok> + <orth>bÅ‚yszczÄ…cy</orth> + <lex><base>bÅ‚yszczeć</base><ctag>adj</ctag></lex> + </tok> + <tok> + <orth>gÅ‚oÅ›nik</orth> + <lex><base>gÅ‚oÅ›nik</base><ctag>noun</ctag></lex> + </tok> + <ns/> + <tok> + <orth>.</orth> + <lex><base>.</base><ctag>interp</ctag></lex> + </tok> + </sentence> + <sentence> + <tok> + <orth>Tam</orth> + <lex><base>tam</base><ctag>qub</ctag></lex> + </tok> + <tok> + <orth>jest</orth> + <lex><base>być</base><ctag>verb</ctag></lex> + </tok> + <tok> + <orth>nowa</orth> + <lex><base>nowy</base><ctag>adj</ctag></lex> + </tok> + <tok> + <orth>pompa</orth> + <lex><base>pompa</base><ctag>noun</ctag></lex> + </tok> + <tok> + <orth>próżniowa</orth> + <lex><base>próżniowy</base><ctag>adj</ctag></lex> + </tok> + <ns/> + <tok> + <orth>.</orth> + <lex><base>.</base><ctag>interp</ctag></lex> + </tok> + </sentence> + </chunk> +</chunkList> +</cesAna> diff --git a/tests/rules-data/match/head/cclmatch1.ccl b/tests/rules-data/match/head/cclmatch1.ccl new file mode 100644 index 0000000000000000000000000000000000000000..7c7aa4761633262bfa9a1855f841136462579eb9 --- /dev/null +++ b/tests/rules-data/match/head/cclmatch1.ccl @@ -0,0 +1,13 @@ +apply +( + match + ( + optional(repeat(equal(class[0], {adj}))), + equal(class[0], {noun}), + optional(repeat(equal(class[0], {adj}))) + ), + actions + ( + mark(M, M, :2, "NP") + ) +) diff --git a/tests/rules-data/match/head/cclmatch1.out.xml b/tests/rules-data/match/head/cclmatch1.out.xml new file mode 100644 index 0000000000000000000000000000000000000000..b6871bd699e26dd0d4a778d7283b5abcc88f205a --- /dev/null +++ b/tests/rules-data/match/head/cclmatch1.out.xml @@ -0,0 +1,74 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE cesAna SYSTEM "xcesAnaIPI.dtd"> +<cesAna xmlns:xlink="http://www.w3.org/1999/xlink" version="1.0" type="lex disamb"> +<chunkList> + <chunk> + <sentence> + <tok> + <orth>Tu</orth> + <lex><base>tu</base><ctag>qub</ctag></lex> + <ann chan="NP">0</ann> + </tok> + <tok> + <orth>jest</orth> + <lex><base>być</base><ctag>verb</ctag></lex> + <ann chan="NP">0</ann> + </tok> + <tok> + <orth>czarny</orth> + <lex><base>czarny</base><ctag>adj</ctag></lex> + <ann chan="NP">1</ann> + </tok> + <tok> + <orth>bÅ‚yszczÄ…cy</orth> + <lex><base>bÅ‚yszczeć</base><ctag>adj</ctag></lex> + <ann chan="NP">1</ann> + </tok> + <tok> + <orth>gÅ‚oÅ›nik</orth> + <lex><base>gÅ‚oÅ›nik</base><ctag>noun</ctag></lex> + <ann chan="NP" head="1">1</ann> + </tok> + <ns/> + <tok> + <orth>.</orth> + <lex><base>.</base><ctag>interp</ctag></lex> + <ann chan="NP">0</ann> + </tok> + </sentence> + <sentence> + <tok> + <orth>Tam</orth> + <lex><base>tam</base><ctag>qub</ctag></lex> + <ann chan="NP">0</ann> + </tok> + <tok> + <orth>jest</orth> + <lex><base>być</base><ctag>verb</ctag></lex> + <ann chan="NP">0</ann> + </tok> + <tok> + <orth>nowa</orth> + <lex><base>nowy</base><ctag>adj</ctag></lex> + <ann chan="NP">1</ann> + </tok> + <tok> + <orth>pompa</orth> + <lex><base>pompa</base><ctag>noun</ctag></lex> + <ann chan="NP" head="1">1</ann> + </tok> + <tok> + <orth>próżniowa</orth> + <lex><base>próżniowy</base><ctag>adj</ctag></lex> + <ann chan="NP">1</ann> + </tok> + <ns/> + <tok> + <orth>.</orth> + <lex><base>.</base><ctag>interp</ctag></lex> + <ann chan="NP">0</ann> + </tok> + </sentence> + </chunk> +</chunkList> +</cesAna> diff --git a/tests/rules-data/match/head/simple.is-the-tagset b/tests/rules-data/match/head/simple.is-the-tagset new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/tests/rules-data/match/head/simple.is-the-tagset @@ -0,0 +1 @@ + diff --git a/tests/rules-data/match/michal/cclmatch.xml b/tests/rules-data/match/michal/cclmatch.xml new file mode 100644 index 0000000000000000000000000000000000000000..c23ba8b14745c9ffc1adbbe17d67cb872e5c4fd8 --- /dev/null +++ b/tests/rules-data/match/michal/cclmatch.xml @@ -0,0 +1,124 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE cesAna SYSTEM "xcesAnaIPI.dtd"> +<cesAna xmlns:xlink="http://www.w3.org/1999/xlink" version="1.0" type="lex disamb"> +<chunkList> + <chunk> + <sentence> + <tok> + <orth>To</orth> + <lex><base>ten</base><ctag>adj:sg:nom:n:pos</ctag></lex> + <lex><base>ten</base><ctag>adj:sg:acc:n:pos</ctag></lex> + <lex><base>to</base><ctag>conj</ctag></lex> + <lex><base>to</base><ctag>pred</ctag></lex> + <lex><base>to</base><ctag>qub</ctag></lex> + <lex><base>to</base><ctag>subst:sg:nom:n</ctag></lex> + <lex><base>to</base><ctag>subst:sg:acc:n</ctag></lex> + <lex><base>to</base><ctag>subst:sg:voc:n</ctag></lex> + <ann chan="capitalized_noun" head="1">1</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + </tok> + <tok> + <orth>Wyżyna</orth> + <lex><base>wyżyna</base><ctag>subst:sg:nom:f</ctag></lex> + <ann chan="capitalized_noun" head="1">2</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + </tok> + <tok> + <orth>PÅ‚aska</orth> + <lex><base>pÅ‚aski</base><ctag>adj:sg:nom:f:pos</ctag></lex> + <ann chan="capitalized_noun" head="1">3</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based" head="1">1</ann> + </tok> + <ns/> + <tok> + <orth>,</orth> + <lex><base>,</base><ctag>interp</ctag></lex> + <ann chan="capitalized_noun">0</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + </tok> + <tok> + <orth>Wyżyna</orth> + <lex><base>wyżyna</base><ctag>subst:sg:nom:f</ctag></lex> + <ann chan="capitalized_noun" head="1">4</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + </tok> + <tok> + <orth>WypukÅ‚a</orth> + <lex><base>wypukÅ‚y</base><ctag>adj:sg:nom:f:pos</ctag></lex> + <ann chan="capitalized_noun" head="1">5</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based" head="1">2</ann> + </tok> + <ns/> + <tok> + <orth>,</orth> + <lex><base>,</base><ctag>interp</ctag></lex> + <ann chan="capitalized_noun">0</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + </tok> + <tok> + <orth>wyżyna</orth> + <lex><base>wyżyna</base><ctag>subst:sg:nom:f</ctag></lex> + <ann chan="capitalized_noun">0</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + </tok> + <tok> + <orth>Kot</orth> + <lex><base>kot</base><ctag>subst:sg:nom:m2</ctag></lex> + <ann chan="capitalized_noun" head="1">7</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word" head="1">1</ann> + <ann chan="reladj_gaz_based">0</ann> + </tok> + <tok> + <orth>i</orth> + <lex><base>i</base><ctag>conj</ctag></lex> + <ann chan="capitalized_noun">0</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + </tok> + <tok> + <orth>miasto</orth> + <lex><base>miasto</base><ctag>subst:sg:nom:n</ctag></lex> + <ann chan="capitalized_noun">0</ann> + <ann chan="city_trigger" head="1">1</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + </tok> + <tok> + <orth>GdaÅ„sk</orth> + <lex><base>gdaÅ„sk</base><ctag>subst:sg:nom:m3</ctag></lex> + <ann chan="capitalized_noun" head="1">8</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word" head="1">2</ann> + <ann chan="reladj_gaz_based">0</ann> + </tok> + <ns/> + <tok> + <orth>.</orth> + <lex><base>.</base><ctag>interp</ctag></lex> + <ann chan="capitalized_noun">0</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + </tok> + </sentence> + </chunk> +</chunkList> +</cesAna> diff --git a/tests/rules-data/match/michal/cclmatch1.ccl b/tests/rules-data/match/michal/cclmatch1.ccl new file mode 100644 index 0000000000000000000000000000000000000000..d2ea62b80385cf4e2095adaae4d42b88fabbaca2 --- /dev/null +++ b/tests/rules-data/match/michal/cclmatch1.ccl @@ -0,0 +1,16 @@ +apply( + match( + regex( base[0], 'wyżyna' ), + is( 'reladj_gaz_based' ) + ), + cond( + ann(:1, 'capitalized_noun' ), + equal( nmb[first(:1)], nmb[first(:2)] ), + equal( cas[first(:1)], cas[first(:2)] ), + equal( gnd[first(:1)], gnd[first(:2)] ) + ), + actions( + mark(M, 'HIGHLAND_NAM') + ) +) + diff --git a/tests/rules-data/match/michal/cclmatch1.out.xml b/tests/rules-data/match/michal/cclmatch1.out.xml new file mode 100644 index 0000000000000000000000000000000000000000..3ec16111de4bfeee240c6982e19fd2d40f1eb108 --- /dev/null +++ b/tests/rules-data/match/michal/cclmatch1.out.xml @@ -0,0 +1,133 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE cesAna SYSTEM "xcesAnaIPI.dtd"> +<cesAna xmlns:xlink="http://www.w3.org/1999/xlink" version="1.0" type="lex disamb"> +<chunkList> + <chunk> + <sentence> + <tok> + <orth>To</orth> + <lex><base>ten</base><ctag>adj:sg:nom:n:pos</ctag></lex> + <lex><base>ten</base><ctag>adj:sg:acc:n:pos</ctag></lex> + <lex><base>to</base><ctag>conj</ctag></lex> + <lex><base>to</base><ctag>pred</ctag></lex> + <lex><base>to</base><ctag>qub</ctag></lex> + <lex><base>to</base><ctag>subst:sg:nom:n</ctag></lex> + <lex><base>to</base><ctag>subst:sg:acc:n</ctag></lex> + <lex><base>to</base><ctag>subst:sg:voc:n</ctag></lex> + <ann chan="capitalized_noun" head="1">1</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + <ann chan="HIGHLAND_NAM">0</ann> + </tok> + <tok> + <orth>Wyżyna</orth> + <lex><base>wyżyna</base><ctag>subst:sg:nom:f</ctag></lex> + <ann chan="capitalized_noun" head="1">2</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + <ann chan="HIGHLAND_NAM" head="1">1</ann> + </tok> + <tok> + <orth>PÅ‚aska</orth> + <lex><base>pÅ‚aski</base><ctag>adj:sg:nom:f:pos</ctag></lex> + <ann chan="capitalized_noun" head="1">3</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based" head="1">1</ann> + <ann chan="HIGHLAND_NAM">1</ann> + </tok> + <ns/> + <tok> + <orth>,</orth> + <lex><base>,</base><ctag>interp</ctag></lex> + <ann chan="capitalized_noun">0</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + <ann chan="HIGHLAND_NAM">0</ann> + </tok> + <tok> + <orth>Wyżyna</orth> + <lex><base>wyżyna</base><ctag>subst:sg:nom:f</ctag></lex> + <ann chan="capitalized_noun" head="1">4</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + <ann chan="HIGHLAND_NAM" head="1">2</ann> + </tok> + <tok> + <orth>WypukÅ‚a</orth> + <lex><base>wypukÅ‚y</base><ctag>adj:sg:nom:f:pos</ctag></lex> + <ann chan="capitalized_noun" head="1">5</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based" head="1">2</ann> + <ann chan="HIGHLAND_NAM">2</ann> + </tok> + <ns/> + <tok> + <orth>,</orth> + <lex><base>,</base><ctag>interp</ctag></lex> + <ann chan="capitalized_noun">0</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + <ann chan="HIGHLAND_NAM">0</ann> + </tok> + <tok> + <orth>wyżyna</orth> + <lex><base>wyżyna</base><ctag>subst:sg:nom:f</ctag></lex> + <ann chan="capitalized_noun">0</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + <ann chan="HIGHLAND_NAM">0</ann> + </tok> + <tok> + <orth>Kot</orth> + <lex><base>kot</base><ctag>subst:sg:nom:m2</ctag></lex> + <ann chan="capitalized_noun" head="1">7</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word" head="1">1</ann> + <ann chan="reladj_gaz_based">0</ann> + <ann chan="HIGHLAND_NAM">0</ann> + </tok> + <tok> + <orth>i</orth> + <lex><base>i</base><ctag>conj</ctag></lex> + <ann chan="capitalized_noun">0</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + </tok> + <tok> + <orth>miasto</orth> + <lex><base>miasto</base><ctag>subst:sg:nom:n</ctag></lex> + <ann chan="capitalized_noun">0</ann> + <ann chan="city_trigger" head="1">1</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + </tok> + <tok> + <orth>GdaÅ„sk</orth> + <lex><base>gdaÅ„sk</base><ctag>subst:sg:nom:m3</ctag></lex> + <ann chan="capitalized_noun" head="1">8</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word" head="1">2</ann> + <ann chan="reladj_gaz_based">0</ann> + </tok> + <ns/> + <tok> + <orth>.</orth> + <lex><base>.</base><ctag>interp</ctag></lex> + <ann chan="capitalized_noun">0</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + </tok> + </sentence> + </chunk> +</chunkList> +</cesAna> diff --git a/tests/rules-data/match/michal/cclmatch2.ccl b/tests/rules-data/match/michal/cclmatch2.ccl new file mode 100644 index 0000000000000000000000000000000000000000..b02d73ff33475a200388c331fbf74a0d1fc4e5f8 --- /dev/null +++ b/tests/rules-data/match/michal/cclmatch2.ccl @@ -0,0 +1,15 @@ + + +apply( + match( + regex( base[0], 'wyżyna'), + and( inter(class[0], {subst}), inter(cas[0], {nom}) ) + ), + cond( + not( ann(:1, 'capitalized_noun' ) ) + ), + actions( + mark(:2, 'HIGHLAND_NAM') + ) +) + diff --git a/tests/rules-data/match/michal/cclmatch2.out.xml b/tests/rules-data/match/michal/cclmatch2.out.xml new file mode 100644 index 0000000000000000000000000000000000000000..de4ddd61344224cac98f3038da2314f54578359e --- /dev/null +++ b/tests/rules-data/match/michal/cclmatch2.out.xml @@ -0,0 +1,133 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE cesAna SYSTEM "xcesAnaIPI.dtd"> +<cesAna xmlns:xlink="http://www.w3.org/1999/xlink" version="1.0" type="lex disamb"> +<chunkList> + <chunk> + <sentence> + <tok> + <orth>To</orth> + <lex><base>ten</base><ctag>adj:sg:nom:n:pos</ctag></lex> + <lex><base>ten</base><ctag>adj:sg:acc:n:pos</ctag></lex> + <lex><base>to</base><ctag>conj</ctag></lex> + <lex><base>to</base><ctag>pred</ctag></lex> + <lex><base>to</base><ctag>qub</ctag></lex> + <lex><base>to</base><ctag>subst:sg:nom:n</ctag></lex> + <lex><base>to</base><ctag>subst:sg:acc:n</ctag></lex> + <lex><base>to</base><ctag>subst:sg:voc:n</ctag></lex> + <ann chan="capitalized_noun" head="1">1</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + <ann chan="HIGHLAND_NAM">0</ann> + </tok> + <tok> + <orth>Wyżyna</orth> + <lex><base>wyżyna</base><ctag>subst:sg:nom:f</ctag></lex> + <ann chan="capitalized_noun" head="1">2</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + <ann chan="HIGHLAND_NAM">0</ann> + </tok> + <tok> + <orth>PÅ‚aska</orth> + <lex><base>pÅ‚aski</base><ctag>adj:sg:nom:f:pos</ctag></lex> + <ann chan="capitalized_noun" head="1">3</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based" head="1">1</ann> + <ann chan="HIGHLAND_NAM">0</ann> + </tok> + <ns/> + <tok> + <orth>,</orth> + <lex><base>,</base><ctag>interp</ctag></lex> + <ann chan="capitalized_noun">0</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + <ann chan="HIGHLAND_NAM">0</ann> + </tok> + <tok> + <orth>Wyżyna</orth> + <lex><base>wyżyna</base><ctag>subst:sg:nom:f</ctag></lex> + <ann chan="capitalized_noun" head="1">4</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + <ann chan="HIGHLAND_NAM">0</ann> + </tok> + <tok> + <orth>WypukÅ‚a</orth> + <lex><base>wypukÅ‚y</base><ctag>adj:sg:nom:f:pos</ctag></lex> + <ann chan="capitalized_noun" head="1">5</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based" head="1">2</ann> + <ann chan="HIGHLAND_NAM">0</ann> + </tok> + <ns/> + <tok> + <orth>,</orth> + <lex><base>,</base><ctag>interp</ctag></lex> + <ann chan="capitalized_noun">0</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + <ann chan="HIGHLAND_NAM">0</ann> + </tok> + <tok> + <orth>wyżyna</orth> + <lex><base>wyżyna</base><ctag>subst:sg:nom:f</ctag></lex> + <ann chan="capitalized_noun">0</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + <ann chan="HIGHLAND_NAM">0</ann> + </tok> + <tok> + <orth>Kot</orth> + <lex><base>kot</base><ctag>subst:sg:nom:m2</ctag></lex> + <ann chan="capitalized_noun" head="1">7</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word" head="1">1</ann> + <ann chan="reladj_gaz_based">0</ann> + <ann chan="HIGHLAND_NAM" head="1">1</ann> + </tok> + <tok> + <orth>i</orth> + <lex><base>i</base><ctag>conj</ctag></lex> + <ann chan="capitalized_noun">0</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + </tok> + <tok> + <orth>miasto</orth> + <lex><base>miasto</base><ctag>subst:sg:nom:n</ctag></lex> + <ann chan="capitalized_noun">0</ann> + <ann chan="city_trigger" head="1">1</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + </tok> + <tok> + <orth>GdaÅ„sk</orth> + <lex><base>gdaÅ„sk</base><ctag>subst:sg:nom:m3</ctag></lex> + <ann chan="capitalized_noun" head="1">8</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word" head="1">2</ann> + <ann chan="reladj_gaz_based">0</ann> + </tok> + <ns/> + <tok> + <orth>.</orth> + <lex><base>.</base><ctag>interp</ctag></lex> + <ann chan="capitalized_noun">0</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + </tok> + </sentence> + </chunk> +</chunkList> +</cesAna> diff --git a/tests/rules-data/match/michal/cclmatch3.ccl b/tests/rules-data/match/michal/cclmatch3.ccl new file mode 100644 index 0000000000000000000000000000000000000000..817f1b54e5077e8e8844e1bdc1267afef2e185ed --- /dev/null +++ b/tests/rules-data/match/michal/cclmatch3.ccl @@ -0,0 +1,12 @@ +apply( + match( + is( 'city_trigger' ), + is( 'first_capital_word' ) + ), + cond( + not( annsub(:2, 'city_nam_gaz') ) + ), + actions( + mark(:2, 'CITY_NAM') + ) +) diff --git a/tests/rules-data/match/michal/cclmatch3.out.xml b/tests/rules-data/match/michal/cclmatch3.out.xml new file mode 100644 index 0000000000000000000000000000000000000000..5d348dc0919b3710230a8e6b9e843c816e7e1d93 --- /dev/null +++ b/tests/rules-data/match/michal/cclmatch3.out.xml @@ -0,0 +1,137 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE cesAna SYSTEM "xcesAnaIPI.dtd"> +<cesAna xmlns:xlink="http://www.w3.org/1999/xlink" version="1.0" type="lex disamb"> +<chunkList> + <chunk> + <sentence> + <tok> + <orth>To</orth> + <lex><base>ten</base><ctag>adj:sg:nom:n:pos</ctag></lex> + <lex><base>ten</base><ctag>adj:sg:acc:n:pos</ctag></lex> + <lex><base>to</base><ctag>conj</ctag></lex> + <lex><base>to</base><ctag>pred</ctag></lex> + <lex><base>to</base><ctag>qub</ctag></lex> + <lex><base>to</base><ctag>subst:sg:nom:n</ctag></lex> + <lex><base>to</base><ctag>subst:sg:acc:n</ctag></lex> + <lex><base>to</base><ctag>subst:sg:voc:n</ctag></lex> + <ann chan="capitalized_noun" head="1">1</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + <ann chan="CITY_NAM">0</ann> + </tok> + <tok> + <orth>Wyżyna</orth> + <lex><base>wyżyna</base><ctag>subst:sg:nom:f</ctag></lex> + <ann chan="capitalized_noun" head="1">2</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + <ann chan="CITY_NAM">0</ann> + </tok> + <tok> + <orth>PÅ‚aska</orth> + <lex><base>pÅ‚aski</base><ctag>adj:sg:nom:f:pos</ctag></lex> + <ann chan="capitalized_noun" head="1">3</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based" head="1">1</ann> + <ann chan="CITY_NAM">0</ann> + </tok> + <ns/> + <tok> + <orth>,</orth> + <lex><base>,</base><ctag>interp</ctag></lex> + <ann chan="capitalized_noun">0</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + <ann chan="CITY_NAM">0</ann> + </tok> + <tok> + <orth>Wyżyna</orth> + <lex><base>wyżyna</base><ctag>subst:sg:nom:f</ctag></lex> + <ann chan="capitalized_noun" head="1">4</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + <ann chan="CITY_NAM">0</ann> + </tok> + <tok> + <orth>WypukÅ‚a</orth> + <lex><base>wypukÅ‚y</base><ctag>adj:sg:nom:f:pos</ctag></lex> + <ann chan="capitalized_noun" head="1">5</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based" head="1">2</ann> + <ann chan="CITY_NAM">0</ann> + </tok> + <ns/> + <tok> + <orth>,</orth> + <lex><base>,</base><ctag>interp</ctag></lex> + <ann chan="capitalized_noun">0</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + <ann chan="CITY_NAM">0</ann> + </tok> + <tok> + <orth>wyżyna</orth> + <lex><base>wyżyna</base><ctag>subst:sg:nom:f</ctag></lex> + <ann chan="capitalized_noun">0</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + <ann chan="CITY_NAM">0</ann> + </tok> + <tok> + <orth>Kot</orth> + <lex><base>kot</base><ctag>subst:sg:nom:m2</ctag></lex> + <ann chan="capitalized_noun" head="1">7</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word" head="1">1</ann> + <ann chan="reladj_gaz_based">0</ann> + <ann chan="CITY_NAM">0</ann> + </tok> + <tok> + <orth>i</orth> + <lex><base>i</base><ctag>conj</ctag></lex> + <ann chan="capitalized_noun">0</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + <ann chan="CITY_NAM">0</ann> + </tok> + <tok> + <orth>miasto</orth> + <lex><base>miasto</base><ctag>subst:sg:nom:n</ctag></lex> + <ann chan="capitalized_noun">0</ann> + <ann chan="city_trigger" head="1">1</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + <ann chan="CITY_NAM">0</ann> + </tok> + <tok> + <orth>GdaÅ„sk</orth> + <lex><base>gdaÅ„sk</base><ctag>subst:sg:nom:m3</ctag></lex> + <ann chan="capitalized_noun" head="1">8</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word" head="1">2</ann> + <ann chan="reladj_gaz_based">0</ann> + <ann chan="CITY_NAM" head="1">1</ann> + </tok> + <ns/> + <tok> + <orth>.</orth> + <lex><base>.</base><ctag>interp</ctag></lex> + <ann chan="capitalized_noun">0</ann> + <ann chan="city_trigger">0</ann> + <ann chan="first_capital_word">0</ann> + <ann chan="reladj_gaz_based">0</ann> + <ann chan="CITY_NAM">0</ann> + </tok> + </sentence> + </chunk> +</chunkList> +</cesAna> diff --git a/tests/rules-data/match/postcond2/cclmatch2.out.ccl b/tests/rules-data/match/postcond2/cclmatch2.out.xml similarity index 100% rename from tests/rules-data/match/postcond2/cclmatch2.out.ccl rename to tests/rules-data/match/postcond2/cclmatch2.out.xml diff --git a/tests/rules-data/match/postcond2/cclmatch5.ccl b/tests/rules-data/match/postcond2/cclmatch5.ccl new file mode 100644 index 0000000000000000000000000000000000000000..51ff9f06e57b7557a4c43747d7ee2cf593d56156 --- /dev/null +++ b/tests/rules-data/match/postcond2/cclmatch5.ccl @@ -0,0 +1,14 @@ +apply( + match( + repeat( + optional(equal(orth[0], "not:here")), + equal(class[0], adj), + optional(equal(orth[0], "not:there")) + ) + ), + // first(M) -> position + // regex… -> starting with w + cond(regex(orth[first(M)], "w.*")), + //cond(debug(orth[first(M)])), + actions(mark(M,"C")) +) diff --git a/tests/rules-data/match/postcond2/cclmatch5.out.xml b/tests/rules-data/match/postcond2/cclmatch5.out.xml new file mode 100644 index 0000000000000000000000000000000000000000..c45a8fc1dff12873863dbcd5c9b34c3404950b5d --- /dev/null +++ b/tests/rules-data/match/postcond2/cclmatch5.out.xml @@ -0,0 +1,80 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE cesAna SYSTEM "xcesAnaIPI.dtd"> +<cesAna xmlns:xlink="http://www.w3.org/1999/xlink" version="1.0" type="lex disamb"> +<chunkList> + <chunk> + <sentence> + <tok> + <orth>Dwa</orth> + <lex><base>dwa</base><ctag>other</ctag></lex> + <ann chan="A">1</ann> + <ann chan="B" head="1">1</ann> + <ann chan="C">0</ann> + </tok> + <tok> + <orth>wielkie</orth> + <lex><base>wielki</base><ctag>adj</ctag></lex> + <ann chan="A">1</ann> + <ann chan="B" head="1">2</ann> + <ann chan="C" head="1">1</ann> + </tok> + <tok> + <orth>wÅ‚ochate</orth> + <lex><base>wÅ‚ochaty</base><ctag>adj</ctag></lex> + <ann chan="A">1</ann> + <ann chan="B">2</ann> + <ann chan="C">1</ann> + </tok> + <tok> + <orth>zapchlone</orth> + <lex><base>zapchlić</base><ctag>adj</ctag></lex> + <ann chan="A">1</ann> + <ann chan="B">2</ann> + <ann chan="C">1</ann> + </tok> + <tok> + <orth>koty</orth> + <lex><base>kot</base><ctag>noun</ctag></lex> + <ann chan="A" head="1">1</ann> + <ann chan="B" head="1">3</ann> + <ann chan="C">0</ann> + </tok> + <tok> + <orth>zjadÅ‚y</orth> + <lex><base>zjeść</base><ctag>verb</ctag></lex> + <ann chan="A">0</ann> + <ann chan="B" head="1">4</ann> + <ann chan="C">0</ann> + </tok> + <tok> + <orth>pięć</orth> + <lex><base>pięć</base><ctag>other</ctag></lex> + <ann chan="A" head="1">2</ann> + <ann chan="B">4</ann> + <ann chan="C">0</ann> + </tok> + <tok> + <orth>tÅ‚ustych</orth> + <lex><base>tÅ‚usty</base><ctag>adj</ctag></lex> + <ann chan="A">2</ann> + <ann chan="B">4</ann> + <ann chan="C">0</ann> + </tok> + <tok> + <orth>soczystych</orth> + <lex><base>soczysty</base><ctag>adj</ctag></lex> + <ann chan="A">2</ann> + <ann chan="B">4</ann> + <ann chan="C">0</ann> + </tok> + <tok> + <orth>much</orth> + <lex><base>mucha</base><ctag>noun</ctag></lex> + <ann chan="A">2</ann> + <ann chan="B">0</ann> + <ann chan="C">0</ann> + </tok> + </sentence> + </chunk> +</chunkList> +</cesAna> diff --git a/tests/rules-data/match/postcond3/cclmatch.xml b/tests/rules-data/match/postcond3/cclmatch.xml new file mode 100644 index 0000000000000000000000000000000000000000..557855b7be41f2c0a94e87de4f5863267a13bb9c --- /dev/null +++ b/tests/rules-data/match/postcond3/cclmatch.xml @@ -0,0 +1,73 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE cesAna SYSTEM "xcesAnaIPI.dtd"> +<cesAna xmlns:xlink="http://www.w3.org/1999/xlink" version="1.0" type="lex disamb"> +<chunkList> + <chunk> + <sentence> + <tok> + <orth>PoczÄ…tek</orth> + <lex><base>poczÄ…tek</base><ctag>subst:sg:nom:m3</ctag></lex> + <lex><base>poczÄ…tek</base><ctag>subst:sg:acc:m3</ctag></lex> + </tok> + <tok> + <orth>nowego</orth> + <lex><base>nowy</base><ctag>adj:sg:gen:m1:pos</ctag></lex> + <lex><base>nowy</base><ctag>adj:sg:acc:m1:pos</ctag></lex> + <lex><base>nowy</base><ctag>adj:sg:gen:m2:pos</ctag></lex> + <lex><base>nowy</base><ctag>adj:sg:acc:m2:pos</ctag></lex> + <lex><base>nowy</base><ctag>adj:sg:gen:m3:pos</ctag></lex> + <lex><base>nowy</base><ctag>adj:sg:gen:n:pos</ctag></lex> + </tok> + <tok> + <orth>zdania</orth> + <lex><base>zdanie</base><ctag>subst:sg:gen:n</ctag></lex> + <lex><base>zdanie</base><ctag>subst:pl:nom:n</ctag></lex> + <lex><base>zdanie</base><ctag>subst:pl:acc:n</ctag></lex> + <lex><base>zdanie</base><ctag>subst:pl:voc:n</ctag></lex> + <lex><base>zdać</base><ctag>ger:sg:gen:n:perf:aff</ctag></lex> + </tok> + <ns/> + <tok> + <orth>.</orth> + <lex><base>.</base><ctag>interp</ctag></lex> + </tok> + </sentence> + <sentence> + <tok> + <orth>Nowy</orth> + <lex><base>nowy</base><ctag>adj:sg:nom:m1:pos</ctag></lex> + <lex><base>nowy</base><ctag>adj:sg:nom:m2:pos</ctag></lex> + <lex><base>nowy</base><ctag>adj:sg:nom:m3:pos</ctag></lex> + <lex><base>nowy</base><ctag>adj:sg:acc:m3:pos</ctag></lex> + </tok> + <tok> + <orth>poczÄ…tek</orth> + <lex><base>poczÄ…tek</base><ctag>subst:sg:nom:m3</ctag></lex> + <lex><base>poczÄ…tek</base><ctag>subst:sg:acc:m3</ctag></lex> + </tok> + <tok> + <orth>starego</orth> + <lex><base>stary</base><ctag>adj:sg:gen:m1:pos</ctag></lex> + <lex><base>stary</base><ctag>adj:sg:acc:m1:pos</ctag></lex> + <lex><base>stary</base><ctag>adj:sg:gen:m2:pos</ctag></lex> + <lex><base>stary</base><ctag>adj:sg:acc:m2:pos</ctag></lex> + <lex><base>stary</base><ctag>adj:sg:gen:m3:pos</ctag></lex> + <lex><base>stary</base><ctag>adj:sg:gen:n:pos</ctag></lex> + </tok> + <tok> + <orth>zdania</orth> + <lex><base>zdanie</base><ctag>subst:sg:gen:n</ctag></lex> + <lex><base>zdanie</base><ctag>subst:pl:nom:n</ctag></lex> + <lex><base>zdanie</base><ctag>subst:pl:acc:n</ctag></lex> + <lex><base>zdanie</base><ctag>subst:pl:voc:n</ctag></lex> + <lex><base>zdać</base><ctag>ger:sg:gen:n:perf:aff</ctag></lex> + </tok> + <ns/> + <tok> + <orth>.</orth> + <lex><base>.</base><ctag>interp</ctag></lex> + </tok> + </sentence> + </chunk> +</chunkList> +</cesAna> diff --git a/tests/rules-data/match/postcond3/cclmatch1.ccl b/tests/rules-data/match/postcond3/cclmatch1.ccl new file mode 100644 index 0000000000000000000000000000000000000000..e69473a7ba46067543fd0dc88e763e8850c536a6 --- /dev/null +++ b/tests/rules-data/match/postcond3/cclmatch1.ccl @@ -0,0 +1,12 @@ +apply( + match( + optional(repeat(inter(class[0], {adj}))), + repeat(inter(class[0], {subst})) + ), + cond( + not(empty(:1)) + ), + actions( + mark(M, "NP") + ) +) diff --git a/tests/rules-data/match/postcond3/cclmatch1.out.xml b/tests/rules-data/match/postcond3/cclmatch1.out.xml new file mode 100644 index 0000000000000000000000000000000000000000..13ba3c81fdf9512598894a1c43fae1e75593af1a --- /dev/null +++ b/tests/rules-data/match/postcond3/cclmatch1.out.xml @@ -0,0 +1,82 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE cesAna SYSTEM "xcesAnaIPI.dtd"> +<cesAna xmlns:xlink="http://www.w3.org/1999/xlink" version="1.0" type="lex disamb"> +<chunkList> + <chunk> + <sentence> + <tok> + <orth>PoczÄ…tek</orth> + <lex><base>poczÄ…tek</base><ctag>subst:sg:nom:m3</ctag></lex> + <lex><base>poczÄ…tek</base><ctag>subst:sg:acc:m3</ctag></lex> + <ann chan="NP">0</ann> + </tok> + <tok> + <orth>nowego</orth> + <lex><base>nowy</base><ctag>adj:sg:gen:m1:pos</ctag></lex> + <lex><base>nowy</base><ctag>adj:sg:acc:m1:pos</ctag></lex> + <lex><base>nowy</base><ctag>adj:sg:gen:m2:pos</ctag></lex> + <lex><base>nowy</base><ctag>adj:sg:acc:m2:pos</ctag></lex> + <lex><base>nowy</base><ctag>adj:sg:gen:m3:pos</ctag></lex> + <lex><base>nowy</base><ctag>adj:sg:gen:n:pos</ctag></lex> + <ann chan="NP" head="1">1</ann> + </tok> + <tok> + <orth>zdania</orth> + <lex><base>zdanie</base><ctag>subst:sg:gen:n</ctag></lex> + <lex><base>zdanie</base><ctag>subst:pl:nom:n</ctag></lex> + <lex><base>zdanie</base><ctag>subst:pl:acc:n</ctag></lex> + <lex><base>zdanie</base><ctag>subst:pl:voc:n</ctag></lex> + <lex><base>zdać</base><ctag>ger:sg:gen:n:perf:aff</ctag></lex> + <ann chan="NP">1</ann> + </tok> + <ns/> + <tok> + <orth>.</orth> + <lex><base>.</base><ctag>interp</ctag></lex> + <ann chan="NP">0</ann> + </tok> + </sentence> + <sentence> + <tok> + <orth>Nowy</orth> + <lex><base>nowy</base><ctag>adj:sg:nom:m1:pos</ctag></lex> + <lex><base>nowy</base><ctag>adj:sg:nom:m2:pos</ctag></lex> + <lex><base>nowy</base><ctag>adj:sg:nom:m3:pos</ctag></lex> + <lex><base>nowy</base><ctag>adj:sg:acc:m3:pos</ctag></lex> + <ann chan="NP" head="1">1</ann> + </tok> + <tok> + <orth>poczÄ…tek</orth> + <lex><base>poczÄ…tek</base><ctag>subst:sg:nom:m3</ctag></lex> + <lex><base>poczÄ…tek</base><ctag>subst:sg:acc:m3</ctag></lex> + <ann chan="NP">1</ann> + </tok> + <tok> + <orth>starego</orth> + <lex><base>stary</base><ctag>adj:sg:gen:m1:pos</ctag></lex> + <lex><base>stary</base><ctag>adj:sg:acc:m1:pos</ctag></lex> + <lex><base>stary</base><ctag>adj:sg:gen:m2:pos</ctag></lex> + <lex><base>stary</base><ctag>adj:sg:acc:m2:pos</ctag></lex> + <lex><base>stary</base><ctag>adj:sg:gen:m3:pos</ctag></lex> + <lex><base>stary</base><ctag>adj:sg:gen:n:pos</ctag></lex> + <ann chan="NP" head="1">2</ann> + </tok> + <tok> + <orth>zdania</orth> + <lex><base>zdanie</base><ctag>subst:sg:gen:n</ctag></lex> + <lex><base>zdanie</base><ctag>subst:pl:nom:n</ctag></lex> + <lex><base>zdanie</base><ctag>subst:pl:acc:n</ctag></lex> + <lex><base>zdanie</base><ctag>subst:pl:voc:n</ctag></lex> + <lex><base>zdać</base><ctag>ger:sg:gen:n:perf:aff</ctag></lex> + <ann chan="NP">2</ann> + </tok> + <ns/> + <tok> + <orth>.</orth> + <lex><base>.</base><ctag>interp</ctag></lex> + <ann chan="NP">0</ann> + </tok> + </sentence> + </chunk> +</chunkList> +</cesAna> diff --git a/tests/rules-data/match/postcond3/cclmatch2.ccl b/tests/rules-data/match/postcond3/cclmatch2.ccl new file mode 100644 index 0000000000000000000000000000000000000000..d8d9560b59da2f5dbb8a374db0f47d0296423f4d --- /dev/null +++ b/tests/rules-data/match/postcond3/cclmatch2.ccl @@ -0,0 +1,12 @@ +apply( + match( + optional(repeat(inter(class[0], {adj}))), + repeat(inter(class[0], {subst})) + ), + cond( + equal(orth[last(:2)], "zdania") + ), + actions( + mark(M, "NP") + ) +) diff --git a/tests/rules-data/match/postcond3/cclmatch2.out.xml b/tests/rules-data/match/postcond3/cclmatch2.out.xml new file mode 100644 index 0000000000000000000000000000000000000000..d23be4f7ca56f90645f68f25676326a3f02eb5cf --- /dev/null +++ b/tests/rules-data/match/postcond3/cclmatch2.out.xml @@ -0,0 +1,82 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE cesAna SYSTEM "xcesAnaIPI.dtd"> +<cesAna xmlns:xlink="http://www.w3.org/1999/xlink" version="1.0" type="lex disamb"> +<chunkList> + <chunk> + <sentence> + <tok> + <orth>PoczÄ…tek</orth> + <lex><base>poczÄ…tek</base><ctag>subst:sg:nom:m3</ctag></lex> + <lex><base>poczÄ…tek</base><ctag>subst:sg:acc:m3</ctag></lex> + <ann chan="NP">0</ann> + </tok> + <tok> + <orth>nowego</orth> + <lex><base>nowy</base><ctag>adj:sg:gen:m1:pos</ctag></lex> + <lex><base>nowy</base><ctag>adj:sg:acc:m1:pos</ctag></lex> + <lex><base>nowy</base><ctag>adj:sg:gen:m2:pos</ctag></lex> + <lex><base>nowy</base><ctag>adj:sg:acc:m2:pos</ctag></lex> + <lex><base>nowy</base><ctag>adj:sg:gen:m3:pos</ctag></lex> + <lex><base>nowy</base><ctag>adj:sg:gen:n:pos</ctag></lex> + <ann chan="NP" head="1">1</ann> + </tok> + <tok> + <orth>zdania</orth> + <lex><base>zdanie</base><ctag>subst:sg:gen:n</ctag></lex> + <lex><base>zdanie</base><ctag>subst:pl:nom:n</ctag></lex> + <lex><base>zdanie</base><ctag>subst:pl:acc:n</ctag></lex> + <lex><base>zdanie</base><ctag>subst:pl:voc:n</ctag></lex> + <lex><base>zdać</base><ctag>ger:sg:gen:n:perf:aff</ctag></lex> + <ann chan="NP">1</ann> + </tok> + <ns/> + <tok> + <orth>.</orth> + <lex><base>.</base><ctag>interp</ctag></lex> + <ann chan="NP">0</ann> + </tok> + </sentence> + <sentence> + <tok> + <orth>Nowy</orth> + <lex><base>nowy</base><ctag>adj:sg:nom:m1:pos</ctag></lex> + <lex><base>nowy</base><ctag>adj:sg:nom:m2:pos</ctag></lex> + <lex><base>nowy</base><ctag>adj:sg:nom:m3:pos</ctag></lex> + <lex><base>nowy</base><ctag>adj:sg:acc:m3:pos</ctag></lex> + <ann chan="NP">0</ann> + </tok> + <tok> + <orth>poczÄ…tek</orth> + <lex><base>poczÄ…tek</base><ctag>subst:sg:nom:m3</ctag></lex> + <lex><base>poczÄ…tek</base><ctag>subst:sg:acc:m3</ctag></lex> + <ann chan="NP">0</ann> + </tok> + <tok> + <orth>starego</orth> + <lex><base>stary</base><ctag>adj:sg:gen:m1:pos</ctag></lex> + <lex><base>stary</base><ctag>adj:sg:acc:m1:pos</ctag></lex> + <lex><base>stary</base><ctag>adj:sg:gen:m2:pos</ctag></lex> + <lex><base>stary</base><ctag>adj:sg:acc:m2:pos</ctag></lex> + <lex><base>stary</base><ctag>adj:sg:gen:m3:pos</ctag></lex> + <lex><base>stary</base><ctag>adj:sg:gen:n:pos</ctag></lex> + <ann chan="NP" head="1">1</ann> + </tok> + <tok> + <orth>zdania</orth> + <lex><base>zdanie</base><ctag>subst:sg:gen:n</ctag></lex> + <lex><base>zdanie</base><ctag>subst:pl:nom:n</ctag></lex> + <lex><base>zdanie</base><ctag>subst:pl:acc:n</ctag></lex> + <lex><base>zdanie</base><ctag>subst:pl:voc:n</ctag></lex> + <lex><base>zdać</base><ctag>ger:sg:gen:n:perf:aff</ctag></lex> + <ann chan="NP">1</ann> + </tok> + <ns/> + <tok> + <orth>.</orth> + <lex><base>.</base><ctag>interp</ctag></lex> + <ann chan="NP">0</ann> + </tok> + </sentence> + </chunk> +</chunkList> +</cesAna> diff --git a/tests/rules-data/match/unmark/cclmatch-exc.ccl b/tests/rules-data/match/unmark/cclmatch2.ccl similarity index 87% rename from tests/rules-data/match/unmark/cclmatch-exc.ccl rename to tests/rules-data/match/unmark/cclmatch2.ccl index c67e4332986284395d3f19903474a6395ccd9855..c81f1a1fca1ab514da44ed87160cf66f1bb6d5c2 100644 --- a/tests/rules-data/match/unmark/cclmatch-exc.ccl +++ b/tests/rules-data/match/unmark/cclmatch2.ccl @@ -3,7 +3,6 @@ apply( optional(equal(class[0], other)), repeat(inter(class[0], adj)) ), - cond(debug(M)), actions( unmark(M, "B") ) diff --git a/tests/rules-data/match/unmark/cclmatch2.out.xml b/tests/rules-data/match/unmark/cclmatch2.out.xml new file mode 100644 index 0000000000000000000000000000000000000000..19cd8679df51efc9c63358f7fc3fa6d3ff172c3a --- /dev/null +++ b/tests/rules-data/match/unmark/cclmatch2.out.xml @@ -0,0 +1,70 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE cesAna SYSTEM "xcesAnaIPI.dtd"> +<cesAna xmlns:xlink="http://www.w3.org/1999/xlink" version="1.0" type="lex disamb"> +<chunkList> + <chunk> + <sentence> + <tok> + <orth>Dwa</orth> + <lex><base>dwa</base><ctag>other</ctag></lex> + <ann chan="A">1</ann> + <ann chan="B">0</ann> + </tok> + <tok> + <orth>wielkie</orth> + <lex><base>wielki</base><ctag>adj</ctag></lex> + <ann chan="A">1</ann> + <ann chan="B" head="1">2</ann> + </tok> + <tok> + <orth>wÅ‚ochate</orth> + <lex><base>wÅ‚ochaty</base><ctag>adj</ctag></lex> + <ann chan="A">1</ann> + <ann chan="B">2</ann> + </tok> + <tok> + <orth>zapchlone</orth> + <lex><base>zapchlić</base><ctag>adj</ctag></lex> + <ann chan="A">1</ann> + <ann chan="B">2</ann> + </tok> + <tok> + <orth>koty</orth> + <lex><base>kot</base><ctag>noun</ctag></lex> + <ann chan="A" head="1">1</ann> + <ann chan="B" head="1">3</ann> + </tok> + <tok> + <orth>zjadÅ‚y</orth> + <lex><base>zjeść</base><ctag>verb</ctag></lex> + <ann chan="A">0</ann> + <ann chan="B">0</ann> + </tok> + <tok> + <orth>pięć</orth> + <lex><base>pięć</base><ctag>other</ctag></lex> + <ann chan="A" head="1">2</ann> + <ann chan="B">0</ann> + </tok> + <tok> + <orth>tÅ‚ustych</orth> + <lex><base>tÅ‚usty</base><ctag>adj</ctag></lex> + <ann chan="A">2</ann> + <ann chan="B">0</ann> + </tok> + <tok> + <orth>soczystych</orth> + <lex><base>soczysty</base><ctag>adj</ctag></lex> + <ann chan="A">2</ann> + <ann chan="B">0</ann> + </tok> + <tok> + <orth>much</orth> + <lex><base>mucha</base><ctag>noun</ctag></lex> + <ann chan="A">2</ann> + <ann chan="B">0</ann> + </tok> + </sentence> + </chunk> +</chunkList> +</cesAna> diff --git a/wccl-apps/CMakeLists.txt b/wccl-apps/CMakeLists.txt index e07a49df24685820ba8044b67b203b6f8e4e8325..9122f0f0891a14c8f8694780b978b5ba83db3363 100644 --- a/wccl-apps/CMakeLists.txt +++ b/wccl-apps/CMakeLists.txt @@ -27,11 +27,9 @@ add_executable(wccl-rules wccl-rules.cpp) target_link_libraries (wccl-rules wccl ${Boost_LIBRARIES} antlr ${LIBS}) add_executable(wccl-parser wccl-parser.cpp) target_link_libraries (wccl-parser wccl ${Boost_LIBRARIES} antlr ${LIBS}) -add_executable(wccl-match wccl-match.cpp) -target_link_libraries (wccl-match wccl ${Boost_LIBRARIES} antlr ${LIBS}) if(UNIX) - install(TARGETS wccl-features wccl-run wccl-rules wccl-parser wccl-match + install(TARGETS wccl-features wccl-run wccl-rules wccl-parser RUNTIME DESTINATION bin ) endif(UNIX) diff --git a/wccl-apps/wccl-match.cpp b/wccl-apps/wccl-match.cpp deleted file mode 100644 index 68f7efbbaeba720600c41477b49f4ca4635bfad9..0000000000000000000000000000000000000000 --- a/wccl-apps/wccl-match.cpp +++ /dev/null @@ -1,243 +0,0 @@ -#include <cstdlib> -#include <fstream> -#include <iomanip> - -#include <libwccl/values/strset.h> -#include <libwccl/parser/Parser.h> -#include <libcorpus2/tagsetmanager.h> -#include <libcorpus2/util/tokentimer.h> - -#include <boost/bind.hpp> -#include <boost/algorithm/string.hpp> -#include <boost/make_shared.hpp> -#include <boost/filesystem.hpp> -#include <boost/program_options.hpp> -#include <libcorpus2/io/reader.h> -#include <libcorpus2/io/writer.h> - -namespace { - bool quiet = false; - - struct options { - bool first; - bool until_done; - int until_done_iterations; - }; -} - -class MatchRunner -{ -public: - MatchRunner(const Corpus2::Tagset& tagset) - : tagset_(tagset), parser_(tagset_), progress_(false) - { - } - - void use_progress(bool use) { - progress_ = use; - if (use) { - Corpus2::TokenTimer& timer = Corpus2::global_timer(); - timer.register_signal_handler(); - } - } - - bool load_more_rules(const std::string &filename); - - bool load_operator_string(const std::string &op_string); - - void apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader, - boost::shared_ptr<Corpus2::TokenWriter> writer); - - bool empty() { - return rules_.empty(); - } - -private: - const Corpus2::Tagset& tagset_; - Wccl::Parser parser_; - std::vector<std::string> rule_names_; - std::vector<boost::shared_ptr<Wccl::MatchRule> > rules_; - bool progress_; -}; - -bool MatchRunner::load_more_rules(const std::string& filename) -{ - boost::shared_ptr<Wccl::MatchRule> retOp; - try { - std::ifstream is(filename.c_str()); - if (!is.good()) { - throw Wccl::FileNotFound(filename, "", __FUNCTION__); - } - retOp = parser_.parseMatchRule(is); - if (retOp) { - boost::filesystem::path p(filename); - rule_names_.push_back(p.stem()); - rules_.push_back(retOp); - return true; - } else { - std::cerr << "Problem while parsing -- " - << "parser returned NULL!" << std::endl; - } - } catch (PwrNlp::PwrNlpError& e) { - std::cerr << e.scope() << " Error: " << e.info() << std::endl; - } - return false; -} - -void MatchRunner::apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader, - boost::shared_ptr<Corpus2::TokenWriter> writer) -{ - Corpus2::TokenTimer& timer = Corpus2::global_timer(); - while (boost::shared_ptr<Corpus2::Chunk> c = reader->get_next_chunk()) { - foreach (boost::shared_ptr<Corpus2::Sentence>& s, c->sentences()) { - boost::shared_ptr<Corpus2::AnnotatedSentence> as; - as = boost::dynamic_pointer_cast<Corpus2::AnnotatedSentence>(s); - if (!as) { - std::cerr << "Did not get an AnnotatedSentence from reader," - "'ann'' option broken?\n"; - return; - } - - foreach (const boost::shared_ptr<Wccl::MatchRule>& r, rules_) { - r->apply(as); - } - - timer.count_sentence(*as); - if (progress_) { - timer.check_slice(); - } - //writer->write_sentence(*as); - } - writer->write_chunk(*c); - } -} - -void usage(char* name) -{ - std::cerr << "This program runs WCCL match rules.\n"; - std::cerr << "Usage " << name << " [OPTIONS] FILES\n" - << "Files ending with .xml are treated as corpora, otherwise \n" - << "as CCL files. Use - to read corpus from stdin (as with -I)\n" - << "Note: the ann option is implied on all input formats\n"; -} - -int main(int argc, char** argv) -{ - std::string tagset_load = "kipi"; - std::string input_format; - std::string output_format; - bool progress = false; - options opts; - opts.first = false; - opts.until_done = false; - opts.until_done_iterations = 1000; - std::vector<std::string> corpora_files, ccl_files, files; - bool corpus_stdin = true; - using boost::program_options::value; - - std::string readers = boost::algorithm::join(Corpus2::TokenReader::available_reader_types_help(), " "); - std::string readers_help = "Input format, any of: " + readers + "\n"; - std::string writers = boost::algorithm::join(Corpus2::TokenWriter::available_writer_types_help(), " "); - std::string writers_help = "Output format, any of: " + writers + "\n";; - - boost::program_options::options_description desc("Allowed options"); - desc.add_options() - ("tagset,t", value(&tagset_load), - "Tagset to use\n") - ("corpus,c", value(&corpora_files), - "Corpus file to load (XCES), do not load from stdin\n") - ("ccl-file,C", value(&ccl_files), - "CCL rule files\n") - ("files,f", value(&files), - "Files to load, looking at the extension to determine type\n") - ("corpus-from-stdin,I", value(&corpus_stdin)->zero_tokens(), - "Read corpus from stdin") - ("input-format,i", value(&input_format)->default_value("xces"), - readers_help.c_str()) - ("output-format,o", value(&output_format)->default_value("ccl"), - writers_help.c_str()) - ("progress,p", value(&progress)->zero_tokens(), - "Show progress info") - ("quiet,q", value(&quiet)->zero_tokens(), - "Suppress messages\n") - ("until-done,u", value(&opts.until_done)->zero_tokens(), - "Until-done mode\n") - ("until-done-iterations", value(&opts.until_done_iterations), - "Until-done iteration limit\n") - ("first-sentence-only,1", value(&opts.first)->zero_tokens(), - "Only process first sentence\n") - ("help,h", "Show help") - ; - boost::program_options::variables_map vm; - boost::program_options::positional_options_description p; - p.add("files", -1); - - try { - boost::program_options::store( - boost::program_options::command_line_parser(argc, argv) - .options(desc).positional(p).run(), vm); - } catch (boost::program_options::error& e) { - std::cerr << e.what() << std::endl; - return 2; - } - boost::program_options::notify(vm); - - if (vm.count("help")) { - usage(argv[0]); - std::cout << desc << "\n"; - return 1; - } - - foreach (const std::string& f, files) { - if (f == "-") { - corpus_stdin = true; - } else if (boost::algorithm::ends_with(f, ".xml")) { - corpora_files.push_back(f); - } else { - ccl_files.push_back(f); - } - } - - // consider stdin only when no corpus files given - corpus_stdin = corpus_stdin && corpora_files.empty(); - - if (ccl_files.empty() || (corpora_files.empty() && !corpus_stdin)) { - usage(argv[0]); - return 2; - } - - try { - const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load); - MatchRunner runner(tagset); - runner.use_progress(progress); - foreach (const std::string& file, ccl_files) { - runner.load_more_rules(file); - } - if (!runner.empty()) { - Corpus2::TokenTimer& timer = Corpus2::global_timer(); - timer.register_signal_handler(); - boost::shared_ptr<Corpus2::TokenWriter> writer; - writer = Corpus2::TokenWriter::create_stream_writer(output_format, std::cout, tagset); - boost::shared_ptr<Corpus2::TokenReader> reader; - foreach (std::string cf, corpora_files) { - reader = Corpus2::TokenReader::create_path_reader(input_format, tagset, cf); - reader->set_option("ann"); - runner.apply_rules(reader, writer); - } - if (corpus_stdin) { - reader = Corpus2::TokenReader::create_stream_reader(input_format, tagset, std::cin); - reader->set_option("ann"); - runner.apply_rules(reader, writer); - } - if (progress) { - Corpus2::TokenTimer& timer = Corpus2::global_timer(); - timer.stats(); - } - } - } catch (PwrNlp::PwrNlpError& e) { - std::cerr << e.info() << std::endl; - return 2; - } - - return 0; -} diff --git a/wccl-apps/wccl-rules.cpp b/wccl-apps/wccl-rules.cpp index e357f72415c650a853368d972ea7dd04abcaf6be..63d33c48f74fd11ef4c7782da3c4177acd16d4cd 100644 --- a/wccl-apps/wccl-rules.cpp +++ b/wccl-apps/wccl-rules.cpp @@ -2,79 +2,119 @@ #include <fstream> #include <iomanip> - #include <libwccl/values/strset.h> #include <libwccl/parser/Parser.h> -#include <libwccl/ops/tagrulesequence.h> #include <libcorpus2/tagsetmanager.h> #include <libcorpus2/util/tokentimer.h> - #include <boost/bind.hpp> #include <boost/algorithm/string.hpp> #include <boost/make_shared.hpp> +#include <boost/filesystem.hpp> #include <boost/program_options.hpp> -#include <libcorpus2/io/xcesreader.h> -#include <libcorpus2/io/xceswriter.h> - -#include <antlr/NoViableAltException.hpp> -#include <antlr/MismatchedTokenException.hpp> +#include <libcorpus2/io/reader.h> +#include <libcorpus2/io/writer.h> namespace { bool quiet = false; - bool progress = false; struct options { bool first; - bool until_done; - int until_done_iterations; }; } -bool load_more_rules(Wccl::Parser& parser, const std::string& filename, Wccl::TagRuleSequence& rules) +class RuleRunner +{ +public: + RuleRunner(const Corpus2::Tagset& tagset) + : tagset_(tagset), parser_(tagset_), progress_(false), search_path_(".") + , tag_rule_iterations_(0), total_match_rules_(0), total_tag_rules_(0) + { + } + + void use_progress(bool use) { + progress_ = use; + if (use) { + Corpus2::TokenTimer& timer = Corpus2::global_timer(); + timer.register_signal_handler(); + } + } + + void set_tag_rule_iterations(int i) { + tag_rule_iterations_ = i; + } + + std::pair<int,int> load_more_rules(const std::string &filename); + + void apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader, + boost::shared_ptr<Corpus2::TokenWriter> writer); + + bool empty() const { + return size() == 0; + } + + size_t size() const { + return total_match_rules_ + total_tag_rules_; + } + + size_t total_match_rules() const { + return total_match_rules_; + } + + size_t total_tag_rules() const { + return total_tag_rules_; + } + + void set_search_path(const std::string& path) { + search_path_ = path; + } + +private: + const Corpus2::Tagset& tagset_; + Wccl::Parser parser_; + std::vector<std::string> file_names_; + std::vector<boost::shared_ptr<Wccl::WcclFile> > parsed_files_; + bool progress_; + std::string search_path_; + int tag_rule_iterations_; + size_t total_match_rules_, total_tag_rules_; +}; + +std::pair<int,int> RuleRunner::load_more_rules(const std::string& filename) { - boost::shared_ptr<Wccl::TagRuleSequence> ret; + boost::shared_ptr<Wccl::WcclFile> parsed_file; try { std::ifstream is(filename.c_str()); if (!is.good()) { throw Wccl::FileNotFound(filename, "", __FUNCTION__); } - - ret = parser.parseTagRuleSequence(is); - if (ret) { - if (!quiet) { - std::cerr << "Loaded " << ret->size() << " rule(s) from " - << filename << "\n"; + parsed_file = parser_.parseWcclFile(is, search_path_); + if (parsed_file) { + boost::filesystem::path p(filename); + file_names_.push_back(p.stem()); + size_t match_rules = 0, tag_rules = 0; + if (parsed_file->has_tag_rules()) { + tag_rules = parsed_file->get_tag_rules().size(); } - std::copy(ret->begin(), ret->end(), std::back_inserter(rules)); - return true; + if (parsed_file->has_match_rules()) { + match_rules = parsed_file->get_match_rules().size(); + } + total_tag_rules_ += tag_rules; + total_match_rules_ += match_rules; + parsed_files_.push_back(parsed_file); + return std::make_pair(tag_rules, match_rules); } else { std::cerr << "Problem while parsing -- " << "parser returned NULL!" << std::endl; } - } catch (antlr::MismatchedTokenException &e) { - std::cerr << e.getFileLineColumnString() - << " " << e.getMessage() << std::endl; - } catch(antlr::NoViableAltException &e) { - std::cerr << e.getFileLineColumnString() - << " " << e.getMessage() << std::endl; - } catch (Wccl::InvalidVariableName &e) { - std::cerr << "Wccl::InvalidVariableName " << e.info() << std::endl; - } catch (Wccl::VariableTypeMismatch &e) { - std::cerr << "Wccl::VariableTypeMismatch " << e.info() << std::endl; - } catch (Wccl::WcclError& e) { - std::cerr << "Wccl::WcclError:" << e.info() << std::endl; } catch (PwrNlp::PwrNlpError& e) { - std::cerr << "PwrNlp::PwrNlpError " << e.info() << std::endl; - } catch (antlr::ANTLRException& e) { - std::cerr << "Antlr error " << e.getMessage() << std::endl; + std::cerr << e.scope() << " Error: " << e.info() << std::endl; } - return false; + return std::make_pair(0,0); } -void apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader, - boost::shared_ptr<Corpus2::TokenWriter> writer, Wccl::TagRuleSequence& rules, - const options& opts) +void RuleRunner::apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader, + boost::shared_ptr<Corpus2::TokenWriter> writer) { Corpus2::TokenTimer& timer = Corpus2::global_timer(); while (boost::shared_ptr<Corpus2::Chunk> c = reader->get_next_chunk()) { @@ -86,45 +126,50 @@ void apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader, "'ann'' option broken?\n"; return; } - if (opts.until_done) { - rules.execute_until_done(as, opts.until_done_iterations); - } else { - rules.execute_once(as); + + foreach (boost::shared_ptr<Wccl::WcclFile>& f, parsed_files_) { + if (f->has_tag_rules()) { + if (tag_rule_iterations_ == 0) { + f->get_tag_rules_ptr()->execute_once(as); + } else if (tag_rule_iterations_ < 0) { + f->get_tag_rules_ptr()->execute_until_done(as); + } else { + f->get_tag_rules_ptr()->execute_until_done(as, tag_rule_iterations_); + } + } + if (f->has_match_rules()) { + f->get_match_rules_ptr()->apply_all(as); + } } + timer.count_sentence(*as); - if (progress) { + if (progress_) { timer.check_slice(); } - if (opts.first) break; //writer->write_sentence(*as); } writer->write_chunk(*c); - if (opts.first) break; - } - if (progress) { - timer.stats(); } } void usage(char* name) { - std::cerr << "This program runs WCCL disambiguation rules.\n"; + std::cerr << "This program runs WCCL match and/or tag rules. Tag rules are applied first.\n"; std::cerr << "Usage " << name << " [OPTIONS] FILES\n" - << "Files ending with .xml are treated as corpora, otherwise \n" - << "as CCL files. Use - to read corpus from stdin (as with -I)\n" - << "Note: the ann option is implied on all input formats\n"; + << "Files ending with .xml are treated as corpora, otherwise " + << "as WCCL files. Use - to read corpus from stdin (as with -I)\n" + << "Note: the ,ann option is implied on all input formats\n"; } - int main(int argc, char** argv) { std::string tagset_load = "kipi"; std::string input_format; std::string output_format; + std::string search_path; + bool progress = false; options opts; opts.first = false; - opts.until_done = false; - opts.until_done_iterations = 1000; std::vector<std::string> corpora_files, ccl_files, files; bool corpus_stdin = true; using boost::program_options::value; @@ -144,20 +189,20 @@ int main(int argc, char** argv) "CCL rule files\n") ("files,f", value(&files), "Files to load, looking at the extension to determine type\n") + ("search-path,P", value(&search_path), + "WCCL resources (lexicons) search path") ("corpus-from-stdin,I", value(&corpus_stdin)->zero_tokens(), - "Read corpus from stdin") + "Read corpus from stdin (requires that no corpora filenames are passed)") ("input-format,i", value(&input_format)->default_value("xces"), readers_help.c_str()) - ("output-format,o", value(&output_format)->default_value("xces"), + ("output-format,o", value(&output_format)->default_value("ccl"), writers_help.c_str()) ("progress,p", value(&progress)->zero_tokens(), "Show progress info") ("quiet,q", value(&quiet)->zero_tokens(), "Suppress messages\n") - ("until-done,u", value(&opts.until_done)->zero_tokens(), - "Until-done mode\n") - ("until-done-iterations", value(&opts.until_done_iterations), - "Until-done iteration limit\n") + ("until-done-iterations,u", value<int>()->implicit_value(1000), + "Until-done iteration limit, no arg for default limit(1000)\n") ("first-sentence-only,1", value(&opts.first)->zero_tokens(), "Only process first sentence\n") ("help,h", "Show help") @@ -202,32 +247,36 @@ int main(int argc, char** argv) try { const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load); - Wccl::Parser parser(tagset); - Wccl::TagRuleSequence rules; - foreach (const std::string& f, ccl_files) { - size_t sz = rules.size(); - if (!load_more_rules(parser, f, rules)) { - std::cerr << "Warning: error while parsing " << f << "\n"; - } - if (rules.size() == sz) { - std::cerr << "Warning: no rules loaded from " << f << "\n"; + RuleRunner runner(tagset); + if (vm.count("until-done-iterations")) { + runner.set_tag_rule_iterations(vm["until-done-iterations"].as<int>()); + } + runner.use_progress(progress); + if (!search_path.empty()) { + runner.set_search_path(search_path); + } + foreach (const std::string& file, ccl_files) { + std::pair<int,int> res = runner.load_more_rules(file); + if (res.first == 0 && res.second == 0) { + std::cerr << "Warning: no rules loaded from " << file << "\n"; + } else if (!quiet) { + std::cerr << "Loaded " << res.first << " tag rule(s) and " + << res.second << " match rule(s) from " << file << "\n"; } } - if (!rules.empty()) { - Corpus2::TokenTimer& timer = Corpus2::global_timer(); - timer.register_signal_handler(); + if (!runner.empty()) { boost::shared_ptr<Corpus2::TokenWriter> writer; writer = Corpus2::TokenWriter::create_stream_writer(output_format, std::cout, tagset); boost::shared_ptr<Corpus2::TokenReader> reader; - foreach (const std::string& f, corpora_files) { - reader = Corpus2::TokenReader::create_path_reader(input_format, tagset, f); + foreach (std::string cf, corpora_files) { + reader = Corpus2::TokenReader::create_path_reader(input_format, tagset, cf); reader->set_option("ann"); - apply_rules(reader, writer, rules, opts); + runner.apply_rules(reader, writer); } if (corpus_stdin) { reader = Corpus2::TokenReader::create_stream_reader(input_format, tagset, std::cin); reader->set_option("ann"); - apply_rules(reader, writer, rules, opts); + runner.apply_rules(reader, writer); } if (progress) { Corpus2::TokenTimer& timer = Corpus2::global_timer(); @@ -235,7 +284,7 @@ int main(int argc, char** argv) } } } catch (PwrNlp::PwrNlpError& e) { - std::cerr << e.info() << std::endl; + std::cerr << e.scope() << "error: " << e.info() << std::endl; return 2; }