diff --git a/libmwereader/mwe.cpp b/libmwereader/mwe.cpp index ec4bbe07916d11947dca3a2995af948c62efd229..4fcc72bfcf8aa55d022972ff0a693b1ced5e98eb 100644 --- a/libmwereader/mwe.cpp +++ b/libmwereader/mwe.cpp @@ -13,6 +13,7 @@ LexicalUnit::LexicalUnit(const std::string &base, base_(base), nowhere_(Wccl::Position()) { + for(strmap::iterator iter = variables.begin(); iter != variables.end(); ++iter){ potential_bases_.insert(iter->second); diff --git a/libmwereader/mweparser.cpp b/libmwereader/mweparser.cpp index 54e8b2752ffd9ad6830c5a0f9a4fe93e3d6454a9..1edb9c684334180e359ad987658a2ece34337670 100644 --- a/libmwereader/mweparser.cpp +++ b/libmwereader/mweparser.cpp @@ -63,6 +63,7 @@ namespace Corpus2 { head_cond_); if(group_type_ == "fix"){ // group_name_ -> lower case + mwe_index_.add_lexicalunit( LexicalUnit::Ptr(new FixedLU(mwe_base_, main, head, variables_))); } else if(group_type_ == "flex"){ @@ -85,6 +86,8 @@ namespace Corpus2 { value = a.value; } } + if (value == "") + throw Wccl::WcclError("Attribute: "+name+" not found"); return value; } diff --git a/libmwereader/mwereader.cpp b/libmwereader/mwereader.cpp index 6b0c86107e58453bf1b93f582a1ad6ad20425df2..670d13913dd9d9e8153122371f6df0166830eb86 100644 --- a/libmwereader/mwereader.cpp +++ b/libmwereader/mwereader.cpp @@ -66,12 +66,17 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( i--; } } - + new_orth_utf8.erase(new_orth_utf8.size()-1, 1); Corpus2::Token *tok = (*sent)[head]; tok->set_orth_utf8(new_orth_utf8); foreach(Lexeme& lex, tok->lexemes()) - if(lex.is_disamb()) + { + + if(lex.is_disamb()){ + lex.set_lemma_utf8(pLU->get_base()); + } + } std::vector <Token*>::iterator del_iter = tokens.begin(); while (del_iter != tokens.end()) { diff --git a/libmwereader/tests/data/fix_gap_mwe.xml b/libmwereader/tests/data/fix_gap_mwe.xml index 5c4e8688c06a9001c7ab337b029def1cd6092c9c..fe63652e6a87fb0bc7a885a4e3de36007ab3af2e 100644 --- a/libmwereader/tests/data/fix_gap_mwe.xml +++ b/libmwereader/tests/data/fix_gap_mwe.xml @@ -1,21 +1,21 @@ <?xml version='1.0' encoding='utf-8'?> <units_description tagset='kipi'> - <mwegroup name="AdjSubstFix" type="fix" class="subst"> + <mwegroup name="SubstAdjFix" type="fix" class="subst"> <condition> and( - inter(base[0],$s:Adj), - inter(base[1],$s:Subst), + inter(base[0],$s:Subst), + rlook(1, 10, $Pos, inter(base[$Pos],$s:Adj)), setvar($Pos1, 0), - setvar($Pos2, 1), - inter(class[0],{adj}), - inter(class[1],{subst,ger,depr}), - agrpp(0,1,{nmb,gnd,cas}) + setvar($Pos2, $Pos), + inter(class[0],{subst,ger,depr}), + inter(class[$Pos],{adj}), + agrpp(0,$Pos,{nmb,gnd,cas}) ) </condition> <instances> - <MWE name="dobra wola"> - <var name="Adj">dobry</var> - <var name="Subst">wola</var> + <MWE base="instrument muzyczny"> + <var name="Subst">instrument</var> + <var name="Adj">muzyczny</var> <head>inter(class[0],{subst,ger,depr})</head> </MWE> </instances> diff --git a/libmwereader/tests/data/fix_mwe.xml b/libmwereader/tests/data/fix_mwe.xml index 5c4e8688c06a9001c7ab337b029def1cd6092c9c..2ffbf3dadcc37affbd4dde4eed709c503ce196ae 100644 --- a/libmwereader/tests/data/fix_mwe.xml +++ b/libmwereader/tests/data/fix_mwe.xml @@ -13,7 +13,7 @@ ) </condition> <instances> - <MWE name="dobra wola"> + <MWE base="dobra wola"> <var name="Adj">dobry</var> <var name="Subst">wola</var> <head>inter(class[0],{subst,ger,depr})</head> diff --git a/libmwereader/tests/data/flex_gap_mwe.xml b/libmwereader/tests/data/flex_gap_mwe.xml index 107c15e3b12e08d924abc4bbeedde6d8299b79d5..ead2807b4c0951838bef045d851d7de36ed99aed 100644 --- a/libmwereader/tests/data/flex_gap_mwe.xml +++ b/libmwereader/tests/data/flex_gap_mwe.xml @@ -3,32 +3,31 @@ <mwegroup name="SubstAdjSgFlex" type="flex" class="subst"> <condition> or( + and( + inter(base[0],$s:Subst), + rlook(1, 10, $Pos, inter(base[$Pos],$s:Adj)), + setvar($Pos1, 0), + setvar($Pos2, $Pos), + inter(class[0],{subst,ger,depr}), + inter(class[$Pos],{adj}), + agrpp(0,$Pos,{nmb,gnd,cas}) + ),//and and( - inter(base[1],$s:Adj), - inter(base[0],$s:Subst), - setvar($Pos1, 1), - setvar($Pos2, 0), - inter(class[1],{adj}), - equal(nmb[0], {sg}), - in(class[0],{subst,ger,depr}), - agrpp(0,1,{nmb,gnd,cas}) - ),//and - and( - inter(base[0],$s:Adj), - inter(base[1],$s:Subst), - setvar($Pos1, 0), - setvar($Pos2, 1), - inter(class[0],{adj}), - equal(nmb[1],{sg}), - in(class[1],{subst,ger,depr}), - agrpp(0,1,{nmb,gnd,cas}) + inter(base[0],$s:Adj), + rlook(1, 10, $Pos, inter(base[$Pos],$s:Subst)), + setvar($Pos1, $Pos), + setvar($Pos2, 0), + inter(class[0],{adj}), + equal(nmb[$Pos],{sg}), + in(class[$Pos],{subst,ger,depr}), + agrpp(0,$Pos,{nmb,gnd,cas}) ) )//or </condition> <instances> - <MWE base="dzień dobry"> - <var name="Subst">dzień</var> - <var name="Adj">dobry</var> + <MWE base="praca naukowa"> + <var name="Subst">praca</var> + <var name="Adj">naukowy</var> <head>in(class[0],{subst,ger,depr})</head> </MWE> </instances> diff --git a/libmwereader/tests/data/test1.kipi.xml b/libmwereader/tests/data/test1.kipi.xml index ea76d4fbb582f82b1076cb05fe3ce2e879ea46b7..5b20000b63d881ada1d555ae861d5daa74c590b0 100644 --- a/libmwereader/tests/data/test1.kipi.xml +++ b/libmwereader/tests/data/test1.kipi.xml @@ -1587,5 +1587,475 @@ </tok> </chunk> </chunk> +<chunk type="p" xlink:href="#dv1p1"> +<chunk type="s"> +<tok> +<orth>Pozycja</orth> +<lex disamb="1"><base>pozycja</base><ctag>subst:sg:nom:f</ctag></lex> +</tok> +<ns/> +<tok> +<orth>,</orth> +<lex disamb="1"><base>,</base><ctag>interp</ctag></lex> +</tok> +<tok> +<orth>mimo</orth> +<lex><base>mima</base><ctag>subst:sg:voc:m1</ctag></lex> +<lex disamb="1"><base>mimo</base><ctag>prep:gen</ctag></lex> +</tok> +<tok> +<orth>iż</orth> +<lex disamb="1"><base>iż</base><ctag>conj</ctag></lex> +</tok> +<tok> +<orth>dotyczy</orth> +<lex disamb="1"><base>dotyczyć</base><ctag>fin:sg:ter:imperf</ctag></lex> +</tok> +<tok> +<orth>trudnych</orth> +<lex><base>trudny</base><ctag>adj:pl:gen:m1:pos</ctag></lex> +<lex><base>trudny</base><ctag>adj:pl:gen:m2:pos</ctag></lex> +<lex><base>trudny</base><ctag>adj:pl:gen:m3:pos</ctag></lex> +<lex><base>trudny</base><ctag>adj:pl:gen:f:pos</ctag></lex> +<lex disamb="1"><base>trudny</base><ctag>adj:pl:gen:n:pos</ctag></lex> +<lex><base>trudny</base><ctag>adj:pl:loc:m1:pos</ctag></lex> +<lex><base>trudny</base><ctag>adj:pl:loc:m2:pos</ctag></lex> +<lex><base>trudny</base><ctag>adj:pl:loc:m3:pos</ctag></lex> +<lex><base>trudny</base><ctag>adj:pl:loc:f:pos</ctag></lex> +<lex><base>trudny</base><ctag>adj:pl:loc:n:pos</ctag></lex> +<lex><base>trudny</base><ctag>adj:pl:acc:m1:pos</ctag></lex> +</tok> +<tok> +<orth>zagadnień</orth> +<lex disamb="1"><base>zagadnienie</base><ctag>subst:pl:gen:n</ctag></lex> +</tok> +<ns/> +<tok> +<orth>,</orth> +<lex disamb="1"><base>,</base><ctag>interp</ctag></lex> +</tok> +<tok> +<orth>zawiera</orth> +<lex disamb="1"><base>zawierać</base><ctag>fin:sg:ter:imperf</ctag></lex> +</tok> +<tok> +<orth>jasne</orth> +<lex disamb="1"><base>jasny</base><ctag>adj:sg:nom:n:pos</ctag></lex> +<lex><base>jasny</base><ctag>adj:sg:acc:n:pos</ctag></lex> +<lex><base>jasny</base><ctag>adj:pl:nom:m2:pos</ctag></lex> +<lex><base>jasny</base><ctag>adj:pl:nom:m3:pos</ctag></lex> +<lex><base>jasny</base><ctag>adj:pl:nom:f:pos</ctag></lex> +<lex><base>jasny</base><ctag>adj:pl:nom:n:pos</ctag></lex> +<lex><base>jasny</base><ctag>adj:pl:acc:m2:pos</ctag></lex> +<lex><base>jasny</base><ctag>adj:pl:acc:m3:pos</ctag></lex> +<lex><base>jasny</base><ctag>adj:pl:acc:f:pos</ctag></lex> +<lex><base>jasny</base><ctag>adj:pl:acc:n:pos</ctag></lex> +</tok> +<tok> +<orth>i</orth> +<lex disamb="1"><base>i</base><ctag>conj</ctag></lex> +</tok> +<tok> +<orth>klarowne</orth> +<lex><base>klarowny</base><ctag>adj:sg:nom:n:pos</ctag></lex> +<lex><base>klarowny</base><ctag>adj:sg:acc:n:pos</ctag></lex> +<lex><base>klarowny</base><ctag>adj:pl:nom:m2:pos</ctag></lex> +<lex><base>klarowny</base><ctag>adj:pl:nom:m3:pos</ctag></lex> +<lex><base>klarowny</base><ctag>adj:pl:nom:f:pos</ctag></lex> +<lex disamb="1"><base>klarowny</base><ctag>adj:pl:nom:n:pos</ctag></lex> +<lex><base>klarowny</base><ctag>adj:pl:acc:m2:pos</ctag></lex> +<lex><base>klarowny</base><ctag>adj:pl:acc:m3:pos</ctag></lex> +<lex><base>klarowny</base><ctag>adj:pl:acc:f:pos</ctag></lex> +<lex><base>klarowny</base><ctag>adj:pl:acc:n:pos</ctag></lex> +</tok> +<tok> +<orth>pytania</orth> +<lex><base>pytać</base><ctag>ger:sg:gen:n:imperf:aff</ctag></lex> +<lex><base>pytanie</base><ctag>subst:sg:gen:n</ctag></lex> +<lex disamb="1"><base>pytanie</base><ctag>subst:pl:nom:n</ctag></lex> +<lex><base>pytanie</base><ctag>subst:pl:acc:n</ctag></lex> +<lex><base>pytanie</base><ctag>subst:pl:voc:n</ctag></lex> +</tok> +<ns/> +<tok> +<orth>,</orth> +<lex disamb="1"><base>,</base><ctag>interp</ctag></lex> +</tok> +<tok> +<orth>zrozumiałe</orth> +<lex><base>zrozumiały</base><ctag>adj:sg:nom:n:pos</ctag></lex> +<lex><base>zrozumiały</base><ctag>adj:sg:acc:n:pos</ctag></lex> +<lex><base>zrozumiały</base><ctag>adj:pl:nom:m2:pos</ctag></lex> +<lex><base>zrozumiały</base><ctag>adj:pl:nom:m3:pos</ctag></lex> +<lex><base>zrozumiały</base><ctag>adj:pl:nom:f:pos</ctag></lex> +<lex disamb="1"><base>zrozumiały</base><ctag>adj:pl:nom:n:pos</ctag></lex> +<lex><base>zrozumiały</base><ctag>adj:pl:acc:m2:pos</ctag></lex> +<lex><base>zrozumiały</base><ctag>adj:pl:acc:m3:pos</ctag></lex> +<lex><base>zrozumiały</base><ctag>adj:pl:acc:f:pos</ctag></lex> +<lex><base>zrozumiały</base><ctag>adj:pl:acc:n:pos</ctag></lex> +</tok> +<tok> +<orth>dla</orth> +<lex disamb="1"><base>dla</base><ctag>prep:gen</ctag></lex> +</tok> +<tok> +<orth>wszystkich</orth> +<lex><base>wszystek</base><ctag>adj:pl:gen:m1:pos</ctag></lex> +<lex><base>wszystek</base><ctag>adj:pl:gen:m2:pos</ctag></lex> +<lex disamb="1"><base>wszystek</base><ctag>adj:pl:gen:m3:pos</ctag></lex> +<lex><base>wszystek</base><ctag>adj:pl:gen:f:pos</ctag></lex> +<lex><base>wszystek</base><ctag>adj:pl:gen:n:pos</ctag></lex> +<lex><base>wszystek</base><ctag>adj:pl:loc:m1:pos</ctag></lex> +<lex><base>wszystek</base><ctag>adj:pl:loc:m2:pos</ctag></lex> +<lex><base>wszystek</base><ctag>adj:pl:loc:m3:pos</ctag></lex> +<lex><base>wszystek</base><ctag>adj:pl:loc:f:pos</ctag></lex> +<lex><base>wszystek</base><ctag>adj:pl:loc:n:pos</ctag></lex> +<lex><base>wszystek</base><ctag>adj:pl:acc:m1:pos</ctag></lex> +</tok> +<tok> +<orth>tych</orth> +<lex disamb="1"><base>ten</base><ctag>adj:pl:gen:m1:pos</ctag></lex> +<lex><base>ten</base><ctag>adj:pl:gen:m2:pos</ctag></lex> +<lex><base>ten</base><ctag>adj:pl:gen:m3:pos</ctag></lex> +<lex><base>ten</base><ctag>adj:pl:gen:f:pos</ctag></lex> +<lex><base>ten</base><ctag>adj:pl:gen:n:pos</ctag></lex> +<lex><base>ten</base><ctag>adj:pl:loc:m1:pos</ctag></lex> +<lex><base>ten</base><ctag>adj:pl:loc:m2:pos</ctag></lex> +<lex><base>ten</base><ctag>adj:pl:loc:m3:pos</ctag></lex> +<lex><base>ten</base><ctag>adj:pl:loc:f:pos</ctag></lex> +<lex><base>ten</base><ctag>adj:pl:loc:n:pos</ctag></lex> +<lex><base>ten</base><ctag>adj:pl:acc:m1:pos</ctag></lex> +</tok> +<ns/> +<tok> +<orth>,</orth> +<lex disamb="1"><base>,</base><ctag>interp</ctag></lex> +</tok> +<tok> +<orth>którzy</orth> +<lex disamb="1"><base>który</base><ctag>adj:pl:nom:m1:pos</ctag></lex> +</tok> +<tok> +<orth>zajmują</orth> +<lex disamb="1"><base>zajmować</base><ctag>fin:pl:ter:imperf</ctag></lex> +</tok> +<tok> +<orth>się</orth> +<lex disamb="1"><base>się</base><ctag>qub</ctag></lex> +</tok> +<tok> +<orth>immunologią</orth> +<lex disamb="1"><base>immunologia</base><ctag>subst:sg:inst:f</ctag></lex> +</tok> +<tok> +<orth>z</orth> +<lex disamb="1"><base>z</base><ctag>prep:gen:nwok</ctag></lex> +<lex><base>z</base><ctag>prep:inst:nwok</ctag></lex> +<lex><base>z</base><ctag>qub</ctag></lex> +</tok> +<tok> +<orth>racji</orth> +<lex disamb="1"><base>racja</base><ctag>subst:sg:gen:f</ctag></lex> +<lex><base>racja</base><ctag>subst:sg:dat:f</ctag></lex> +<lex><base>racja</base><ctag>subst:sg:loc:f</ctag></lex> +<lex><base>racja</base><ctag>subst:pl:gen:f</ctag></lex> +</tok> +<tok> +<orth>studiów</orth> +<lex disamb="1"><base>studium</base><ctag>subst:pl:gen:n</ctag></lex> +<lex disamb="1"><base>studio</base><ctag>subst:pl:gen:n</ctag></lex> +</tok> +<ns/> +<tok> +<orth>,</orth> +<lex disamb="1"><base>,</base><ctag>interp</ctag></lex> +</tok> +<tok> +<orth>pracy</orth> +<lex disamb="1"><base>praca</base><ctag>subst:sg:gen:f</ctag></lex> +<lex><base>praca</base><ctag>subst:sg:dat:f</ctag></lex> +<lex><base>praca</base><ctag>subst:sg:loc:f</ctag></lex> +</tok> +<tok> +<orth>z</orth> +<lex disamb="1"><base>z</base><ctag>prep:gen:nwok</ctag></lex> +<lex><base>z</base><ctag>prep:inst:nwok</ctag></lex> +<lex><base>z</base><ctag>qub</ctag></lex> +</tok> +<tok> +<orth>natury</orth> +<lex disamb="1"><base>natura</base><ctag>subst:sg:gen:f</ctag></lex> +<lex><base>natura</base><ctag>subst:pl:nom:f</ctag></lex> +<lex><base>natura</base><ctag>subst:pl:acc:f</ctag></lex> +<lex><base>natura</base><ctag>subst:pl:voc:f</ctag></lex> +</tok> +<tok> +<orth>naukowej</orth> +<lex disamb="1"><base>naukowy</base><ctag>adj:sg:gen:f:pos</ctag></lex> +<lex><base>naukowy</base><ctag>adj:sg:dat:f:pos</ctag></lex> +<lex><base>naukowy</base><ctag>adj:sg:loc:f:pos</ctag></lex> +</tok> +<tok> +<orth>czy</orth> +<lex disamb="1"><base>czy</base><ctag>conj</ctag></lex> +<lex><base>czy</base><ctag>qub</ctag></lex> +</tok> +<tok> +<orth>zawodowej</orth> +<lex disamb="1"><base>zawodowy</base><ctag>adj:sg:gen:f:pos</ctag></lex> +<lex><base>zawodowy</base><ctag>adj:sg:dat:f:pos</ctag></lex> +<lex><base>zawodowy</base><ctag>adj:sg:loc:f:pos</ctag></lex> +</tok> +<ns/> +<tok> +<orth>.</orth> +<lex disamb="1"><base>.</base><ctag>interp</ctag></lex> +</tok> +</chunk> +</chunk> +<chunk type="p" xlink:href="#dv1p1"> +<chunk type="s"> +<tok> +<orth>Pozycja</orth> +<lex disamb="1"><base>pozycja</base><ctag>subst:sg:nom:f</ctag></lex> +</tok> +<ns/> +<tok> +<orth>,</orth> +<lex disamb="1"><base>,</base><ctag>interp</ctag></lex> +</tok> +<tok> +<orth>mimo</orth> +<lex><base>mima</base><ctag>subst:sg:voc:m1</ctag></lex> +<lex disamb="1"><base>mimo</base><ctag>prep:gen</ctag></lex> +</tok> +<tok> +<orth>iż</orth> +<lex disamb="1"><base>iż</base><ctag>conj</ctag></lex> +</tok> +<tok> +<orth>dotyczy</orth> +<lex disamb="1"><base>dotyczyć</base><ctag>fin:sg:ter:imperf</ctag></lex> +</tok> +<tok> +<orth>trudnych</orth> +<lex><base>trudny</base><ctag>adj:pl:gen:m1:pos</ctag></lex> +<lex><base>trudny</base><ctag>adj:pl:gen:m2:pos</ctag></lex> +<lex><base>trudny</base><ctag>adj:pl:gen:m3:pos</ctag></lex> +<lex><base>trudny</base><ctag>adj:pl:gen:f:pos</ctag></lex> +<lex disamb="1"><base>trudny</base><ctag>adj:pl:gen:n:pos</ctag></lex> +<lex><base>trudny</base><ctag>adj:pl:loc:m1:pos</ctag></lex> +<lex><base>trudny</base><ctag>adj:pl:loc:m2:pos</ctag></lex> +<lex><base>trudny</base><ctag>adj:pl:loc:m3:pos</ctag></lex> +<lex><base>trudny</base><ctag>adj:pl:loc:f:pos</ctag></lex> +<lex><base>trudny</base><ctag>adj:pl:loc:n:pos</ctag></lex> +<lex><base>trudny</base><ctag>adj:pl:acc:m1:pos</ctag></lex> +</tok> +<tok> +<orth>zagadnień</orth> +<lex disamb="1"><base>zagadnienie</base><ctag>subst:pl:gen:n</ctag></lex> +</tok> +<ns/> +<tok> +<orth>,</orth> +<lex disamb="1"><base>,</base><ctag>interp</ctag></lex> +</tok> +<tok> +<orth>zawiera</orth> +<lex disamb="1"><base>zawierać</base><ctag>fin:sg:ter:imperf</ctag></lex> +</tok> +<tok> +<orth>jasne</orth> +<lex disamb="1"><base>jasny</base><ctag>adj:sg:nom:n:pos</ctag></lex> +<lex><base>jasny</base><ctag>adj:sg:acc:n:pos</ctag></lex> +<lex><base>jasny</base><ctag>adj:pl:nom:m2:pos</ctag></lex> +<lex><base>jasny</base><ctag>adj:pl:nom:m3:pos</ctag></lex> +<lex><base>jasny</base><ctag>adj:pl:nom:f:pos</ctag></lex> +<lex><base>jasny</base><ctag>adj:pl:nom:n:pos</ctag></lex> +<lex><base>jasny</base><ctag>adj:pl:acc:m2:pos</ctag></lex> +<lex><base>jasny</base><ctag>adj:pl:acc:m3:pos</ctag></lex> +<lex><base>jasny</base><ctag>adj:pl:acc:f:pos</ctag></lex> +<lex><base>jasny</base><ctag>adj:pl:acc:n:pos</ctag></lex> +</tok> +<tok> +<orth>i</orth> +<lex disamb="1"><base>i</base><ctag>conj</ctag></lex> +</tok> +<tok> +<orth>klarowne</orth> +<lex><base>klarowny</base><ctag>adj:sg:nom:n:pos</ctag></lex> +<lex><base>klarowny</base><ctag>adj:sg:acc:n:pos</ctag></lex> +<lex><base>klarowny</base><ctag>adj:pl:nom:m2:pos</ctag></lex> +<lex><base>klarowny</base><ctag>adj:pl:nom:m3:pos</ctag></lex> +<lex><base>klarowny</base><ctag>adj:pl:nom:f:pos</ctag></lex> +<lex disamb="1"><base>klarowny</base><ctag>adj:pl:nom:n:pos</ctag></lex> +<lex><base>klarowny</base><ctag>adj:pl:acc:m2:pos</ctag></lex> +<lex><base>klarowny</base><ctag>adj:pl:acc:m3:pos</ctag></lex> +<lex><base>klarowny</base><ctag>adj:pl:acc:f:pos</ctag></lex> +<lex><base>klarowny</base><ctag>adj:pl:acc:n:pos</ctag></lex> +</tok> +<tok> +<orth>pytania</orth> +<lex><base>pytać</base><ctag>ger:sg:gen:n:imperf:aff</ctag></lex> +<lex><base>pytanie</base><ctag>subst:sg:gen:n</ctag></lex> +<lex disamb="1"><base>pytanie</base><ctag>subst:pl:nom:n</ctag></lex> +<lex><base>pytanie</base><ctag>subst:pl:acc:n</ctag></lex> +<lex><base>pytanie</base><ctag>subst:pl:voc:n</ctag></lex> +</tok> +<ns/> +<tok> +<orth>,</orth> +<lex disamb="1"><base>,</base><ctag>interp</ctag></lex> +</tok> +<tok> +<orth>zrozumiałe</orth> +<lex><base>zrozumiały</base><ctag>adj:sg:nom:n:pos</ctag></lex> +<lex><base>zrozumiały</base><ctag>adj:sg:acc:n:pos</ctag></lex> +<lex><base>zrozumiały</base><ctag>adj:pl:nom:m2:pos</ctag></lex> +<lex><base>zrozumiały</base><ctag>adj:pl:nom:m3:pos</ctag></lex> +<lex><base>zrozumiały</base><ctag>adj:pl:nom:f:pos</ctag></lex> +<lex disamb="1"><base>zrozumiały</base><ctag>adj:pl:nom:n:pos</ctag></lex> +<lex><base>zrozumiały</base><ctag>adj:pl:acc:m2:pos</ctag></lex> +<lex><base>zrozumiały</base><ctag>adj:pl:acc:m3:pos</ctag></lex> +<lex><base>zrozumiały</base><ctag>adj:pl:acc:f:pos</ctag></lex> +<lex><base>zrozumiały</base><ctag>adj:pl:acc:n:pos</ctag></lex> +</tok> +<tok> +<orth>dla</orth> +<lex disamb="1"><base>dla</base><ctag>prep:gen</ctag></lex> +</tok> +<tok> +<orth>wszystkich</orth> +<lex><base>wszystek</base><ctag>adj:pl:gen:m1:pos</ctag></lex> +<lex><base>wszystek</base><ctag>adj:pl:gen:m2:pos</ctag></lex> +<lex disamb="1"><base>wszystek</base><ctag>adj:pl:gen:m3:pos</ctag></lex> +<lex><base>wszystek</base><ctag>adj:pl:gen:f:pos</ctag></lex> +<lex><base>wszystek</base><ctag>adj:pl:gen:n:pos</ctag></lex> +<lex><base>wszystek</base><ctag>adj:pl:loc:m1:pos</ctag></lex> +<lex><base>wszystek</base><ctag>adj:pl:loc:m2:pos</ctag></lex> +<lex><base>wszystek</base><ctag>adj:pl:loc:m3:pos</ctag></lex> +<lex><base>wszystek</base><ctag>adj:pl:loc:f:pos</ctag></lex> +<lex><base>wszystek</base><ctag>adj:pl:loc:n:pos</ctag></lex> +<lex><base>wszystek</base><ctag>adj:pl:acc:m1:pos</ctag></lex> +</tok> +<tok> +<orth>tych</orth> +<lex disamb="1"><base>ten</base><ctag>adj:pl:gen:m1:pos</ctag></lex> +<lex><base>ten</base><ctag>adj:pl:gen:m2:pos</ctag></lex> +<lex><base>ten</base><ctag>adj:pl:gen:m3:pos</ctag></lex> +<lex><base>ten</base><ctag>adj:pl:gen:f:pos</ctag></lex> +<lex><base>ten</base><ctag>adj:pl:gen:n:pos</ctag></lex> +<lex><base>ten</base><ctag>adj:pl:loc:m1:pos</ctag></lex> +<lex><base>ten</base><ctag>adj:pl:loc:m2:pos</ctag></lex> +<lex><base>ten</base><ctag>adj:pl:loc:m3:pos</ctag></lex> +<lex><base>ten</base><ctag>adj:pl:loc:f:pos</ctag></lex> +<lex><base>ten</base><ctag>adj:pl:loc:n:pos</ctag></lex> +<lex><base>ten</base><ctag>adj:pl:acc:m1:pos</ctag></lex> +</tok> +<ns/> +<tok> +<orth>,</orth> +<lex disamb="1"><base>,</base><ctag>interp</ctag></lex> +</tok> +<tok> +<orth>którzy</orth> +<lex disamb="1"><base>który</base><ctag>adj:pl:nom:m1:pos</ctag></lex> +</tok> +<tok> +<orth>zajmują</orth> +<lex disamb="1"><base>zajmować</base><ctag>fin:pl:ter:imperf</ctag></lex> +</tok> +<tok> +<orth>się</orth> +<lex disamb="1"><base>się</base><ctag>qub</ctag></lex> +</tok> +<tok> +<orth>immunologią</orth> +<lex disamb="1"><base>immunologia</base><ctag>subst:sg:inst:f</ctag></lex> +</tok> +<tok> +<orth>z</orth> +<lex disamb="1"><base>z</base><ctag>prep:gen:nwok</ctag></lex> +<lex><base>z</base><ctag>prep:inst:nwok</ctag></lex> +<lex><base>z</base><ctag>qub</ctag></lex> +</tok> +<tok> +<orth>racji</orth> +<lex disamb="1"><base>racja</base><ctag>subst:sg:gen:f</ctag></lex> +<lex><base>racja</base><ctag>subst:sg:dat:f</ctag></lex> +<lex><base>racja</base><ctag>subst:sg:loc:f</ctag></lex> +<lex><base>racja</base><ctag>subst:pl:gen:f</ctag></lex> +</tok> +<tok> +<orth>studiów</orth> +<lex disamb="1"><base>studium</base><ctag>subst:pl:gen:n</ctag></lex> +<lex disamb="1"><base>studio</base><ctag>subst:pl:gen:n</ctag></lex> +</tok> +<ns/> +<tok> +<orth>,</orth> +<lex disamb="1"><base>,</base><ctag>interp</ctag></lex> +</tok> +<tok> +<orth>naukowej</orth> +<lex disamb="1"><base>naukowy</base><ctag>adj:sg:gen:f:pos</ctag></lex> +<lex><base>naukowy</base><ctag>adj:sg:dat:f:pos</ctag></lex> +<lex><base>naukowy</base><ctag>adj:sg:loc:f:pos</ctag></lex> +</tok> +<tok> +<orth>z</orth> +<lex disamb="1"><base>z</base><ctag>prep:gen:nwok</ctag></lex> +<lex><base>z</base><ctag>prep:inst:nwok</ctag></lex> +<lex><base>z</base><ctag>qub</ctag></lex> +</tok> +<tok> +<orth>natury</orth> +<lex disamb="1"><base>natura</base><ctag>subst:sg:gen:f</ctag></lex> +<lex><base>natura</base><ctag>subst:pl:nom:f</ctag></lex> +<lex><base>natura</base><ctag>subst:pl:acc:f</ctag></lex> +<lex><base>natura</base><ctag>subst:pl:voc:f</ctag></lex> +</tok> +<tok> +<orth>nie</orth> +<lex><base>on</base><ctag>ppron3:sg:acc:n:ter:akc:praep</ctag></lex> +<lex><base>on</base><ctag>ppron3:sg:acc:n:ter:nakc:praep</ctag></lex> +<lex><base>on</base><ctag>ppron3:pl:acc:m2:ter:akc:praep</ctag></lex> +<lex><base>on</base><ctag>ppron3:pl:acc:m2:ter:nakc:praep</ctag></lex> +<lex><base>on</base><ctag>ppron3:pl:acc:m3:ter:akc:praep</ctag></lex> +<lex><base>on</base><ctag>ppron3:pl:acc:m3:ter:nakc:praep</ctag></lex> +<lex><base>on</base><ctag>ppron3:pl:acc:f:ter:akc:praep</ctag></lex> +<lex><base>on</base><ctag>ppron3:pl:acc:f:ter:nakc:praep</ctag></lex> +<lex><base>on</base><ctag>ppron3:pl:acc:n:ter:akc:praep</ctag></lex> +<lex><base>on</base><ctag>ppron3:pl:acc:n:ter:nakc:praep</ctag></lex> +<lex disamb="1"><base>nie</base><ctag>qub</ctag></lex> +</tok> +<tok> +<orth>całkiem</orth> +<lex disamb="1"><base>całkiem</base><ctag>qub</ctag></lex> +</tok> +<tok> +<orth>pracy</orth> +<lex disamb="1"><base>praca</base><ctag>subst:sg:gen:f</ctag></lex> +<lex><base>praca</base><ctag>subst:sg:dat:f</ctag></lex> +<lex><base>praca</base><ctag>subst:sg:loc:f</ctag></lex> +</tok> +<tok> +<orth>czy</orth> +<lex><base>czy</base><ctag>conj</ctag></lex> +<lex disamb="1"><base>czy</base><ctag>qub</ctag></lex> +</tok> +<tok> +<orth>zawodowej</orth> +<lex disamb="1"><base>zawodowy</base><ctag>adj:sg:gen:f:pos</ctag></lex> +<lex><base>zawodowy</base><ctag>adj:sg:dat:f:pos</ctag></lex> +<lex><base>zawodowy</base><ctag>adj:sg:loc:f:pos</ctag></lex> +</tok> +<ns/> +<tok> +<orth>.</orth> +<lex disamb="1"><base>.</base><ctag>interp</ctag></lex> +</tok> +</chunk> +</chunk> </chunkList> </cesAna> diff --git a/libmwereader/tests/data/test1.xml b/libmwereader/tests/data/test1.xml index 6d37d572eb03d458fdf7910ec9b30bf7c063bc56..c4b6b9618c970b61da619d5d5c9b5f8fdb8a9668 100644 --- a/libmwereader/tests/data/test1.xml +++ b/libmwereader/tests/data/test1.xml @@ -5,4 +5,6 @@ <chunk type="p" xlink:href="#dv1p1">Projekt rozporządzenia Ministra Edukacji Narodowej w sprawie podstaw programowych kształcenia w zawodach: górnik eksploatacji podziemnej, górnik odkrywkowej eksploatacji złóż, monter instalacji gazowych, monter instrumentów całkowicie muzycznych, monter sieci komunalnych, stolarz, technik hydrolog, technik instrumentów muzycznych, technik meteorolog i technik papiernictwa Projekt rozporządzenia Ministra Edukacji Narodowej w sprawie sposobu i trybu organizowania indywidualnego obowiązkowego rocznego przygotowania przedszkolnego i indywidualnego nauczania dzieci i młodzieży Projekt rozporządzenia Ministra Edukacji Narodowej w sprawie rodzajów innych form wychowania przedszkolnego, warunków tworzenia i organizowania tych form oraz sposobu ich działania Projekt rozporządzenia Ministra Edukacji Narodowej w sprawie rodzajów innych form wychowania przedszkolnego, warunków tworzenia i organizowania tych form oraz sposobu ich działania.</chunk> <chunk type="p" xlink:href="#dv1p1">Pozycja, mimo iż dotyczy trudnych zagadnień, zawiera jasne i klarowne pytania, zrozumiałe dla wszystkich tych, którzy zajmują się immunologią z racji studiów, pracy nie całkiem naukowej czy zawodowej.</chunk> <chunk type="p" xlink:href="#dv1p1">Pozycja, mimo iż dotyczy trudnych zagadnień, zawiera jasne i klarowne pytania, zrozumiałe dla wszystkich tych, którzy zajmują się immunologią z racji studiów, naukowej nie całkiem pracy czy zawodowej.</chunk> +<chunk type="p" xlink:href="#dv1p1">Pozycja, mimo iż dotyczy trudnych zagadnień, zawiera jasne i klarowne pytania, zrozumiałe dla wszystkich tych, którzy zajmują się immunologią z racji studiów, pracy z natury naukowej czy zawodowej.</chunk> +<chunk type="p" xlink:href="#dv1p1">Pozycja, mimo iż dotyczy trudnych zagadnień, zawiera jasne i klarowne pytania, zrozumiałe dla wszystkich tych, którzy zajmują się immunologią z racji studiów, naukowej z natury nie całkiem pracy czy zawodowej.</chunk> </chunkList></cesAna> diff --git a/libmwereader/tests/data/test_mwe.xml b/libmwereader/tests/data/test_mwe.xml index 85368ac2ab5f4466a51c4d1a5f1879112fa39287..635b1f1442dd05b191e472d0bae6f50ff8fb9613 100644 --- a/libmwereader/tests/data/test_mwe.xml +++ b/libmwereader/tests/data/test_mwe.xml @@ -44,7 +44,7 @@ ) </condition> <instances> - <MWE name="dobre imię"> + <MWE base="dobre imię"> <var name="Adj">dobry</var> <var name="Subst">imię</var> <head>inter(class[0],{subst,ger,depr})</head> diff --git a/libmwereader/tests/mwefunctional.cpp b/libmwereader/tests/mwefunctional.cpp index 9ec490131f78602491b4be8c07537a1b7bdaada8..141d3fcabce21841b88c849399d978d65d6b2ceb 100644 --- a/libmwereader/tests/mwefunctional.cpp +++ b/libmwereader/tests/mwefunctional.cpp @@ -37,7 +37,6 @@ BOOST_FIXTURE_TEST_CASE( preferred_lexeme, Fixture) const Corpus2::Tagset& tset = Corpus2::get_named_tagset("kipi"); Corpus2::MWEReader mwr(tset, test_corpus.string()); mwr.set_option("inner:xces"); - mwr.set_option("mwefile:"+ (data_dir / "fix_mwe.xml").string()); Corpus2::Sentence::Ptr s1 = mwr.get_next_sentence(); @@ -71,8 +70,7 @@ BOOST_FIXTURE_TEST_CASE( fix_no_gap , Fixture) mwr.set_option("mwefile:"+ (data_dir / "fix_mwe.xml").string()); Corpus2::Sentence::Ptr s1 = mwr.get_next_sentence(); Corpus2::Token* mwu = s1->operator[](4); - std::string a = mwu->orth_utf8(); - BOOST_CHECK(a == "dobrej woli"); + BOOST_CHECK(mwu->orth_utf8() == "dobrej woli"); BOOST_CHECK(mwu->get_preferred_lexeme(tset).lemma_utf8() == "dobra wola"); } @@ -88,10 +86,10 @@ BOOST_FIXTURE_TEST_CASE( flex_no_gap , Fixture) mwr.get_next_sentence(); Corpus2::Sentence::Ptr s2 = mwr.get_next_sentence(); Corpus2::Sentence::Ptr s3 = mwr.get_next_sentence(); - Corpus2::Token* mwu = s2->operator[](13); + Corpus2::Token* mwu = s2->operator[](14); BOOST_CHECK(mwu->orth_utf8() == "dzień dobry"); BOOST_CHECK(mwu->get_preferred_lexeme(tset).lemma_utf8() == "dzień dobry"); - Corpus2::Token* mwu2 = s3->operator[](13); + Corpus2::Token* mwu2 = s3->operator[](14); BOOST_CHECK(mwu2->orth_utf8() == "dobry dzień"); BOOST_CHECK(mwu2->get_preferred_lexeme(tset).lemma_utf8() == "dzień dobry"); } @@ -109,7 +107,8 @@ BOOST_FIXTURE_TEST_CASE( fix_gap , Fixture) mwr.get_next_sentence(); Corpus2::Sentence::Ptr s4 = mwr.get_next_sentence(); Corpus2::Token* mwu = s4->operator[](27); - BOOST_CHECK(mwu->orth_utf8() == "instumentów muzycznych"); + + BOOST_CHECK(mwu->orth_utf8() == "instrumentów muzycznych"); BOOST_CHECK(mwu->get_preferred_lexeme(tset).lemma_utf8() == "instrument muzyczny"); } BOOST_FIXTURE_TEST_CASE( flex_gap , Fixture) @@ -119,7 +118,7 @@ BOOST_FIXTURE_TEST_CASE( flex_gap , Fixture) Corpus2::MWEReader mwr(tset, test_corpus.string()); mwr.set_option("inner:xces"); - mwr.set_option("mwefile:"+ (data_dir / "flex_mwe.xml").string()); + mwr.set_option("mwefile:"+ (data_dir / "flex_gap_mwe.xml").string()); mwr.get_next_sentence(); mwr.get_next_sentence(); mwr.get_next_sentence(); @@ -129,8 +128,38 @@ BOOST_FIXTURE_TEST_CASE( flex_gap , Fixture) Corpus2::Token* mwu = s5->operator[](27); BOOST_CHECK(mwu->orth_utf8() == "pracy naukowej"); BOOST_CHECK(mwu->get_preferred_lexeme(tset).lemma_utf8() == "praca naukowa"); - Corpus2::Token* mwu2 = s6->operator[](27); + Corpus2::Token* mwu2 = s6->operator[](29); BOOST_CHECK(mwu2->orth_utf8() == "naukowej pracy"); BOOST_CHECK(mwu2->get_preferred_lexeme(tset).lemma_utf8() == "praca naukowa"); } +//czy head jest w odpowiednm miejscu jesli rzeczownik jest tam w srodku +BOOST_FIXTURE_TEST_CASE( flex_gap_noun , Fixture) +{ + BOOST_MESSAGE("=====================\ntest: finding flex_gap_noun mwe"); + const Corpus2::Tagset& tset = Corpus2::get_named_tagset("kipi"); + Corpus2::MWEReader mwr(tset, test_corpus.string()); + mwr.set_option("inner:xces"); + + mwr.set_option("mwefile:"+ (data_dir / "flex_gap_mwe.xml").string()); + mwr.get_next_sentence(); + mwr.get_next_sentence(); + mwr.get_next_sentence(); + mwr.get_next_sentence(); + mwr.get_next_sentence(); + mwr.get_next_sentence(); + Corpus2::Sentence::Ptr s7 = mwr.get_next_sentence(); + Corpus2::Token* mwu = s7->operator[](27); + BOOST_CHECK(mwu->orth_utf8() == "pracy naukowej"); + BOOST_CHECK(mwu->get_preferred_lexeme(tset).lemma_utf8() == "praca naukowa"); + Corpus2::Sentence::Ptr s8 = mwr.get_next_sentence(); + Corpus2::Token* mwu2 = s8->operator[](31); + std::string a = mwu2->orth_utf8(); + BOOST_MESSAGE("+++++"+a); + BOOST_CHECK(mwu2->orth_utf8() == "naukowej pracy"); + BOOST_CHECK(mwu2->get_preferred_lexeme(tset).lemma_utf8() == "praca naukowa"); + +} + + + BOOST_AUTO_TEST_SUITE_END()