diff --git a/libmwereader/mwe.cpp b/libmwereader/mwe.cpp index 5799d2cf8b96e545fa0ae005fb797880de7d8fa1..ec4bbe07916d11947dca3a2995af948c62efd229 100644 --- a/libmwereader/mwe.cpp +++ b/libmwereader/mwe.cpp @@ -24,7 +24,7 @@ LexicalUnit::LexicalUnit(const std::string &base, } bool LexicalUnit::IsHere(const Wccl::SentenceContext &sc, - std::set<size_t> &out_position, int &head_pos) const + std::set<int> &out_position, int &head_pos) const { // set variables for(variables_map::const_iterator ivars = variables_.begin(); diff --git a/libmwereader/mwe.h b/libmwereader/mwe.h index 439323f99c4d81b11106241ae69d65806727f3ff..0fec3e0e2f8be3172f75b87fa84aa9d927235458 100644 --- a/libmwereader/mwe.h +++ b/libmwereader/mwe.h @@ -35,7 +35,7 @@ public: * \returns true if this lexical unit was found here */ virtual bool IsHere(const Wccl::SentenceContext& sc, - std::set<size_t> &out_positions, int &head_pos) const; + std::set<int> &out_positions, int &head_pos) const; const std::string & get_base() const{ return base_;} const variables_map & get_variables() const{ return variables_;} diff --git a/libmwereader/mweparser.cpp b/libmwereader/mweparser.cpp index cda56745dc33ef33f2798fce26e74aac9a37ac1e..54e8b2752ffd9ad6830c5a0f9a4fe93e3d6454a9 100644 --- a/libmwereader/mweparser.cpp +++ b/libmwereader/mweparser.cpp @@ -42,6 +42,8 @@ namespace Corpus2 { return op; } + ////////////////////////////////////////////////////////////////////// + MWEParser::MWEParser(MWEIndex &index) : BasicSaxParser(), state_(NONE), mwe_index_(index) { @@ -71,7 +73,7 @@ namespace Corpus2 { + group_type_); } - + variables_.clear(); } std::string MWEParser::get_attribute(const AttributeList& attributes, diff --git a/libmwereader/mwereader.cpp b/libmwereader/mwereader.cpp index e6328be4549d4e8d810d424f7491cf6d80776479..6b0c86107e58453bf1b93f582a1ad6ad20425df2 100644 --- a/libmwereader/mwereader.cpp +++ b/libmwereader/mwereader.cpp @@ -30,33 +30,65 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( { // TODO MWE stuff Sentence::Ptr pSentence = inner_reader_->get_next_sentence(); - Wccl::SentenceContext sc(pSentence); + return process_sentence(sc); + + } + + Sentence::Ptr MWEReader::process_sentence(Wccl::SentenceContext & sc) + { for(int i = 0; i < sc.size(); ++i){ sc.set_position(i); - Corpus2::Token *pToken = (*pSentence)[i]; - std::cout << pToken->orth_utf8() << " "; + Corpus2::Token *pToken = sc.at(i); + //std::cout << pToken->orth_utf8() << " "; std::vector<Lexeme>& lexemes = pToken->lexemes(); foreach(const Lexeme& lex, lexemes){ if(lex.is_disamb()){ std::string base = lex.lemma_utf8(); const MWEIndex::luvec& potential = mwe_index_.get_potential_lu(base); - if(potential.size()) - std::cout << "# "; foreach(LexicalUnit::Ptr pLU, potential){ - std::set<size_t> positions; + std::set<int> positions; int head; bool is_here = pLU->IsHere(sc, positions, head); - if(is_here) - std::cout << "** " << pLU->get_base() << "** "; + if(is_here){ + std::string new_orth_utf8; + Corpus2::Sentence::Ptr sent = sc.get_sentence_ptr(); + + std::vector<Token*> &tokens = sent->tokens(); + foreach(const int &pos, positions){ + Token* tok = tokens [pos]; + new_orth_utf8 += tok->orth_utf8() + " "; + if(pos != head){ + delete tok; + tokens[pos] = NULL; + if(pos >= i) + i--; + } + } + + Corpus2::Token *tok = (*sent)[head]; + tok->set_orth_utf8(new_orth_utf8); + foreach(Lexeme& lex, tok->lexemes()) + if(lex.is_disamb()) + lex.set_lemma_utf8(pLU->get_base()); + + std::vector <Token*>::iterator del_iter = tokens.begin(); + while (del_iter != tokens.end()) { + if( (*del_iter) == NULL) + del_iter = tokens.erase(del_iter); + else + ++del_iter; + } + } + } } } } - std::cout << "ENDL\n"; - return pSentence; + + return sc.get_sentence_ptr(); } boost::shared_ptr<Chunk> MWEReader::get_next_chunk() diff --git a/libmwereader/mwereader.h b/libmwereader/mwereader.h index 4c65c2e16c9741df6dbdd68a5a4638911a0b80a2..93d171987a39c67db37e74e688c9246387cc17cb 100644 --- a/libmwereader/mwereader.h +++ b/libmwereader/mwereader.h @@ -46,6 +46,9 @@ public: static bool registered; +protected: + Sentence::Ptr process_sentence(Wccl::SentenceContext & sc); + private: void load_mwes(const std::string& filename); diff --git a/libmwereader/mwertest.cpp b/libmwereader/mwertest.cpp index c5b9cebb1e2172e2e1095db476e5cd1aeecf65ef..3265fadd9499fc404b4aa0c7a8a01b1d56bc5c78 100644 --- a/libmwereader/mwertest.cpp +++ b/libmwereader/mwertest.cpp @@ -11,4 +11,6 @@ int main(int ac, char**av) MWEParser parser(temp_index); parser.parse_file(av[1]); + + } diff --git a/libmwereader/test_mwe.xml b/libmwereader/test_mwe.xml index 020213ffa30dd2e18419b6168fd5a416dae909d9..85368ac2ab5f4466a51c4d1a5f1879112fa39287 100644 --- a/libmwereader/test_mwe.xml +++ b/libmwereader/test_mwe.xml @@ -9,8 +9,8 @@ setvar($Pos2, 1), inter(class[0],{subst,ger,depr}), - inter(class[1],{subst,ger,depr}), - inter(cas[0], cas[1]) + inter(class[1],{subst,ger,depr})//, + //inter(cas[0], cas[1]) - głupie ograniczenie, np. Debatowali nad ceną netto ) </condition> <instances> @@ -24,6 +24,11 @@ <var name="Subst2">netto</var> <head>inter(base[0], "waga")</head> </MWE> + <MWE base="ratyfikacja traktatu"> + <var name="Subst1">ratyfikacja</var> + <var name="Subst2">traktat</var> + <head>inter(base[0], "traktat")</head> + </MWE> </instances> </mwegroup> <mwegroup name="AdjSubstFix" type="fix" class="subst">