From f5f4f356e9600c062819005904f0fc2b94f43c4f Mon Sep 17 00:00:00 2001 From: Bartosz Broda <bartosz.broda@gmail.com> Date: Mon, 10 Oct 2011 10:35:56 +0200 Subject: [PATCH] refactor of mwe reader code, removed bug in memory allocation along the way --- libmwereader/mwereader.cpp | 112 ++++++++++++++++--------------------- libmwereader/mwereader.h | 8 ++- 2 files changed, 56 insertions(+), 64 deletions(-) diff --git a/libmwereader/mwereader.cpp b/libmwereader/mwereader.cpp index 4679078..821cbf7 100644 --- a/libmwereader/mwereader.cpp +++ b/libmwereader/mwereader.cpp @@ -67,81 +67,71 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( { currentSentence = inner_reader_->get_next_sentence(); if(currentSentence==0) - { return currentSentence; - } - Wccl::SentenceContext sc(currentSentence); - token_index=0; - return process_sentence(sc); + return process_sentence(currentSentence); } - Sentence::Ptr MWEReader::process_sentence(Wccl::SentenceContext & sc) + Sentence::Ptr MWEReader::process_sentence(Corpus2::Sentence::Ptr sentence) { - - size_t sssize = sc.size(); + Wccl::SentenceContext sc(sentence); for(int i = 0; i < sc.size() ;++i){ sc.set_position(i); Corpus2::Token *pToken = sc.at(i); - //std::cout << "AAAAAAAAAAA" << i << " ---- " << sc.size() << std::endl; - //if(!pToken) - //continue; - //std::cout << pToken->orth_utf8() << " "; std::vector<Lexeme>& lexemes = pToken->lexemes(); - if(!lexemes.size()){ - sssize = sc.size(); + if(lexemes.size() == 0) continue; - } + foreach(const Lexeme& lex, lexemes){ if(lex.is_disamb()){ std::string base = lex.lemma_utf8(); const MWEIndex::luvec& potential = mwe_index_.get_potential_lu(base); - //std::cout << "potential " << potential.size() << std::endl; foreach(LexicalUnit::Ptr pLU, potential){ std::set<int> positions; int head; - //std::cout << " is " << std::endl; bool is_here = pLU->IsHere(sc, positions, head); - //std::cout << " is out" << std::endl; - if(is_here){ - std::string new_orth_utf8; - Corpus2::Sentence::Ptr sent = sc.get_sentence_ptr(); - - std::vector<Token*> &tokens = sent->tokens(); - int orig_i = i; - foreach(const int &pos, positions){ - Token* tok = tokens [pos]; - new_orth_utf8 += tok->orth_utf8() + " "; - if(pos != head){ - delete tok; - tokens[pos] = NULL; - //std::cout << "BBBB " << pos << " " << i << std::endl; - - if(orig_i > pos) - { - i--; - std::cout << "\nTUTUXXXXXX\n"; - } - //std::cout << "XBBBB " << pos << " " << i << std::endl; - } - } - new_orth_utf8.erase(new_orth_utf8.size()-1, 1); - Corpus2::Token *tok = (*sent)[head]; - tok->set_orth_utf8(new_orth_utf8); - foreach(Lexeme& lex, tok->lexemes()){ - if(lex.is_disamb()){ - lex.set_lemma_utf8(pLU->get_base()); - } - } - tokens.erase(std::remove(tokens.begin(), tokens.end(), (Token*)NULL), tokens.end()); - } + if(is_here) + return process_sentence(clone_sentence_add_mwe(sentence, head, positions, pLU->get_base())); } } } - sssize = sc.size(); } + return sentence; + } + Sentence::Ptr MWEReader::clone_sentence_add_mwe(Corpus2::Sentence::Ptr sentence, + int head, const std::set<int>& all, + const std::string &new_base) + { + std::string new_orth = get_new_orth_utf8(sentence, all); + Sentence::Ptr new_sentence = boost::make_shared<Sentence>(); + std::vector<Token*> &tokens = sentence->tokens(); + for(int i = 0; i < (int)tokens.size(); i++){ + if(i == head){ + Corpus2::Token * t = tokens[i]->clone(); + t->set_orth_utf8(new_orth); + foreach(Lexeme& lex, t->lexemes()) + if(lex.is_disamb()) + lex.set_lemma_utf8(new_base); + new_sentence->append(t); + } else if( all.find(i) == all.end()) + new_sentence->append(tokens[i]->clone()); + // else -> do nothing + } + return new_sentence; + } - return sc.get_sentence_ptr(); + std::string MWEReader::get_new_orth_utf8(Corpus2::Sentence::Ptr sentence, + const std::set<int>& all) + { + std::string new_orth; + std::vector<Token*> &tokens = sentence->tokens(); + foreach(const int &pos, all){ + Token* tok = tokens [pos]; + new_orth += tok->orth_utf8() + " "; + } + new_orth.erase(new_orth.size()-1, 1); + + return new_orth; } boost::shared_ptr<Chunk> MWEReader::get_next_chunk() @@ -149,18 +139,14 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( currentChunk=inner_reader_->get_next_chunk(); if(currentChunk == NULL) return currentChunk; - std::vector< boost::shared_ptr<Corpus2::Sentence> > s1 = currentChunk->sentences(); - std::vector< boost::shared_ptr<Corpus2::Sentence> >::iterator it; - for(it=s1.begin(); it!=s1.end(); it++) - { - if(it==s1.begin()) - currentSentence=*it; - Wccl::SentenceContext sc(*it); - process_sentence(sc); - } - token_index=0; - return currentChunk; + boost::shared_ptr<Chunk> new_chunk = boost::make_shared<Chunk>(); + + foreach(Corpus2::Sentence::Ptr sentence, currentChunk->sentences()) + new_chunk->append( process_sentence(sentence) ); + + + return new_chunk; } void MWEReader::set_option(const std::string& option) diff --git a/libmwereader/mwereader.h b/libmwereader/mwereader.h index 9290c2e..29eb1bf 100644 --- a/libmwereader/mwereader.h +++ b/libmwereader/mwereader.h @@ -68,11 +68,17 @@ public: static bool registered; protected: - Sentence::Ptr process_sentence(Wccl::SentenceContext & sc); + Sentence::Ptr process_sentence(Corpus2::Sentence::Ptr sentence); private: void load_mwes(const std::string& filename); + Sentence::Ptr clone_sentence_add_mwe(Corpus2::Sentence::Ptr sentence, + int head, const std::set<int>& all, + const std::string &new_base); + std::string get_new_orth_utf8(Corpus2::Sentence::Ptr sentence, + const std::set<int>& all); + MWEIndex mwe_index_; /// ptr to inner reader doing the real work of reading a corpus TokenReaderPtr inner_reader_; -- GitLab