refactor of mwe reader code, removed bug in memory allocation along the way

f5f4f356 · Bartosz Broda · 74289078 · f5f4f356 · f5f4f356
Commit f5f4f356 authored 13 years ago by Bartosz Broda
--- a/libmwereader/mwereader.cpp
+++ b/libmwereader/mwereader.cpp
@@ -67,81 +67,71 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
 	{
 		currentSentence = inner_reader_->get_next_sentence();
 		if(currentSentence==0)
-		{
 			return currentSentence;
-		}
-		Wccl::SentenceContext sc(currentSentence);
-		token_index=0;
-		return process_sentence(sc);
+		return process_sentence(currentSentence);
 	}

-	Sentence::Ptr MWEReader::process_sentence(Wccl::SentenceContext & sc)
+	Sentence::Ptr MWEReader::process_sentence(Corpus2::Sentence::Ptr sentence)
 	{
-
-		size_t sssize = sc.size();
+		Wccl::SentenceContext sc(sentence);
 		for(int i = 0; i < sc.size() ;++i){
 			sc.set_position(i);
 			Corpus2::Token *pToken = sc.at(i);
-			//std::cout << "AAAAAAAAAAA" << i << " ---- " << sc.size()  << std::endl;
-			//if(!pToken)
-				//continue;
-			//std::cout << pToken->orth_utf8() << " ";
 			std::vector<Lexeme>& lexemes = pToken->lexemes();
-			if(!lexemes.size()){
-				sssize = sc.size();
+			if(lexemes.size() == 0)
 				continue;
-			}
+
 			foreach(const Lexeme& lex, lexemes){
 				if(lex.is_disamb()){
 					std::string base = lex.lemma_utf8();
 					const MWEIndex::luvec& potential = mwe_index_.get_potential_lu(base);
-					//std::cout << "potential " << potential.size() << std::endl;
 					foreach(LexicalUnit::Ptr pLU, potential){
 						std::set<int> positions;
 						int head;
-						//std::cout << " is " << std::endl;
 						bool is_here = pLU->IsHere(sc, positions, head);
-						//std::cout << " is out" << std::endl;
-						if(is_here){
-							std::string new_orth_utf8;
-							Corpus2::Sentence::Ptr sent = sc.get_sentence_ptr();
-
-							std::vector<Token*> &tokens = sent->tokens();
-							int orig_i = i;
-							foreach(const int &pos, positions){
-								Token* tok = tokens [pos];
-								new_orth_utf8 += tok->orth_utf8() + " ";
-								if(pos != head){
-									delete tok;
-									tokens[pos] = NULL;
-									//std::cout << "BBBB " << pos  << " " << i << std::endl;
-
-									if(orig_i > pos)
-									{
-										i--;
-										std::cout << "\nTUTUXXXXXX\n";
-									}
-									//std::cout << "XBBBB " << pos  << " " << i << std::endl;
-								}
-							}
-							new_orth_utf8.erase(new_orth_utf8.size()-1, 1);
-							Corpus2::Token *tok = (*sent)[head];
-							tok->set_orth_utf8(new_orth_utf8);
-							foreach(Lexeme& lex, tok->lexemes()){
-								if(lex.is_disamb()){
-									lex.set_lemma_utf8(pLU->get_base());
-								}
-							}
-							tokens.erase(std::remove(tokens.begin(), tokens.end(), (Token*)NULL), tokens.end());
-						}
+						if(is_here)
+							return  process_sentence(clone_sentence_add_mwe(sentence, head, positions, pLU->get_base()));
 					}
 				}
 			}
-			sssize = sc.size();
 		}
+		return sentence;
+	}

+	Sentence::Ptr  MWEReader::clone_sentence_add_mwe(Corpus2::Sentence::Ptr sentence,
+										  int head, const std::set<int>& all,
+										  const std::string &new_base)
+	{
+		std::string new_orth = get_new_orth_utf8(sentence, all);
+		Sentence::Ptr new_sentence = boost::make_shared<Sentence>();
+		std::vector<Token*> &tokens = sentence->tokens();
+		for(int i = 0; i < (int)tokens.size(); i++){
+			if(i == head){
+				Corpus2::Token * t = tokens[i]->clone();
+				t->set_orth_utf8(new_orth);
+				foreach(Lexeme& lex, t->lexemes())
+					if(lex.is_disamb())
+						lex.set_lemma_utf8(new_base);
+				new_sentence->append(t);
+			} else if( all.find(i) == all.end())
+				new_sentence->append(tokens[i]->clone());
+			// else -> do nothing
+		}
+		return new_sentence;
+	}

-		return sc.get_sentence_ptr();
+	std::string MWEReader::get_new_orth_utf8(Corpus2::Sentence::Ptr sentence,
+								  const std::set<int>& all)
+	{
+		std::string new_orth;
+		std::vector<Token*> &tokens = sentence->tokens();
+		foreach(const int &pos, all){
+			Token* tok = tokens [pos];
+			new_orth += tok->orth_utf8() + " ";
+		}
+		new_orth.erase(new_orth.size()-1, 1);
+
+		return new_orth;
 	}

 	boost::shared_ptr<Chunk> MWEReader::get_next_chunk()
@@ -149,18 +139,14 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
 		currentChunk=inner_reader_->get_next_chunk();
 		if(currentChunk == NULL)
 			return currentChunk;
-		std::vector< boost::shared_ptr<Corpus2::Sentence> > s1 = currentChunk->sentences();
-		std::vector< boost::shared_ptr<Corpus2::Sentence> >::iterator it;
-		for(it=s1.begin(); it!=s1.end(); it++)
-		{
-			if(it==s1.begin())
-				currentSentence=*it;
-			Wccl::SentenceContext sc(*it);
-			process_sentence(sc);
-		}

-		token_index=0;
-		return currentChunk;
+		 boost::shared_ptr<Chunk> new_chunk = boost::make_shared<Chunk>();
+
+		 foreach(Corpus2::Sentence::Ptr sentence, currentChunk->sentences())
+			 new_chunk->append( process_sentence(sentence) );
+
+
+		return new_chunk;
 	}

 	void MWEReader::set_option(const std::string& option)

--- a/libmwereader/mwereader.h
+++ b/libmwereader/mwereader.h
@@ -68,11 +68,17 @@ public:
 	static bool registered;

 protected:
-	Sentence::Ptr process_sentence(Wccl::SentenceContext & sc);
+	Sentence::Ptr process_sentence(Corpus2::Sentence::Ptr sentence);

 private:
 	void load_mwes(const std::string& filename);

+	Sentence::Ptr  clone_sentence_add_mwe(Corpus2::Sentence::Ptr sentence,
+										  int head, const std::set<int>& all,
+										  const std::string &new_base);
+	std::string get_new_orth_utf8(Corpus2::Sentence::Ptr sentence,
+								  const std::set<int>& all);
+
 	MWEIndex mwe_index_;
 	/// ptr to inner reader doing the real work of reading a corpus
 	TokenReaderPtr inner_reader_;