diff --git a/libmwereader/mwereader.cpp b/libmwereader/mwereader.cpp index e2399fa452c6c27d3974f6d1ce93705ff9ce8a7f..6586c3941c75ef20923ed6273ead776a8389987f 100644 --- a/libmwereader/mwereader.cpp +++ b/libmwereader/mwereader.cpp @@ -40,16 +40,24 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( Sentence::Ptr MWEReader::process_sentence(Wccl::SentenceContext & sc) { - for(int i = 0; i < sc.size(); ++i){ + size_t sssize = sc.size(); + for(int i = 0; i < sc.size() ;++i){ sc.set_position(i); Corpus2::Token *pToken = sc.at(i); + //std::cout << "AAAAAAAAAAA" << i << " ---- " << sc.size() << std::endl; + //if(!pToken) + //continue; //std::cout << pToken->orth_utf8() << " "; std::vector<Lexeme>& lexemes = pToken->lexemes(); + if(!lexemes.size()){ + sssize = sc.size(); + continue; + } foreach(const Lexeme& lex, lexemes){ if(lex.is_disamb()){ std::string base = lex.lemma_utf8(); const MWEIndex::luvec& potential = mwe_index_.get_potential_lu(base); - std::cout << "potential " << potential.size() << std::endl; + //std::cout << "potential " << potential.size() << std::endl; foreach(LexicalUnit::Ptr pLU, potential){ std::set<int> positions; int head; @@ -61,14 +69,21 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( Corpus2::Sentence::Ptr sent = sc.get_sentence_ptr(); std::vector<Token*> &tokens = sent->tokens(); + int orig_i = i; foreach(const int &pos, positions){ Token* tok = tokens [pos]; new_orth_utf8 += tok->orth_utf8() + " "; if(pos != head){ delete tok; tokens[pos] = NULL; - if(pos >= i) + //std::cout << "BBBB " << pos << " " << i << std::endl; + + if(orig_i > pos) + { i--; + std::cout << "\nTUTUXXXXXX\n"; + } + //std::cout << "XBBBB " << pos << " " << i << std::endl; } } new_orth_utf8.erase(new_orth_utf8.size()-1, 1); @@ -84,6 +99,7 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( } } } + sssize = sc.size(); }