Skip to content
Snippets Groups Projects
Commit f5f4f356 authored by Bartosz Broda's avatar Bartosz Broda
Browse files

refactor of mwe reader code, removed bug in memory allocation along the way

parent 74289078
No related merge requests found
......@@ -67,81 +67,71 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
{
currentSentence = inner_reader_->get_next_sentence();
if(currentSentence==0)
{
return currentSentence;
}
Wccl::SentenceContext sc(currentSentence);
token_index=0;
return process_sentence(sc);
return process_sentence(currentSentence);
}
Sentence::Ptr MWEReader::process_sentence(Wccl::SentenceContext & sc)
Sentence::Ptr MWEReader::process_sentence(Corpus2::Sentence::Ptr sentence)
{
size_t sssize = sc.size();
Wccl::SentenceContext sc(sentence);
for(int i = 0; i < sc.size() ;++i){
sc.set_position(i);
Corpus2::Token *pToken = sc.at(i);
//std::cout << "AAAAAAAAAAA" << i << " ---- " << sc.size() << std::endl;
//if(!pToken)
//continue;
//std::cout << pToken->orth_utf8() << " ";
std::vector<Lexeme>& lexemes = pToken->lexemes();
if(!lexemes.size()){
sssize = sc.size();
if(lexemes.size() == 0)
continue;
}
foreach(const Lexeme& lex, lexemes){
if(lex.is_disamb()){
std::string base = lex.lemma_utf8();
const MWEIndex::luvec& potential = mwe_index_.get_potential_lu(base);
//std::cout << "potential " << potential.size() << std::endl;
foreach(LexicalUnit::Ptr pLU, potential){
std::set<int> positions;
int head;
//std::cout << " is " << std::endl;
bool is_here = pLU->IsHere(sc, positions, head);
//std::cout << " is out" << std::endl;
if(is_here){
std::string new_orth_utf8;
Corpus2::Sentence::Ptr sent = sc.get_sentence_ptr();
std::vector<Token*> &tokens = sent->tokens();
int orig_i = i;
foreach(const int &pos, positions){
Token* tok = tokens [pos];
new_orth_utf8 += tok->orth_utf8() + " ";
if(pos != head){
delete tok;
tokens[pos] = NULL;
//std::cout << "BBBB " << pos << " " << i << std::endl;
if(orig_i > pos)
{
i--;
std::cout << "\nTUTUXXXXXX\n";
}
//std::cout << "XBBBB " << pos << " " << i << std::endl;
}
}
new_orth_utf8.erase(new_orth_utf8.size()-1, 1);
Corpus2::Token *tok = (*sent)[head];
tok->set_orth_utf8(new_orth_utf8);
foreach(Lexeme& lex, tok->lexemes()){
if(lex.is_disamb()){
lex.set_lemma_utf8(pLU->get_base());
}
}
tokens.erase(std::remove(tokens.begin(), tokens.end(), (Token*)NULL), tokens.end());
}
if(is_here)
return process_sentence(clone_sentence_add_mwe(sentence, head, positions, pLU->get_base()));
}
}
}
sssize = sc.size();
}
return sentence;
}
Sentence::Ptr MWEReader::clone_sentence_add_mwe(Corpus2::Sentence::Ptr sentence,
int head, const std::set<int>& all,
const std::string &new_base)
{
std::string new_orth = get_new_orth_utf8(sentence, all);
Sentence::Ptr new_sentence = boost::make_shared<Sentence>();
std::vector<Token*> &tokens = sentence->tokens();
for(int i = 0; i < (int)tokens.size(); i++){
if(i == head){
Corpus2::Token * t = tokens[i]->clone();
t->set_orth_utf8(new_orth);
foreach(Lexeme& lex, t->lexemes())
if(lex.is_disamb())
lex.set_lemma_utf8(new_base);
new_sentence->append(t);
} else if( all.find(i) == all.end())
new_sentence->append(tokens[i]->clone());
// else -> do nothing
}
return new_sentence;
}
return sc.get_sentence_ptr();
std::string MWEReader::get_new_orth_utf8(Corpus2::Sentence::Ptr sentence,
const std::set<int>& all)
{
std::string new_orth;
std::vector<Token*> &tokens = sentence->tokens();
foreach(const int &pos, all){
Token* tok = tokens [pos];
new_orth += tok->orth_utf8() + " ";
}
new_orth.erase(new_orth.size()-1, 1);
return new_orth;
}
boost::shared_ptr<Chunk> MWEReader::get_next_chunk()
......@@ -149,18 +139,14 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
currentChunk=inner_reader_->get_next_chunk();
if(currentChunk == NULL)
return currentChunk;
std::vector< boost::shared_ptr<Corpus2::Sentence> > s1 = currentChunk->sentences();
std::vector< boost::shared_ptr<Corpus2::Sentence> >::iterator it;
for(it=s1.begin(); it!=s1.end(); it++)
{
if(it==s1.begin())
currentSentence=*it;
Wccl::SentenceContext sc(*it);
process_sentence(sc);
}
token_index=0;
return currentChunk;
boost::shared_ptr<Chunk> new_chunk = boost::make_shared<Chunk>();
foreach(Corpus2::Sentence::Ptr sentence, currentChunk->sentences())
new_chunk->append( process_sentence(sentence) );
return new_chunk;
}
void MWEReader::set_option(const std::string& option)
......
......@@ -68,11 +68,17 @@ public:
static bool registered;
protected:
Sentence::Ptr process_sentence(Wccl::SentenceContext & sc);
Sentence::Ptr process_sentence(Corpus2::Sentence::Ptr sentence);
private:
void load_mwes(const std::string& filename);
Sentence::Ptr clone_sentence_add_mwe(Corpus2::Sentence::Ptr sentence,
int head, const std::set<int>& all,
const std::string &new_base);
std::string get_new_orth_utf8(Corpus2::Sentence::Ptr sentence,
const std::set<int>& all);
MWEIndex mwe_index_;
/// ptr to inner reader doing the real work of reading a corpus
TokenReaderPtr inner_reader_;
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment