diff --git a/src/libmwereader/mwereader.cpp b/src/libmwereader/mwereader.cpp index a0acbcb66a930d3bfe7393b7fc6331126538123d..013841d1e78ca8dd9f70702f65a30c9dc3df5fd4 100644 --- a/src/libmwereader/mwereader.cpp +++ b/src/libmwereader/mwereader.cpp @@ -155,7 +155,7 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( // move context position to next token after MWE elements int curr_position = sentence_ctx->get_position(); if (curr_position + all.size() < sentence->size()) { - sentence_ctx->set_position(sentence_ctx->get_position() + all.size()); + sentence_ctx->set_position(sentence_ctx->get_position() + all.size() - 1); } } @@ -171,6 +171,8 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( // TODO: pass annotated sentence to methods // AnnotatedSentencePtr ann_sentence = AnnotatedSentence::wrap_sentence(sentence); int annotation_number = 0; + // keeps position of last annotated token + int last_ann_pos = -1; SentenceContextPtr sc = boost::make_shared<Wccl::SentenceContext>(sentence); for (sc->goto_start(); sc->is_current_inside(); sc->advance()) @@ -184,6 +186,10 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( if(lex.is_disamb()){ std::string base = lex.lemma_utf8(); const MWEIndex::luvec& potential = mwe_index_.get_potential_lu(base); + // variables holding values for selected lu + LexicalUnit::Ptr sel_pLU; + std::set<int> sel_positions; + int sel_head; BOOST_FOREACH (LexicalUnit::Ptr pLU, potential) { bool ok = true; @@ -202,20 +208,45 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( int head; bool is_here = pLU->IsHere(sc, positions, head); if(is_here) { - if (annotate) { - add_mwe_channel( - sc, head, positions, - ++annotation_number, - pLU->get_base()); - } - else { - sc = clone_sentence_add_mwe( - sc, head, positions, - pLU->get_base()); + if (positions.size() > sel_positions.size()) { + sel_pLU = pLU; + sel_positions = positions; + sel_head = head; } } } } + if (sel_pLU) { + int first_curr_pos; + int last_curr_pos; + if(!sel_positions.empty()) + first_curr_pos = *sel_positions.begin(); + // annotate only if tokens have not been already + // anotated; Below check relies on first token and + // prevents from re-anotating the last token in the + // sentence (grom previous annotating) + if (first_curr_pos > last_ann_pos) { + if (annotate) { + add_mwe_channel( + sc, + sel_head, + sel_positions, + ++annotation_number, + sel_pLU->get_base() + ); + } + else { + sc = clone_sentence_add_mwe( + sc, + sel_head, + sel_positions, + sel_pLU->get_base() + ); + } + last_curr_pos = *sel_positions.rbegin(); + last_ann_pos = last_curr_pos; + } + } } } }