diff --git a/libmwereader/mwereader.cpp b/libmwereader/mwereader.cpp index 6586c3941c75ef20923ed6273ead776a8389987f..0b91296c05d721de32162c35381d32566d572f99 100644 --- a/libmwereader/mwereader.cpp +++ b/libmwereader/mwereader.cpp @@ -21,20 +21,40 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( Token* MWEReader::get_next_token() { - // TODO MWE stuff - // get whole sentence -> process it -> return token by token - return inner_reader_->get_next_token(); + if(currentSentence->empty()) + currentSentence=get_next_sentence(); + + std::vector<Token*> tokens = currentSentence->tokens(); + if(token_index<tokens.size()) + { + return tokens.at(token_index++); + } + else + { + + currentSentence=get_next_sentence(); + + + if(currentSentence==NULL) + { + return NULL; + } + tokens = currentSentence->tokens(); + token_index=0; + return tokens.at(token_index++); + } } Sentence::Ptr MWEReader::get_next_sentence() { - // TODO MWE stuff - Sentence::Ptr pSentence = inner_reader_->get_next_sentence(); - if(pSentence == NULL) - return Sentence::Ptr(); - Wccl::SentenceContext sc(pSentence); + currentSentence = inner_reader_->get_next_sentence(); + if(currentSentence==0) + { + return currentSentence; + } + Wccl::SentenceContext sc(currentSentence); + token_index=0; return process_sentence(sc); - } Sentence::Ptr MWEReader::process_sentence(Wccl::SentenceContext & sc) @@ -108,9 +128,21 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( boost::shared_ptr<Chunk> MWEReader::get_next_chunk() { - // TODO MWE stuff - // get whole chunk -> process sentences -> return processed chunk - return inner_reader_->get_next_chunk(); + currentChunk=inner_reader_->get_next_chunk(); + if(currentChunk == NULL) + return currentChunk; + std::vector< boost::shared_ptr<Corpus2::Sentence> > s1 = currentChunk->sentences(); + std::vector< boost::shared_ptr<Corpus2::Sentence> >::iterator it; + for(it=s1.begin(); it!=s1.end(); it++) + { + if(it==s1.begin()) + currentSentence=*it; + Wccl::SentenceContext sc(*it); + process_sentence(sc); + } + + token_index=0; + return currentChunk; } void MWEReader::set_option(const std::string& option) @@ -120,6 +152,8 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( std::string inner = option.substr(6); inner_reader_ = create_path_reader(inner, this->tagset(), inner_filename_); + token_index=0; + currentSentence= boost::make_shared<Sentence>(); } if(boost::algorithm::starts_with(option, "mwefile:")) { std::string mwefile = option.substr(8); diff --git a/libmwereader/mwereader.h b/libmwereader/mwereader.h index 61d9c52859a6649fb4158951cf6d76e4362ca97c..44429b8a76a5008b0a50564f2d5f1d007ab65e13 100644 --- a/libmwereader/mwereader.h +++ b/libmwereader/mwereader.h @@ -62,6 +62,9 @@ private: /// path for inner reader std::string inner_filename_; /// inner reader option + size_t token_index; + Sentence::Ptr currentSentence; + boost::shared_ptr<Chunk> currentChunk; }; } // ns Corpus2 diff --git a/libmwereader/tests/mwefunctional.cpp b/libmwereader/tests/mwefunctional.cpp index 5e0b082fe5348bb93b8ff52cf769caeb5da89aa4..47fbc66816027a1cacf133218b161287b5bf5ba4 100644 --- a/libmwereader/tests/mwefunctional.cpp +++ b/libmwereader/tests/mwefunctional.cpp @@ -33,6 +33,7 @@ struct Fixture{ + BOOST_FIXTURE_TEST_CASE( preferred_lexeme, Fixture) { BOOST_MESSAGE("test: finding preferred lexeme"); @@ -47,6 +48,7 @@ BOOST_FIXTURE_TEST_CASE( preferred_lexeme, Fixture) } + BOOST_FIXTURE_TEST_CASE( lexeme_no_white_spaces, Fixture) { BOOST_MESSAGE("=====================\ntest: no white space after or before"); @@ -162,6 +164,78 @@ BOOST_FIXTURE_TEST_CASE( flex_gap_noun , Fixture) } +BOOST_FIXTURE_TEST_CASE( get_next_chunk, Fixture) +{ + const Corpus2::Tagset& tset = Corpus2::get_named_tagset("kipi"); + Corpus2::MWEReader mwr1(tset, test_corpus.string()); + mwr1.set_option("inner:xces"); + mwr1.set_option("mwefile:"+ (data_dir / "fix_mwe.xml").string()); + + mwr1.get_next_chunk(); + mwr1.get_next_chunk(); + mwr1.get_next_chunk(); + mwr1.get_next_chunk(); + mwr1.get_next_chunk(); + mwr1.get_next_chunk(); + mwr1.get_next_chunk(); + BOOST_CHECK(mwr1.get_next_chunk()!=NULL); + BOOST_CHECK(mwr1.get_next_chunk()==NULL); +} + +BOOST_FIXTURE_TEST_CASE( get_next_sentence, Fixture) +{ + const Corpus2::Tagset& tset = Corpus2::get_named_tagset("kipi"); + Corpus2::MWEReader mwr1(tset, test_corpus.string()); + mwr1.set_option("inner:xces"); + mwr1.set_option("mwefile:"+ (data_dir / "fix_mwe.xml").string()); + + mwr1.get_next_sentence(); + mwr1.get_next_sentence(); + mwr1.get_next_sentence(); + mwr1.get_next_sentence(); + mwr1.get_next_sentence(); + mwr1.get_next_sentence(); + mwr1.get_next_sentence(); + BOOST_CHECK(mwr1.get_next_sentence()!=NULL); + BOOST_CHECK(mwr1.get_next_sentence()==NULL); +} + +BOOST_FIXTURE_TEST_CASE( flex_no_gap_new , Fixture) +{ + BOOST_MESSAGE("=====================\ntest: finding flex mwe"); + const Corpus2::Tagset& tset = Corpus2::get_named_tagset("kipi"); + Corpus2::MWEReader mwr(tset, test_corpus.string()); + mwr.set_option("inner:xces"); + + mwr.set_option("mwefile:"+ (data_dir / "flex_mwe.xml").string()); + Corpus2::Token* mwu; + for(int i=0; i<120 ; i++) + { + switch(i) + { + case 26: + mwu = mwr.get_next_token(); + BOOST_CHECK(mwu->orth_utf8() == "dzień dobry"); + BOOST_CHECK(mwu->get_preferred_lexeme(tset).lemma_utf8() == "dzień dobry"); + break; + case 48: + mwu = mwr.get_next_token(); + BOOST_CHECK(mwu->orth_utf8() == "dobry dzień"); + BOOST_CHECK(mwu->get_preferred_lexeme(tset).lemma_utf8() == "dzień dobry"); + break; + default: + mwr.get_next_token(); + } + + } +} + + + + + +//Check NULL + BOOST_AUTO_TEST_SUITE_END()