Skip to content
Snippets Groups Projects
Commit 1a8eb2ba authored by Lukasz Bilenkij's avatar Lukasz Bilenkij
Browse files

Add processing in get_next_token/chunk/sentence methods

parent 9ac9fb0d
Branches
No related merge requests found
......@@ -21,20 +21,40 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
Token* MWEReader::get_next_token()
{
// TODO MWE stuff
// get whole sentence -> process it -> return token by token
return inner_reader_->get_next_token();
if(currentSentence->empty())
currentSentence=get_next_sentence();
std::vector<Token*> tokens = currentSentence->tokens();
if(token_index<tokens.size())
{
return tokens.at(token_index++);
}
else
{
currentSentence=get_next_sentence();
if(currentSentence==NULL)
{
return NULL;
}
tokens = currentSentence->tokens();
token_index=0;
return tokens.at(token_index++);
}
}
Sentence::Ptr MWEReader::get_next_sentence()
{
// TODO MWE stuff
Sentence::Ptr pSentence = inner_reader_->get_next_sentence();
if(pSentence == NULL)
return Sentence::Ptr();
Wccl::SentenceContext sc(pSentence);
currentSentence = inner_reader_->get_next_sentence();
if(currentSentence==0)
{
return currentSentence;
}
Wccl::SentenceContext sc(currentSentence);
token_index=0;
return process_sentence(sc);
}
Sentence::Ptr MWEReader::process_sentence(Wccl::SentenceContext & sc)
......@@ -108,9 +128,21 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
boost::shared_ptr<Chunk> MWEReader::get_next_chunk()
{
// TODO MWE stuff
// get whole chunk -> process sentences -> return processed chunk
return inner_reader_->get_next_chunk();
currentChunk=inner_reader_->get_next_chunk();
if(currentChunk == NULL)
return currentChunk;
std::vector< boost::shared_ptr<Corpus2::Sentence> > s1 = currentChunk->sentences();
std::vector< boost::shared_ptr<Corpus2::Sentence> >::iterator it;
for(it=s1.begin(); it!=s1.end(); it++)
{
if(it==s1.begin())
currentSentence=*it;
Wccl::SentenceContext sc(*it);
process_sentence(sc);
}
token_index=0;
return currentChunk;
}
void MWEReader::set_option(const std::string& option)
......@@ -120,6 +152,8 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
std::string inner = option.substr(6);
inner_reader_ = create_path_reader(inner, this->tagset(),
inner_filename_);
token_index=0;
currentSentence= boost::make_shared<Sentence>();
}
if(boost::algorithm::starts_with(option, "mwefile:")) {
std::string mwefile = option.substr(8);
......
......@@ -62,6 +62,9 @@ private:
/// path for inner reader
std::string inner_filename_;
/// inner reader option
size_t token_index;
Sentence::Ptr currentSentence;
boost::shared_ptr<Chunk> currentChunk;
};
} // ns Corpus2
......
......@@ -33,6 +33,7 @@ struct Fixture{
BOOST_FIXTURE_TEST_CASE( preferred_lexeme, Fixture)
{
BOOST_MESSAGE("test: finding preferred lexeme");
......@@ -47,6 +48,7 @@ BOOST_FIXTURE_TEST_CASE( preferred_lexeme, Fixture)
}
BOOST_FIXTURE_TEST_CASE( lexeme_no_white_spaces, Fixture)
{
BOOST_MESSAGE("=====================\ntest: no white space after or before");
......@@ -162,6 +164,78 @@ BOOST_FIXTURE_TEST_CASE( flex_gap_noun , Fixture)
}
BOOST_FIXTURE_TEST_CASE( get_next_chunk, Fixture)
{
const Corpus2::Tagset& tset = Corpus2::get_named_tagset("kipi");
Corpus2::MWEReader mwr1(tset, test_corpus.string());
mwr1.set_option("inner:xces");
mwr1.set_option("mwefile:"+ (data_dir / "fix_mwe.xml").string());
mwr1.get_next_chunk();
mwr1.get_next_chunk();
mwr1.get_next_chunk();
mwr1.get_next_chunk();
mwr1.get_next_chunk();
mwr1.get_next_chunk();
mwr1.get_next_chunk();
BOOST_CHECK(mwr1.get_next_chunk()!=NULL);
BOOST_CHECK(mwr1.get_next_chunk()==NULL);
}
BOOST_FIXTURE_TEST_CASE( get_next_sentence, Fixture)
{
const Corpus2::Tagset& tset = Corpus2::get_named_tagset("kipi");
Corpus2::MWEReader mwr1(tset, test_corpus.string());
mwr1.set_option("inner:xces");
mwr1.set_option("mwefile:"+ (data_dir / "fix_mwe.xml").string());
mwr1.get_next_sentence();
mwr1.get_next_sentence();
mwr1.get_next_sentence();
mwr1.get_next_sentence();
mwr1.get_next_sentence();
mwr1.get_next_sentence();
mwr1.get_next_sentence();
BOOST_CHECK(mwr1.get_next_sentence()!=NULL);
BOOST_CHECK(mwr1.get_next_sentence()==NULL);
}
BOOST_FIXTURE_TEST_CASE( flex_no_gap_new , Fixture)
{
BOOST_MESSAGE("=====================\ntest: finding flex mwe");
const Corpus2::Tagset& tset = Corpus2::get_named_tagset("kipi");
Corpus2::MWEReader mwr(tset, test_corpus.string());
mwr.set_option("inner:xces");
mwr.set_option("mwefile:"+ (data_dir / "flex_mwe.xml").string());
Corpus2::Token* mwu;
for(int i=0; i<120 ; i++)
{
switch(i)
{
case 26:
mwu = mwr.get_next_token();
BOOST_CHECK(mwu->orth_utf8() == "dzień dobry");
BOOST_CHECK(mwu->get_preferred_lexeme(tset).lemma_utf8() == "dzień dobry");
break;
case 48:
mwu = mwr.get_next_token();
BOOST_CHECK(mwu->orth_utf8() == "dobry dzień");
BOOST_CHECK(mwu->get_preferred_lexeme(tset).lemma_utf8() == "dzień dobry");
break;
default:
mwr.get_next_token();
}
}
}
//Check NULL
BOOST_AUTO_TEST_SUITE_END()
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment