Newer
Older
#include <boost/algorithm/string.hpp>
namespace Corpus2{
bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
MWEReader::MWEReader(const Tagset &tagset, const std::string &filename)
: TokenReader(tagset), inner_filename_(filename)
// TODO implementataion?
}
MWEReader::~MWEReader()
{
// TODO implementataion
}
Token* MWEReader::get_next_token()
{
// get whole sentence -> process it -> return token by token
return inner_reader_->get_next_token();
}
Sentence::Ptr MWEReader::get_next_sentence()
{
Sentence::Ptr pSentence = inner_reader_->get_next_sentence();
Wccl::SentenceContext sc(pSentence);
return process_sentence(sc);
}
Sentence::Ptr MWEReader::process_sentence(Wccl::SentenceContext & sc)
{
for(int i = 0; i < sc.size(); ++i){
sc.set_position(i);
Corpus2::Token *pToken = sc.at(i);
//std::cout << pToken->orth_utf8() << " ";
std::vector<Lexeme>& lexemes = pToken->lexemes();
foreach(const Lexeme& lex, lexemes){
if(lex.is_disamb()){
std::string base = lex.lemma_utf8();
const MWEIndex::luvec& potential = mwe_index_.get_potential_lu(base);
std::cout << "potential " << potential.size() << std::endl;
foreach(LexicalUnit::Ptr pLU, potential){
std::set<int> positions;
bool is_here = pLU->IsHere(sc, positions, head);
if(is_here){
std::string new_orth_utf8;
Corpus2::Sentence::Ptr sent = sc.get_sentence_ptr();
std::vector<Token*> &tokens = sent->tokens();
foreach(const int &pos, positions){
Token* tok = tokens [pos];
new_orth_utf8 += tok->orth_utf8() + " ";
if(pos != head){
delete tok;
tokens[pos] = NULL;
if(pos >= i)
i--;
}
}
Corpus2::Token *tok = (*sent)[head];
tok->set_orth_utf8(new_orth_utf8);
lex.set_lemma_utf8(pLU->get_base());
tokens.erase(std::remove(tokens.begin(), tokens.end(), (Token*)NULL), tokens.end());
ilor
committed
}
return sc.get_sentence_ptr();
}
boost::shared_ptr<Chunk> MWEReader::get_next_chunk()
{
// get whole chunk -> process sentences -> return processed chunk
return inner_reader_->get_next_chunk();
}
void MWEReader::set_option(const std::string& option)
{
std::string inner = option.substr(6);
inner_reader_ = create_path_reader(inner, this->tagset(),
inner_filename_);
}
if(boost::algorithm::starts_with(option, "mwefile:")) {
std::string mwefile = option.substr(8);
load_mwes(mwefile);
if(inner_reader_ == NULL)
throw Corpus2Error("Inner reader not initialised.");
// TODO MWE stuff
}
std::string MWEReader::get_option(const std::string& option) const
{
if(boost::algorithm::starts_with(option, "inner:")
&& inner_reader_ != NULL)
return option;
// TODO options for MWE
return inner_reader_->get_option(option);
void MWEReader::load_mwes(const std::string &filename)
{