Newer
Older
/*
Copyright (C) 2011 Adam Wardyński, Tomasz Śniatowski, Paweł Kędzia,
Adam Radziszewski, Bartosz Broda
Part of the WCCL project
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.
See the LICENSE and COPYING files for more details.
*/
#include <boost/algorithm/string.hpp>
namespace Corpus2{
bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
MWEReader::MWEReader(const Tagset &tagset, const std::string &filename)
: TokenReader(tagset), inner_filename_(filename)
MWEReader::MWEReader(const Tagset &tagset, const std::string &filename, TokenReaderPtr reader)
: TokenReader(tagset), inner_filename_(filename)
{
mwes_counter=0;
inner_reader_ = reader;
}
MWEReader::~MWEReader()
{
// TODO implementataion
}
Token* MWEReader::get_next_token()
{
if(currentSentence->empty())
currentSentence=get_next_sentence();
std::vector<Token*> tokens = currentSentence->tokens();
if(token_index<tokens.size())
{
return tokens.at(token_index++);
}
else
{
currentSentence=get_next_sentence();
if(currentSentence==NULL)
{
return NULL;
}
tokens = currentSentence->tokens();
token_index=0;
return tokens.at(token_index++);
}
}
Sentence::Ptr MWEReader::get_next_sentence()
{
currentSentence = inner_reader_->get_next_sentence();
if(currentSentence==0)
return currentSentence;
return process_sentence(currentSentence);
Sentence::Ptr MWEReader::process_sentence(Corpus2::Sentence::Ptr sentence)
Wccl::SentenceContext sc(sentence);
Corpus2::Token *pToken = sc.at(i);
std::vector<Lexeme>& lexemes = pToken->lexemes();
if(lexemes.size() == 0)
foreach(const Lexeme& lex, lexemes){
if(lex.is_disamb()){
std::string base = lex.lemma_utf8();
const MWEIndex::luvec& potential = mwe_index_.get_potential_lu(base);
foreach(LexicalUnit::Ptr pLU, potential){
std::set<int> positions;
int head;
bool is_here = pLU->IsHere(sc, positions, head);
if(is_here)
return process_sentence(clone_sentence_add_mwe(sentence, head, positions, pLU->get_base()));
return sentence;
}
Sentence::Ptr MWEReader::clone_sentence_add_mwe(Corpus2::Sentence::Ptr sentence,
int head, const std::set<int>& all,
const std::string &new_base)
{
std::string new_orth = get_new_orth_utf8(sentence, all);
Sentence::Ptr new_sentence = boost::make_shared<Sentence>();
std::vector<Token*> &tokens = sentence->tokens();
for(int i = 0; i < (int)tokens.size(); i++){
if(i == head){
Corpus2::Token * t = tokens[i]->clone();
t->set_orth_utf8(new_orth);
foreach(Lexeme& lex, t->lexemes())
if(lex.is_disamb())
lex.set_lemma_utf8(new_base);
new_sentence->append(t);
} else if( all.find(i) == all.end())
new_sentence->append(tokens[i]->clone());
// else -> do nothing
}
return new_sentence;
}
std::string MWEReader::get_new_orth_utf8(Corpus2::Sentence::Ptr sentence,
const std::set<int>& all)
{
std::string new_orth;
std::vector<Token*> &tokens = sentence->tokens();
foreach(const int &pos, all){
Token* tok = tokens [pos];
new_orth += tok->orth_utf8() + " ";
}
new_orth.erase(new_orth.size()-1, 1);
return new_orth;
}
boost::shared_ptr<Chunk> MWEReader::get_next_chunk()
{
currentChunk=inner_reader_->get_next_chunk();
if(currentChunk == NULL)
return currentChunk;
boost::shared_ptr<Chunk> new_chunk = boost::make_shared<Chunk>();
foreach(Corpus2::Sentence::Ptr sentence, currentChunk->sentences())
new_chunk->append( process_sentence(sentence) );
return new_chunk;
}
void MWEReader::set_option(const std::string& option)
{
std::string inner = option.substr(6);
inner_reader_ = create_path_reader(inner, this->tagset(),
token_index=0;
currentSentence= boost::make_shared<Sentence>();
}
if(boost::algorithm::starts_with(option, "mwefile:")) {
std::string mwefile = option.substr(8);
if(boost::filesystem::exists(mwefile)){
else
throw std::runtime_error("File "+ mwefile + " does not exists");
}
if(boost::algorithm::starts_with(option, "mwefile-list:")) {
std::string mwefile = option.substr(13);
size_t found=mwefile.find(" ",0);
while(found!=std::string::npos)
{
std::string file = mwefile.substr(0,found);
if(boost::filesystem::exists(file))
load_mwes(file);
else
throw std::runtime_error("File "+ mwefile +" does not exists");
mwefile=mwefile.substr(found+1);
found=mwefile.find(" ",0);
}
if(boost::filesystem::exists(mwefile))
load_mwes(mwefile);
else
throw std::runtime_error("File "+ mwefile +" does not exists");
if(inner_reader_ == NULL)
throw Corpus2Error("Inner reader not initialised.");
if(mwes_counter==0)
throw Corpus2Error("MWE files were not loaded");
}
std::string MWEReader::get_option(const std::string& option) const
{
if(boost::algorithm::starts_with(option, "inner:")
&& inner_reader_ != NULL)
return option;
// TODO options for MWE
return inner_reader_->get_option(option);
void MWEReader::load_mwes(const std::string &filename)
{
if(parser.get_tagset().name() != tagset().name())
throw std::runtime_error( "Tagset in mwe file does not match reader tagset!" );