#include "mwereader.h" #include "mweparser.h" #include <boost/algorithm/string.hpp> namespace Corpus2{ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( "mwereader","inner,mwefile"); // TODO more help? MWEReader::MWEReader(const Tagset &tagset, const std::string &filename) : TokenReader(tagset), inner_filename_(filename) { // TODO implementataion? } MWEReader::~MWEReader() { // TODO implementataion } Token* MWEReader::get_next_token() { if(currentSentence->empty()) currentSentence=get_next_sentence(); std::vector<Token*> tokens = currentSentence->tokens(); if(token_index<tokens.size()) { return tokens.at(token_index++); } else { currentSentence=get_next_sentence(); if(currentSentence==NULL) { return NULL; } tokens = currentSentence->tokens(); token_index=0; return tokens.at(token_index++); } } Sentence::Ptr MWEReader::get_next_sentence() { currentSentence = inner_reader_->get_next_sentence(); if(currentSentence==0) { return currentSentence; } Wccl::SentenceContext sc(currentSentence); token_index=0; return process_sentence(sc); } Sentence::Ptr MWEReader::process_sentence(Wccl::SentenceContext & sc) { size_t sssize = sc.size(); for(int i = 0; i < sc.size() ;++i){ sc.set_position(i); Corpus2::Token *pToken = sc.at(i); //std::cout << "AAAAAAAAAAA" << i << " ---- " << sc.size() << std::endl; //if(!pToken) //continue; //std::cout << pToken->orth_utf8() << " "; std::vector<Lexeme>& lexemes = pToken->lexemes(); if(!lexemes.size()){ sssize = sc.size(); continue; } foreach(const Lexeme& lex, lexemes){ if(lex.is_disamb()){ std::string base = lex.lemma_utf8(); const MWEIndex::luvec& potential = mwe_index_.get_potential_lu(base); //std::cout << "potential " << potential.size() << std::endl; foreach(LexicalUnit::Ptr pLU, potential){ std::set<int> positions; int head; //std::cout << " is " << std::endl; bool is_here = pLU->IsHere(sc, positions, head); //std::cout << " is out" << std::endl; if(is_here){ std::string new_orth_utf8; Corpus2::Sentence::Ptr sent = sc.get_sentence_ptr(); std::vector<Token*> &tokens = sent->tokens(); int orig_i = i; foreach(const int &pos, positions){ Token* tok = tokens [pos]; new_orth_utf8 += tok->orth_utf8() + " "; if(pos != head){ delete tok; tokens[pos] = NULL; //std::cout << "BBBB " << pos << " " << i << std::endl; if(orig_i > pos) { i--; std::cout << "\nTUTUXXXXXX\n"; } //std::cout << "XBBBB " << pos << " " << i << std::endl; } } new_orth_utf8.erase(new_orth_utf8.size()-1, 1); Corpus2::Token *tok = (*sent)[head]; tok->set_orth_utf8(new_orth_utf8); foreach(Lexeme& lex, tok->lexemes()){ if(lex.is_disamb()){ lex.set_lemma_utf8(pLU->get_base()); } } tokens.erase(std::remove(tokens.begin(), tokens.end(), (Token*)NULL), tokens.end()); } } } } sssize = sc.size(); } return sc.get_sentence_ptr(); } boost::shared_ptr<Chunk> MWEReader::get_next_chunk() { currentChunk=inner_reader_->get_next_chunk(); if(currentChunk == NULL) return currentChunk; std::vector< boost::shared_ptr<Corpus2::Sentence> > s1 = currentChunk->sentences(); std::vector< boost::shared_ptr<Corpus2::Sentence> >::iterator it; for(it=s1.begin(); it!=s1.end(); it++) { if(it==s1.begin()) currentSentence=*it; Wccl::SentenceContext sc(*it); process_sentence(sc); } token_index=0; return currentChunk; } void MWEReader::set_option(const std::string& option) { if(boost::algorithm::starts_with(option, "inner:")) { std::string inner = option.substr(6); inner_reader_ = create_path_reader(inner, this->tagset(), inner_filename_); token_index=0; currentSentence= boost::make_shared<Sentence>(); } if(boost::algorithm::starts_with(option, "mwefile:")) { std::string mwefile = option.substr(8); load_mwes(mwefile); } // TODO more MWE stuff } void MWEReader::validate() { if(inner_reader_ == NULL) throw Corpus2Error("Inner reader not initialised."); // TODO MWE stuff } std::string MWEReader::get_option(const std::string& option) const { if(boost::algorithm::starts_with(option, "inner:") && inner_reader_ != NULL) return option; // TODO options for MWE return inner_reader_->get_option(option); } void MWEReader::load_mwes(const std::string &filename) { MWEParser parser(mwe_index_); parser.parse_file(filename); } }// ns Corpus2