/* Copyright (C) 2011 Adam Wardyński, Tomasz Śniatowski, Paweł Kędzia, Adam Radziszewski, Bartosz Broda Part of the WCCL project This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the LICENSE and COPYING files for more details. */ #include "mwereader.h" #include "mweparser.h" #include <boost/algorithm/string.hpp> #include <boost/filesystem.hpp> namespace Corpus2{ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( "mwereader","inner,mwefile"); // TODO more help? MWEReader::MWEReader(const Tagset &tagset, const std::string &filename) : TokenReader(tagset), inner_filename_(filename) { mwes_counter=0; } MWEReader::MWEReader(const Tagset &tagset, const std::string &filename, TokenReaderPtr reader) : TokenReader(tagset), inner_filename_(filename) { mwes_counter=0; inner_reader_ = reader; } MWEReader::~MWEReader() { // TODO implementataion } Token* MWEReader::get_next_token() { if(currentSentence->empty()) currentSentence=get_next_sentence(); std::vector<Token*> tokens = currentSentence->tokens(); if(token_index<tokens.size()) { return tokens.at(token_index++); } else { currentSentence=get_next_sentence(); if(currentSentence==NULL) { return NULL; } tokens = currentSentence->tokens(); token_index=0; return tokens.at(token_index++); } } Sentence::Ptr MWEReader::get_next_sentence() { currentSentence = inner_reader_->get_next_sentence(); if(currentSentence==0) return currentSentence; return process_sentence(currentSentence); } Sentence::Ptr MWEReader::process_sentence(Corpus2::Sentence::Ptr sentence) { Wccl::SentenceContext sc(sentence); for(int i = 0; i < sc.size() ;++i){ sc.set_position(i); Corpus2::Token *pToken = sc.at(i); std::vector<Lexeme>& lexemes = pToken->lexemes(); if(lexemes.size() == 0) continue; foreach(const Lexeme& lex, lexemes){ if(lex.is_disamb()){ std::string base = lex.lemma_utf8(); const MWEIndex::luvec& potential = mwe_index_.get_potential_lu(base); foreach(LexicalUnit::Ptr pLU, potential){ std::set<int> positions; int head; bool is_here = pLU->IsHere(sc, positions, head); if(is_here) return process_sentence(clone_sentence_add_mwe(sentence, head, positions, pLU->get_base())); } } } } return sentence; } Sentence::Ptr MWEReader::clone_sentence_add_mwe(Corpus2::Sentence::Ptr sentence, int head, const std::set<int>& all, const std::string &new_base) { std::string new_orth = get_new_orth_utf8(sentence, all); Sentence::Ptr new_sentence = boost::make_shared<Sentence>(); std::vector<Token*> &tokens = sentence->tokens(); for(int i = 0; i < (int)tokens.size(); i++){ if(i == head){ Corpus2::Token * t = tokens[i]->clone(); t->set_orth_utf8(new_orth); foreach(Lexeme& lex, t->lexemes()) if(lex.is_disamb()) lex.set_lemma_utf8(new_base); new_sentence->append(t); } else if( all.find(i) == all.end()) new_sentence->append(tokens[i]->clone()); // else -> do nothing } return new_sentence; } std::string MWEReader::get_new_orth_utf8(Corpus2::Sentence::Ptr sentence, const std::set<int>& all) { std::string new_orth; std::vector<Token*> &tokens = sentence->tokens(); foreach(const int &pos, all){ Token* tok = tokens [pos]; new_orth += tok->orth_utf8() + " "; } new_orth.erase(new_orth.size()-1, 1); return new_orth; } boost::shared_ptr<Chunk> MWEReader::get_next_chunk() { currentChunk=inner_reader_->get_next_chunk(); if(currentChunk == NULL) return currentChunk; boost::shared_ptr<Chunk> new_chunk = boost::make_shared<Chunk>(); foreach(Corpus2::Sentence::Ptr sentence, currentChunk->sentences()) new_chunk->append( process_sentence(sentence) ); return new_chunk; } void MWEReader::set_option(const std::string& option) { if(boost::algorithm::starts_with(option, "inner:")) { std::string inner = option.substr(6); inner_reader_ = create_path_reader(inner, this->tagset(), inner_filename_); token_index=0; currentSentence= boost::make_shared<Sentence>(); } if(boost::algorithm::starts_with(option, "mwefile:")) { std::string mwefile = option.substr(8); boost::algorithm::trim(mwefile); if(boost::filesystem::exists(mwefile)){ load_mwes(mwefile); } else throw std::runtime_error("File "+ mwefile + " does not exists"); } if(boost::algorithm::starts_with(option, "mwefile-list:")) { std::string mwefile = option.substr(13); size_t found=mwefile.find(" ",0); while(found!=std::string::npos) { std::string file = mwefile.substr(0,found); if(boost::filesystem::exists(file)) load_mwes(file); else throw std::runtime_error("File "+ mwefile +" does not exists"); mwefile=mwefile.substr(found+1); found=mwefile.find(" ",0); } if(boost::filesystem::exists(mwefile)) load_mwes(mwefile); else throw std::runtime_error("File "+ mwefile +" does not exists"); } // TODO more MWE stuff } void MWEReader::validate() { if(inner_reader_ == NULL) throw Corpus2Error("Inner reader not initialised."); if(mwes_counter==0) throw Corpus2Error("MWE files were not loaded"); // TODO MWE stuff } std::string MWEReader::get_option(const std::string& option) const { if(boost::algorithm::starts_with(option, "inner:") && inner_reader_ != NULL) return option; // TODO options for MWE return inner_reader_->get_option(option); } void MWEReader::load_mwes(const std::string &filename) { MWEParser parser(mwe_index_); parser.parse_file(filename); mwes_counter++; if(parser.get_tagset().name() != tagset().name()) throw std::runtime_error( "Tagset in mwe file does not match reader tagset!" ); } }// ns Corpus2