/* Copyright (C) 2011 Adam Wardyński, Tomasz Śniatowski, Paweł Kędzia, Adam Radziszewski, Bartosz Broda Part of the WCCL project This program is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the LICENSE, COPYING.LESSER and COPYING files for more details. */ #include "mwereader.h" #include "mweparser.h" #include <boost/algorithm/string.hpp> #include <boost/filesystem.hpp> #include <boost/unordered_set.hpp> #include <boost/lexical_cast.hpp> namespace Corpus2{ typedef boost::shared_ptr<Wccl::SentenceContext> SentenceContextPtr; typedef boost::shared_ptr<AnnotatedSentence> AnnotatedSentencePtr; typedef boost::shared_ptr<TokenMetaData> TokenMetaDataPtr; typedef std::map<std::string, AnnotationChannel> ChanMapT; bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( "mwereader","inner,mwefile"); // TODO more help? MWEReader::MWEReader(const Tagset &tagset, const std::string &filename) : TokenReader(tagset), inner_filename_(filename) { mwes_counter=0; chan_ann_name = "mwe"; } MWEReader::MWEReader(const Tagset &tagset, const std::string &filename, TokenReaderPtr reader) : TokenReader(tagset), inner_filename_(filename) { mwes_counter=0; inner_reader_ = reader; chan_ann_name = "mwe"; } void MWEReader::setFile(const std::string &filename) { inner_filename_ = filename; reset(); } MWEReader::~MWEReader() { // TODO implementataion } void MWEReader::set_annotation_channel(const std::string & chan_name) { chan_ann_name = chan_name; } std::string MWEReader::get_annotation_channel_base_name() { return chan_ann_name + "_base"; } Token* MWEReader::get_next_token() { if(currentSentence->empty()) currentSentence=get_next_sentence(); std::vector<Token*> tokens = currentSentence->tokens(); if(token_index<tokens.size()) { return tokens.at(token_index++)->clone(); } else { currentSentence=get_next_sentence(); if(currentSentence==NULL) { return NULL; } tokens = currentSentence->tokens(); token_index=0; return tokens.at(token_index++)->clone(); } } Sentence::Ptr MWEReader::get_next_sentence() { currentSentence = inner_reader_->get_next_sentence(); if(currentSentence==0) return currentSentence; return process_sentence(currentSentence); } void MWEReader::add_mwe_channel(SentenceContextPtr sentence_ctx, int head, const std::set<int>& all, int annotation_number, const std::string &new_base) { Corpus2::Sentence::Ptr sentence = sentence_ctx->get_sentence_ptr(); AnnotatedSentencePtr ann_sentence = AnnotatedSentence::wrap_sentence(sentence); std::string new_orth = get_new_orth_utf8(sentence, all); std::vector<Token*> &tokens = ann_sentence->tokens(); // create 'mwe' channel if not exists ChanMapT chan_map = ann_sentence->all_channels(); if (chan_map.find(chan_ann_name) == chan_map.end()) { ann_sentence->create_channel(chan_ann_name); } AnnotationChannel& channel = ann_sentence->get_channel(chan_ann_name); // if channel exists, we leave annotation numbers int head_ann_num = channel.get_segment_at(head); // if not, we add new annotation number and MWE base to head token if (head_ann_num <= 0) { head_ann_num = annotation_number; } channel.set_segment_at(head, head_ann_num); // create metadata if not exits, for 'mwe_base' prop if (!tokens[head]->get_metadata()) { tokens[head]->create_metadata(); } TokenMetaDataPtr md = tokens[head]->get_metadata(); md->set_attribute(get_annotation_channel_base_name(), new_base); // annotate mwe elements with annotation_number of head std::set<int>::iterator pos_it; int ann_num; for (pos_it = all.begin(); pos_it != all.end(); ++pos_it) { ann_num = channel.get_segment_at(*pos_it); if (ann_num <= 0) { ann_num = head_ann_num; } channel.set_segment_at(*pos_it, ann_num); } // move context position to next token after MWE elements int curr_position = sentence_ctx->get_position(); if (curr_position + all.size() < sentence->size()) { sentence_ctx->set_position(sentence_ctx->get_position() + all.size()); } } Sentence::Ptr MWEReader::process_sentence(Corpus2::Sentence::Ptr sentence) { boost::unordered_set<std::string> available_bases; for (unsigned i = 0; i < sentence->size(); ++i) for (unsigned j = 0; j < sentence->at(i)->lexemes().size(); ++j) if (sentence->at(i)->lexemes()[j].is_disamb()) available_bases.insert(sentence->at(i)->lexemes()[j].lemma_utf8()); // TODO: pass annotated sentence to methods // AnnotatedSentencePtr ann_sentence = AnnotatedSentence::wrap_sentence(sentence); int annotation_number = 0; SentenceContextPtr sc = boost::make_shared<Wccl::SentenceContext>(sentence); for (sc->goto_start(); sc->is_current_inside(); sc->advance()) { Corpus2::Token *pToken = sc->current(); std::vector<Lexeme>& lexemes = pToken->lexemes(); if(lexemes.size() == 0) continue; BOOST_FOREACH (const Lexeme& lex, lexemes){ if(lex.is_disamb()){ std::string base = lex.lemma_utf8(); const MWEIndex::luvec& potential = mwe_index_.get_potential_lu(base); BOOST_FOREACH (LexicalUnit::Ptr pLU, potential) { bool ok = true; BOOST_FOREACH (const std::string & base, pLU->get_potential_bases()) { if (available_bases.find(base) == available_bases.end()) { ok = false; break; } } if (ok) { std::set<int> positions; int head; bool is_here = pLU->IsHere(sc, positions, head); if(is_here) { if (annotate) { add_mwe_channel( sc, head, positions, ++annotation_number, pLU->get_base()); } else { sc = clone_sentence_add_mwe( sc, head, positions, pLU->get_base()); } } } } } } } return sc->get_sentence_ptr(); } SentenceContextPtr MWEReader::clone_sentence_add_mwe(SentenceContextPtr sentence, int head, const std::set<int>& all, const std::string &new_base) { std::string new_orth = get_new_orth_utf8(sentence->get_sentence_ptr(), all); Sentence::Ptr new_sentence = boost::make_shared<AnnotatedSentence>(); new_sentence->set_id(sentence->get_sentence_ptr()->id()); SentenceContextPtr new_context = boost::make_shared<Wccl::SentenceContext>(new_sentence); new_context->set_position(sentence->get_position()); std::vector<Token*> &tokens = sentence->get_sentence_ptr()->tokens(); for (int i = 0; i < (int)tokens.size(); i++) { if(i == head) { Corpus2::Token * t = tokens[i]->clone(); t->set_orth_utf8(new_orth); BOOST_FOREACH (Lexeme& lex, t->lexemes()) if(lex.is_disamb()) lex.set_lemma_utf8(new_base); new_sentence->append(t); } else if( all.find(i) == all.end()) new_sentence->append(tokens[i]->clone()); else if (i < sentence->get_position()) new_context->recede(); } return new_context; } std::string MWEReader::get_new_orth_utf8(Corpus2::Sentence::Ptr sentence, const std::set<int>& all) { std::string new_orth; std::vector<Token*> &tokens = sentence->tokens(); BOOST_FOREACH (const int &pos, all){ Token* tok = tokens [pos]; new_orth += tok->orth_utf8() + " "; } new_orth.erase(new_orth.size()-1, 1); return new_orth; } boost::shared_ptr<Chunk> MWEReader::get_next_chunk() { currentChunk=inner_reader_->get_next_chunk(); if(currentChunk == NULL) return currentChunk; boost::shared_ptr<Chunk> new_chunk = boost::make_shared<Chunk>(); new_chunk->set_attribute("id", currentChunk->get_attribute("id")); BOOST_FOREACH (Corpus2::Sentence::Ptr sentence, currentChunk->sentences()) new_chunk->append( process_sentence(sentence) ); return new_chunk; } void MWEReader::set_option(const std::string& option) { if(boost::algorithm::starts_with(option, "inner:")) { inner_reader_type = option.substr(6); reset(); } if(boost::algorithm::starts_with(option, "annotations:")) { annotate = boost::lexical_cast<bool>(option.substr(12)); } if(boost::algorithm::starts_with(option, "mwefile:")) { std::string mwefile = option.substr(8); boost::algorithm::trim(mwefile); if(boost::filesystem::exists(mwefile)){ load_mwes(mwefile); } else throw std::runtime_error("File "+ mwefile + " does not exists"); } if(boost::algorithm::starts_with(option, "mwefile-list:")) { std::string mwefile = option.substr(13); size_t found=mwefile.find(" ",0); while(found!=std::string::npos) { std::string file = mwefile.substr(0,found); if(boost::filesystem::exists(file)) load_mwes(file); else throw std::runtime_error("File "+ mwefile +" does not exists"); mwefile=mwefile.substr(found+1); found=mwefile.find(" ",0); } if(boost::filesystem::exists(mwefile)) load_mwes(mwefile); else throw std::runtime_error("File "+ mwefile +" does not exists"); } // TODO more MWE stuff } void MWEReader::validate() { if(inner_reader_ == NULL) throw Corpus2Error("Inner reader not initialised."); if(mwes_counter==0) throw Corpus2Error("MWE files were not loaded"); // TODO MWE stuff } std::string MWEReader::get_option(const std::string& option) const { if(boost::algorithm::starts_with(option, "inner:") && inner_reader_ != NULL) return option; // TODO options for MWE return inner_reader_->get_option(option); } void MWEReader::load_mwes(const std::string &filename) { MWEParser parser(mwe_index_); parser.parse_file(filename); mwes_counter++; if(parser.get_tagset().name() != tagset().name()) throw std::runtime_error( "Tagset in mwe file does not match reader tagset!" ); } void MWEReader::reset() { inner_reader_ = create_path_reader(inner_reader_type, this->tagset(), inner_filename_); token_index=0; currentSentence = boost::make_shared<Sentence>(); } }// ns Corpus2