diff --git a/libmwereader/mwe.cpp b/libmwereader/mwe.cpp index 6facfd33441e833a4fd068d577594dafc954cfff..0d9454d70a9b5271ea8a25e932673dd9a56e26a7 100644 --- a/libmwereader/mwe.cpp +++ b/libmwereader/mwe.cpp @@ -49,8 +49,13 @@ bool LexicalUnit::IsHere(const Wccl::SentenceContext &sc, for(variables_map::const_iterator ivars = variables_.begin(); ivars != variables_.end(); ++ivars){ if(!boost::starts_with(ivars->first, "!")){ - //std::cout << ivars->first << " " << std::endl; + /*std::cout << " -- " << base_ << " -- " << ivars->first << " -- " << std::endl; + + for (unsigned i = 0; i < condition_->valid_variable_names().size(); i++) + std::cout << condition_->valid_variable_names()[i] << std::endl;*/ + condition_->set<Wccl::StrSet>(ivars->first, ivars->second); + //std::cout << " -- egi --" << std::endl; } } diff --git a/libmwereader/mweparser.cpp b/libmwereader/mweparser.cpp index 5487097e58519151ee7be81c14d7a9b2cf868a09..2b51ec4b5f37dfa61321e888fe734b7565e62417 100644 --- a/libmwereader/mweparser.cpp +++ b/libmwereader/mweparser.cpp @@ -17,6 +17,8 @@ or FITNESS FOR A PARTICULAR PURPOSE. #include "mweparser.h" +#include <algorithm> + #include <boost/foreach.hpp> #include <libcorpus2/tagsetmanager.h> @@ -75,15 +77,28 @@ namespace Corpus2 { wccl_operator_); MWEBuilder::BoolOpPtr head = mwe_builder_->get_head_condition( head_cond_); - if(group_type_ == "fix"){ // group_name_ -> lower case - mwe_index_.add_lexicalunit( LexicalUnit::Ptr(new FixedLU(mwe_base_, main, head, - variables_))); - } else if(group_type_ == "flex"){ - mwe_index_.add_lexicalunit(LexicalUnit::Ptr(new FlexLU(mwe_base_, main, head, - variables_))); - } else { - throw Wccl::WcclError("Unknown type of lexical unit:" - + group_type_); + + std::vector<std::string> valid_vars = main->valid_variable_names(); + for (str_map::iterator it = variables_.begin(); it != variables_.end(); ++it) + { + if (std::find(valid_vars.begin(), valid_vars.end(), it->first) != valid_vars.end()) + { + if(group_type_ == "fix"){ // group_name_ -> lower case + mwe_index_.add_lexicalunit( LexicalUnit::Ptr(new FixedLU(mwe_base_, main, head, + variables_))); + } else if(group_type_ == "flex"){ + mwe_index_.add_lexicalunit(LexicalUnit::Ptr(new FlexLU(mwe_base_, main, head, + variables_))); + } else { + throw Wccl::WcclError("Unknown type of lexical unit:" + + group_type_); + } + + } + else + { + std::cerr << "Warning: " << mwe_base_ << " has unknown variable " << it->first << "! Skipping." << std::endl; + } } variables_.clear(); } diff --git a/libmwereader/mwereader.cpp b/libmwereader/mwereader.cpp index 561acd7caf3eb3e4456bd3d3d3e2266e7670d618..36ebc2cb51553828789d332c5783f2deb2fc6737 100644 --- a/libmwereader/mwereader.cpp +++ b/libmwereader/mwereader.cpp @@ -19,6 +19,7 @@ or FITNESS FOR A PARTICULAR PURPOSE. #include "mweparser.h" #include <boost/algorithm/string.hpp> #include <boost/filesystem.hpp> +#include <boost/unordered_set.hpp> namespace Corpus2{ @@ -38,6 +39,12 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( mwes_counter=0; inner_reader_ = reader; } + + void MWEReader::setFile(const std::string &filename) + { + inner_filename_ = filename; + reset(); + } MWEReader::~MWEReader() { @@ -80,10 +87,18 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( Sentence::Ptr MWEReader::process_sentence(Corpus2::Sentence::Ptr sentence) { + boost::unordered_set<std::string> available_bases; + for (unsigned i = 0; i < sentence->size(); ++i) + for (unsigned j = 0; j < sentence->at(i)->lexemes().size(); ++j) + if (sentence->at(i)->lexemes()[j].is_disamb()) + available_bases.insert(sentence->at(i)->lexemes()[j].lemma_utf8()); + + + Wccl::SentenceContext sc(sentence); - for(int i = 0; i < sc.size() ;++i){ - sc.set_position(i); - Corpus2::Token *pToken = sc.at(i); + for (sc.goto_start(); sc.is_current_inside(); sc.advance()) + { + Corpus2::Token *pToken = sc.current(); std::vector<Lexeme>& lexemes = pToken->lexemes(); if(lexemes.size() == 0) continue; @@ -92,39 +107,65 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( if(lex.is_disamb()){ std::string base = lex.lemma_utf8(); const MWEIndex::luvec& potential = mwe_index_.get_potential_lu(base); - BOOST_FOREACH (LexicalUnit::Ptr pLU, potential){ - std::set<int> positions; - int head; - bool is_here = pLU->IsHere(sc, positions, head); - if(is_here) - return process_sentence(clone_sentence_add_mwe(sentence, head, positions, pLU->get_base())); + BOOST_FOREACH (LexicalUnit::Ptr pLU, potential) + { + bool ok = true; + BOOST_FOREACH (const std::string & base, pLU->get_potential_bases()) + { + if (available_bases.find(base) == available_bases.end()) + { + ok = false; + break; + } + } + + if (ok) + { + std::set<int> positions; + int head; + bool is_here = pLU->IsHere(sc, positions, head); + if(is_here) + sc = clone_sentence_add_mwe(sc, head, positions, pLU->get_base()); + } } } } } - return sentence; + return sc.get_sentence_ptr(); } - Sentence::Ptr MWEReader::clone_sentence_add_mwe(Corpus2::Sentence::Ptr sentence, + Wccl::SentenceContext MWEReader::clone_sentence_add_mwe(Wccl::SentenceContext sentence, int head, const std::set<int>& all, const std::string &new_base) { - std::string new_orth = get_new_orth_utf8(sentence, all); + std::string new_orth = get_new_orth_utf8(sentence.get_sentence_ptr(), all); Sentence::Ptr new_sentence = boost::make_shared<Sentence>(); - std::vector<Token*> &tokens = sentence->tokens(); - for(int i = 0; i < (int)tokens.size(); i++){ - if(i == head){ + + Wccl::SentenceContext new_context(new_sentence); + new_context.set_position(sentence.get_position()); + + std::vector<Token*> &tokens = sentence.get_sentence_ptr()->tokens(); + + for (int i = 0; i < (int)tokens.size(); i++) + { + if(i == head) + { Corpus2::Token * t = tokens[i]->clone(); t->set_orth_utf8(new_orth); + BOOST_FOREACH (Lexeme& lex, t->lexemes()) if(lex.is_disamb()) lex.set_lemma_utf8(new_base); + new_sentence->append(t); - } else if( all.find(i) == all.end()) + } + else if( all.find(i) == all.end()) new_sentence->append(tokens[i]->clone()); - // else -> do nothing + + else if (i < sentence.get_position()) + new_context.recede(); } - return new_sentence; + return new_context; } std::string MWEReader::get_new_orth_utf8(Corpus2::Sentence::Ptr sentence, @@ -159,11 +200,8 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( void MWEReader::set_option(const std::string& option) { if(boost::algorithm::starts_with(option, "inner:")) { - std::string inner = option.substr(6); - inner_reader_ = create_path_reader(inner, this->tagset(), - inner_filename_); - token_index=0; - currentSentence= boost::make_shared<Sentence>(); + inner_reader_type = option.substr(6); + reset(); } if(boost::algorithm::starts_with(option, "mwefile:")) { std::string mwefile = option.substr(8); @@ -224,6 +262,14 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( if(parser.get_tagset().name() != tagset().name()) throw std::runtime_error( "Tagset in mwe file does not match reader tagset!" ); } + + void MWEReader::reset() + { + inner_reader_ = create_path_reader(inner_reader_type, this->tagset(), + inner_filename_); + token_index=0; + currentSentence = boost::make_shared<Sentence>(); + } }// ns Corpus2 diff --git a/libmwereader/mwereader.h b/libmwereader/mwereader.h index c6ffce869e4294633ee18e15016848f3abb0d0cd..90eb00b99c629a69e8dcabc278f2276802b8c12e 100644 --- a/libmwereader/mwereader.h +++ b/libmwereader/mwereader.h @@ -31,10 +31,13 @@ public: /** * \param filename corpus filename (MWE file is given in options) */ - MWEReader(const Tagset& tagset, const std::string& filename); - MWEReader(const Tagset &tagset, const std::string &filename, TokenReaderPtr reader); + MWEReader(const Tagset& tagset, const std::string & filename); + MWEReader(const Tagset &tagset, const std::string & filename, TokenReaderPtr reader); ~MWEReader(); + + /// Allows reusage of the reader for multiple files. It is needed for it stores huge index of MWEs + void setFile(const std::string & filename); /// retrieves whole sentence, finds MWEs, and return tokens Token* get_next_token(); @@ -74,17 +77,22 @@ protected: private: void load_mwes(const std::string& filename); - Sentence::Ptr clone_sentence_add_mwe(Corpus2::Sentence::Ptr sentence, + Wccl::SentenceContext clone_sentence_add_mwe(Wccl::SentenceContext sentence, int head, const std::set<int>& all, const std::string &new_base); std::string get_new_orth_utf8(Corpus2::Sentence::Ptr sentence, const std::set<int>& all); + + /// resets inner reader and all state bar MWE index + void reset(); MWEIndex mwe_index_; /// ptr to inner reader doing the real work of reading a corpus TokenReaderPtr inner_reader_; /// path for inner reader std::string inner_filename_; + /// type of inner reader + std::string inner_reader_type; /// inner reader option size_t token_index; /// contains last processed sentence diff --git a/libwccl/sentencecontext.h b/libwccl/sentencecontext.h index 9a6589da06c8b77bb87e37bfa5f8d23ce747c575..a921d0cfa39e17e4d95347a726e4dec5b177d880 100644 --- a/libwccl/sentencecontext.h +++ b/libwccl/sentencecontext.h @@ -127,6 +127,11 @@ public: void advance() { ++position_; } + + /// Position recede shorthand + void recede() { + --position_; + } /// Reset position to point to the first token void goto_start() {