Skip to content
Snippets Groups Projects
Commit c182020d authored by Michał Moczulski's avatar Michał Moczulski
Browse files

Faster MWEReader

parent 1e9e6539
Branches
No related tags found
No related merge requests found
...@@ -49,8 +49,13 @@ bool LexicalUnit::IsHere(const Wccl::SentenceContext &sc, ...@@ -49,8 +49,13 @@ bool LexicalUnit::IsHere(const Wccl::SentenceContext &sc,
for(variables_map::const_iterator ivars = variables_.begin(); for(variables_map::const_iterator ivars = variables_.begin();
ivars != variables_.end(); ++ivars){ ivars != variables_.end(); ++ivars){
if(!boost::starts_with(ivars->first, "!")){ if(!boost::starts_with(ivars->first, "!")){
//std::cout << ivars->first << " " << std::endl; /*std::cout << " -- " << base_ << " -- " << ivars->first << " -- " << std::endl;
for (unsigned i = 0; i < condition_->valid_variable_names().size(); i++)
std::cout << condition_->valid_variable_names()[i] << std::endl;*/
condition_->set<Wccl::StrSet>(ivars->first, ivars->second); condition_->set<Wccl::StrSet>(ivars->first, ivars->second);
//std::cout << " -- egi --" << std::endl;
} }
} }
......
...@@ -17,6 +17,8 @@ or FITNESS FOR A PARTICULAR PURPOSE. ...@@ -17,6 +17,8 @@ or FITNESS FOR A PARTICULAR PURPOSE.
#include "mweparser.h" #include "mweparser.h"
#include <algorithm>
#include <boost/foreach.hpp> #include <boost/foreach.hpp>
#include <libcorpus2/tagsetmanager.h> #include <libcorpus2/tagsetmanager.h>
...@@ -75,6 +77,12 @@ namespace Corpus2 { ...@@ -75,6 +77,12 @@ namespace Corpus2 {
wccl_operator_); wccl_operator_);
MWEBuilder::BoolOpPtr head = mwe_builder_->get_head_condition( MWEBuilder::BoolOpPtr head = mwe_builder_->get_head_condition(
head_cond_); head_cond_);
std::vector<std::string> valid_vars = main->valid_variable_names();
for (str_map::iterator it = variables_.begin(); it != variables_.end(); ++it)
{
if (std::find(valid_vars.begin(), valid_vars.end(), it->first) != valid_vars.end())
{
if(group_type_ == "fix"){ // group_name_ -> lower case if(group_type_ == "fix"){ // group_name_ -> lower case
mwe_index_.add_lexicalunit( LexicalUnit::Ptr(new FixedLU(mwe_base_, main, head, mwe_index_.add_lexicalunit( LexicalUnit::Ptr(new FixedLU(mwe_base_, main, head,
variables_))); variables_)));
...@@ -85,6 +93,13 @@ namespace Corpus2 { ...@@ -85,6 +93,13 @@ namespace Corpus2 {
throw Wccl::WcclError("Unknown type of lexical unit:" throw Wccl::WcclError("Unknown type of lexical unit:"
+ group_type_); + group_type_);
} }
}
else
{
std::cerr << "Warning: " << mwe_base_ << " has unknown variable " << it->first << "! Skipping." << std::endl;
}
}
variables_.clear(); variables_.clear();
} }
......
...@@ -19,6 +19,7 @@ or FITNESS FOR A PARTICULAR PURPOSE. ...@@ -19,6 +19,7 @@ or FITNESS FOR A PARTICULAR PURPOSE.
#include "mweparser.h" #include "mweparser.h"
#include <boost/algorithm/string.hpp> #include <boost/algorithm/string.hpp>
#include <boost/filesystem.hpp> #include <boost/filesystem.hpp>
#include <boost/unordered_set.hpp>
namespace Corpus2{ namespace Corpus2{
...@@ -39,6 +40,12 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( ...@@ -39,6 +40,12 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
inner_reader_ = reader; inner_reader_ = reader;
} }
void MWEReader::setFile(const std::string &filename)
{
inner_filename_ = filename;
reset();
}
MWEReader::~MWEReader() MWEReader::~MWEReader()
{ {
// TODO implementataion // TODO implementataion
...@@ -80,10 +87,18 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( ...@@ -80,10 +87,18 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
Sentence::Ptr MWEReader::process_sentence(Corpus2::Sentence::Ptr sentence) Sentence::Ptr MWEReader::process_sentence(Corpus2::Sentence::Ptr sentence)
{ {
boost::unordered_set<std::string> available_bases;
for (unsigned i = 0; i < sentence->size(); ++i)
for (unsigned j = 0; j < sentence->at(i)->lexemes().size(); ++j)
if (sentence->at(i)->lexemes()[j].is_disamb())
available_bases.insert(sentence->at(i)->lexemes()[j].lemma_utf8());
Wccl::SentenceContext sc(sentence); Wccl::SentenceContext sc(sentence);
for(int i = 0; i < sc.size() ;++i){ for (sc.goto_start(); sc.is_current_inside(); sc.advance())
sc.set_position(i); {
Corpus2::Token *pToken = sc.at(i); Corpus2::Token *pToken = sc.current();
std::vector<Lexeme>& lexemes = pToken->lexemes(); std::vector<Lexeme>& lexemes = pToken->lexemes();
if(lexemes.size() == 0) if(lexemes.size() == 0)
continue; continue;
...@@ -92,39 +107,65 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( ...@@ -92,39 +107,65 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
if(lex.is_disamb()){ if(lex.is_disamb()){
std::string base = lex.lemma_utf8(); std::string base = lex.lemma_utf8();
const MWEIndex::luvec& potential = mwe_index_.get_potential_lu(base); const MWEIndex::luvec& potential = mwe_index_.get_potential_lu(base);
BOOST_FOREACH (LexicalUnit::Ptr pLU, potential){ BOOST_FOREACH (LexicalUnit::Ptr pLU, potential)
{
bool ok = true;
BOOST_FOREACH (const std::string & base, pLU->get_potential_bases())
{
if (available_bases.find(base) == available_bases.end())
{
ok = false;
break;
}
}
if (ok)
{
std::set<int> positions; std::set<int> positions;
int head; int head;
bool is_here = pLU->IsHere(sc, positions, head); bool is_here = pLU->IsHere(sc, positions, head);
if(is_here) if(is_here)
return process_sentence(clone_sentence_add_mwe(sentence, head, positions, pLU->get_base())); sc = clone_sentence_add_mwe(sc, head, positions, pLU->get_base());
}
} }
} }
} }
} }
return sentence; return sc.get_sentence_ptr();
} }
Sentence::Ptr MWEReader::clone_sentence_add_mwe(Corpus2::Sentence::Ptr sentence, Wccl::SentenceContext MWEReader::clone_sentence_add_mwe(Wccl::SentenceContext sentence,
int head, const std::set<int>& all, int head, const std::set<int>& all,
const std::string &new_base) const std::string &new_base)
{ {
std::string new_orth = get_new_orth_utf8(sentence, all); std::string new_orth = get_new_orth_utf8(sentence.get_sentence_ptr(), all);
Sentence::Ptr new_sentence = boost::make_shared<Sentence>(); Sentence::Ptr new_sentence = boost::make_shared<Sentence>();
std::vector<Token*> &tokens = sentence->tokens();
for(int i = 0; i < (int)tokens.size(); i++){ Wccl::SentenceContext new_context(new_sentence);
if(i == head){ new_context.set_position(sentence.get_position());
std::vector<Token*> &tokens = sentence.get_sentence_ptr()->tokens();
for (int i = 0; i < (int)tokens.size(); i++)
{
if(i == head)
{
Corpus2::Token * t = tokens[i]->clone(); Corpus2::Token * t = tokens[i]->clone();
t->set_orth_utf8(new_orth); t->set_orth_utf8(new_orth);
BOOST_FOREACH (Lexeme& lex, t->lexemes()) BOOST_FOREACH (Lexeme& lex, t->lexemes())
if(lex.is_disamb()) if(lex.is_disamb())
lex.set_lemma_utf8(new_base); lex.set_lemma_utf8(new_base);
new_sentence->append(t); new_sentence->append(t);
} else if( all.find(i) == all.end()) }
else if( all.find(i) == all.end())
new_sentence->append(tokens[i]->clone()); new_sentence->append(tokens[i]->clone());
// else -> do nothing
else if (i < sentence.get_position())
new_context.recede();
} }
return new_sentence; return new_context;
} }
std::string MWEReader::get_new_orth_utf8(Corpus2::Sentence::Ptr sentence, std::string MWEReader::get_new_orth_utf8(Corpus2::Sentence::Ptr sentence,
...@@ -159,11 +200,8 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( ...@@ -159,11 +200,8 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
void MWEReader::set_option(const std::string& option) void MWEReader::set_option(const std::string& option)
{ {
if(boost::algorithm::starts_with(option, "inner:")) { if(boost::algorithm::starts_with(option, "inner:")) {
std::string inner = option.substr(6); inner_reader_type = option.substr(6);
inner_reader_ = create_path_reader(inner, this->tagset(), reset();
inner_filename_);
token_index=0;
currentSentence= boost::make_shared<Sentence>();
} }
if(boost::algorithm::starts_with(option, "mwefile:")) { if(boost::algorithm::starts_with(option, "mwefile:")) {
std::string mwefile = option.substr(8); std::string mwefile = option.substr(8);
...@@ -225,5 +263,13 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( ...@@ -225,5 +263,13 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
throw std::runtime_error( "Tagset in mwe file does not match reader tagset!" ); throw std::runtime_error( "Tagset in mwe file does not match reader tagset!" );
} }
void MWEReader::reset()
{
inner_reader_ = create_path_reader(inner_reader_type, this->tagset(),
inner_filename_);
token_index=0;
currentSentence = boost::make_shared<Sentence>();
}
}// ns Corpus2 }// ns Corpus2
...@@ -36,6 +36,9 @@ public: ...@@ -36,6 +36,9 @@ public:
~MWEReader(); ~MWEReader();
/// Allows reusage of the reader for multiple files. It is needed for it stores huge index of MWEs
void setFile(const std::string & filename);
/// retrieves whole sentence, finds MWEs, and return tokens /// retrieves whole sentence, finds MWEs, and return tokens
Token* get_next_token(); Token* get_next_token();
...@@ -74,17 +77,22 @@ protected: ...@@ -74,17 +77,22 @@ protected:
private: private:
void load_mwes(const std::string& filename); void load_mwes(const std::string& filename);
Sentence::Ptr clone_sentence_add_mwe(Corpus2::Sentence::Ptr sentence, Wccl::SentenceContext clone_sentence_add_mwe(Wccl::SentenceContext sentence,
int head, const std::set<int>& all, int head, const std::set<int>& all,
const std::string &new_base); const std::string &new_base);
std::string get_new_orth_utf8(Corpus2::Sentence::Ptr sentence, std::string get_new_orth_utf8(Corpus2::Sentence::Ptr sentence,
const std::set<int>& all); const std::set<int>& all);
/// resets inner reader and all state bar MWE index
void reset();
MWEIndex mwe_index_; MWEIndex mwe_index_;
/// ptr to inner reader doing the real work of reading a corpus /// ptr to inner reader doing the real work of reading a corpus
TokenReaderPtr inner_reader_; TokenReaderPtr inner_reader_;
/// path for inner reader /// path for inner reader
std::string inner_filename_; std::string inner_filename_;
/// type of inner reader
std::string inner_reader_type;
/// inner reader option /// inner reader option
size_t token_index; size_t token_index;
/// contains last processed sentence /// contains last processed sentence
......
...@@ -128,6 +128,11 @@ public: ...@@ -128,6 +128,11 @@ public:
++position_; ++position_;
} }
/// Position recede shorthand
void recede() {
--position_;
}
/// Reset position to point to the first token /// Reset position to point to the first token
void goto_start() { void goto_start() {
position_ = 0; position_ = 0;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment