Skip to content
Snippets Groups Projects
Commit 72497a73 authored by Michał Kaliński's avatar Michał Kaliński
Browse files

Merge branch 'master' of nlp.pwr.wroc.pl:wccl

parents 7f7ab6ed c182020d
No related branches found
No related tags found
No related merge requests found
...@@ -49,8 +49,13 @@ bool LexicalUnit::IsHere(const Wccl::SentenceContext &sc, ...@@ -49,8 +49,13 @@ bool LexicalUnit::IsHere(const Wccl::SentenceContext &sc,
for(variables_map::const_iterator ivars = variables_.begin(); for(variables_map::const_iterator ivars = variables_.begin();
ivars != variables_.end(); ++ivars){ ivars != variables_.end(); ++ivars){
if(!boost::starts_with(ivars->first, "!")){ if(!boost::starts_with(ivars->first, "!")){
//std::cout << ivars->first << " " << std::endl; /*std::cout << " -- " << base_ << " -- " << ivars->first << " -- " << std::endl;
for (unsigned i = 0; i < condition_->valid_variable_names().size(); i++)
std::cout << condition_->valid_variable_names()[i] << std::endl;*/
condition_->set<Wccl::StrSet>(ivars->first, ivars->second); condition_->set<Wccl::StrSet>(ivars->first, ivars->second);
//std::cout << " -- egi --" << std::endl;
} }
} }
......
...@@ -17,6 +17,8 @@ or FITNESS FOR A PARTICULAR PURPOSE. ...@@ -17,6 +17,8 @@ or FITNESS FOR A PARTICULAR PURPOSE.
#include "mweparser.h" #include "mweparser.h"
#include <algorithm>
#include <boost/foreach.hpp> #include <boost/foreach.hpp>
#include <libcorpus2/tagsetmanager.h> #include <libcorpus2/tagsetmanager.h>
...@@ -75,6 +77,12 @@ namespace Corpus2 { ...@@ -75,6 +77,12 @@ namespace Corpus2 {
wccl_operator_); wccl_operator_);
MWEBuilder::BoolOpPtr head = mwe_builder_->get_head_condition( MWEBuilder::BoolOpPtr head = mwe_builder_->get_head_condition(
head_cond_); head_cond_);
std::vector<std::string> valid_vars = main->valid_variable_names();
for (str_map::iterator it = variables_.begin(); it != variables_.end(); ++it)
{
if (std::find(valid_vars.begin(), valid_vars.end(), it->first) != valid_vars.end())
{
if(group_type_ == "fix"){ // group_name_ -> lower case if(group_type_ == "fix"){ // group_name_ -> lower case
mwe_index_.add_lexicalunit( LexicalUnit::Ptr(new FixedLU(mwe_base_, main, head, mwe_index_.add_lexicalunit( LexicalUnit::Ptr(new FixedLU(mwe_base_, main, head,
variables_))); variables_)));
...@@ -85,6 +93,13 @@ namespace Corpus2 { ...@@ -85,6 +93,13 @@ namespace Corpus2 {
throw Wccl::WcclError("Unknown type of lexical unit:" throw Wccl::WcclError("Unknown type of lexical unit:"
+ group_type_); + group_type_);
} }
}
else
{
std::cerr << "Warning: " << mwe_base_ << " has unknown variable " << it->first << "! Skipping." << std::endl;
}
}
variables_.clear(); variables_.clear();
} }
......
...@@ -19,6 +19,7 @@ or FITNESS FOR A PARTICULAR PURPOSE. ...@@ -19,6 +19,7 @@ or FITNESS FOR A PARTICULAR PURPOSE.
#include "mweparser.h" #include "mweparser.h"
#include <boost/algorithm/string.hpp> #include <boost/algorithm/string.hpp>
#include <boost/filesystem.hpp> #include <boost/filesystem.hpp>
#include <boost/unordered_set.hpp>
namespace Corpus2{ namespace Corpus2{
...@@ -39,6 +40,12 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( ...@@ -39,6 +40,12 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
inner_reader_ = reader; inner_reader_ = reader;
} }
void MWEReader::setFile(const std::string &filename)
{
inner_filename_ = filename;
reset();
}
MWEReader::~MWEReader() MWEReader::~MWEReader()
{ {
// TODO implementataion // TODO implementataion
...@@ -80,10 +87,18 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( ...@@ -80,10 +87,18 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
Sentence::Ptr MWEReader::process_sentence(Corpus2::Sentence::Ptr sentence) Sentence::Ptr MWEReader::process_sentence(Corpus2::Sentence::Ptr sentence)
{ {
boost::unordered_set<std::string> available_bases;
for (unsigned i = 0; i < sentence->size(); ++i)
for (unsigned j = 0; j < sentence->at(i)->lexemes().size(); ++j)
if (sentence->at(i)->lexemes()[j].is_disamb())
available_bases.insert(sentence->at(i)->lexemes()[j].lemma_utf8());
Wccl::SentenceContext sc(sentence); Wccl::SentenceContext sc(sentence);
for(int i = 0; i < sc.size() ;++i){ for (sc.goto_start(); sc.is_current_inside(); sc.advance())
sc.set_position(i); {
Corpus2::Token *pToken = sc.at(i); Corpus2::Token *pToken = sc.current();
std::vector<Lexeme>& lexemes = pToken->lexemes(); std::vector<Lexeme>& lexemes = pToken->lexemes();
if(lexemes.size() == 0) if(lexemes.size() == 0)
continue; continue;
...@@ -92,39 +107,65 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( ...@@ -92,39 +107,65 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
if(lex.is_disamb()){ if(lex.is_disamb()){
std::string base = lex.lemma_utf8(); std::string base = lex.lemma_utf8();
const MWEIndex::luvec& potential = mwe_index_.get_potential_lu(base); const MWEIndex::luvec& potential = mwe_index_.get_potential_lu(base);
BOOST_FOREACH (LexicalUnit::Ptr pLU, potential){ BOOST_FOREACH (LexicalUnit::Ptr pLU, potential)
{
bool ok = true;
BOOST_FOREACH (const std::string & base, pLU->get_potential_bases())
{
if (available_bases.find(base) == available_bases.end())
{
ok = false;
break;
}
}
if (ok)
{
std::set<int> positions; std::set<int> positions;
int head; int head;
bool is_here = pLU->IsHere(sc, positions, head); bool is_here = pLU->IsHere(sc, positions, head);
if(is_here) if(is_here)
return process_sentence(clone_sentence_add_mwe(sentence, head, positions, pLU->get_base())); sc = clone_sentence_add_mwe(sc, head, positions, pLU->get_base());
}
} }
} }
} }
} }
return sentence; return sc.get_sentence_ptr();
} }
Sentence::Ptr MWEReader::clone_sentence_add_mwe(Corpus2::Sentence::Ptr sentence, Wccl::SentenceContext MWEReader::clone_sentence_add_mwe(Wccl::SentenceContext sentence,
int head, const std::set<int>& all, int head, const std::set<int>& all,
const std::string &new_base) const std::string &new_base)
{ {
std::string new_orth = get_new_orth_utf8(sentence, all); std::string new_orth = get_new_orth_utf8(sentence.get_sentence_ptr(), all);
Sentence::Ptr new_sentence = boost::make_shared<Sentence>(); Sentence::Ptr new_sentence = boost::make_shared<Sentence>();
std::vector<Token*> &tokens = sentence->tokens();
for(int i = 0; i < (int)tokens.size(); i++){ Wccl::SentenceContext new_context(new_sentence);
if(i == head){ new_context.set_position(sentence.get_position());
std::vector<Token*> &tokens = sentence.get_sentence_ptr()->tokens();
for (int i = 0; i < (int)tokens.size(); i++)
{
if(i == head)
{
Corpus2::Token * t = tokens[i]->clone(); Corpus2::Token * t = tokens[i]->clone();
t->set_orth_utf8(new_orth); t->set_orth_utf8(new_orth);
BOOST_FOREACH (Lexeme& lex, t->lexemes()) BOOST_FOREACH (Lexeme& lex, t->lexemes())
if(lex.is_disamb()) if(lex.is_disamb())
lex.set_lemma_utf8(new_base); lex.set_lemma_utf8(new_base);
new_sentence->append(t); new_sentence->append(t);
} else if( all.find(i) == all.end()) }
else if( all.find(i) == all.end())
new_sentence->append(tokens[i]->clone()); new_sentence->append(tokens[i]->clone());
// else -> do nothing
else if (i < sentence.get_position())
new_context.recede();
} }
return new_sentence; return new_context;
} }
std::string MWEReader::get_new_orth_utf8(Corpus2::Sentence::Ptr sentence, std::string MWEReader::get_new_orth_utf8(Corpus2::Sentence::Ptr sentence,
...@@ -159,11 +200,8 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( ...@@ -159,11 +200,8 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
void MWEReader::set_option(const std::string& option) void MWEReader::set_option(const std::string& option)
{ {
if(boost::algorithm::starts_with(option, "inner:")) { if(boost::algorithm::starts_with(option, "inner:")) {
std::string inner = option.substr(6); inner_reader_type = option.substr(6);
inner_reader_ = create_path_reader(inner, this->tagset(), reset();
inner_filename_);
token_index=0;
currentSentence= boost::make_shared<Sentence>();
} }
if(boost::algorithm::starts_with(option, "mwefile:")) { if(boost::algorithm::starts_with(option, "mwefile:")) {
std::string mwefile = option.substr(8); std::string mwefile = option.substr(8);
...@@ -225,5 +263,13 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( ...@@ -225,5 +263,13 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
throw std::runtime_error( "Tagset in mwe file does not match reader tagset!" ); throw std::runtime_error( "Tagset in mwe file does not match reader tagset!" );
} }
void MWEReader::reset()
{
inner_reader_ = create_path_reader(inner_reader_type, this->tagset(),
inner_filename_);
token_index=0;
currentSentence = boost::make_shared<Sentence>();
}
}// ns Corpus2 }// ns Corpus2
...@@ -36,6 +36,9 @@ public: ...@@ -36,6 +36,9 @@ public:
~MWEReader(); ~MWEReader();
/// Allows reusage of the reader for multiple files. It is needed for it stores huge index of MWEs
void setFile(const std::string & filename);
/// retrieves whole sentence, finds MWEs, and return tokens /// retrieves whole sentence, finds MWEs, and return tokens
Token* get_next_token(); Token* get_next_token();
...@@ -74,17 +77,22 @@ protected: ...@@ -74,17 +77,22 @@ protected:
private: private:
void load_mwes(const std::string& filename); void load_mwes(const std::string& filename);
Sentence::Ptr clone_sentence_add_mwe(Corpus2::Sentence::Ptr sentence, Wccl::SentenceContext clone_sentence_add_mwe(Wccl::SentenceContext sentence,
int head, const std::set<int>& all, int head, const std::set<int>& all,
const std::string &new_base); const std::string &new_base);
std::string get_new_orth_utf8(Corpus2::Sentence::Ptr sentence, std::string get_new_orth_utf8(Corpus2::Sentence::Ptr sentence,
const std::set<int>& all); const std::set<int>& all);
/// resets inner reader and all state bar MWE index
void reset();
MWEIndex mwe_index_; MWEIndex mwe_index_;
/// ptr to inner reader doing the real work of reading a corpus /// ptr to inner reader doing the real work of reading a corpus
TokenReaderPtr inner_reader_; TokenReaderPtr inner_reader_;
/// path for inner reader /// path for inner reader
std::string inner_filename_; std::string inner_filename_;
/// type of inner reader
std::string inner_reader_type;
/// inner reader option /// inner reader option
size_t token_index; size_t token_index;
/// contains last processed sentence /// contains last processed sentence
......
...@@ -128,6 +128,11 @@ public: ...@@ -128,6 +128,11 @@ public:
++position_; ++position_;
} }
/// Position recede shorthand
void recede() {
--position_;
}
/// Reset position to point to the first token /// Reset position to point to the first token
void goto_start() { void goto_start() {
position_ = 0; position_ = 0;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment