Commit 72497a73 authored by Michał Kaliński's avatar Michał Kaliński

Merge branch 'master' of nlp.pwr.wroc.pl:wccl

parents 7f7ab6ed c182020d
......@@ -49,8 +49,13 @@ bool LexicalUnit::IsHere(const Wccl::SentenceContext &sc,
for(variables_map::const_iterator ivars = variables_.begin();
ivars != variables_.end(); ++ivars){
if(!boost::starts_with(ivars->first, "!")){
//std::cout << ivars->first << " " << std::endl;
/*std::cout << " -- " << base_ << " -- " << ivars->first << " -- " << std::endl;
for (unsigned i = 0; i < condition_->valid_variable_names().size(); i++)
std::cout << condition_->valid_variable_names()[i] << std::endl;*/
condition_->set<Wccl::StrSet>(ivars->first, ivars->second);
//std::cout << " -- egi --" << std::endl;
}
}
......
......@@ -17,6 +17,8 @@ or FITNESS FOR A PARTICULAR PURPOSE.
#include "mweparser.h"
#include <algorithm>
#include <boost/foreach.hpp>
#include <libcorpus2/tagsetmanager.h>
......@@ -75,15 +77,28 @@ namespace Corpus2 {
wccl_operator_);
MWEBuilder::BoolOpPtr head = mwe_builder_->get_head_condition(
head_cond_);
if(group_type_ == "fix"){ // group_name_ -> lower case
mwe_index_.add_lexicalunit( LexicalUnit::Ptr(new FixedLU(mwe_base_, main, head,
variables_)));
} else if(group_type_ == "flex"){
mwe_index_.add_lexicalunit(LexicalUnit::Ptr(new FlexLU(mwe_base_, main, head,
variables_)));
} else {
throw Wccl::WcclError("Unknown type of lexical unit:"
+ group_type_);
std::vector<std::string> valid_vars = main->valid_variable_names();
for (str_map::iterator it = variables_.begin(); it != variables_.end(); ++it)
{
if (std::find(valid_vars.begin(), valid_vars.end(), it->first) != valid_vars.end())
{
if(group_type_ == "fix"){ // group_name_ -> lower case
mwe_index_.add_lexicalunit( LexicalUnit::Ptr(new FixedLU(mwe_base_, main, head,
variables_)));
} else if(group_type_ == "flex"){
mwe_index_.add_lexicalunit(LexicalUnit::Ptr(new FlexLU(mwe_base_, main, head,
variables_)));
} else {
throw Wccl::WcclError("Unknown type of lexical unit:"
+ group_type_);
}
}
else
{
std::cerr << "Warning: " << mwe_base_ << " has unknown variable " << it->first << "! Skipping." << std::endl;
}
}
variables_.clear();
}
......
......@@ -19,6 +19,7 @@ or FITNESS FOR A PARTICULAR PURPOSE.
#include "mweparser.h"
#include <boost/algorithm/string.hpp>
#include <boost/filesystem.hpp>
#include <boost/unordered_set.hpp>
namespace Corpus2{
......@@ -38,6 +39,12 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
mwes_counter=0;
inner_reader_ = reader;
}
void MWEReader::setFile(const std::string &filename)
{
inner_filename_ = filename;
reset();
}
MWEReader::~MWEReader()
{
......@@ -80,10 +87,18 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
Sentence::Ptr MWEReader::process_sentence(Corpus2::Sentence::Ptr sentence)
{
boost::unordered_set<std::string> available_bases;
for (unsigned i = 0; i < sentence->size(); ++i)
for (unsigned j = 0; j < sentence->at(i)->lexemes().size(); ++j)
if (sentence->at(i)->lexemes()[j].is_disamb())
available_bases.insert(sentence->at(i)->lexemes()[j].lemma_utf8());
Wccl::SentenceContext sc(sentence);
for(int i = 0; i < sc.size() ;++i){
sc.set_position(i);
Corpus2::Token *pToken = sc.at(i);
for (sc.goto_start(); sc.is_current_inside(); sc.advance())
{
Corpus2::Token *pToken = sc.current();
std::vector<Lexeme>& lexemes = pToken->lexemes();
if(lexemes.size() == 0)
continue;
......@@ -92,39 +107,65 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
if(lex.is_disamb()){
std::string base = lex.lemma_utf8();
const MWEIndex::luvec& potential = mwe_index_.get_potential_lu(base);
BOOST_FOREACH (LexicalUnit::Ptr pLU, potential){
std::set<int> positions;
int head;
bool is_here = pLU->IsHere(sc, positions, head);
if(is_here)
return process_sentence(clone_sentence_add_mwe(sentence, head, positions, pLU->get_base()));
BOOST_FOREACH (LexicalUnit::Ptr pLU, potential)
{
bool ok = true;
BOOST_FOREACH (const std::string & base, pLU->get_potential_bases())
{
if (available_bases.find(base) == available_bases.end())
{
ok = false;
break;
}
}
if (ok)
{
std::set<int> positions;
int head;
bool is_here = pLU->IsHere(sc, positions, head);
if(is_here)
sc = clone_sentence_add_mwe(sc, head, positions, pLU->get_base());
}
}
}
}
}
return sentence;
return sc.get_sentence_ptr();
}
Sentence::Ptr MWEReader::clone_sentence_add_mwe(Corpus2::Sentence::Ptr sentence,
Wccl::SentenceContext MWEReader::clone_sentence_add_mwe(Wccl::SentenceContext sentence,
int head, const std::set<int>& all,
const std::string &new_base)
{
std::string new_orth = get_new_orth_utf8(sentence, all);
std::string new_orth = get_new_orth_utf8(sentence.get_sentence_ptr(), all);
Sentence::Ptr new_sentence = boost::make_shared<Sentence>();
std::vector<Token*> &tokens = sentence->tokens();
for(int i = 0; i < (int)tokens.size(); i++){
if(i == head){
Wccl::SentenceContext new_context(new_sentence);
new_context.set_position(sentence.get_position());
std::vector<Token*> &tokens = sentence.get_sentence_ptr()->tokens();
for (int i = 0; i < (int)tokens.size(); i++)
{
if(i == head)
{
Corpus2::Token * t = tokens[i]->clone();
t->set_orth_utf8(new_orth);
BOOST_FOREACH (Lexeme& lex, t->lexemes())
if(lex.is_disamb())
lex.set_lemma_utf8(new_base);
new_sentence->append(t);
} else if( all.find(i) == all.end())
}
else if( all.find(i) == all.end())
new_sentence->append(tokens[i]->clone());
// else -> do nothing
else if (i < sentence.get_position())
new_context.recede();
}
return new_sentence;
return new_context;
}
std::string MWEReader::get_new_orth_utf8(Corpus2::Sentence::Ptr sentence,
......@@ -159,11 +200,8 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
void MWEReader::set_option(const std::string& option)
{
if(boost::algorithm::starts_with(option, "inner:")) {
std::string inner = option.substr(6);
inner_reader_ = create_path_reader(inner, this->tagset(),
inner_filename_);
token_index=0;
currentSentence= boost::make_shared<Sentence>();
inner_reader_type = option.substr(6);
reset();
}
if(boost::algorithm::starts_with(option, "mwefile:")) {
std::string mwefile = option.substr(8);
......@@ -224,6 +262,14 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
if(parser.get_tagset().name() != tagset().name())
throw std::runtime_error( "Tagset in mwe file does not match reader tagset!" );
}
void MWEReader::reset()
{
inner_reader_ = create_path_reader(inner_reader_type, this->tagset(),
inner_filename_);
token_index=0;
currentSentence = boost::make_shared<Sentence>();
}
}// ns Corpus2
......@@ -31,10 +31,13 @@ public:
/**
* \param filename corpus filename (MWE file is given in options)
*/
MWEReader(const Tagset& tagset, const std::string& filename);
MWEReader(const Tagset &tagset, const std::string &filename, TokenReaderPtr reader);
MWEReader(const Tagset& tagset, const std::string & filename);
MWEReader(const Tagset &tagset, const std::string & filename, TokenReaderPtr reader);
~MWEReader();
/// Allows reusage of the reader for multiple files. It is needed for it stores huge index of MWEs
void setFile(const std::string & filename);
/// retrieves whole sentence, finds MWEs, and return tokens
Token* get_next_token();
......@@ -74,17 +77,22 @@ protected:
private:
void load_mwes(const std::string& filename);
Sentence::Ptr clone_sentence_add_mwe(Corpus2::Sentence::Ptr sentence,
Wccl::SentenceContext clone_sentence_add_mwe(Wccl::SentenceContext sentence,
int head, const std::set<int>& all,
const std::string &new_base);
std::string get_new_orth_utf8(Corpus2::Sentence::Ptr sentence,
const std::set<int>& all);
/// resets inner reader and all state bar MWE index
void reset();
MWEIndex mwe_index_;
/// ptr to inner reader doing the real work of reading a corpus
TokenReaderPtr inner_reader_;
/// path for inner reader
std::string inner_filename_;
/// type of inner reader
std::string inner_reader_type;
/// inner reader option
size_t token_index;
/// contains last processed sentence
......
......@@ -127,6 +127,11 @@ public:
void advance() {
++position_;
}
/// Position recede shorthand
void recede() {
--position_;
}
/// Reset position to point to the first token
void goto_start() {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment