Commit 7c4939c7 authored by rancher's avatar rancher

merged with mwe_fix

parent f28ca379
......@@ -24,10 +24,8 @@ namespace Corpus2{
LexicalUnit::LexicalUnit(const std::string &base,
LexicalUnit::BoolOpPtr condition,
LexicalUnit::BoolOpPtr head_cond,
LexicalUnit::strmap variables)
: condition_(condition),
head_cond_(head_cond),
base_(base),
nowhere_(Wccl::Position())
{
......@@ -42,32 +40,25 @@ LexicalUnit::LexicalUnit(const std::string &base,
}
bool LexicalUnit::IsHere(const Wccl::SentenceContext &sc,
bool LexicalUnit::IsHere(const boost::shared_ptr<Wccl::SentenceContext> sc,
std::set<int> &out_position, int &head_pos) const
{
// set variables, skip vars with names starting with '!'
for(variables_map::const_iterator ivars = variables_.begin();
ivars != variables_.end(); ++ivars){
if(!boost::starts_with(ivars->first, "!")){
/*std::cout << " -- " << base_ << " -- " << ivars->first << " -- " << std::endl;
for (unsigned i = 0; i < condition_->valid_variable_names().size(); i++)
std::cout << condition_->valid_variable_names()[i] << std::endl;*/
condition_->set<Wccl::StrSet>(ivars->first, ivars->second);
//std::cout << " -- egi --" << std::endl;
}
}
// fire up the operator
boost::shared_ptr<const Wccl::Bool> pResult = condition_->apply(sc);
boost::shared_ptr<const Wccl::Bool> pResult = condition_->apply(*sc);
if(pResult->get_value() == false)
return false;
bool found_head = false;
bool head_defined = false;
Wccl::SentenceContext sc2(sc.get_sentence_ptr());
Wccl::SentenceContext sc2(sc->get_sentence_ptr());
// fill up positions
BOOST_FOREACH (const std::string&varname, condition_->valid_variable_names()){
......@@ -78,21 +69,13 @@ bool LexicalUnit::IsHere(const Wccl::SentenceContext &sc,
errmsg += " Offending unit: " + base_;
throw Wccl::WcclError(errmsg);
}
int abs_pos = sc.get_abs_position(pos);
int abs_pos = sc->get_abs_position(pos);
out_position.insert( abs_pos );
if(!found_head){
sc2.set_position(abs_pos);
if(head_cond_->apply(sc2)->get_value()) {
if (!head_defined)
head_pos = abs_pos;
found_head = true;
}
}
}
if(boost::algorithm::starts_with(varname, "Head")) {
Wccl::Position predefined_head_pos = condition_->get<Wccl::Position>(varname);
head_pos = sc.get_abs_position(predefined_head_pos);
head_defined = true;
head_pos = sc->get_abs_position(predefined_head_pos);
found_head = true;
}
}
......@@ -102,7 +85,12 @@ bool LexicalUnit::IsHere(const Wccl::SentenceContext &sc,
//throw Wccl::WcclError(errmsg);
return false;
}
if(out_position.empty()) {
std::string errmsg("MWE found, but positions of MWE elements were "
"not marked - check setvars in MWE dictionary.");
std::cout << errmsg << std::endl;
return false;
}
return true;
}
......@@ -111,17 +99,15 @@ bool LexicalUnit::IsHere(const Wccl::SentenceContext &sc,
FixedLU::FixedLU(const std::string &base,
boost::shared_ptr<Wccl::Operator<Wccl::Bool> > condition,
boost::shared_ptr<Wccl::Operator<Wccl::Bool> > head_cond,
std::map<std::string, std::string> variables)
: LexicalUnit(base, condition, head_cond, variables)
: LexicalUnit(base, condition, variables)
{
}
FlexLU::FlexLU(const std::string &base,
boost::shared_ptr<Wccl::Operator<Wccl::Bool> > condition,
boost::shared_ptr<Wccl::Operator<Wccl::Bool> > head_cond,
std::map<std::string, std::string> variables)
: LexicalUnit(base, condition, head_cond, variables)
: LexicalUnit(base, condition, variables)
{
}
......
......@@ -19,6 +19,7 @@ or FITNESS FOR A PARTICULAR PURPOSE.
#define LIBMWEREADER_MWE_H
#include <boost/unordered_map.hpp>
#include <boost/shared_ptr.hpp>
#include <libcorpus2/io/reader.h>
#include <libwccl/ops/operator.h>
......@@ -38,9 +39,7 @@ public:
typedef std::set<std::string> strset;
typedef boost::shared_ptr<Wccl::Operator<Wccl::Bool> > BoolOpPtr;
LexicalUnit(const std::string &base, BoolOpPtr condition,
BoolOpPtr head_cond, strmap variables
);
LexicalUnit(const std::string &base, BoolOpPtr condition, strmap variables);
/**
* \param sc SentenceContext with position set to value which
......@@ -51,8 +50,8 @@ public:
* sentence context
* \returns true if this lexical unit was found here
*/
virtual bool IsHere(const Wccl::SentenceContext& sc,
std::set<int> &out_positions, int &head_pos) const;
virtual bool IsHere(const boost::shared_ptr<Wccl::SentenceContext> sc,
std::set<int> &out_positions, int &head_pos) const;
const std::string & get_base() const{ return base_;}
const variables_map & get_variables() const{ return variables_;}
......@@ -63,7 +62,6 @@ public:
protected:
boost::shared_ptr<Wccl::Operator<Wccl::Bool> > condition_;
boost::shared_ptr<Wccl::Operator<Wccl::Bool> > head_cond_;
variables_map variables_;
std::string base_;
......@@ -81,7 +79,6 @@ class FixedLU : public LexicalUnit
public:
FixedLU(const std::string &base,
LexicalUnit::BoolOpPtr condition,
LexicalUnit::BoolOpPtr head_cond,
LexicalUnit::strmap variables
);
};
......@@ -91,7 +88,6 @@ class FlexLU : public LexicalUnit
public:
FlexLU(const std::string &base,
LexicalUnit::BoolOpPtr condition,
LexicalUnit::BoolOpPtr head_cond,
LexicalUnit::strmap variables
);
......
......@@ -35,11 +35,6 @@ namespace Corpus2 {
{
}
MWEBuilder::BoolOpPtr MWEBuilder::get_head_condition(
const std::string & headcond)
{
return get_condition(headcond, head_conditions_);
}
MWEBuilder::BoolOpPtr MWEBuilder::get_mwe_condition(
const std::string &cond)
{
......@@ -75,8 +70,6 @@ namespace Corpus2 {
{
MWEBuilder::BoolOpPtr main = mwe_builder_->get_mwe_condition(
wccl_operator_);
MWEBuilder::BoolOpPtr head = mwe_builder_->get_head_condition(
head_cond_);
std::vector<std::string> valid_vars = main->valid_variable_names();
for (str_map::iterator it = variables_.begin(); it != variables_.end(); ++it)
......@@ -84,11 +77,13 @@ namespace Corpus2 {
if (std::find(valid_vars.begin(), valid_vars.end(), it->first) != valid_vars.end())
{
if(group_type_ == "fix"){ // group_name_ -> lower case
mwe_index_.add_lexicalunit( LexicalUnit::Ptr(new FixedLU(mwe_base_, main, head,
variables_)));
mwe_index_.add_lexicalunit(
LexicalUnit::Ptr(new FixedLU(mwe_base_, main, variables_))
);
} else if(group_type_ == "flex"){
mwe_index_.add_lexicalunit(LexicalUnit::Ptr(new FlexLU(mwe_base_, main, head,
variables_)));
mwe_index_.add_lexicalunit(
LexicalUnit::Ptr(new FlexLU(mwe_base_, main, variables_))
);
} else {
throw Wccl::WcclError("Unknown type of lexical unit:"
+ group_type_);
......@@ -156,10 +151,6 @@ namespace Corpus2 {
var_name_ = get_attribute(attributes, "name");
grab_characters_ = true;
clear_buf();
} else if(state_ == MWE && name == "head"){
state_ = HEAD;
grab_characters_ = true;
clear_buf();
}
}
......@@ -181,9 +172,6 @@ namespace Corpus2 {
} else if(state_ == VAR && name == "var"){
state_ = MWE;
variables_[var_name_] = finish_get_text();
} else if(state_ == HEAD && name == "head"){
state_ = MWE;
head_cond_ = finish_get_text();
} else{
std::cerr << "Wrong state_:" << state_ << " for name: "
<< name << std::endl;
......@@ -196,7 +184,6 @@ namespace Corpus2 {
BOOST_FOREACH (str_map::value_type &i, variables_)
out << i.first << ": " << i.second << ", ";
out << "\nWarunek głowy: " << head_cond_ << "\n";
if(with_condition){
out << "Grupa jednostek: " << group_name_ << std::endl;
out << "Operator: " << wccl_operator_ << std::endl;
......
......@@ -20,10 +20,16 @@ or FITNESS FOR A PARTICULAR PURPOSE.
#include <boost/algorithm/string.hpp>
#include <boost/filesystem.hpp>
#include <boost/unordered_set.hpp>
#include <boost/lexical_cast.hpp>
namespace Corpus2{
typedef boost::shared_ptr<Wccl::SentenceContext> SentenceContextPtr;
typedef boost::shared_ptr<AnnotatedSentence> AnnotatedSentencePtr;
typedef boost::shared_ptr<TokenMetaData> TokenMetaDataPtr;
typedef std::map<std::string, AnnotationChannel> ChanMapT;
bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
"mwereader","inner,mwefile"); // TODO more help?
......@@ -33,7 +39,8 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
mwes_counter=0;
}
MWEReader::MWEReader(const Tagset &tagset, const std::string &filename, TokenReaderPtr reader)
MWEReader::MWEReader(const Tagset &tagset, const std::string &filename,
TokenReaderPtr reader)
: TokenReader(tagset), inner_filename_(filename)
{
mwes_counter=0;
......@@ -85,6 +92,59 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
return process_sentence(currentSentence);
}
void MWEReader::add_mwe_channel(SentenceContextPtr sentence_ctx,
int head, const std::set<int>& all,
int annotation_number, const std::string &new_base) {
Corpus2::Sentence::Ptr sentence = sentence_ctx->get_sentence_ptr();
AnnotatedSentencePtr ann_sentence = AnnotatedSentence::wrap_sentence(sentence);
std::string new_orth = get_new_orth_utf8(sentence, all);
std::vector<Token*> &tokens = ann_sentence->tokens();
// create 'mwe' channel if not exists
ChanMapT chan_map = ann_sentence->all_channels();
if (chan_map.find("mwe") == chan_map.end()) {
ann_sentence->create_channel("mwe");
}
AnnotationChannel& channel = ann_sentence->get_channel("mwe");
// if channel exists, we leave annotation numbers
int head_ann_num = channel.get_segment_at(head);
// if not, we add new annotation number and MWE base to head token
if (head_ann_num <= 0) {
head_ann_num = annotation_number;
}
channel.set_segment_at(head, head_ann_num);
// create metadata if not exits, for 'mwe_base' prop
if (!tokens[head]->get_metadata()) {
tokens[head]->create_metadata();
}
TokenMetaDataPtr md = tokens[head]->get_metadata();
md->set_attribute("mwe_base", new_base);
// annotate mwe elements with annotation_number of head
std::set<int>::iterator pos_it;
int ann_num;
for (pos_it = all.begin(); pos_it != all.end(); ++pos_it) {
ann_num = channel.get_segment_at(*pos_it);
if (ann_num <= 0) {
ann_num = head_ann_num;
}
channel.set_segment_at(*pos_it, ann_num);
}
// move context position to next token after MWE elements
int curr_position = sentence_ctx->get_position();
if (curr_position + all.size() < sentence->size()) {
sentence_ctx->set_position(sentence_ctx->get_position() + all.size());
}
}
Sentence::Ptr MWEReader::process_sentence(Corpus2::Sentence::Ptr sentence)
{
boost::unordered_set<std::string> available_bases;
......@@ -92,13 +152,15 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
for (unsigned j = 0; j < sentence->at(i)->lexemes().size(); ++j)
if (sentence->at(i)->lexemes()[j].is_disamb())
available_bases.insert(sentence->at(i)->lexemes()[j].lemma_utf8());
// TODO: pass annotated sentence to methods
// AnnotatedSentencePtr ann_sentence = AnnotatedSentence::wrap_sentence(sentence);
int annotation_number = 0;
Wccl::SentenceContext sc(sentence);
for (sc.goto_start(); sc.is_current_inside(); sc.advance())
SentenceContextPtr sc = boost::make_shared<Wccl::SentenceContext>(sentence);
for (sc->goto_start(); sc->is_current_inside(); sc->advance())
{
Corpus2::Token *pToken = sc.current();
Corpus2::Token *pToken = sc->current();
std::vector<Lexeme>& lexemes = pToken->lexemes();
if(lexemes.size() == 0)
continue;
......@@ -124,27 +186,38 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
std::set<int> positions;
int head;
bool is_here = pLU->IsHere(sc, positions, head);
if(is_here)
sc = clone_sentence_add_mwe(sc, head, positions, pLU->get_base());
if(is_here) {
if (annotate) {
add_mwe_channel(
sc, head, positions,
++annotation_number,
pLU->get_base());
}
else {
sc = clone_sentence_add_mwe(
sc, head, positions,
pLU->get_base());
}
}
}
}
}
}
}
return sc.get_sentence_ptr();
return sc->get_sentence_ptr();
}
Wccl::SentenceContext MWEReader::clone_sentence_add_mwe(Wccl::SentenceContext sentence,
int head, const std::set<int>& all,
const std::string &new_base)
SentenceContextPtr MWEReader::clone_sentence_add_mwe(SentenceContextPtr sentence,
int head, const std::set<int>& all, const std::string &new_base)
{
std::string new_orth = get_new_orth_utf8(sentence.get_sentence_ptr(), all);
Sentence::Ptr new_sentence = boost::make_shared<Sentence>();
Wccl::SentenceContext new_context(new_sentence);
new_context.set_position(sentence.get_position());
std::vector<Token*> &tokens = sentence.get_sentence_ptr()->tokens();
std::string new_orth = get_new_orth_utf8(sentence->get_sentence_ptr(), all);
Sentence::Ptr new_sentence = boost::make_shared<AnnotatedSentence>();
new_sentence->set_id(sentence->get_sentence_ptr()->id());
SentenceContextPtr new_context = boost::make_shared<Wccl::SentenceContext>(new_sentence);
new_context->set_position(sentence->get_position());
std::vector<Token*> &tokens = sentence->get_sentence_ptr()->tokens();
for (int i = 0; i < (int)tokens.size(); i++)
{
......@@ -162,8 +235,8 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
else if( all.find(i) == all.end())
new_sentence->append(tokens[i]->clone());
else if (i < sentence.get_position())
new_context.recede();
else if (i < sentence->get_position())
new_context->recede();
}
return new_context;
}
......@@ -189,6 +262,7 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
return currentChunk;
boost::shared_ptr<Chunk> new_chunk = boost::make_shared<Chunk>();
new_chunk->set_attribute("id", currentChunk->get_attribute("id"));
BOOST_FOREACH (Corpus2::Sentence::Ptr sentence, currentChunk->sentences())
new_chunk->append( process_sentence(sentence) );
......@@ -203,6 +277,9 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
inner_reader_type = option.substr(6);
reset();
}
if(boost::algorithm::starts_with(option, "annotations:")) {
annotate = boost::lexical_cast<bool>(option.substr(12));
}
if(boost::algorithm::starts_with(option, "mwefile:")) {
std::string mwefile = option.substr(8);
boost::algorithm::trim(mwefile);
......
......@@ -35,7 +35,7 @@ public:
MWEReader(const Tagset &tagset, const std::string & filename, TokenReaderPtr reader);
~MWEReader();
/// Allows reusage of the reader for multiple files. It is needed for it stores huge index of MWEs
void setFile(const std::string & filename);
......@@ -72,14 +72,29 @@ public:
static bool registered;
protected:
/**
* adds 'mwe' annotation channel
*/
void add_mwe_channel(
boost::shared_ptr<Wccl::SentenceContext> sentence,
int head, const std::set<int>& all, int annotation_number,
const std::string &new_base);
/**
* use MWE annotations instead of merging MWE tokens to one token
*/
void use_annotations(bool val) {
annotate = val;
}
Sentence::Ptr process_sentence(Corpus2::Sentence::Ptr sentence);
private:
void load_mwes(const std::string& filename);
Wccl::SentenceContext clone_sentence_add_mwe(Wccl::SentenceContext sentence,
int head, const std::set<int>& all,
const std::string &new_base);
boost::shared_ptr<Wccl::SentenceContext> clone_sentence_add_mwe(
boost::shared_ptr<Wccl::SentenceContext> sentence,
int head, const std::set<int>& all,
const std::string &new_base);
std::string get_new_orth_utf8(Corpus2::Sentence::Ptr sentence,
const std::set<int>& all);
......@@ -101,6 +116,8 @@ private:
boost::shared_ptr<Chunk> currentChunk;
/// quantity of loaded mwes files
size_t mwes_counter;
/// use annotations instead of merging the tokens
bool annotate;
};
} // ns Corpus2
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment