Skip to content
Snippets Groups Projects
Commit c2317fdd authored by Bartosz Broda's avatar Bartosz Broda
Browse files

add preliminary recognition of MWEs

parent e70ded36
No related merge requests found
#include "mwe.h" #include "mwe.h"
#include <boost/algorithm/string.hpp> #include <boost/algorithm/string.hpp>
#include <libwccl/values/strset.h>
namespace Corpus2{ namespace Corpus2{
...@@ -9,22 +10,34 @@ LexicalUnit::LexicalUnit(const std::string &base, ...@@ -9,22 +10,34 @@ LexicalUnit::LexicalUnit(const std::string &base,
LexicalUnit::strmap variables) LexicalUnit::strmap variables)
: condition_(condition), : condition_(condition),
head_cond_(head_cond), head_cond_(head_cond),
variables_(variables),
base_(base), base_(base),
nowhere_(Wccl::Position()) nowhere_(Wccl::Position())
{ {
for(strmap::iterator iter = variables_.begin(); for(strmap::iterator iter = variables.begin();
iter != variables_.end(); ++iter) iter != variables.end(); ++iter){
potential_bases_.insert(iter->second); potential_bases_.insert(iter->second);
Wccl::StrSet ss;
ss.insert_utf8(iter->second);
variables_[iter->first] = ss;
}
} }
bool LexicalUnit::IsHere(const Wccl::SentenceContext &sc, bool LexicalUnit::IsHere(const Wccl::SentenceContext &sc,
std::set<size_t> &out_position) const std::set<size_t> &out_position) const
{ {
// set variables
for(variables_map::const_iterator ivars = variables_.begin();
ivars != variables_.end(); ++ivars){
condition_->set<Wccl::StrSet>(ivars->first, ivars->second);
}
// fire up the operator
boost::shared_ptr<const Wccl::Bool> pResult = condition_->apply(sc); boost::shared_ptr<const Wccl::Bool> pResult = condition_->apply(sc);
if(pResult->get_value() == false) if(pResult->get_value() == false)
return false; return false;
// fill up positions
foreach(const std::string&varname, condition_->valid_variable_names()){ foreach(const std::string&varname, condition_->valid_variable_names()){
if(boost::algorithm::starts_with(varname, "Pos")){ if(boost::algorithm::starts_with(varname, "Pos")){
Wccl::Position pos = condition_->get<Wccl::Position>(varname); Wccl::Position pos = condition_->get<Wccl::Position>(varname);
...@@ -36,6 +49,8 @@ bool LexicalUnit::IsHere(const Wccl::SentenceContext &sc, ...@@ -36,6 +49,8 @@ bool LexicalUnit::IsHere(const Wccl::SentenceContext &sc,
out_position.insert( sc.get_abs_position(pos) ); out_position.insert( sc.get_abs_position(pos) );
} }
} }
// TODO head position
return true; return true;
} }
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#include <libcorpus2/io/reader.h> #include <libcorpus2/io/reader.h>
#include <libwccl/ops/operator.h> #include <libwccl/ops/operator.h>
#include <libwccl/values/strset.h>
namespace Corpus2 { namespace Corpus2 {
...@@ -14,6 +15,9 @@ class LexicalUnit ...@@ -14,6 +15,9 @@ class LexicalUnit
{ {
public: public:
typedef std::map<std::string, std::string> strmap; typedef std::map<std::string, std::string> strmap;
typedef std::map<std::string, Wccl::StrSet> variables_map;
typedef std::set<std::string> strset; typedef std::set<std::string> strset;
typedef boost::shared_ptr<Wccl::Operator<Wccl::Bool> > BoolOpPtr; typedef boost::shared_ptr<Wccl::Operator<Wccl::Bool> > BoolOpPtr;
...@@ -34,7 +38,7 @@ public: ...@@ -34,7 +38,7 @@ public:
std::set<size_t> &out_positions) const; std::set<size_t> &out_positions) const;
const std::string & get_base() const{ return base_;} const std::string & get_base() const{ return base_;}
const strmap & get_variables() const{ return variables_;} const variables_map & get_variables() const{ return variables_;}
const strset& get_potential_bases() const{ return potential_bases_;} const strset& get_potential_bases() const{ return potential_bases_;}
...@@ -43,7 +47,7 @@ protected: ...@@ -43,7 +47,7 @@ protected:
boost::shared_ptr<Wccl::Operator<Wccl::Bool> > condition_; boost::shared_ptr<Wccl::Operator<Wccl::Bool> > condition_;
boost::shared_ptr<Wccl::Operator<Wccl::Bool> > head_cond_; boost::shared_ptr<Wccl::Operator<Wccl::Bool> > head_cond_;
strmap variables_; variables_map variables_;
std::string base_; std::string base_;
strset potential_bases_; strset potential_bases_;
...@@ -87,6 +91,8 @@ public: ...@@ -87,6 +91,8 @@ public:
const luvec& get_potential_lu(const std::string& base); const luvec& get_potential_lu(const std::string& base);
// TODO: method for reordering units by "length"
protected: protected:
typedef boost::unordered_map<std::string,luvec> value_type; typedef boost::unordered_map<std::string,luvec> value_type;
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#include "mweparser.h" #include "mweparser.h"
#include <boost/algorithm/string.hpp> #include <boost/algorithm/string.hpp>
namespace Corpus2{ namespace Corpus2{
bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
...@@ -28,7 +29,33 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( ...@@ -28,7 +29,33 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
Sentence::Ptr MWEReader::get_next_sentence() Sentence::Ptr MWEReader::get_next_sentence()
{ {
// TODO MWE stuff // TODO MWE stuff
return inner_reader_->get_next_sentence(); Sentence::Ptr pSentence = inner_reader_->get_next_sentence();
Wccl::SentenceContext sc(pSentence);
for(int i = 0; i < sc.size(); ++i){
sc.set_position(i);
Corpus2::Token *pToken = (*pSentence)[i];
std::cout << pToken->orth_utf8() << " ";
std::vector<Lexeme>& lexemes = pToken->lexemes();
foreach(const Lexeme& lex, lexemes){
if(lex.is_disamb()){
std::string base = lex.lemma_utf8();
const MWEIndex::luvec& potential = mwe_index_.get_potential_lu(base);
if(potential.size())
std::cout << "# ";
foreach(LexicalUnit::Ptr pLU, potential){
std::set<size_t> positions;
bool is_here = pLU->IsHere(sc, positions);
if(is_here)
std::cout << "** " << pLU->get_base() << "** ";
}
}
}
}
std::cout << "ENDL\n";
return pSentence;
} }
boost::shared_ptr<Chunk> MWEReader::get_next_chunk() boost::shared_ptr<Chunk> MWEReader::get_next_chunk()
...@@ -75,6 +102,7 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( ...@@ -75,6 +102,7 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
{ {
MWEParser parser(mwe_index_); MWEParser parser(mwe_index_);
parser.parse_file(filename); parser.parse_file(filename);
} }
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment