Skip to content
Snippets Groups Projects
Commit c2317fdd authored by Bartosz Broda's avatar Bartosz Broda
Browse files

add preliminary recognition of MWEs

parent e70ded36
Branches
No related tags found
No related merge requests found
#include "mwe.h"
#include <boost/algorithm/string.hpp>
#include <libwccl/values/strset.h>
namespace Corpus2{
......@@ -9,22 +10,34 @@ LexicalUnit::LexicalUnit(const std::string &base,
LexicalUnit::strmap variables)
: condition_(condition),
head_cond_(head_cond),
variables_(variables),
base_(base),
nowhere_(Wccl::Position())
{
for(strmap::iterator iter = variables_.begin();
iter != variables_.end(); ++iter)
for(strmap::iterator iter = variables.begin();
iter != variables.end(); ++iter){
potential_bases_.insert(iter->second);
Wccl::StrSet ss;
ss.insert_utf8(iter->second);
variables_[iter->first] = ss;
}
}
bool LexicalUnit::IsHere(const Wccl::SentenceContext &sc,
std::set<size_t> &out_position) const
{
// set variables
for(variables_map::const_iterator ivars = variables_.begin();
ivars != variables_.end(); ++ivars){
condition_->set<Wccl::StrSet>(ivars->first, ivars->second);
}
// fire up the operator
boost::shared_ptr<const Wccl::Bool> pResult = condition_->apply(sc);
if(pResult->get_value() == false)
return false;
// fill up positions
foreach(const std::string&varname, condition_->valid_variable_names()){
if(boost::algorithm::starts_with(varname, "Pos")){
Wccl::Position pos = condition_->get<Wccl::Position>(varname);
......@@ -36,6 +49,8 @@ bool LexicalUnit::IsHere(const Wccl::SentenceContext &sc,
out_position.insert( sc.get_abs_position(pos) );
}
}
// TODO head position
return true;
}
......
......@@ -5,6 +5,7 @@
#include <libcorpus2/io/reader.h>
#include <libwccl/ops/operator.h>
#include <libwccl/values/strset.h>
namespace Corpus2 {
......@@ -14,6 +15,9 @@ class LexicalUnit
{
public:
typedef std::map<std::string, std::string> strmap;
typedef std::map<std::string, Wccl::StrSet> variables_map;
typedef std::set<std::string> strset;
typedef boost::shared_ptr<Wccl::Operator<Wccl::Bool> > BoolOpPtr;
......@@ -34,7 +38,7 @@ public:
std::set<size_t> &out_positions) const;
const std::string & get_base() const{ return base_;}
const strmap & get_variables() const{ return variables_;}
const variables_map & get_variables() const{ return variables_;}
const strset& get_potential_bases() const{ return potential_bases_;}
......@@ -43,7 +47,7 @@ protected:
boost::shared_ptr<Wccl::Operator<Wccl::Bool> > condition_;
boost::shared_ptr<Wccl::Operator<Wccl::Bool> > head_cond_;
strmap variables_;
variables_map variables_;
std::string base_;
strset potential_bases_;
......@@ -87,6 +91,8 @@ public:
const luvec& get_potential_lu(const std::string& base);
// TODO: method for reordering units by "length"
protected:
typedef boost::unordered_map<std::string,luvec> value_type;
......
......@@ -2,6 +2,7 @@
#include "mweparser.h"
#include <boost/algorithm/string.hpp>
namespace Corpus2{
bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
......@@ -28,7 +29,33 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
Sentence::Ptr MWEReader::get_next_sentence()
{
// TODO MWE stuff
return inner_reader_->get_next_sentence();
Sentence::Ptr pSentence = inner_reader_->get_next_sentence();
Wccl::SentenceContext sc(pSentence);
for(int i = 0; i < sc.size(); ++i){
sc.set_position(i);
Corpus2::Token *pToken = (*pSentence)[i];
std::cout << pToken->orth_utf8() << " ";
std::vector<Lexeme>& lexemes = pToken->lexemes();
foreach(const Lexeme& lex, lexemes){
if(lex.is_disamb()){
std::string base = lex.lemma_utf8();
const MWEIndex::luvec& potential = mwe_index_.get_potential_lu(base);
if(potential.size())
std::cout << "# ";
foreach(LexicalUnit::Ptr pLU, potential){
std::set<size_t> positions;
bool is_here = pLU->IsHere(sc, positions);
if(is_here)
std::cout << "** " << pLU->get_base() << "** ";
}
}
}
}
std::cout << "ENDL\n";
return pSentence;
}
boost::shared_ptr<Chunk> MWEReader::get_next_chunk()
......@@ -75,6 +102,7 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
{
MWEParser parser(mwe_index_);
parser.parse_file(filename);
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment