Skip to content
Snippets Groups Projects
Commit c2317fdd authored by Bartosz Broda's avatar Bartosz Broda
Browse files

add preliminary recognition of MWEs

parent e70ded36
Branches
No related merge requests found
#include "mwe.h"
#include <boost/algorithm/string.hpp>
#include <libwccl/values/strset.h>
namespace Corpus2{
......@@ -9,22 +10,34 @@ LexicalUnit::LexicalUnit(const std::string &base,
LexicalUnit::strmap variables)
: condition_(condition),
head_cond_(head_cond),
variables_(variables),
base_(base),
nowhere_(Wccl::Position())
{
for(strmap::iterator iter = variables_.begin();
iter != variables_.end(); ++iter)
for(strmap::iterator iter = variables.begin();
iter != variables.end(); ++iter){
potential_bases_.insert(iter->second);
Wccl::StrSet ss;
ss.insert_utf8(iter->second);
variables_[iter->first] = ss;
}
}
bool LexicalUnit::IsHere(const Wccl::SentenceContext &sc,
std::set<size_t> &out_position) const
{
// set variables
for(variables_map::const_iterator ivars = variables_.begin();
ivars != variables_.end(); ++ivars){
condition_->set<Wccl::StrSet>(ivars->first, ivars->second);
}
// fire up the operator
boost::shared_ptr<const Wccl::Bool> pResult = condition_->apply(sc);
if(pResult->get_value() == false)
return false;
// fill up positions
foreach(const std::string&varname, condition_->valid_variable_names()){
if(boost::algorithm::starts_with(varname, "Pos")){
Wccl::Position pos = condition_->get<Wccl::Position>(varname);
......@@ -36,6 +49,8 @@ bool LexicalUnit::IsHere(const Wccl::SentenceContext &sc,
out_position.insert( sc.get_abs_position(pos) );
}
}
// TODO head position
return true;
}
......
......@@ -5,6 +5,7 @@
#include <libcorpus2/io/reader.h>
#include <libwccl/ops/operator.h>
#include <libwccl/values/strset.h>
namespace Corpus2 {
......@@ -14,6 +15,9 @@ class LexicalUnit
{
public:
typedef std::map<std::string, std::string> strmap;
typedef std::map<std::string, Wccl::StrSet> variables_map;
typedef std::set<std::string> strset;
typedef boost::shared_ptr<Wccl::Operator<Wccl::Bool> > BoolOpPtr;
......@@ -34,7 +38,7 @@ public:
std::set<size_t> &out_positions) const;
const std::string & get_base() const{ return base_;}
const strmap & get_variables() const{ return variables_;}
const variables_map & get_variables() const{ return variables_;}
const strset& get_potential_bases() const{ return potential_bases_;}
......@@ -43,7 +47,7 @@ protected:
boost::shared_ptr<Wccl::Operator<Wccl::Bool> > condition_;
boost::shared_ptr<Wccl::Operator<Wccl::Bool> > head_cond_;
strmap variables_;
variables_map variables_;
std::string base_;
strset potential_bases_;
......@@ -87,6 +91,8 @@ public:
const luvec& get_potential_lu(const std::string& base);
// TODO: method for reordering units by "length"
protected:
typedef boost::unordered_map<std::string,luvec> value_type;
......
......@@ -2,6 +2,7 @@
#include "mweparser.h"
#include <boost/algorithm/string.hpp>
namespace Corpus2{
bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
......@@ -28,7 +29,33 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
Sentence::Ptr MWEReader::get_next_sentence()
{
// TODO MWE stuff
return inner_reader_->get_next_sentence();
Sentence::Ptr pSentence = inner_reader_->get_next_sentence();
Wccl::SentenceContext sc(pSentence);
for(int i = 0; i < sc.size(); ++i){
sc.set_position(i);
Corpus2::Token *pToken = (*pSentence)[i];
std::cout << pToken->orth_utf8() << " ";
std::vector<Lexeme>& lexemes = pToken->lexemes();
foreach(const Lexeme& lex, lexemes){
if(lex.is_disamb()){
std::string base = lex.lemma_utf8();
const MWEIndex::luvec& potential = mwe_index_.get_potential_lu(base);
if(potential.size())
std::cout << "# ";
foreach(LexicalUnit::Ptr pLU, potential){
std::set<size_t> positions;
bool is_here = pLU->IsHere(sc, positions);
if(is_here)
std::cout << "** " << pLU->get_base() << "** ";
}
}
}
}
std::cout << "ENDL\n";
return pSentence;
}
boost::shared_ptr<Chunk> MWEReader::get_next_chunk()
......@@ -75,6 +102,7 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
{
MWEParser parser(mwe_index_);
parser.parse_file(filename);
}
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment