From c2317fdd77c7ad24738d2da5ad024e2f3778eeb1 Mon Sep 17 00:00:00 2001 From: Bartosz Broda <bartosz.broda@gmail.com> Date: Tue, 14 Jun 2011 08:39:29 +0200 Subject: [PATCH] add preliminary recognition of MWEs --- libmwereader/mwe.cpp | 21 ++++++++++++++++++--- libmwereader/mwe.h | 10 ++++++++-- libmwereader/mwereader.cpp | 30 +++++++++++++++++++++++++++++- 3 files changed, 55 insertions(+), 6 deletions(-) diff --git a/libmwereader/mwe.cpp b/libmwereader/mwe.cpp index 6bd065a..cc1f537 100644 --- a/libmwereader/mwe.cpp +++ b/libmwereader/mwe.cpp @@ -1,5 +1,6 @@ #include "mwe.h" #include <boost/algorithm/string.hpp> +#include <libwccl/values/strset.h> namespace Corpus2{ @@ -9,22 +10,34 @@ LexicalUnit::LexicalUnit(const std::string &base, LexicalUnit::strmap variables) : condition_(condition), head_cond_(head_cond), - variables_(variables), base_(base), nowhere_(Wccl::Position()) { - for(strmap::iterator iter = variables_.begin(); - iter != variables_.end(); ++iter) + for(strmap::iterator iter = variables.begin(); + iter != variables.end(); ++iter){ potential_bases_.insert(iter->second); + Wccl::StrSet ss; + ss.insert_utf8(iter->second); + variables_[iter->first] = ss; + } + } bool LexicalUnit::IsHere(const Wccl::SentenceContext &sc, std::set<size_t> &out_position) const { + // set variables + for(variables_map::const_iterator ivars = variables_.begin(); + ivars != variables_.end(); ++ivars){ + condition_->set<Wccl::StrSet>(ivars->first, ivars->second); + } + + // fire up the operator boost::shared_ptr<const Wccl::Bool> pResult = condition_->apply(sc); if(pResult->get_value() == false) return false; + // fill up positions foreach(const std::string&varname, condition_->valid_variable_names()){ if(boost::algorithm::starts_with(varname, "Pos")){ Wccl::Position pos = condition_->get<Wccl::Position>(varname); @@ -36,6 +49,8 @@ bool LexicalUnit::IsHere(const Wccl::SentenceContext &sc, out_position.insert( sc.get_abs_position(pos) ); } } + + // TODO head position return true; } diff --git a/libmwereader/mwe.h b/libmwereader/mwe.h index e0ee53c..77a38fe 100644 --- a/libmwereader/mwe.h +++ b/libmwereader/mwe.h @@ -5,6 +5,7 @@ #include <libcorpus2/io/reader.h> #include <libwccl/ops/operator.h> +#include <libwccl/values/strset.h> namespace Corpus2 { @@ -14,6 +15,9 @@ class LexicalUnit { public: typedef std::map<std::string, std::string> strmap; + + typedef std::map<std::string, Wccl::StrSet> variables_map; + typedef std::set<std::string> strset; typedef boost::shared_ptr<Wccl::Operator<Wccl::Bool> > BoolOpPtr; @@ -34,7 +38,7 @@ public: std::set<size_t> &out_positions) const; const std::string & get_base() const{ return base_;} - const strmap & get_variables() const{ return variables_;} + const variables_map & get_variables() const{ return variables_;} const strset& get_potential_bases() const{ return potential_bases_;} @@ -43,7 +47,7 @@ protected: boost::shared_ptr<Wccl::Operator<Wccl::Bool> > condition_; boost::shared_ptr<Wccl::Operator<Wccl::Bool> > head_cond_; - strmap variables_; + variables_map variables_; std::string base_; strset potential_bases_; @@ -87,6 +91,8 @@ public: const luvec& get_potential_lu(const std::string& base); + // TODO: method for reordering units by "length" + protected: typedef boost::unordered_map<std::string,luvec> value_type; diff --git a/libmwereader/mwereader.cpp b/libmwereader/mwereader.cpp index d7b38c7..26e4c6a 100644 --- a/libmwereader/mwereader.cpp +++ b/libmwereader/mwereader.cpp @@ -2,6 +2,7 @@ #include "mweparser.h" #include <boost/algorithm/string.hpp> + namespace Corpus2{ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( @@ -28,7 +29,33 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( Sentence::Ptr MWEReader::get_next_sentence() { // TODO MWE stuff - return inner_reader_->get_next_sentence(); + Sentence::Ptr pSentence = inner_reader_->get_next_sentence(); + + Wccl::SentenceContext sc(pSentence); + + for(int i = 0; i < sc.size(); ++i){ + sc.set_position(i); + Corpus2::Token *pToken = (*pSentence)[i]; + std::cout << pToken->orth_utf8() << " "; + std::vector<Lexeme>& lexemes = pToken->lexemes(); + foreach(const Lexeme& lex, lexemes){ + if(lex.is_disamb()){ + std::string base = lex.lemma_utf8(); + const MWEIndex::luvec& potential = mwe_index_.get_potential_lu(base); + if(potential.size()) + std::cout << "# "; + foreach(LexicalUnit::Ptr pLU, potential){ + std::set<size_t> positions; + bool is_here = pLU->IsHere(sc, positions); + if(is_here) + std::cout << "** " << pLU->get_base() << "** "; + } + } + } + } + + std::cout << "ENDL\n"; + return pSentence; } boost::shared_ptr<Chunk> MWEReader::get_next_chunk() @@ -75,6 +102,7 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( { MWEParser parser(mwe_index_); parser.parse_file(filename); + } -- GitLab