From c2317fdd77c7ad24738d2da5ad024e2f3778eeb1 Mon Sep 17 00:00:00 2001
From: Bartosz Broda <bartosz.broda@gmail.com>
Date: Tue, 14 Jun 2011 08:39:29 +0200
Subject: [PATCH] add preliminary recognition of MWEs

---
 libmwereader/mwe.cpp       | 21 ++++++++++++++++++---
 libmwereader/mwe.h         | 10 ++++++++--
 libmwereader/mwereader.cpp | 30 +++++++++++++++++++++++++++++-
 3 files changed, 55 insertions(+), 6 deletions(-)

diff --git a/libmwereader/mwe.cpp b/libmwereader/mwe.cpp
index 6bd065a..cc1f537 100644
--- a/libmwereader/mwe.cpp
+++ b/libmwereader/mwe.cpp
@@ -1,5 +1,6 @@
 #include "mwe.h"
 #include <boost/algorithm/string.hpp>
+#include <libwccl/values/strset.h>
 
 namespace Corpus2{
 
@@ -9,22 +10,34 @@ LexicalUnit::LexicalUnit(const std::string &base,
 						 LexicalUnit::strmap variables)
 	: condition_(condition),
 	  head_cond_(head_cond),
-	  variables_(variables),
 	  base_(base),
 	  nowhere_(Wccl::Position())
 {
-	for(strmap::iterator iter = variables_.begin();
-		iter != variables_.end(); ++iter)
+	for(strmap::iterator iter = variables.begin();
+		iter != variables.end(); ++iter){
 		potential_bases_.insert(iter->second);
+		Wccl::StrSet ss;
+		ss.insert_utf8(iter->second);
+		variables_[iter->first] = ss;
+	}
+
 }
 
 bool LexicalUnit::IsHere(const Wccl::SentenceContext &sc,
 					std::set<size_t> &out_position) const
 {
+	// set variables
+	for(variables_map::const_iterator ivars = variables_.begin();
+		ivars != variables_.end(); ++ivars){
+		condition_->set<Wccl::StrSet>(ivars->first, ivars->second);
+	}
+
+	// fire up the operator
 	boost::shared_ptr<const Wccl::Bool> pResult = condition_->apply(sc);
 	if(pResult->get_value() == false)
 		return false;
 
+	// fill up positions
 	foreach(const std::string&varname, condition_->valid_variable_names()){
 		if(boost::algorithm::starts_with(varname, "Pos")){
 			Wccl::Position pos = condition_->get<Wccl::Position>(varname);
@@ -36,6 +49,8 @@ bool LexicalUnit::IsHere(const Wccl::SentenceContext &sc,
 			out_position.insert( sc.get_abs_position(pos) );
 		}
 	}
+
+	// TODO head position
 	return true;
 }
 
diff --git a/libmwereader/mwe.h b/libmwereader/mwe.h
index e0ee53c..77a38fe 100644
--- a/libmwereader/mwe.h
+++ b/libmwereader/mwe.h
@@ -5,6 +5,7 @@
 
 #include <libcorpus2/io/reader.h>
 #include <libwccl/ops/operator.h>
+#include <libwccl/values/strset.h>
 
 namespace Corpus2 {
 
@@ -14,6 +15,9 @@ class LexicalUnit
 {
 public:
 	typedef std::map<std::string, std::string> strmap;
+
+	typedef std::map<std::string, Wccl::StrSet> variables_map;
+
 	typedef std::set<std::string> strset;
 	typedef boost::shared_ptr<Wccl::Operator<Wccl::Bool> > BoolOpPtr;
 
@@ -34,7 +38,7 @@ public:
 						std::set<size_t> &out_positions) const;
 
 	const std::string & get_base() const{ return base_;}
-	const strmap & get_variables() const{ return variables_;}
+	const variables_map & get_variables() const{ return variables_;}
 	const strset& get_potential_bases() const{ return potential_bases_;}
 
 
@@ -43,7 +47,7 @@ protected:
 
 	boost::shared_ptr<Wccl::Operator<Wccl::Bool> > condition_;
 	boost::shared_ptr<Wccl::Operator<Wccl::Bool> > head_cond_;
-	strmap variables_;
+	variables_map variables_;
 	std::string base_;
 
 	strset potential_bases_;
@@ -87,6 +91,8 @@ public:
 
 	const luvec& get_potential_lu(const std::string& base);
 
+	// TODO: method for reordering units by "length"
+
 protected:
 
 	typedef boost::unordered_map<std::string,luvec> value_type;
diff --git a/libmwereader/mwereader.cpp b/libmwereader/mwereader.cpp
index d7b38c7..26e4c6a 100644
--- a/libmwereader/mwereader.cpp
+++ b/libmwereader/mwereader.cpp
@@ -2,6 +2,7 @@
 #include "mweparser.h"
 #include <boost/algorithm/string.hpp>
 
+
 namespace Corpus2{
 
 bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
@@ -28,7 +29,33 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
 	Sentence::Ptr MWEReader::get_next_sentence()
 	{
 		// TODO MWE stuff
-		return inner_reader_->get_next_sentence();
+		Sentence::Ptr pSentence = inner_reader_->get_next_sentence();
+
+		Wccl::SentenceContext sc(pSentence);
+
+		for(int i = 0; i < sc.size(); ++i){
+			sc.set_position(i);
+			Corpus2::Token *pToken = (*pSentence)[i];
+			std::cout << pToken->orth_utf8() << " ";
+			std::vector<Lexeme>& lexemes = pToken->lexemes();
+			foreach(const Lexeme& lex, lexemes){
+				if(lex.is_disamb()){
+					std::string base = lex.lemma_utf8();
+					const MWEIndex::luvec& potential = mwe_index_.get_potential_lu(base);
+					if(potential.size())
+						std::cout << "# ";
+					foreach(LexicalUnit::Ptr pLU, potential){
+						std::set<size_t> positions;
+						bool is_here = pLU->IsHere(sc, positions);
+						if(is_here)
+							std::cout << "** " << pLU->get_base() << "** ";
+					}
+				}
+			}
+		}
+
+		std::cout << "ENDL\n";
+		return pSentence;
 	}
 
 	boost::shared_ptr<Chunk> MWEReader::get_next_chunk()
@@ -75,6 +102,7 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
 	{
 		MWEParser parser(mwe_index_);
 		parser.parse_file(filename);
+
 	}
 
 
-- 
GitLab