From 66d3eb6d3b198f1b520ea1c2b875ca39dec11aa3 Mon Sep 17 00:00:00 2001
From: Bartosz Broda <bartosz.broda@gmail.com>
Date: Tue, 14 Jun 2011 11:44:11 +0200
Subject: [PATCH] first working version of mwereader (for sentences only)

---
 libmwereader/mwe.cpp       |  2 +-
 libmwereader/mwe.h         |  2 +-
 libmwereader/mweparser.cpp |  4 ++-
 libmwereader/mwereader.cpp | 52 ++++++++++++++++++++++++++++++--------
 libmwereader/mwereader.h   |  3 +++
 libmwereader/mwertest.cpp  |  2 ++
 libmwereader/test_mwe.xml  |  9 +++++--
 7 files changed, 59 insertions(+), 15 deletions(-)

diff --git a/libmwereader/mwe.cpp b/libmwereader/mwe.cpp
index 5799d2c..ec4bbe0 100644
--- a/libmwereader/mwe.cpp
+++ b/libmwereader/mwe.cpp
@@ -24,7 +24,7 @@ LexicalUnit::LexicalUnit(const std::string &base,
 }
 
 bool LexicalUnit::IsHere(const Wccl::SentenceContext &sc,
-					std::set<size_t> &out_position, int &head_pos) const
+					std::set<int> &out_position, int &head_pos) const
 {
 	// set variables
 	for(variables_map::const_iterator ivars = variables_.begin();
diff --git a/libmwereader/mwe.h b/libmwereader/mwe.h
index 439323f..0fec3e0 100644
--- a/libmwereader/mwe.h
+++ b/libmwereader/mwe.h
@@ -35,7 +35,7 @@ public:
 	  * \returns true if this lexical unit was found here
 	  */
 	virtual bool IsHere(const Wccl::SentenceContext& sc,
-						std::set<size_t> &out_positions, int &head_pos) const;
+						std::set<int> &out_positions, int &head_pos) const;
 
 	const std::string & get_base() const{ return base_;}
 	const variables_map & get_variables() const{ return variables_;}
diff --git a/libmwereader/mweparser.cpp b/libmwereader/mweparser.cpp
index cda5674..54e8b27 100644
--- a/libmwereader/mweparser.cpp
+++ b/libmwereader/mweparser.cpp
@@ -42,6 +42,8 @@ namespace Corpus2 {
 		return op;
 	}
 
+	//////////////////////////////////////////////////////////////////////
+
 	MWEParser::MWEParser(MWEIndex &index)
 		: BasicSaxParser(), state_(NONE), mwe_index_(index)
 	{
@@ -71,7 +73,7 @@ namespace Corpus2 {
 									+ group_type_);
 		}
 
-
+		variables_.clear();
 	}
 
 	std::string MWEParser::get_attribute(const AttributeList& attributes,
diff --git a/libmwereader/mwereader.cpp b/libmwereader/mwereader.cpp
index e6328be..6b0c861 100644
--- a/libmwereader/mwereader.cpp
+++ b/libmwereader/mwereader.cpp
@@ -30,33 +30,65 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
 	{
 		// TODO MWE stuff
 		Sentence::Ptr pSentence = inner_reader_->get_next_sentence();
-
 		Wccl::SentenceContext sc(pSentence);
+		return process_sentence(sc);
+
+	}
+
+	Sentence::Ptr MWEReader::process_sentence(Wccl::SentenceContext & sc)
+	{
 
 		for(int i = 0; i < sc.size(); ++i){
 			sc.set_position(i);
-			Corpus2::Token *pToken = (*pSentence)[i];
-			std::cout << pToken->orth_utf8() << " ";
+			Corpus2::Token *pToken = sc.at(i);
+			//std::cout << pToken->orth_utf8() << " ";
 			std::vector<Lexeme>& lexemes = pToken->lexemes();
 			foreach(const Lexeme& lex, lexemes){
 				if(lex.is_disamb()){
 					std::string base = lex.lemma_utf8();
 					const MWEIndex::luvec& potential = mwe_index_.get_potential_lu(base);
-					if(potential.size())
-						std::cout << "# ";
 					foreach(LexicalUnit::Ptr pLU, potential){
-						std::set<size_t> positions;
+						std::set<int> positions;
 						int head;
 						bool is_here = pLU->IsHere(sc, positions, head);
-						if(is_here)
-							std::cout << "** " << pLU->get_base() << "** ";
+						if(is_here){
+							std::string new_orth_utf8;
+							Corpus2::Sentence::Ptr sent = sc.get_sentence_ptr();
+
+							std::vector<Token*> &tokens = sent->tokens();
+							foreach(const int &pos, positions){
+								Token* tok = tokens [pos];
+								new_orth_utf8 += tok->orth_utf8() + " ";
+								if(pos != head){
+									delete tok;
+									tokens[pos] = NULL;
+									if(pos >= i)
+										i--;
+								}
+							}
+
+							Corpus2::Token *tok = (*sent)[head];
+							tok->set_orth_utf8(new_orth_utf8);
+							foreach(Lexeme& lex, tok->lexemes())
+								if(lex.is_disamb())
+									lex.set_lemma_utf8(pLU->get_base());
+
+							std::vector <Token*>::iterator del_iter = tokens.begin();
+							while (del_iter  != tokens.end()) {
+								if( (*del_iter) == NULL)
+									del_iter  = tokens.erase(del_iter);
+								else
+									++del_iter;
+								}
+							}
+
 					}
 				}
 			}
 		}
 
-		std::cout << "ENDL\n";
-		return pSentence;
+
+		return sc.get_sentence_ptr();
 	}
 
 	boost::shared_ptr<Chunk> MWEReader::get_next_chunk()
diff --git a/libmwereader/mwereader.h b/libmwereader/mwereader.h
index 4c65c2e..93d1719 100644
--- a/libmwereader/mwereader.h
+++ b/libmwereader/mwereader.h
@@ -46,6 +46,9 @@ public:
 
 	static bool registered;
 
+protected:
+	Sentence::Ptr process_sentence(Wccl::SentenceContext & sc);
+
 private:
 	void load_mwes(const std::string& filename);
 
diff --git a/libmwereader/mwertest.cpp b/libmwereader/mwertest.cpp
index c5b9ceb..3265fad 100644
--- a/libmwereader/mwertest.cpp
+++ b/libmwereader/mwertest.cpp
@@ -11,4 +11,6 @@ int main(int ac, char**av)
 
 	MWEParser parser(temp_index);
 	parser.parse_file(av[1]);
+
+
 }
diff --git a/libmwereader/test_mwe.xml b/libmwereader/test_mwe.xml
index 020213f..85368ac 100644
--- a/libmwereader/test_mwe.xml
+++ b/libmwereader/test_mwe.xml
@@ -9,8 +9,8 @@
 				setvar($Pos2, 1),
 				
 				inter(class[0],{subst,ger,depr}),
-				inter(class[1],{subst,ger,depr}),
-				inter(cas[0], cas[1])
+				inter(class[1],{subst,ger,depr})//,
+				//inter(cas[0], cas[1]) - głupie ograniczenie, np. Debatowali nad ceną netto
 			)
 		</condition>
 		<instances>
@@ -24,6 +24,11 @@
 				<var name="Subst2">netto</var>
 				<head>inter(base[0], "waga")</head>
 			</MWE>
+			<MWE base="ratyfikacja traktatu">
+				<var name="Subst1">ratyfikacja</var>
+				<var name="Subst2">traktat</var>
+				<head>inter(base[0], "traktat")</head>
+			</MWE>
 		</instances>
 	</mwegroup>
 	<mwegroup name="AdjSubstFix" type="fix" class="subst">
-- 
GitLab