merged with mwe_fix

7c4939c7 · mateuszg · f28ca379 · 7c4939c7 · 7c4939c7 · 7c4939c7
Commit 7c4939c7 authored Apr 18, 2019 by mateuszg
--- a/libmwereader/mwe.cpp
+++ b/libmwereader/mwe.cpp
@@ -24,10 +24,8 @@ namespace Corpus2{
 LexicalUnit::LexicalUnit(const std::string &base,
 						 LexicalUnit::BoolOpPtr condition,
-						 LexicalUnit::BoolOpPtr head_cond,
 						 LexicalUnit::strmap variables)
 	: condition_(condition),
-	  head_cond_(head_cond),
 	  base_(base),
 	  nowhere_(Wccl::Position())
 {
@@ -42,32 +40,25 @@ LexicalUnit::LexicalUnit(const std::string &base,
 }
-bool LexicalUnit::IsHere(const Wccl::SentenceContext &sc,
+bool LexicalUnit::IsHere(const boost::shared_ptr<Wccl::SentenceContext> sc,
 					std::set<int> &out_position, int &head_pos) const
 {
 	// set variables, skip vars with names starting with '!'
 	for(variables_map::const_iterator ivars = variables_.begin();
 		ivars != variables_.end(); ++ivars){			
 			if(!boost::starts_with(ivars->first, "!")){
-				/*std::cout << " -- " << base_ << " -- " << ivars->first << " -- " << std::endl;
-				for (unsigned i = 0; i < condition_->valid_variable_names().size(); i++)
-					std::cout << condition_->valid_variable_names()[i] << std::endl;*/
 				condition_->set<Wccl::StrSet>(ivars->first, ivars->second);
-				//std::cout << " -- egi --" << std::endl;
 			}
 		}
 	// fire up the operator
-	boost::shared_ptr<const Wccl::Bool> pResult = condition_->apply(sc);
+	boost::shared_ptr<const Wccl::Bool> pResult = condition_->apply(*sc);
 	if(pResult->get_value() == false)
 		return false;
 	bool found_head = false;
-	bool head_defined = false;
-	Wccl::SentenceContext sc2(sc.get_sentence_ptr());
+	Wccl::SentenceContext sc2(sc->get_sentence_ptr());
 	// fill up positions
 	BOOST_FOREACH (const std::string&varname, condition_->valid_variable_names()){
@@ -78,21 +69,13 @@ bool LexicalUnit::IsHere(const Wccl::SentenceContext &sc,
 				errmsg += " Offending unit: " + base_;
 				throw Wccl::WcclError(errmsg);
 			}
-			int abs_pos = sc.get_abs_position(pos);
+			int abs_pos = sc->get_abs_position(pos);
 			out_position.insert( abs_pos );
-			if(!found_head){
-				sc2.set_position(abs_pos);
-				if(head_cond_->apply(sc2)->get_value()) {
-					if (!head_defined)
-						head_pos = abs_pos;
-					found_head = true;
-				}
-			}
 		}
 		if(boost::algorithm::starts_with(varname, "Head")) {
 			Wccl::Position predefined_head_pos = condition_->get<Wccl::Position>(varname);
-			head_pos = sc.get_abs_position(predefined_head_pos);
+			head_pos = sc->get_abs_position(predefined_head_pos);
-			head_defined = true;
+			found_head = true;
 		}
 	}
@@ -102,7 +85,12 @@ bool LexicalUnit::IsHere(const Wccl::SentenceContext &sc,
 		//throw Wccl::WcclError(errmsg);
 		return false;
 	}
+	if(out_position.empty()) {
+		std::string errmsg("MWE found, but positions of MWE elements were " 
+				"not marked - check setvars in MWE dictionary.");
+		std::cout << errmsg << std::endl;
+		return false;
+	}
 	return true;
 }
@@ -111,17 +99,15 @@ bool LexicalUnit::IsHere(const Wccl::SentenceContext &sc,
 FixedLU::FixedLU(const std::string &base,
 				boost::shared_ptr<Wccl::Operator<Wccl::Bool> > condition,
-				boost::shared_ptr<Wccl::Operator<Wccl::Bool> > head_cond,
 				std::map<std::string, std::string> variables)
-		: LexicalUnit(base, condition, head_cond, variables)
+		: LexicalUnit(base, condition, variables)
 {
 }
 FlexLU::FlexLU(const std::string &base,
 				boost::shared_ptr<Wccl::Operator<Wccl::Bool> > condition,
-				boost::shared_ptr<Wccl::Operator<Wccl::Bool> > head_cond,
 				std::map<std::string, std::string> variables)
-		: LexicalUnit(base, condition, head_cond, variables)
+		: LexicalUnit(base, condition, variables)
 {
 }

--- a/libmwereader/mwe.h
+++ b/libmwereader/mwe.h
@@ -19,6 +19,7 @@ or FITNESS FOR A PARTICULAR PURPOSE.
 #define LIBMWEREADER_MWE_H
 #include <boost/unordered_map.hpp>
+#include <boost/shared_ptr.hpp>
 #include <libcorpus2/io/reader.h>
 #include <libwccl/ops/operator.h>
@@ -38,9 +39,7 @@ public:
 	typedef std::set<std::string> strset;
 	typedef boost::shared_ptr<Wccl::Operator<Wccl::Bool> > BoolOpPtr;
-	LexicalUnit(const std::string &base, BoolOpPtr condition,
+	LexicalUnit(const std::string &base, BoolOpPtr condition, strmap variables);
-				BoolOpPtr head_cond, strmap variables
-				);
 	/**
 	  * \param sc SentenceContext with position set to value which
@@ -51,7 +50,7 @@ public:
 	  * sentence context
 	  * \returns true if this lexical unit was found here
 	  */
-	virtual bool IsHere(const Wccl::SentenceContext& sc,
+	virtual bool IsHere(const boost::shared_ptr<Wccl::SentenceContext> sc,
 			std::set<int> &out_positions, int &head_pos) const;
 	const std::string & get_base() const{ return base_;}
@@ -63,7 +62,6 @@ public:
 protected:
 	boost::shared_ptr<Wccl::Operator<Wccl::Bool> > condition_;
-	boost::shared_ptr<Wccl::Operator<Wccl::Bool> > head_cond_;
 	variables_map variables_;
 	std::string base_;
@@ -81,7 +79,6 @@ class FixedLU : public LexicalUnit
 public:
 	FixedLU(const std::string &base,
 				LexicalUnit::BoolOpPtr condition,
-				LexicalUnit::BoolOpPtr head_cond,
 				LexicalUnit::strmap variables
 				);
 };
@@ -91,7 +88,6 @@ class FlexLU : public LexicalUnit
 public:
 	FlexLU(const std::string &base,
 				LexicalUnit::BoolOpPtr condition,
-				LexicalUnit::BoolOpPtr head_cond,
 				LexicalUnit::strmap variables
 				);

--- a/libmwereader/mweparser.cpp
+++ b/libmwereader/mweparser.cpp
@@ -35,11 +35,6 @@ namespace Corpus2 {
 	{
 	}
-	MWEBuilder::BoolOpPtr MWEBuilder::get_head_condition(
-		const std::string & headcond)
-	{
-		return get_condition(headcond, head_conditions_);
-	}
 	MWEBuilder::BoolOpPtr MWEBuilder::get_mwe_condition(
 		const std::string &cond)
 	{
@@ -75,8 +70,6 @@ namespace Corpus2 {
 	{
 		MWEBuilder::BoolOpPtr main = mwe_builder_->get_mwe_condition(
 					wccl_operator_);
-		MWEBuilder::BoolOpPtr head = mwe_builder_->get_head_condition(
-					head_cond_);
 		std::vector<std::string> valid_vars = main->valid_variable_names();
 		for (str_map::iterator it = variables_.begin(); it != variables_.end(); ++it)
@@ -84,11 +77,13 @@ namespace Corpus2 {
 			if (std::find(valid_vars.begin(), valid_vars.end(), it->first) != valid_vars.end())
 			{
 				if(group_type_ == "fix"){ // group_name_  -> lower case
-					mwe_index_.add_lexicalunit( LexicalUnit::Ptr(new FixedLU(mwe_base_, main, head,
+					mwe_index_.add_lexicalunit( 
-													  variables_)));
+							LexicalUnit::Ptr(new FixedLU(mwe_base_, main, variables_))
+					);
 				} else if(group_type_ == "flex"){
-					mwe_index_.add_lexicalunit(LexicalUnit::Ptr(new FlexLU(mwe_base_, main, head,
+					mwe_index_.add_lexicalunit(
-													variables_)));
+							LexicalUnit::Ptr(new FlexLU(mwe_base_, main, variables_))
+					);
 				} else {
 					throw Wccl::WcclError("Unknown type of lexical unit:"
 											+ group_type_);
@@ -156,10 +151,6 @@ namespace Corpus2 {
 			var_name_ = get_attribute(attributes, "name");
 			grab_characters_ = true;
 			clear_buf();
-		} else if(state_ == MWE && name == "head"){
-			state_ = HEAD;
-			grab_characters_ = true;
-			clear_buf();
 		}
 	}
@@ -181,9 +172,6 @@ namespace Corpus2 {
 		} else if(state_ == VAR && name == "var"){
 			state_ = MWE;
 			variables_[var_name_] = finish_get_text();
-		} else if(state_ == HEAD && name == "head"){
-			state_ = MWE;
-			head_cond_ = finish_get_text();
 		} else{
 			std::cerr << "Wrong state_:" << state_ << " for name: "
 					<< name << std::endl;
@@ -196,7 +184,6 @@ namespace Corpus2 {
 		BOOST_FOREACH (str_map::value_type &i, variables_)
 			out << i.first << ": " << i.second << ", ";
-		out << "\nWarunek głowy: " << head_cond_ << "\n";
 		if(with_condition){
 			out << "Grupa jednostek: " << group_name_ << std::endl;
 			out << "Operator: " << wccl_operator_ << std::endl;

--- a/libmwereader/mwereader.cpp
+++ b/libmwereader/mwereader.cpp
@@ -20,10 +20,16 @@ or FITNESS FOR A PARTICULAR PURPOSE.
 #include <boost/algorithm/string.hpp>
 #include <boost/filesystem.hpp>
 #include <boost/unordered_set.hpp>
+#include <boost/lexical_cast.hpp>
 namespace Corpus2{
+typedef boost::shared_ptr<Wccl::SentenceContext> SentenceContextPtr;
+typedef boost::shared_ptr<AnnotatedSentence> AnnotatedSentencePtr;
+typedef boost::shared_ptr<TokenMetaData> TokenMetaDataPtr;
+typedef std::map<std::string, AnnotationChannel> ChanMapT;
 bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
 	"mwereader","inner,mwefile"); // TODO more help?
@@ -33,7 +39,8 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
 		mwes_counter=0;
 	}
-	MWEReader::MWEReader(const Tagset &tagset, const std::string &filename, TokenReaderPtr reader)
+	MWEReader::MWEReader(const Tagset &tagset, const std::string &filename, 
+			TokenReaderPtr reader)
 		: TokenReader(tagset), inner_filename_(filename)
 	{
 		mwes_counter=0;
@@ -85,6 +92,59 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
 		return process_sentence(currentSentence);
 	}
+	void MWEReader::add_mwe_channel(SentenceContextPtr sentence_ctx,
+			int head, const std::set<int>& all,
+			int annotation_number, const std::string &new_base) {
+		Corpus2::Sentence::Ptr sentence = sentence_ctx->get_sentence_ptr();	
+		AnnotatedSentencePtr ann_sentence = AnnotatedSentence::wrap_sentence(sentence);
+		std::string new_orth = get_new_orth_utf8(sentence, all);
+		std::vector<Token*> &tokens = ann_sentence->tokens();
+		// create 'mwe' channel if not exists
+		ChanMapT chan_map = ann_sentence->all_channels();
+		if (chan_map.find("mwe") == chan_map.end()) {
+			ann_sentence->create_channel("mwe");
+		}
+		AnnotationChannel& channel = ann_sentence->get_channel("mwe");
+		// if channel exists, we leave annotation numbers
+		int head_ann_num = channel.get_segment_at(head);
+		// if not, we add new annotation number and MWE base to head token
+		if (head_ann_num <= 0) {
+			head_ann_num = annotation_number;
+		}
+		channel.set_segment_at(head, head_ann_num);
+		// create metadata if not exits, for 'mwe_base' prop
+		if (!tokens[head]->get_metadata()) {
+			tokens[head]->create_metadata();
+		}
+		TokenMetaDataPtr md = tokens[head]->get_metadata();
+		md->set_attribute("mwe_base", new_base);
+		// annotate mwe elements with annotation_number of head
+		std::set<int>::iterator pos_it;
+		int ann_num;
+		for (pos_it = all.begin(); pos_it != all.end(); ++pos_it) {
+			ann_num = channel.get_segment_at(*pos_it);
+			if (ann_num <= 0) {
+				ann_num = head_ann_num; 
+			}
+			channel.set_segment_at(*pos_it, ann_num);
+		}
+		// move context position to next token after MWE elements
+		int curr_position = sentence_ctx->get_position();
+		if (curr_position + all.size() < sentence->size()) {
+			sentence_ctx->set_position(sentence_ctx->get_position() + all.size());
+		}
+	}
 	Sentence::Ptr MWEReader::process_sentence(Corpus2::Sentence::Ptr sentence)
 	{
 		boost::unordered_set<std::string> available_bases;
@@ -93,12 +153,14 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
 				if (sentence->at(i)->lexemes()[j].is_disamb())
 					available_bases.insert(sentence->at(i)->lexemes()[j].lemma_utf8());
+		// TODO: pass annotated sentence to methods
+		// AnnotatedSentencePtr ann_sentence = AnnotatedSentence::wrap_sentence(sentence);
+		int annotation_number = 0;
+		SentenceContextPtr sc = boost::make_shared<Wccl::SentenceContext>(sentence);
-		Wccl::SentenceContext sc(sentence);
+		for (sc->goto_start(); sc->is_current_inside(); sc->advance())
-		for (sc.goto_start(); sc.is_current_inside(); sc.advance())
 		{
-			Corpus2::Token *pToken = sc.current();
+			Corpus2::Token *pToken = sc->current();
 			std::vector<Lexeme>& lexemes = pToken->lexemes();
 			if(lexemes.size() == 0)
 				continue;
@@ -124,27 +186,38 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
 							std::set<int> positions;
 							int head;
 							bool is_here = pLU->IsHere(sc, positions, head);
-							if(is_here)
+							if(is_here) {
-								sc = clone_sentence_add_mwe(sc, head, positions, pLU->get_base());
+								if (annotate) {
+									add_mwe_channel(
+										sc, head, positions,
+										++annotation_number,
+										pLU->get_base());
+								}
+								else {
+								      sc = clone_sentence_add_mwe(
+								      	sc, head, positions, 
+								      	pLU->get_base());
+								}
 							}
 						}
 					}
 				}
 			}
-		return sc.get_sentence_ptr();
+		}
+		return sc->get_sentence_ptr();
 	}
-	Wccl::SentenceContext MWEReader::clone_sentence_add_mwe(Wccl::SentenceContext sentence,
+	SentenceContextPtr MWEReader::clone_sentence_add_mwe(SentenceContextPtr sentence,
-										  int head, const std::set<int>& all,
+			int head, const std::set<int>& all, const std::string &new_base)
-										  const std::string &new_base)
 	{
-		std::string new_orth = get_new_orth_utf8(sentence.get_sentence_ptr(), all);
+		std::string new_orth = get_new_orth_utf8(sentence->get_sentence_ptr(), all);
-		Sentence::Ptr new_sentence = boost::make_shared<Sentence>();
+		Sentence::Ptr new_sentence = boost::make_shared<AnnotatedSentence>();
+		new_sentence->set_id(sentence->get_sentence_ptr()->id());
-		Wccl::SentenceContext new_context(new_sentence);
+		SentenceContextPtr new_context = boost::make_shared<Wccl::SentenceContext>(new_sentence);
-		new_context.set_position(sentence.get_position());
+		new_context->set_position(sentence->get_position());
-		std::vector<Token*> &tokens = sentence.get_sentence_ptr()->tokens();
+		std::vector<Token*> &tokens = sentence->get_sentence_ptr()->tokens();
 		for (int i = 0; i < (int)tokens.size(); i++)
 		{
@@ -162,8 +235,8 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
 			else if( all.find(i) == all.end())
 				new_sentence->append(tokens[i]->clone());
-			else if (i < sentence.get_position())
+			else if (i < sentence->get_position())
-				new_context.recede();
+				new_context->recede();
 		}
 		return new_context;
 	}
@@ -189,6 +262,7 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
 			return currentChunk;
 		 boost::shared_ptr<Chunk> new_chunk = boost::make_shared<Chunk>();
+		 new_chunk->set_attribute("id", currentChunk->get_attribute("id"));
 		 BOOST_FOREACH (Corpus2::Sentence::Ptr sentence, currentChunk->sentences())
 			 new_chunk->append( process_sentence(sentence) );
@@ -203,6 +277,9 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
 			inner_reader_type = option.substr(6);
 			reset();
 		}
+		if(boost::algorithm::starts_with(option, "annotations:")) {
+			annotate = boost::lexical_cast<bool>(option.substr(12));
+		}
 		if(boost::algorithm::starts_with(option, "mwefile:")) {
 			std::string mwefile = option.substr(8);
 			boost::algorithm::trim(mwefile);

--- a/libmwereader/mwereader.h
+++ b/libmwereader/mwereader.h
@@ -72,12 +72,27 @@ public:
 	static bool registered;
 protected:
+	/**
+	 * adds 'mwe' annotation channel
+	 */
+	void add_mwe_channel(
+			boost::shared_ptr<Wccl::SentenceContext> sentence,
+			int head, const std::set<int>& all, int annotation_number,
+			const std::string &new_base);
+	/**
+	 * use MWE annotations instead of merging MWE tokens to one token
+	 */
+	void use_annotations(bool val) {
+		annotate = val;
+	}
 	Sentence::Ptr process_sentence(Corpus2::Sentence::Ptr sentence);
 private:
 	void load_mwes(const std::string& filename);
-	Wccl::SentenceContext clone_sentence_add_mwe(Wccl::SentenceContext sentence,
+	boost::shared_ptr<Wccl::SentenceContext> clone_sentence_add_mwe(
+			boost::shared_ptr<Wccl::SentenceContext> sentence,
 			int head, const std::set<int>& all,
 			const std::string &new_base);
 	std::string get_new_orth_utf8(Corpus2::Sentence::Ptr sentence,
@@ -101,6 +116,8 @@ private:
 	boost::shared_ptr<Chunk> currentChunk;
 	/// quantity of loaded mwes files
 	size_t mwes_counter;
+	/// use annotations instead of merging the tokens
+	bool annotate;
 };
 } // ns Corpus2