Skip to content
Snippets Groups Projects
mwereader.cpp 5.46 KiB
Newer Older
Bartosz Broda's avatar
Bartosz Broda committed
#include "mwereader.h"
Bartosz Broda's avatar
Bartosz Broda committed
#include "mweparser.h"
#include <boost/algorithm/string.hpp>
Lukasz Bilenkij's avatar
Lukasz Bilenkij committed
#include <boost/filesystem.hpp>
Bartosz Broda's avatar
Bartosz Broda committed
namespace Corpus2{

bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
Bartosz Broda's avatar
Bartosz Broda committed
	"mwereader","inner,mwefile"); // TODO more help?
Bartosz Broda's avatar
Bartosz Broda committed

	MWEReader::MWEReader(const Tagset &tagset, const std::string &filename)
		: TokenReader(tagset), inner_filename_(filename)
Bartosz Broda's avatar
Bartosz Broda committed
	{
Lukasz Bilenkij's avatar
Lukasz Bilenkij committed
		mwes_counter=0;
Bartosz Broda's avatar
Bartosz Broda committed
	}

	MWEReader::~MWEReader()
	{
		// TODO implementataion
	}

	Token* MWEReader::get_next_token()
	{
		if(currentSentence->empty())
			currentSentence=get_next_sentence();

		std::vector<Token*> tokens = currentSentence->tokens();
		if(token_index<tokens.size())
		{
			return tokens.at(token_index++);
		}
		else
		{

			currentSentence=get_next_sentence();


			if(currentSentence==NULL)
			{
				return NULL;
			}
			tokens = currentSentence->tokens();
			token_index=0;
			return tokens.at(token_index++);
		}
Bartosz Broda's avatar
Bartosz Broda committed
	}

	Sentence::Ptr MWEReader::get_next_sentence()
	{
		currentSentence = inner_reader_->get_next_sentence();
		if(currentSentence==0)
		{
			return currentSentence;
		}
		Wccl::SentenceContext sc(currentSentence);
		token_index=0;
		return process_sentence(sc);
	}

	Sentence::Ptr MWEReader::process_sentence(Wccl::SentenceContext & sc)
	{
Bartosz Broda's avatar
Bartosz Broda committed
		size_t sssize = sc.size();
		for(int i = 0; i < sc.size() ;++i){
			sc.set_position(i);
			Corpus2::Token *pToken = sc.at(i);
Bartosz Broda's avatar
Bartosz Broda committed
			//std::cout << "AAAAAAAAAAA" << i << " ---- " << sc.size()  << std::endl;
			//if(!pToken)
				//continue;
			//std::cout << pToken->orth_utf8() << " ";
			std::vector<Lexeme>& lexemes = pToken->lexemes();
Bartosz Broda's avatar
Bartosz Broda committed
			if(!lexemes.size()){
				sssize = sc.size();
				continue;
			}
			foreach(const Lexeme& lex, lexemes){
				if(lex.is_disamb()){
					std::string base = lex.lemma_utf8();
					const MWEIndex::luvec& potential = mwe_index_.get_potential_lu(base);
Bartosz Broda's avatar
Bartosz Broda committed
					//std::cout << "potential " << potential.size() << std::endl;
					foreach(LexicalUnit::Ptr pLU, potential){
						std::set<int> positions;
						int head;
						//std::cout << " is " << std::endl;
						bool is_here = pLU->IsHere(sc, positions, head);
						//std::cout << " is out" << std::endl;
						if(is_here){
							std::string new_orth_utf8;
							Corpus2::Sentence::Ptr sent = sc.get_sentence_ptr();

							std::vector<Token*> &tokens = sent->tokens();
Bartosz Broda's avatar
Bartosz Broda committed
							int orig_i = i;
							foreach(const int &pos, positions){
								Token* tok = tokens [pos];
								new_orth_utf8 += tok->orth_utf8() + " ";
								if(pos != head){
									delete tok;
									tokens[pos] = NULL;
Bartosz Broda's avatar
Bartosz Broda committed
									//std::cout << "BBBB " << pos  << " " << i << std::endl;

									if(orig_i > pos)
									{
Bartosz Broda's avatar
Bartosz Broda committed
										std::cout << "\nTUTUXXXXXX\n";
									}
									//std::cout << "XBBBB " << pos  << " " << i << std::endl;
omekr's avatar
omekr committed
							new_orth_utf8.erase(new_orth_utf8.size()-1, 1);
							Corpus2::Token *tok = (*sent)[head];
							tok->set_orth_utf8(new_orth_utf8);
ilor's avatar
ilor committed
							foreach(Lexeme& lex, tok->lexemes()){
omekr's avatar
omekr committed
								if(lex.is_disamb()){
									lex.set_lemma_utf8(pLU->get_base());
omekr's avatar
omekr committed
								}
							}
ilor's avatar
ilor committed
							tokens.erase(std::remove(tokens.begin(), tokens.end(), (Token*)NULL), tokens.end());
Bartosz Broda's avatar
Bartosz Broda committed
			sssize = sc.size();

		return sc.get_sentence_ptr();
Bartosz Broda's avatar
Bartosz Broda committed
	}

	boost::shared_ptr<Chunk> MWEReader::get_next_chunk()
	{
		currentChunk=inner_reader_->get_next_chunk();
		if(currentChunk == NULL)
			return currentChunk;
		std::vector< boost::shared_ptr<Corpus2::Sentence> > s1 = currentChunk->sentences();
		std::vector< boost::shared_ptr<Corpus2::Sentence> >::iterator it;
		for(it=s1.begin(); it!=s1.end(); it++)
		{
			if(it==s1.begin())
				currentSentence=*it;
			Wccl::SentenceContext sc(*it);
			process_sentence(sc);
		}

		token_index=0;
		return currentChunk;
Bartosz Broda's avatar
Bartosz Broda committed
	}

	void MWEReader::set_option(const std::string& option)
	{
Bartosz Broda's avatar
Bartosz Broda committed
		if(boost::algorithm::starts_with(option, "inner:")) {
			std::string inner = option.substr(6);
			inner_reader_ = create_path_reader(inner, this->tagset(),
Bartosz Broda's avatar
Bartosz Broda committed
											inner_filename_);
			token_index=0;
			currentSentence= boost::make_shared<Sentence>();
Bartosz Broda's avatar
Bartosz Broda committed
		}
		if(boost::algorithm::starts_with(option, "mwefile:")) {
Bartosz Broda's avatar
Bartosz Broda committed
			std::string mwefile = option.substr(8);
Lukasz Bilenkij's avatar
Lukasz Bilenkij committed
			if(boost::filesystem::exists(mwefile))
				load_mwes(mwefile);
			else
				throw std::runtime_error("File "+ mwefile + " does not exists");
		}
		if(boost::algorithm::starts_with(option, "mwefile-list:")) {
			std::string mwefile = option.substr(13);
			size_t found=mwefile.find(" ",0);
			while(found!=std::string::npos)
			{
				std::string file = mwefile.substr(0,found);
				if(boost::filesystem::exists(file))
					load_mwes(file);
				else
					throw std::runtime_error("File "+ mwefile +" does not exists");
				mwefile=mwefile.substr(found+1);
				found=mwefile.find(" ",0);
			}
			if(boost::filesystem::exists(mwefile))
				load_mwes(mwefile);
			else
				throw std::runtime_error("File "+ mwefile +" does not exists");
Bartosz Broda's avatar
Bartosz Broda committed

Bartosz Broda's avatar
Bartosz Broda committed
		// TODO more MWE stuff
Bartosz Broda's avatar
Bartosz Broda committed
	}

	void MWEReader::validate()
	{
		if(inner_reader_ == NULL)
			throw Corpus2Error("Inner reader not initialised.");
Lukasz Bilenkij's avatar
Lukasz Bilenkij committed
		if(mwes_counter==0)
			throw Corpus2Error("MWE files were not loaded");
		// TODO MWE stuff
Bartosz Broda's avatar
Bartosz Broda committed
	}

	std::string MWEReader::get_option(const std::string& option) const
	{
		if(boost::algorithm::starts_with(option, "inner:")
			&& inner_reader_ != NULL)
			return option;
		// TODO options for MWE
		return inner_reader_->get_option(option);
Bartosz Broda's avatar
Bartosz Broda committed
	void MWEReader::load_mwes(const std::string &filename)
	{
Bartosz Broda's avatar
Bartosz Broda committed
		MWEParser parser(mwe_index_);
Bartosz Broda's avatar
Bartosz Broda committed
		parser.parse_file(filename);
Lukasz Bilenkij's avatar
Lukasz Bilenkij committed
		mwes_counter++;
Bartosz Broda's avatar
Bartosz Broda committed

}// ns Corpus2