Skip to content
Snippets Groups Projects
mwereader.cpp 6.17 KiB
Newer Older
/*
    Copyright (C) 2011 Adam Wardyński, Tomasz Śniatowski, Paweł Kędzia,
    Adam Radziszewski, Bartosz Broda
    Part of the WCCL project

    This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your option)
any later version.

    This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. 

    See the LICENSE and COPYING files for more details.
*/

Bartosz Broda's avatar
Bartosz Broda committed
#include "mwereader.h"
Bartosz Broda's avatar
Bartosz Broda committed
#include "mweparser.h"
#include <boost/algorithm/string.hpp>
Lukasz Bilenkij's avatar
Lukasz Bilenkij committed
#include <boost/filesystem.hpp>
Bartosz Broda's avatar
Bartosz Broda committed
namespace Corpus2{

bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
Bartosz Broda's avatar
Bartosz Broda committed
	"mwereader","inner,mwefile"); // TODO more help?
Bartosz Broda's avatar
Bartosz Broda committed

	MWEReader::MWEReader(const Tagset &tagset, const std::string &filename)
		: TokenReader(tagset), inner_filename_(filename)
Bartosz Broda's avatar
Bartosz Broda committed
	{
Lukasz Bilenkij's avatar
Lukasz Bilenkij committed
		mwes_counter=0;
        MWEReader::MWEReader(const Tagset &tagset, const std::string &filename, TokenReaderPtr reader)
            : TokenReader(tagset), inner_filename_(filename)
        {
                mwes_counter=0;
                inner_reader_ = reader;
        }

Bartosz Broda's avatar
Bartosz Broda committed
	MWEReader::~MWEReader()
	{
		// TODO implementataion
	}

	Token* MWEReader::get_next_token()
	{
		if(currentSentence->empty())
			currentSentence=get_next_sentence();

		std::vector<Token*> tokens = currentSentence->tokens();
		if(token_index<tokens.size())
		{
			return tokens.at(token_index++);
		}
		else
		{

			currentSentence=get_next_sentence();


			if(currentSentence==NULL)
			{
				return NULL;
			}
			tokens = currentSentence->tokens();
			token_index=0;
			return tokens.at(token_index++);
		}
Bartosz Broda's avatar
Bartosz Broda committed
	}

	Sentence::Ptr MWEReader::get_next_sentence()
	{
		currentSentence = inner_reader_->get_next_sentence();
		if(currentSentence==0)
			return currentSentence;
		return process_sentence(currentSentence);
	Sentence::Ptr MWEReader::process_sentence(Corpus2::Sentence::Ptr sentence)
Bartosz Broda's avatar
Bartosz Broda committed
		for(int i = 0; i < sc.size() ;++i){
			sc.set_position(i);
			Corpus2::Token *pToken = sc.at(i);
			std::vector<Lexeme>& lexemes = pToken->lexemes();
Bartosz Broda's avatar
Bartosz Broda committed
				continue;
			foreach(const Lexeme& lex, lexemes){
				if(lex.is_disamb()){
					std::string base = lex.lemma_utf8();
					const MWEIndex::luvec& potential = mwe_index_.get_potential_lu(base);
					foreach(LexicalUnit::Ptr pLU, potential){
						std::set<int> positions;
						int head;
						bool is_here = pLU->IsHere(sc, positions, head);
						if(is_here)
							return  process_sentence(clone_sentence_add_mwe(sentence, head, positions, pLU->get_base()));
	Sentence::Ptr  MWEReader::clone_sentence_add_mwe(Corpus2::Sentence::Ptr sentence,
										  int head, const std::set<int>& all,
										  const std::string &new_base)
	{
		std::string new_orth = get_new_orth_utf8(sentence, all);
		Sentence::Ptr new_sentence = boost::make_shared<Sentence>();
		std::vector<Token*> &tokens = sentence->tokens();
		for(int i = 0; i < (int)tokens.size(); i++){
			if(i == head){
				Corpus2::Token * t = tokens[i]->clone();
				t->set_orth_utf8(new_orth);
				foreach(Lexeme& lex, t->lexemes())
					if(lex.is_disamb())
						lex.set_lemma_utf8(new_base);
				new_sentence->append(t);
			} else if( all.find(i) == all.end())
				new_sentence->append(tokens[i]->clone());
			// else -> do nothing
		}
		return new_sentence;
	}
	std::string MWEReader::get_new_orth_utf8(Corpus2::Sentence::Ptr sentence,
								  const std::set<int>& all)
	{
		std::string new_orth;
		std::vector<Token*> &tokens = sentence->tokens();
		foreach(const int &pos, all){
			Token* tok = tokens [pos];
			new_orth += tok->orth_utf8() + " ";
		}
		new_orth.erase(new_orth.size()-1, 1);

		return new_orth;
Bartosz Broda's avatar
Bartosz Broda committed
	}

	boost::shared_ptr<Chunk> MWEReader::get_next_chunk()
	{
		currentChunk=inner_reader_->get_next_chunk();
		if(currentChunk == NULL)
			return currentChunk;

		 boost::shared_ptr<Chunk> new_chunk = boost::make_shared<Chunk>();

		 foreach(Corpus2::Sentence::Ptr sentence, currentChunk->sentences())
			 new_chunk->append( process_sentence(sentence) );


		return new_chunk;
Bartosz Broda's avatar
Bartosz Broda committed
	}

	void MWEReader::set_option(const std::string& option)
	{
Bartosz Broda's avatar
Bartosz Broda committed
		if(boost::algorithm::starts_with(option, "inner:")) {
			std::string inner = option.substr(6);
			inner_reader_ = create_path_reader(inner, this->tagset(),
Bartosz Broda's avatar
Bartosz Broda committed
											inner_filename_);
			token_index=0;
			currentSentence= boost::make_shared<Sentence>();
Bartosz Broda's avatar
Bartosz Broda committed
		}
		if(boost::algorithm::starts_with(option, "mwefile:")) {
			std::string mwefile = option.substr(8);
			boost::algorithm::trim(mwefile);
			if(boost::filesystem::exists(mwefile)){
Lukasz Bilenkij's avatar
Lukasz Bilenkij committed
				load_mwes(mwefile);
Lukasz Bilenkij's avatar
Lukasz Bilenkij committed
			else
				throw std::runtime_error("File "+ mwefile + " does not exists");
		}
		if(boost::algorithm::starts_with(option, "mwefile-list:")) {
			std::string mwefile = option.substr(13);
			size_t found=mwefile.find(" ",0);
			while(found!=std::string::npos)
			{
				std::string file = mwefile.substr(0,found);
				if(boost::filesystem::exists(file))
					load_mwes(file);
				else
					throw std::runtime_error("File "+ mwefile +" does not exists");
				mwefile=mwefile.substr(found+1);
				found=mwefile.find(" ",0);
			}
			if(boost::filesystem::exists(mwefile))
				load_mwes(mwefile);
			else
				throw std::runtime_error("File "+ mwefile +" does not exists");
Bartosz Broda's avatar
Bartosz Broda committed

Bartosz Broda's avatar
Bartosz Broda committed
		// TODO more MWE stuff
Bartosz Broda's avatar
Bartosz Broda committed
	}

	void MWEReader::validate()
	{
		if(inner_reader_ == NULL)
			throw Corpus2Error("Inner reader not initialised.");
Lukasz Bilenkij's avatar
Lukasz Bilenkij committed
		if(mwes_counter==0)
			throw Corpus2Error("MWE files were not loaded");
		// TODO MWE stuff
Bartosz Broda's avatar
Bartosz Broda committed
	}

	std::string MWEReader::get_option(const std::string& option) const
	{
		if(boost::algorithm::starts_with(option, "inner:")
			&& inner_reader_ != NULL)
			return option;
		// TODO options for MWE
		return inner_reader_->get_option(option);
Bartosz Broda's avatar
Bartosz Broda committed
	void MWEReader::load_mwes(const std::string &filename)
	{
Bartosz Broda's avatar
Bartosz Broda committed
		MWEParser parser(mwe_index_);
Bartosz Broda's avatar
Bartosz Broda committed
		parser.parse_file(filename);
Lukasz Bilenkij's avatar
Lukasz Bilenkij committed
		mwes_counter++;
Bartosz Broda's avatar
Bartosz Broda committed
		if(parser.get_tagset().name() != tagset().name())
			throw std::runtime_error( "Tagset in mwe file does not match reader tagset!" );
Bartosz Broda's avatar
Bartosz Broda committed

}// ns Corpus2