Skip to content
Snippets Groups Projects
mwereader.cpp 2.59 KiB
Newer Older
Bartosz Broda's avatar
Bartosz Broda committed
#include "mwereader.h"
Bartosz Broda's avatar
Bartosz Broda committed
#include "mweparser.h"
#include <boost/algorithm/string.hpp>
Bartosz Broda's avatar
Bartosz Broda committed
namespace Corpus2{

bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
Bartosz Broda's avatar
Bartosz Broda committed
	"mwereader","inner,mwefile"); // TODO more help?
Bartosz Broda's avatar
Bartosz Broda committed

	MWEReader::MWEReader(const Tagset &tagset, const std::string &filename)
		: TokenReader(tagset), inner_filename_(filename)
Bartosz Broda's avatar
Bartosz Broda committed
	{
Bartosz Broda's avatar
Bartosz Broda committed
	}

	MWEReader::~MWEReader()
	{
		// TODO implementataion
	}

	Token* MWEReader::get_next_token()
	{
		// TODO MWE stuff
Bartosz Broda's avatar
Bartosz Broda committed
		// get whole sentence -> process it -> return token by token
		return inner_reader_->get_next_token();
Bartosz Broda's avatar
Bartosz Broda committed
	}

	Sentence::Ptr MWEReader::get_next_sentence()
	{
		// TODO MWE stuff
		Sentence::Ptr pSentence = inner_reader_->get_next_sentence();

		Wccl::SentenceContext sc(pSentence);

		for(int i = 0; i < sc.size(); ++i){
			sc.set_position(i);
			Corpus2::Token *pToken = (*pSentence)[i];
			std::cout << pToken->orth_utf8() << " ";
			std::vector<Lexeme>& lexemes = pToken->lexemes();
			foreach(const Lexeme& lex, lexemes){
				if(lex.is_disamb()){
					std::string base = lex.lemma_utf8();
					const MWEIndex::luvec& potential = mwe_index_.get_potential_lu(base);
					if(potential.size())
						std::cout << "# ";
					foreach(LexicalUnit::Ptr pLU, potential){
						std::set<size_t> positions;
						int head;
						bool is_here = pLU->IsHere(sc, positions, head);
						if(is_here)
							std::cout << "** " << pLU->get_base() << "** ";
					}
				}
			}
		}

		std::cout << "ENDL\n";
		return pSentence;
Bartosz Broda's avatar
Bartosz Broda committed
	}

	boost::shared_ptr<Chunk> MWEReader::get_next_chunk()
	{
		// TODO MWE stuff
Bartosz Broda's avatar
Bartosz Broda committed
		// get whole chunk -> process sentences -> return processed chunk
		return inner_reader_->get_next_chunk();
Bartosz Broda's avatar
Bartosz Broda committed
	}

	void MWEReader::set_option(const std::string& option)
	{
Bartosz Broda's avatar
Bartosz Broda committed
		if(boost::algorithm::starts_with(option, "inner:")) {
			std::string inner = option.substr(6);
			inner_reader_ = create_path_reader(inner, this->tagset(),
Bartosz Broda's avatar
Bartosz Broda committed
											inner_filename_);
		}
		if(boost::algorithm::starts_with(option, "mwefile:")) {
			std::string mwefile = option.substr(8);
			load_mwes(mwefile);
Bartosz Broda's avatar
Bartosz Broda committed

Bartosz Broda's avatar
Bartosz Broda committed
		// TODO more MWE stuff
Bartosz Broda's avatar
Bartosz Broda committed
	}

	void MWEReader::validate()
	{
		if(inner_reader_ == NULL)
			throw Corpus2Error("Inner reader not initialised.");
		// TODO MWE stuff
Bartosz Broda's avatar
Bartosz Broda committed
	}

	std::string MWEReader::get_option(const std::string& option) const
	{
		if(boost::algorithm::starts_with(option, "inner:")
			&& inner_reader_ != NULL)
			return option;
		// TODO options for MWE
		return inner_reader_->get_option(option);
Bartosz Broda's avatar
Bartosz Broda committed
	void MWEReader::load_mwes(const std::string &filename)
	{
Bartosz Broda's avatar
Bartosz Broda committed
		MWEParser parser(mwe_index_);
Bartosz Broda's avatar
Bartosz Broda committed
		parser.parse_file(filename);
Bartosz Broda's avatar
Bartosz Broda committed

}// ns Corpus2