Skip to content
Snippets Groups Projects
mwereader.h 2.82 KiB
Newer Older
/*
    Copyright (C) 2011 Adam Wardyński, Tomasz Śniatowski, Paweł Kędzia,
    Adam Radziszewski, Bartosz Broda
    Part of the WCCL project

    This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your option)
any later version.

    This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. 

    See the LICENSE and COPYING files for more details.
*/

Bartosz Broda's avatar
Bartosz Broda committed
#ifndef LIBMWEREADER_MWEREADER_H
#define LIBMWEREADER_MWEREADER_H

#include <libcorpus2/io/reader.h>

Bartosz Broda's avatar
Bartosz Broda committed
#include "mwe.h"

Bartosz Broda's avatar
Bartosz Broda committed
namespace Corpus2 {


class MWEReader: public TokenReader
{
public:
	/**
	  * \param filename corpus filename (MWE file is given in options)
	  */
Bartosz Broda's avatar
Bartosz Broda committed
	MWEReader(const Tagset& tagset, const std::string& filename);
        MWEReader(const Tagset &tagset, const std::string &filename, TokenReaderPtr reader);
Bartosz Broda's avatar
Bartosz Broda committed

	~MWEReader();

Bartosz Broda's avatar
Bartosz Broda committed
	/// retrieves whole sentence, finds MWEs, and return tokens
Bartosz Broda's avatar
Bartosz Broda committed
	Token* get_next_token();

Bartosz Broda's avatar
Bartosz Broda committed
	/// the prefered mode for this reader
Bartosz Broda's avatar
Bartosz Broda committed
	Sentence::Ptr get_next_sentence();

Bartosz Broda's avatar
Bartosz Broda committed
	/**
	  * retrieves chunk with inner reader and then searches for MWEs within
	  * sentences.
	  */
Bartosz Broda's avatar
Bartosz Broda committed
	boost::shared_ptr<Chunk> get_next_chunk();
omekr's avatar
omekr committed
	/**
	  * setting an "inner:..." option is equal to an immediate creation of an inner reader.
	  * If a filename set in ctor is not valid, setting the "inner" option results in error.
	  *
	 **/
Bartosz Broda's avatar
Bartosz Broda committed
	void set_option(const std::string& option);

	/**
	 * Option inspector. Should echo the option if it is set, return
	 * an empty string otheriwse, and "unknown" if the option is invalid.
	 */
	std::string get_option(const std::string& option) const;

	/**
	 * Check if the reader is valid, should throw if not. Called after
	 * all set_options during factory reader creation.
	 */
	virtual void validate();

	static bool registered;
Bartosz Broda's avatar
Bartosz Broda committed

	Sentence::Ptr process_sentence(Corpus2::Sentence::Ptr sentence);
Bartosz Broda's avatar
Bartosz Broda committed
	void load_mwes(const std::string& filename);

	Sentence::Ptr  clone_sentence_add_mwe(Corpus2::Sentence::Ptr sentence,
										  int head, const std::set<int>& all,
										  const std::string &new_base);
	std::string get_new_orth_utf8(Corpus2::Sentence::Ptr sentence,
								  const std::set<int>& all);

Bartosz Broda's avatar
Bartosz Broda committed
	MWEIndex mwe_index_;
	/// ptr to inner reader doing the real work of reading a corpus
	TokenReaderPtr inner_reader_;
	/// path for inner reader
	std::string inner_filename_;
	/// inner reader option
Lukasz Bilenkij's avatar
Lukasz Bilenkij committed
	/// contains last processed sentence
	Sentence::Ptr currentSentence;
Lukasz Bilenkij's avatar
Lukasz Bilenkij committed
	/// contains last processed chunk
	boost::shared_ptr<Chunk> currentChunk;
Lukasz Bilenkij's avatar
Lukasz Bilenkij committed
	/// quantity of loaded mwes files
	size_t mwes_counter;
Bartosz Broda's avatar
Bartosz Broda committed
};

} // ns Corpus2

#endif // LIBMWEREADER_MWEREADER_H