Skip to content
Snippets Groups Projects
mwereader.h 1.72 KiB
Newer Older
Bartosz Broda's avatar
Bartosz Broda committed
#ifndef LIBMWEREADER_MWEREADER_H
#define LIBMWEREADER_MWEREADER_H

#include <libcorpus2/io/reader.h>

Bartosz Broda's avatar
Bartosz Broda committed
#include "mwe.h"

Bartosz Broda's avatar
Bartosz Broda committed
namespace Corpus2 {


class MWEReader: public TokenReader
{
public:
	/**
	  * \param filename corpus filename (MWE file is given in options)
	  */
Bartosz Broda's avatar
Bartosz Broda committed
	MWEReader(const Tagset& tagset, const std::string& filename);

	~MWEReader();

Bartosz Broda's avatar
Bartosz Broda committed
	/// retrieves whole sentence, finds MWEs, and return tokens
Bartosz Broda's avatar
Bartosz Broda committed
	Token* get_next_token();

Bartosz Broda's avatar
Bartosz Broda committed
	/// the prefered mode for this reader
Bartosz Broda's avatar
Bartosz Broda committed
	Sentence::Ptr get_next_sentence();

Bartosz Broda's avatar
Bartosz Broda committed
	/**
	  * retrieves chunk with inner reader and then searches for MWEs within
	  * sentences.
	  */
Bartosz Broda's avatar
Bartosz Broda committed
	boost::shared_ptr<Chunk> get_next_chunk();
omekr's avatar
omekr committed
	/**
	  * setting an "inner:..." option is equal to an immediate creation of an inner reader.
	  * If a filename set in ctor is not valid, setting the "inner" option results in error.
	  *
	 **/
Bartosz Broda's avatar
Bartosz Broda committed
	void set_option(const std::string& option);

	/**
	 * Option inspector. Should echo the option if it is set, return
	 * an empty string otheriwse, and "unknown" if the option is invalid.
	 */
	std::string get_option(const std::string& option) const;

	/**
	 * Check if the reader is valid, should throw if not. Called after
	 * all set_options during factory reader creation.
	 */
	virtual void validate();

	static bool registered;
Bartosz Broda's avatar
Bartosz Broda committed

protected:
	Sentence::Ptr process_sentence(Wccl::SentenceContext & sc);

Bartosz Broda's avatar
Bartosz Broda committed
	void load_mwes(const std::string& filename);

Bartosz Broda's avatar
Bartosz Broda committed
	MWEIndex mwe_index_;
	/// ptr to inner reader doing the real work of reading a corpus
	TokenReaderPtr inner_reader_;
	/// path for inner reader
	std::string inner_filename_;
	/// inner reader option
	size_t token_index;
	Sentence::Ptr currentSentence;
	boost::shared_ptr<Chunk> currentChunk;
Bartosz Broda's avatar
Bartosz Broda committed
};

} // ns Corpus2

#endif // LIBMWEREADER_MWEREADER_H