#ifndef LIBMWEREADER_MWEREADER_H #define LIBMWEREADER_MWEREADER_H #include <libcorpus2/io/reader.h> #include "mwe.h" namespace Corpus2 { class MWEReader: public TokenReader { public: /** * \param filename corpus filename (MWE file is given in options) */ MWEReader(const Tagset& tagset, const std::string& filename); ~MWEReader(); /// retrieves whole sentence, finds MWEs, and return tokens Token* get_next_token(); /// the prefered mode for this reader Sentence::Ptr get_next_sentence(); /** * retrieves chunk with inner reader and then searches for MWEs within * sentences. */ boost::shared_ptr<Chunk> get_next_chunk(); /** * setting an "inner:..." option is equal to an immediate creation of an inner reader. * If a filename set in ctor is not valid, setting the "inner" option results in error. * **/ void set_option(const std::string& option); /** * Option inspector. Should echo the option if it is set, return * an empty string otheriwse, and "unknown" if the option is invalid. */ std::string get_option(const std::string& option) const; /** * Check if the reader is valid, should throw if not. Called after * all set_options during factory reader creation. */ virtual void validate(); static bool registered; protected: Sentence::Ptr process_sentence(Wccl::SentenceContext & sc); private: void load_mwes(const std::string& filename); MWEIndex mwe_index_; /// ptr to inner reader doing the real work of reading a corpus TokenReaderPtr inner_reader_; /// path for inner reader std::string inner_filename_; /// inner reader option size_t token_index; Sentence::Ptr currentSentence; boost::shared_ptr<Chunk> currentChunk; }; } // ns Corpus2 #endif // LIBMWEREADER_MWEREADER_H