/* Copyright (C) 2010 Tomasz Ĺšniatowski, Adam Radziszewski Part of the libcorpus2 project This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the LICENSE and COPYING files for more details. */ #ifndef LIBCORPUS2_IO_READER_H #define LIBCORPUS2_IO_READER_H #include <libcorpus2/chunk.h> #include <libcorpus2/tokensource.h> #include <deque> #include <loki/Factory.h> #include <loki/Singleton.h> namespace Corpus2 { /** * Base class for readers of Corpus2 tokens. Tokens are always read from a * source and always within a given tagset. * * Tokens can be read on a token-by-token basis, or in entire senteces, or * in entire chunks (paragraphs). Behavior is not defined if there are * mixed calls to differend forms of getting data. */ class TokenReader : public TokenSource { public: /// Constructor --- only a Tagset is needed explicit TokenReader(const Tagset& tagset); /** * Reader creation from a class identifier (possibly with comma-separated * parameters / options that are passed to set_option), with a tagset and * a path to a file or some other resource that the reader will open. * * Any files open will be closed by the reader. */ static boost::shared_ptr<TokenReader> create_path_reader( const std::string& class_id, const Tagset& tagset, const std::string& path); /** * Reader creation as in create_path_reader, only readng form a stream * that is managed by the caller (so e.g. std::cin can be used). Generally * all stream readers are path readers, but not all path readers are * stream readers (a path reader might look and open more that one file, * which is beyond what this interface allows). Attempting to create a * reader that can not read a stream will result in an exception. */ static boost::shared_ptr<TokenReader> create_stream_reader( const std::string& class_id, const Tagset& tagset, std::istream& stream); /// Destructor virtual ~TokenReader(); /** * Interface for getting a token from the reader. Note that the caller * must dispose of the Token it receives. A null value returned indicates * end of processing. * * There is no information about sentence boundaries in this mode. */ virtual Token* get_next_token() = 0; /** * Interface for getting entire senteces from the reader. * * There is no information about chunk boundaries in that mode. */ virtual Sentence::Ptr get_next_sentence() = 0; /** * Interface for getting entire chunks from the reader. */ virtual boost::shared_ptr<Chunk> get_next_chunk() = 0; /** * General option setter. */ virtual void set_option(const std::string& option); /** * Option inspector. Should echo the option if it is set, return * an empty string otheriwse, and "unknown" if the option is invalid. */ virtual std::string get_option(const std::string& option) const; /** * Tagset accesor */ const Tagset& tagset() const { return tagset_; } /** * Function to get a vector of available reader type strings. */ static std::vector<std::string> available_reader_types(); /** * Function to get the help string for a reader */ static std::string reader_help(const std::string& class_id); /** * Function to get a vector of available reader type strings with help * strings appended */ static std::vector<std::string> available_reader_types_help(); /** * Convenience template for registering TokenReader derived classes. */ template <typename T> static bool register_reader(const std::string& class_id, const std::string& help = ""); /** * Convenience template for registering TokenReader derived classes. * Path-only verison. */ template <typename T> static bool register_path_reader(const std::string& class_id, const std::string& help = ""); /// Convenience function to parse a tag string with options of this reader Tag parse_tag(const std::string& tag_string) const { return tagset().parse_simple_tag(tag_string, tag_parse_mode_); } /// tag parse mode getter Tagset::ParseMode tag_parse_mode() const { return tag_parse_mode_; } /// tag parse mode setter void set_tag_parse_mode(Tagset::ParseMode mode) { tag_parse_mode_ = mode; } boost::shared_ptr<Sentence> make_sentence() const; private: /// Tagset used by the Reader const Tagset& tagset_; /// Tag parse mode Tagset::ParseMode tag_parse_mode_; /// Flag to force creation of sentences as AnnotatedSentences bool use_annotated_sentences_; }; namespace detail { typedef Loki::Factory< TokenReader, // The base class for objects created in the factory std::string, // Identifier type Loki::TL::MakeTypelist< const Tagset& /*tagset*/, std::istream& /*input*/, const string_range_vector& /*params*/ >::Result > StreamTokenReaderFactoryType; typedef Loki::Factory< TokenReader, // The base class for objects created in the factory std::string, // Identifier type Loki::TL::MakeTypelist< const Tagset& /*tagset*/, const std::string& /*path*/, const string_range_vector& /*params*/ >::Result > PathTokenReaderFactoryType; struct TokenReaderFactory { StreamTokenReaderFactoryType stream_factory; PathTokenReaderFactoryType path_factory; std::map<std::string, std::string> help; }; /** * Declaration of the TokenWriter factory as a singleton Loki object * factory. The factory instance can be accessed as * TokenLayerFactory::Instance(). It is assumed that all derived classes * have the same constructor signature. */ typedef Loki::SingletonHolder< TokenReaderFactory, Loki::CreateUsingNew, // default, needed to change the item below Loki::LongevityLifetime::DieAsSmallObjectChild // per libloki docs > TokenReaderFactorySingleton; /** * Templated TokenReader creation function, stream variant */ template <typename T> inline T* stream_reader_creator(const Tagset& tagset, std::istream& is, const string_range_vector& params) { T* reader = new T(tagset, is); foreach (const string_range& sr, params) { reader->set_option(boost::copy_range<std::string>(sr)); } return reader; } /** * Templated TokenReader creation function, stream variant */ template <typename T> inline T* path_reader_creator(const Tagset& tagset, const std::string& path, const string_range_vector& params) { T* reader = new T(tagset, path); foreach (const string_range& sr, params) { reader->set_option(boost::copy_range<std::string>(sr)); } return reader; } /** * Convenience typedef for the exception type the factory throws */ typedef Loki::DefaultFactoryError< std::string, TokenReader >::Exception TokenReaderFactoryException; } /* end ns detail */ template <typename T> bool TokenReader::register_reader(const std::string& class_id, const std::string& help) { bool ret = detail::TokenReaderFactorySingleton::Instance().path_factory.Register( class_id, detail::path_reader_creator<T>); bool ret2 = detail::TokenReaderFactorySingleton::Instance().stream_factory.Register( class_id, detail::stream_reader_creator<T>); if (ret || ret2) { detail::TokenReaderFactorySingleton::Instance().help[class_id] = help; } return ret; } template <typename T> bool TokenReader::register_path_reader(const std::string& class_id, const std::string& help) { bool ret = detail::TokenReaderFactorySingleton::Instance().path_factory.Register( class_id, detail::path_reader_creator<T>); if (ret) { detail::TokenReaderFactorySingleton::Instance().help[class_id] = help; } return ret; } /** * Convenience class for readers that keep a buffer of chunks. Sentence * and token accessors are based upon the chunk buffer. * * A dervied class only neds to override ensure_more with a function that * fills the chunk buffer. */ class BufferedChunkReader : public TokenReader { public: BufferedChunkReader(const Tagset& tagset); ~BufferedChunkReader(); Token* get_next_token(); Sentence::Ptr get_next_sentence(); boost::shared_ptr<Chunk> get_next_chunk(); void set_option(const std::string& option) { TokenReader::set_option(option); } std::string get_option(const std::string& option) const { return TokenReader::get_option(option); } protected: virtual void ensure_more() = 0; std::deque< boost::shared_ptr<Chunk> > chunk_buf_; std::deque< Sentence::Ptr > sentence_buf_; std::deque<Token*> token_buf_; }; /** * Convenience class for readers that internally read sentences. The token * accessor is based on the sentence buffer, so is the chunk accessor. * * A dervied class only neds to override actual_next_sentence with a * function that returns a new sentence. * * Note that the chunk accessor might well read the entire input and return * one huge chunk with all the sentences. */ class BufferedSentenceReader : public TokenReader { public: BufferedSentenceReader(const Tagset& tagset); Token* get_next_token(); Sentence::Ptr get_next_sentence(); boost::shared_ptr<Chunk> get_next_chunk(); void set_option(const std::string& option) { TokenReader::set_option(option); } std::string get_option(const std::string& option) const { return TokenReader::get_option(option); } protected: virtual Sentence::Ptr actual_next_sentence() = 0; bool chunkify_; Sentence::Ptr sentence_buf_; std::deque<Token*> token_buf_; }; } /* end ns Corpus2 */ #endif // LIBCORPUS2_IO_READER_H