From 1b0a59daecc4523b2035cf3d45ac347fba76b855 Mon Sep 17 00:00:00 2001 From: ilor <kailoran@gmail.com> Date: Mon, 11 Apr 2011 14:47:17 +0200 Subject: [PATCH] Reader factory. There is some code duplication with the writer factory as well as among the individual Readers, but overall it is now easier to support more input formats. --- libcorpus2/io/cclreader.cpp | 17 +++ libcorpus2/io/cclreader.h | 4 + libcorpus2/io/fastxces.cpp | 8 ++ libcorpus2/io/fastxces.h | 4 + libcorpus2/io/reader.cpp | 79 ++++++++++++++ libcorpus2/io/reader.h | 201 ++++++++++++++++++++++++++++++++++- libcorpus2/io/rft.cpp | 41 ++++++- libcorpus2/io/rft.h | 18 +++- libcorpus2/io/xcesreader.cpp | 21 ++++ libcorpus2/io/xcesreader.h | 4 + libcorpus2/io/xmlreader.h | 25 ++--- tests/io.cpp | 21 ++++ 12 files changed, 422 insertions(+), 21 deletions(-) diff --git a/libcorpus2/io/cclreader.cpp b/libcorpus2/io/cclreader.cpp index d0d5d59..46edaa9 100644 --- a/libcorpus2/io/cclreader.cpp +++ b/libcorpus2/io/cclreader.cpp @@ -26,6 +26,9 @@ or FITNESS FOR A PARTICULAR PURPOSE. namespace Corpus2 { +bool CclReader::registered = TokenReader::register_reader<CclReader>("ccl", + "disamb_only,loose,strict,no_warn_inconsistent"); + class CclReaderImpl : public XmlReader { public: @@ -214,7 +217,21 @@ void CclReader::set_option(const std::string& option) impl_->set_loose_tag_parsing(false); } else if (option == "no_warn_inconsistent") { impl_->set_warn_on_inconsistent(false); + } else if (option == "disamb_only") { + impl_->set_disamb_only(true); + } +} + +std::string CclReader::get_option(const std::string& option) +{ + if (option == "disamb_only") { + return impl_->get_disamb_only() ? option : ""; + } else if (option == "loose") { + return impl_->get_loose_tag_parsing() ? option : ""; + } else if (option == "strict") { + return !impl_->get_loose_tag_parsing() ? option : ""; } + return BufferedChunkReader::get_option(option); } } /* end ns Corpus2 */ diff --git a/libcorpus2/io/cclreader.h b/libcorpus2/io/cclreader.h index 7fe1f98..8909c4d 100644 --- a/libcorpus2/io/cclreader.h +++ b/libcorpus2/io/cclreader.h @@ -44,6 +44,10 @@ public: void set_option(const std::string& option); + std::string get_option(const std::string& option); + + static bool registered; + protected: void ensure_more(); diff --git a/libcorpus2/io/fastxces.cpp b/libcorpus2/io/fastxces.cpp index f809c7b..2fa1105 100644 --- a/libcorpus2/io/fastxces.cpp +++ b/libcorpus2/io/fastxces.cpp @@ -5,6 +5,9 @@ namespace Corpus2 { +bool FastXcesReader::registered = TokenReader::register_reader<FastXcesReader>("xces-fast", + ""); + class FastXcesReaderImpl { public: @@ -161,6 +164,11 @@ void FastXcesReader::set_option(const std::string& /*option*/) { } +std::string FastXcesReader::get_option(const std::string& option) +{ + return BufferedChunkReader::get_option(option); +} + FastXcesReader::FastXcesReader(const Tagset &tagset, const std::string &filename) : BufferedChunkReader(tagset), impl_(new FastXcesReaderImpl(tagset, chunk_buf_)) diff --git a/libcorpus2/io/fastxces.h b/libcorpus2/io/fastxces.h index ed822b0..876bdad 100644 --- a/libcorpus2/io/fastxces.h +++ b/libcorpus2/io/fastxces.h @@ -26,6 +26,10 @@ public: void set_option(const std::string& option); + std::string get_option(const std::string& option); + + static bool registered; + protected: void ensure_more(); diff --git a/libcorpus2/io/reader.cpp b/libcorpus2/io/reader.cpp index d0f1223..8bd39a0 100644 --- a/libcorpus2/io/reader.cpp +++ b/libcorpus2/io/reader.cpp @@ -16,6 +16,8 @@ or FITNESS FOR A PARTICULAR PURPOSE. #include <libcorpus2/io/reader.h> #include <boost/make_shared.hpp> +#include <boost/algorithm/string.hpp> +#include <sstream> namespace Corpus2 { @@ -28,6 +30,83 @@ TokenReader::~TokenReader() { } +boost::shared_ptr<TokenReader> TokenReader::create_path_reader( + const std::string& class_id_params, + const Tagset& tagset, + const std::string& path) +{ + string_range_vector params; + boost::algorithm::split(params, class_id_params, + boost::is_any_of(std::string(","))); + std::string class_id = boost::copy_range<std::string>(params[0]); + params.erase(params.begin(), params.begin() + 1); + try { + return boost::shared_ptr<TokenReader>( + detail::TokenReaderFactorySingleton::Instance().path_factory.CreateObject( + class_id, tagset, path, params)); + } catch (detail::TokenReaderFactoryException& e) { + throw Corpus2Error("Reader class not found: " + class_id); + } +} + +boost::shared_ptr<TokenReader> TokenReader::create_stream_reader( + const std::string& class_id_params, + const Tagset& tagset, + std::istream& stream) +{ + string_range_vector params; + boost::algorithm::split(params, class_id_params, + boost::is_any_of(std::string(","))); + std::string class_id = boost::copy_range<std::string>(params[0]); + params.erase(params.begin(), params.begin() + 1); + try { + return boost::shared_ptr<TokenReader>( + detail::TokenReaderFactorySingleton::Instance().stream_factory.CreateObject( + class_id, tagset, stream, params)); + } catch (detail::TokenReaderFactoryException& e) { + std::vector<std::string> ids; + ids = detail::TokenReaderFactorySingleton::Instance().path_factory.RegisteredIds(); + if (std::find(ids.begin(), ids.end(), class_id) == ids.end()) { + throw Corpus2Error("Reader class not found: " + class_id); + } else { + throw Corpus2Error("This reader does not support stream mode: " + class_id); + } + } +} + +std::vector<std::string> TokenReader::available_reader_types() +{ + return detail::TokenReaderFactorySingleton::Instance().path_factory.RegisteredIds(); +} + +std::string TokenReader::reader_help(const std::string& class_id) +{ + std::map<std::string, std::string>::const_iterator c; + c = detail::TokenReaderFactorySingleton::Instance().help.find(class_id); + if (c != detail::TokenReaderFactorySingleton::Instance().help.end()) { + return c->second; + } else { + return ""; + } +} + +std::vector<std::string> TokenReader::available_reader_types_help() +{ + std::vector<std::string> v = available_reader_types(); + foreach (std::string& id, v) { + std::stringstream ss; + std::map<std::string, std::string>::const_iterator c; + c = detail::TokenReaderFactorySingleton::Instance().help.find(id); + if (c != detail::TokenReaderFactorySingleton::Instance().help.end()) { + ss << id << "["; + ss << c->second; + ss << "]"; + } + id = ss.str(); + } + return v; +} + BufferedChunkReader::BufferedChunkReader(const Tagset& tagset) : TokenReader(tagset) { diff --git a/libcorpus2/io/reader.h b/libcorpus2/io/reader.h index c18a029..3bf2c41 100644 --- a/libcorpus2/io/reader.h +++ b/libcorpus2/io/reader.h @@ -36,27 +36,226 @@ namespace Corpus2 { class TokenReader : public TokenSource { public: - TokenReader(const Tagset& tagset); + /// Constructor --- only a Tagset is needed + explicit TokenReader(const Tagset& tagset); + /** + * Reader creation from a class identifier (possibly with comma-separated + * parameters / options that are passed to set_option), with a tagset and + * a path to a file or some other resource that the reader will open. + * + * Any files open will be closed by the reader. + */ + static boost::shared_ptr<TokenReader> create_path_reader( + const std::string& class_id, + const Tagset& tagset, + const std::string& path); + + /** + * Reader creation as in create_path_reader, only readng form a stream + * that is managed by the caller (so e.g. std::cin can be used). Generally + * all stream readers are path readers, but not all path readers are + * stream readers (a path reader might look and open more that one file, + * which is beyond what this interface allows). Attempting to create a + * reader that can not read a stream will result in an exception. + */ + static boost::shared_ptr<TokenReader> create_stream_reader( + const std::string& class_id, + const Tagset& tagset, + std::istream& stream); + + /// Destructor virtual ~TokenReader(); + /** + * Interface for getting a token from the reader. Note that the caller + * must dispose of the Token it receives. A null value returned indicates + * end of processing. + * + * There is no information about sentence boundaries in this mode. + */ virtual Token* get_next_token() = 0; + /** + * Interface for getting entire senteces from the reader. + * + * There is no information about chunk boundaries in that mode. + */ virtual Sentence::Ptr get_next_sentence() = 0; + /** + * Interface for getting entire chunks from the reader. + */ virtual boost::shared_ptr<Chunk> get_next_chunk() = 0; + /** + * General option setter. + */ virtual void set_option(const std::string& /*option*/) { } + /** + * Option inspector. Should echo the option if it is set, return + * an empty string otheriwse, and "unknown" if the option is invalid. + */ + virtual std::string get_option(const std::string& /*option*/) { + return "unknown"; + } + + /** + * Tagset accesor + */ const Tagset& tagset() { return tagset_; } + /** + * Function to get a vector of available reader type strings. + */ + static std::vector<std::string> available_reader_types(); + + /** + * Function to get the help string for a reader + */ + static std::string reader_help(const std::string& class_id); + + /** + * Function to get a vector of available reader type strings with help + * strings appended + */ + static std::vector<std::string> available_reader_types_help(); + + /** + * Convenience template for registering TokenReader derived classes. + */ + template <typename T> + static bool register_reader(const std::string& class_id, + const std::string& help = ""); + + /** + * Convenience template for registering TokenReader derived classes. + * Path-only verison. + */ + template <typename T> + static bool register_path_reader(const std::string& class_id, + const std::string& help = ""); + private: + /// Tagset used by the Reader const Tagset& tagset_; }; +namespace detail { + +typedef Loki::Factory< + TokenReader, // The base class for objects created in the factory + std::string, // Identifier type + Loki::TL::MakeTypelist< + const Tagset& /*tagset*/, + std::istream& /*input*/, + const string_range_vector& /*params*/ + >::Result +> StreamTokenReaderFactoryType; + +typedef Loki::Factory< + TokenReader, // The base class for objects created in the factory + std::string, // Identifier type + Loki::TL::MakeTypelist< + const Tagset& /*tagset*/, + const std::string& /*path*/, + const string_range_vector& /*params*/ + >::Result +> PathTokenReaderFactoryType; + +struct TokenReaderFactory +{ + StreamTokenReaderFactoryType stream_factory; + PathTokenReaderFactoryType path_factory; + std::map<std::string, std::string> help; +}; + +/** + * Declaration of the TokenWriter factory as a singleton Loki object + * factory. The factory instance can be accessed as + * TokenLayerFactory::Instance(). It is assumed that all derived classes + * have the same constructor signature. + */ +typedef Loki::SingletonHolder< + TokenReaderFactory, + Loki::CreateUsingNew, // default, needed to change the item below + Loki::LongevityLifetime::DieAsSmallObjectChild // per libloki docs +> +TokenReaderFactorySingleton; + +/** + * Templated TokenReader creation function, stream variant + */ +template <typename T> +inline +T* stream_reader_creator(const Tagset& tagset, std::istream& is, + const string_range_vector& params) +{ + T* reader = new T(tagset, is); + foreach (const string_range& sr, params) { + reader->set_option(boost::copy_range<std::string>(sr)); + } + return reader; +} + +/** + * Templated TokenReader creation function, stream variant + */ +template <typename T> +inline +T* path_reader_creator(const Tagset& tagset, const std::string& path, + const string_range_vector& params) +{ + T* reader = new T(tagset, path); + foreach (const string_range& sr, params) { + reader->set_option(boost::copy_range<std::string>(sr)); + } + return reader; +} + +/** + * Convenience typedef for the exception type the factory throws + */ +typedef Loki::DefaultFactoryError< + std::string, TokenReader +>::Exception +TokenReaderFactoryException; + +} /* end ns detail */ + + + +template <typename T> +bool TokenReader::register_reader(const std::string& class_id, + const std::string& help) +{ + bool ret = detail::TokenReaderFactorySingleton::Instance().path_factory.Register( + class_id, detail::path_reader_creator<T>); + bool ret2 = detail::TokenReaderFactorySingleton::Instance().stream_factory.Register( + class_id, detail::stream_reader_creator<T>); + if (ret || ret2) { + detail::TokenReaderFactorySingleton::Instance().help[class_id] = help; + } + return ret; +} + +template <typename T> +bool TokenReader::register_path_reader(const std::string& class_id, + const std::string& help) +{ + bool ret = detail::TokenReaderFactorySingleton::Instance().path_factory.Register( + class_id, detail::path_reader_creator<T>); + if (ret) { + detail::TokenReaderFactorySingleton::Instance().help[class_id] = help; + } + return ret; +} + + /** * Convenience class for readers that keep a buffer of chunks. Sentence * and token accessors are based upon the chunk buffer. diff --git a/libcorpus2/io/rft.cpp b/libcorpus2/io/rft.cpp index 9a566fc..2595b37 100644 --- a/libcorpus2/io/rft.cpp +++ b/libcorpus2/io/rft.cpp @@ -20,11 +20,12 @@ or FITNESS FOR A PARTICULAR PURPOSE. #include <boost/algorithm/string.hpp> #include <boost/make_shared.hpp> +#include <fstream> namespace Corpus2 { bool RftWriter::registered = TokenWriter::register_writer<RftWriter>( - "rft", "mbt"); + "rft", "mbt,nowarn"); RftWriter::RftWriter(std::ostream& os, const Tagset& tagset, const string_range_vector& params) @@ -77,11 +78,29 @@ void RftWriter::write_chunk(const Chunk& c) } } +bool RftReader::registered = TokenReader::register_reader<RftReader>("rft", + "set_disamb,mbt"); + + RftReader::RftReader(const Tagset& tagset, std::istream& is, bool disamb, bool mbt_dialect) - : BufferedSentenceReader(tagset), is_(is), disamb_(disamb) + : BufferedSentenceReader(tagset), is_(&is), disamb_(disamb) + , mbt_dialect_(mbt_dialect) +{ +} + +RftReader::RftReader(const Tagset& tagset, const std::string& filename, bool disamb, + bool mbt_dialect) + : BufferedSentenceReader(tagset), is_(), disamb_(disamb) , mbt_dialect_(mbt_dialect) { + is_owned_.reset(new std::ifstream(filename.c_str(), std::ifstream::in)); + if (this->is_owned_->bad()) { + throw Corpus2Error("File not found!"); + } + else { + this->is_ = is_owned_.get(); + } } Sentence::Ptr RftReader::actual_next_sentence() @@ -122,5 +141,23 @@ Sentence::Ptr RftReader::actual_next_sentence() return s; } +void RftReader::set_option(const std::string &option) +{ + if (option == "mbt") { + mbt_dialect_ = true; + } else if (option == "set_disamb") { + disamb_ = true; + } +} + +std::string RftReader::get_option(const std::string &option) +{ + if (option == "mbt") { + return mbt_dialect_ ? option : ""; + } else if (option == "set_disamb") { + return disamb_ ? option : ""; + } + return BufferedSentenceReader::get_option(option); +} } /* end ns Corpus2 */ diff --git a/libcorpus2/io/rft.h b/libcorpus2/io/rft.h index 057a2b4..dfc3c3c 100644 --- a/libcorpus2/io/rft.h +++ b/libcorpus2/io/rft.h @@ -19,6 +19,7 @@ or FITNESS FOR A PARTICULAR PURPOSE. #include <libcorpus2/io/reader.h> #include <libcorpus2/io/writer.h> +#include <boost/scoped_ptr.hpp> namespace Corpus2 { @@ -55,18 +56,29 @@ private: class RftReader : public BufferedSentenceReader { public: - RftReader(const Tagset& tagset, std::istream& is, bool disamb, + RftReader(const Tagset& tagset, std::istream& is, bool disamb = false, + bool mbt_dialect = false); // TODO move to some sort of params + + RftReader(const Tagset& tagset, const std::string& filename, bool disamb = false, bool mbt_dialect = false); // TODO move to some sort of params std::istream& is() { - return is_; + return *is_; } + void set_option(const std::string& option); + + std::string get_option(const std::string& option); + + static bool registered; + protected: /// BufferedSentenceReader override Sentence::Ptr actual_next_sentence(); - std::istream& is_; + std::istream* is_; + boost::scoped_ptr<std::istream> is_owned_; + bool disamb_; /// Whether using TiMBL/MBT variant (slightly different than RFT per se). diff --git a/libcorpus2/io/xcesreader.cpp b/libcorpus2/io/xcesreader.cpp index f9e8ce3..1af1543 100644 --- a/libcorpus2/io/xcesreader.cpp +++ b/libcorpus2/io/xcesreader.cpp @@ -24,6 +24,9 @@ or FITNESS FOR A PARTICULAR PURPOSE. namespace Corpus2 { +bool XcesReader::registered = TokenReader::register_reader<XcesReader>("xces", + "disamb_only,sh,loose,strict,no_warn_inconsistent"); + class XcesReaderImpl : public XmlReader { public: @@ -97,7 +100,25 @@ void XcesReader::set_option(const std::string& option) impl_->set_loose_tag_parsing(false); } else if (option == "no_warn_inconsistent") { impl_->set_warn_on_inconsistent(false); + } else if (option == "sh") { + impl_->set_disamb_sh(true); + } else if (option == "disamb_only") { + impl_->set_disamb_only(true); + } +} + +std::string XcesReader::get_option(const std::string& option) +{ + if (option == "sh") { + return impl_->get_disamb_sh() ? option : ""; + } else if (option == "disamb_only") { + return impl_->get_disamb_only() ? option : ""; + } else if (option == "loose") { + return impl_->get_loose_tag_parsing() ? option : ""; + } else if (option == "strict") { + return !impl_->get_loose_tag_parsing() ? option : ""; } + return BufferedChunkReader::get_option(option); } } /* end ns Corpus2 */ diff --git a/libcorpus2/io/xcesreader.h b/libcorpus2/io/xcesreader.h index 3260693..56c11f1 100644 --- a/libcorpus2/io/xcesreader.h +++ b/libcorpus2/io/xcesreader.h @@ -44,6 +44,10 @@ public: void set_option(const std::string& option); + std::string get_option(const std::string& option); + + static bool registered; + protected: void ensure_more(); diff --git a/libcorpus2/io/xmlreader.h b/libcorpus2/io/xmlreader.h index f2b9b14..ce1e42b 100644 --- a/libcorpus2/io/xmlreader.h +++ b/libcorpus2/io/xmlreader.h @@ -35,25 +35,20 @@ public: virtual ~XmlReader(); - void set_disamb_only(bool v) { - disamb_only_ = v; - } + bool get_disamb_only() const { return disamb_only_; } + void set_disamb_only(bool v) { disamb_only_ = v; } - void set_disamb_sh(bool v) { - disamb_sh_ = v; - } + bool get_disamb_sh() const { return disamb_sh_; } + void set_disamb_sh(bool v) { disamb_sh_ = v; } - void set_warn_on_inconsistent(bool v) { - warn_on_inconsistent_ = v; - } + bool get_warn_on_unexpected() const { return warn_on_unexpected_; } + void set_warn_on_unexpected(bool v) { warn_on_unexpected_ = v; } - void set_warn_on_unexpected(bool v) { - warn_on_unexpected_ = v; - } + bool get_warn_on_inconsistent() const { return warn_on_inconsistent_; } + void set_warn_on_inconsistent(bool v) { warn_on_inconsistent_ = v; } - void set_loose_tag_parsing(bool v) { - loose_tag_parsing_ = v; - } + bool get_loose_tag_parsing() const { return loose_tag_parsing_; } + void set_loose_tag_parsing(bool v) { loose_tag_parsing_ = v; } protected: std::string get_type_from_attributes(const AttributeList& attributes) const; diff --git a/tests/io.cpp b/tests/io.cpp index 3a65f0d..1048a4f 100644 --- a/tests/io.cpp +++ b/tests/io.cpp @@ -179,5 +179,26 @@ BOOST_AUTO_TEST_CASE( io_oo ) BOOST_CHECK(chunk); } +BOOST_AUTO_TEST_CASE( create_reader ) +{ + const Corpus2::Tagset& tagset = Corpus2::get_named_tagset("kipi"); + boost::shared_ptr<Corpus2::TokenReader> r; + std::stringstream ss; + ss << swiatopoglad; + r = Corpus2::TokenReader::create_stream_reader("xces,disamb_only,sh", tagset, ss); + boost::shared_ptr<Corpus2::XcesReader> xr; + xr = boost::dynamic_pointer_cast<Corpus2::XcesReader>(r); + BOOST_REQUIRE(xr); + BOOST_CHECK_EQUAL(xr->get_option("disamb_only"), "disamb_only"); + BOOST_CHECK_EQUAL(xr->get_option("sh"), "sh"); + r = Corpus2::TokenReader::create_stream_reader("xces,disamb_only,strict", tagset, ss); + xr = boost::dynamic_pointer_cast<Corpus2::XcesReader>(r); + BOOST_REQUIRE(xr); + BOOST_CHECK_EQUAL(xr->get_option("disamb_only"), "disamb_only"); + BOOST_CHECK_EQUAL(xr->get_option("sh"), ""); + BOOST_CHECK_EQUAL(xr->get_option("strict"), "strict"); +} + + BOOST_AUTO_TEST_SUITE_END(); -- GitLab