diff --git a/libcorpus2/io/cclreader.cpp b/libcorpus2/io/cclreader.cpp index 46edaa9ebcfa14a4743a6e33ea3ebb300823293e..14d2b993f541b386270d307f3e25227f0e56a432 100644 --- a/libcorpus2/io/cclreader.cpp +++ b/libcorpus2/io/cclreader.cpp @@ -27,12 +27,12 @@ or FITNESS FOR A PARTICULAR PURPOSE. namespace Corpus2 { bool CclReader::registered = TokenReader::register_reader<CclReader>("ccl", - "disamb_only,loose,strict,no_warn_inconsistent"); + "ign,loose,strict,disamb_only,no_warn_inconsistent"); class CclReaderImpl : public XmlReader { public: - CclReaderImpl(const Tagset& tagset, + CclReaderImpl(const TokenReader& base_reader, std::deque< boost::shared_ptr<Chunk> >& obuf, bool disamb_only, bool disamb_sh); @@ -71,14 +71,14 @@ protected: CclReader::CclReader(const Tagset& tagset, std::istream& is, bool disamb_only, bool disamb_sh) : BufferedChunkReader(tagset), - impl_(new CclReaderImpl(tagset, chunk_buf_, disamb_only, disamb_sh)) + impl_(new CclReaderImpl(*this, chunk_buf_, disamb_only, disamb_sh)) { this->is_ = &is; } CclReader::CclReader(const Tagset& tagset, const std::string& filename, bool disamb_only, bool disamb_sh) : BufferedChunkReader(tagset), - impl_(new CclReaderImpl(tagset, chunk_buf_, disamb_only, disamb_sh)) + impl_(new CclReaderImpl(*this, chunk_buf_, disamb_only, disamb_sh)) { this->is_owned_.reset(new std::ifstream(filename.c_str(), std::ifstream::in)); @@ -107,10 +107,10 @@ void CclReader::ensure_more() } } -CclReaderImpl::CclReaderImpl(const Tagset& tagset, +CclReaderImpl::CclReaderImpl(const TokenReader& base_reader, std::deque< boost::shared_ptr<Chunk> >& obuf, bool disamb_only, bool disamb_sh) - : XmlReader(tagset, obuf) + : XmlReader(base_reader, obuf) { XmlReader::set_disamb_only(disamb_only); XmlReader::set_disamb_sh(disamb_sh); @@ -211,14 +211,12 @@ void CclReaderImpl::finish_token() void CclReader::set_option(const std::string& option) { - if (option == "loose") { - impl_->set_loose_tag_parsing(true); - } else if (option == "strict") { - impl_->set_loose_tag_parsing(false); - } else if (option == "no_warn_inconsistent") { + if (option == "no_warn_inconsistent") { impl_->set_warn_on_inconsistent(false); } else if (option == "disamb_only") { impl_->set_disamb_only(true); + } else { + BufferedChunkReader::set_option(option); } } @@ -226,10 +224,8 @@ std::string CclReader::get_option(const std::string& option) { if (option == "disamb_only") { return impl_->get_disamb_only() ? option : ""; - } else if (option == "loose") { - return impl_->get_loose_tag_parsing() ? option : ""; - } else if (option == "strict") { - return !impl_->get_loose_tag_parsing() ? option : ""; + } else if (option == "no_warn_inconsistent") { + return impl_->get_warn_on_inconsistent() ? option : ""; } return BufferedChunkReader::get_option(option); } diff --git a/libcorpus2/io/fastxces.cpp b/libcorpus2/io/fastxces.cpp index 2fa1105ad2092476d5921bdc4d68bffdcec5fa91..0d52af9a37574294e16c3448c7fbefee7c367455 100644 --- a/libcorpus2/io/fastxces.cpp +++ b/libcorpus2/io/fastxces.cpp @@ -6,14 +6,14 @@ namespace Corpus2 { bool FastXcesReader::registered = TokenReader::register_reader<FastXcesReader>("xces-fast", - ""); + "ign,loose,strict"); class FastXcesReaderImpl { public: - FastXcesReaderImpl(const Tagset& tagset, + FastXcesReaderImpl(const TokenReader& base_reader, std::deque< boost::shared_ptr<Chunk> >& obuf) - : tagset_(tagset), obuf_(obuf) + : base_reader_(base_reader), obuf_(obuf) { tok_ = new Token(); sent_ = boost::make_shared<Sentence>(); @@ -85,7 +85,7 @@ public: finish_all(); } private: - const Tagset& tagset_; + const TokenReader& base_reader_; /// Token being constructed Token* tok_; @@ -135,7 +135,7 @@ private: } void ctag(const std::string& base, const std::string& ctag, bool disamb) { - Tag tag = tagset_.parse_simple_tag(ctag); + Tag tag = base_reader_.parse_tag(ctag); Lexeme lex(UnicodeString::fromUTF8(base), tag); lex.set_disamb(disamb); tok_->add_lexeme(lex); @@ -150,7 +150,7 @@ private: FastXcesReader::FastXcesReader(const Tagset &tagset, std::istream &is) : BufferedChunkReader(tagset), - impl_(new FastXcesReaderImpl(tagset, chunk_buf_)) + impl_(new FastXcesReaderImpl(*this, chunk_buf_)) { this->is_ = &is; } @@ -171,7 +171,7 @@ std::string FastXcesReader::get_option(const std::string& option) FastXcesReader::FastXcesReader(const Tagset &tagset, const std::string &filename) : BufferedChunkReader(tagset), - impl_(new FastXcesReaderImpl(tagset, chunk_buf_)) + impl_(new FastXcesReaderImpl(*this, chunk_buf_)) { this->is_owned_.reset(new std::ifstream(filename.c_str(), std::ifstream::in)); diff --git a/libcorpus2/io/reader.cpp b/libcorpus2/io/reader.cpp index 8bd39a048b5320d771fa76da61b8d16d1b88063e..837b78ae7df8c93371f0ad8f0314a3e44ed94406 100644 --- a/libcorpus2/io/reader.cpp +++ b/libcorpus2/io/reader.cpp @@ -22,7 +22,7 @@ or FITNESS FOR A PARTICULAR PURPOSE. namespace Corpus2 { TokenReader::TokenReader(const Tagset& tagset) - : tagset_(tagset) + : tagset_(tagset), tag_parse_mode_(Tagset::ParseDefault) { } @@ -30,6 +30,37 @@ TokenReader::~TokenReader() { } +void TokenReader::set_option(const std::string &option) +{ + if (option == "ign") { + tag_parse_mode_ = static_cast<Tagset::ParseMode>( + tag_parse_mode_ | Tagset::ParseFailWithIgn); + } else if (option == "loose") { + tag_parse_mode_ = static_cast<Tagset::ParseMode>( + Tagset::ParseLoose | (tag_parse_mode_ & Tagset::ParseFailWithIgn)); + } else if (option == "strict") { + tag_parse_mode_ = static_cast<Tagset::ParseMode>( + Tagset::ParseDefault | (tag_parse_mode_ & Tagset::ParseFailWithIgn)); + } else { + throw Corpus2Error("Unknown option passed to reader: " + option); + } +} + +std::string TokenReader::get_option(const std::string &option) +{ + if (option == "ign") { + return tag_parse_mode_ & Tagset::ParseFailWithIgn ? option : ""; + } else if (option == "loose") { + return (tag_parse_mode_ & ~Tagset::ParseFailWithIgn) + == Tagset::ParseLoose ? option : ""; + } else if (option == "strict") { + return (tag_parse_mode_ & ~Tagset::ParseFailWithIgn) + == Tagset::ParseDefault ? option : ""; + } else { + return "unknown"; + } +} + boost::shared_ptr<TokenReader> TokenReader::create_path_reader( const std::string& class_id_params, const Tagset& tagset, diff --git a/libcorpus2/io/reader.h b/libcorpus2/io/reader.h index 3bf2c4179996b54ea86cafeac1b1a41c939e48b0..ee4dc271491d62e7a80fad2e2e204833b9105f43 100644 --- a/libcorpus2/io/reader.h +++ b/libcorpus2/io/reader.h @@ -91,21 +91,18 @@ public: /** * General option setter. */ - virtual void set_option(const std::string& /*option*/) { - } + virtual void set_option(const std::string& option); /** * Option inspector. Should echo the option if it is set, return * an empty string otheriwse, and "unknown" if the option is invalid. */ - virtual std::string get_option(const std::string& /*option*/) { - return "unknown"; - } + virtual std::string get_option(const std::string& option); /** * Tagset accesor */ - const Tagset& tagset() { + const Tagset& tagset() const { return tagset_; } @@ -140,9 +137,27 @@ public: static bool register_path_reader(const std::string& class_id, const std::string& help = ""); + /// Convenience function to parse a tag string with options of this reader + Tag parse_tag(const std::string& tag_string) const { + return tagset().parse_simple_tag(tag_string, tag_parse_mode_); + } + + /// tag parse mode getter + Tagset::ParseMode tag_parse_mode() const { + return tag_parse_mode_; + } + + /// tag parse mode setter + void set_tag_parse_mode(Tagset::ParseMode mode) { + tag_parse_mode_ = mode; + } + private: /// Tagset used by the Reader const Tagset& tagset_; + + /// Tag parse mode + Tagset::ParseMode tag_parse_mode_; }; namespace detail { diff --git a/libcorpus2/io/rft.cpp b/libcorpus2/io/rft.cpp index 2595b37ec5e50b1dc38a7f4ccbe3a61956c70af3..5918b118ecf93c58158529612286f1c9c8d94cab 100644 --- a/libcorpus2/io/rft.cpp +++ b/libcorpus2/io/rft.cpp @@ -36,10 +36,10 @@ RftWriter::RftWriter(std::ostream& os, const Tagset& tagset, std::string p = boost::copy_range<std::string>(param); if (p == "nowarn") { warn_on_no_lexemes_ = false; - } - else if (p == "mbt") { + } else if (p == "mbt") { mbt_dialect_ = true; } + } } @@ -79,7 +79,7 @@ void RftWriter::write_chunk(const Chunk& c) } bool RftReader::registered = TokenReader::register_reader<RftReader>("rft", - "set_disamb,mbt"); + "ign,loose,strict,set_disamb,mbt"); RftReader::RftReader(const Tagset& tagset, std::istream& is, bool disamb, @@ -122,7 +122,7 @@ Sentence::Ptr RftReader::actual_next_sentence() if (!mbt_dialect_) { boost::algorithm::replace_all(tag_string, ".", ":"); } - Tag tag = tagset().parse_simple_tag(tag_string); + Tag tag = parse_tag(tag_string); Token* t = new Token(); t->set_orth(UnicodeString::fromUTF8(orth)); t->set_wa(PwrNlp::Whitespace::Space); @@ -147,6 +147,8 @@ void RftReader::set_option(const std::string &option) mbt_dialect_ = true; } else if (option == "set_disamb") { disamb_ = true; + } else { + BufferedSentenceReader::set_option(option); } } diff --git a/libcorpus2/io/rft.h b/libcorpus2/io/rft.h index dfc3c3c165344db8eeb37391f10e433f99a65af9..7ae19a299dd7dbdaf200c2ce82ffdfe5c5362f9c 100644 --- a/libcorpus2/io/rft.h +++ b/libcorpus2/io/rft.h @@ -79,11 +79,11 @@ protected: std::istream* is_; boost::scoped_ptr<std::istream> is_owned_; - + /// Whether to mark all incoming tags as disambiguated bool disamb_; + /// Whether using TiMBL/MBT variant (slightly different than RFT per se). bool mbt_dialect_; - }; } /* end ns Corpus2 */ diff --git a/libcorpus2/io/xcesreader.cpp b/libcorpus2/io/xcesreader.cpp index 1af1543f99b8d49e09cb089c76fe4133818a2341..9378fcdbd626f241fe8b5c690474c364da62420e 100644 --- a/libcorpus2/io/xcesreader.cpp +++ b/libcorpus2/io/xcesreader.cpp @@ -25,12 +25,12 @@ or FITNESS FOR A PARTICULAR PURPOSE. namespace Corpus2 { bool XcesReader::registered = TokenReader::register_reader<XcesReader>("xces", - "disamb_only,sh,loose,strict,no_warn_inconsistent"); + "ign,loose,strict,disamb_only,sh,no_warn_inconsistent"); class XcesReaderImpl : public XmlReader { public: - XcesReaderImpl(const Tagset& tagset, + XcesReaderImpl(const TokenReader& base_reader, std::deque< boost::shared_ptr<Chunk> >& obuf, bool disamb_only, bool disamb_sh); @@ -42,14 +42,14 @@ protected: XcesReader::XcesReader(const Tagset& tagset, std::istream& is, bool disamb_only, bool disamb_sh) : BufferedChunkReader(tagset), - impl_(new XcesReaderImpl(tagset, chunk_buf_, disamb_only, disamb_sh)) + impl_(new XcesReaderImpl(*this, chunk_buf_, disamb_only, disamb_sh)) { this->is_ = &is; } XcesReader::XcesReader(const Tagset& tagset, const std::string& filename, bool disamb_only, bool disamb_sh) : BufferedChunkReader(tagset), - impl_(new XcesReaderImpl(tagset, chunk_buf_, disamb_only, disamb_sh)) + impl_(new XcesReaderImpl(*this, chunk_buf_, disamb_only, disamb_sh)) { this->is_owned_.reset(new std::ifstream(filename.c_str(), std::ifstream::in)); @@ -78,10 +78,10 @@ void XcesReader::ensure_more() } } -XcesReaderImpl::XcesReaderImpl(const Tagset& tagset, +XcesReaderImpl::XcesReaderImpl(const TokenReader& base_reader, std::deque< boost::shared_ptr<Chunk> >& obuf, bool disamb_only, bool disamb_sh) - : XmlReader(tagset, obuf) + : XmlReader(base_reader, obuf) { XmlReader::set_disamb_only(disamb_only); XmlReader::set_disamb_sh(disamb_sh); @@ -94,16 +94,14 @@ XcesReaderImpl::~XcesReaderImpl() void XcesReader::set_option(const std::string& option) { - if (option == "loose") { - impl_->set_loose_tag_parsing(true); - } else if (option == "strict") { - impl_->set_loose_tag_parsing(false); - } else if (option == "no_warn_inconsistent") { + if (option == "no_warn_inconsistent") { impl_->set_warn_on_inconsistent(false); } else if (option == "sh") { impl_->set_disamb_sh(true); } else if (option == "disamb_only") { impl_->set_disamb_only(true); + } else { + BufferedChunkReader::set_option(option); } } @@ -113,10 +111,8 @@ std::string XcesReader::get_option(const std::string& option) return impl_->get_disamb_sh() ? option : ""; } else if (option == "disamb_only") { return impl_->get_disamb_only() ? option : ""; - } else if (option == "loose") { - return impl_->get_loose_tag_parsing() ? option : ""; - } else if (option == "strict") { - return !impl_->get_loose_tag_parsing() ? option : ""; + } else if (option == "no_warn_inconsistent") { + return impl_->get_warn_on_inconsistent() ? option : ""; } return BufferedChunkReader::get_option(option); } diff --git a/libcorpus2/io/xmlreader.cpp b/libcorpus2/io/xmlreader.cpp index a4207224ae3f5f0c98d0ede4bc21c8d0b1dccd6e..85ba6e99dc20a9a0b05c268bc47e2d09a332dac8 100644 --- a/libcorpus2/io/xmlreader.cpp +++ b/libcorpus2/io/xmlreader.cpp @@ -24,16 +24,15 @@ or FITNESS FOR A PARTICULAR PURPOSE. namespace Corpus2 { -XmlReader::XmlReader(const Tagset& tagset, +XmlReader::XmlReader(const TokenReader& base_reader, std::deque< boost::shared_ptr<Chunk> >& obuf) : BasicSaxParser() - , tagset_(tagset), state_(STATE_NONE) + , base_reader_(base_reader), state_(STATE_NONE) , chunkless_(false), out_of_chunk_(false) , wa_(PwrNlp::Whitespace::Newline) , sbuf_(), tok_(NULL), sent_(), chunk_(), obuf_(obuf) , disamb_only_(false), disamb_sh_(false) , warn_on_inconsistent_(true), warn_on_unexpected_(true) - , loose_tag_parsing_(false) { } @@ -222,8 +221,7 @@ void XmlReader::on_end_element(const Glib::ustring &name) grab_characters_ = false; state_ = STATE_LEX; } else if (state_ == STATE_TAG && name == "ctag") { - Tag tag = tagset_.parse_simple_tag(get_buf(), - loose_tag_parsing_ ? Tagset::ParseLoose : Tagset::ParseDefault); + Tag tag = base_reader_.parse_tag(get_buf()); tok_->lexemes().back().set_tag(tag); grab_characters_ = false; state_ = STATE_LEX; diff --git a/libcorpus2/io/xmlreader.h b/libcorpus2/io/xmlreader.h index ce1e42bc1c7e02c1882ee936c4656c6af57a80c9..7b875710ef8e0043461447c7b94e101ffddb9c10 100644 --- a/libcorpus2/io/xmlreader.h +++ b/libcorpus2/io/xmlreader.h @@ -30,7 +30,7 @@ namespace Corpus2 { class XmlReader : public BasicSaxParser { public: - XmlReader(const Tagset& tagset, + XmlReader(const TokenReader& base_reader, std::deque< boost::shared_ptr<Chunk> >& obuf); virtual ~XmlReader(); @@ -47,9 +47,6 @@ public: bool get_warn_on_inconsistent() const { return warn_on_inconsistent_; } void set_warn_on_inconsistent(bool v) { warn_on_inconsistent_ = v; } - bool get_loose_tag_parsing() const { return loose_tag_parsing_; } - void set_loose_tag_parsing(bool v) { loose_tag_parsing_ = v; } - protected: std::string get_type_from_attributes(const AttributeList& attributes) const; @@ -77,7 +74,7 @@ protected: virtual void finish_token(); - const Tagset& tagset_; + const TokenReader& base_reader_; static const int STATE_NONE = 0; static const int STATE_CHUNK = 1; @@ -124,14 +121,11 @@ protected: /// Flag to control warning messages on state errors bool warn_on_inconsistent_; - /// Floag to control warning messages on unknown tags + /// Floag to control warning messages on unknown XML tags bool warn_on_unexpected_; /// Tag name for sentence objects, customized in child class ctors std::string sentence_tag_name_; - - /// Flag to disable strict tag correctness checking - bool loose_tag_parsing_; }; } /* end ns Corpus2 */