/* Copyright (C) 2010 Tomasz Ĺšniatowski, Adam Radziszewski Part of the libcorpus2 project This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the LICENSE and COPYING files for more details. */ #ifndef LIBCORPUS2_IO_XMLREADER_H #define LIBCORPUS2_IO_XMLREADER_H #include <libcorpus2/io/reader.h> #include <libcorpus2/io/xces.h> #include <libcorpus2/chunk.h> #include <deque> #include <boost/scoped_ptr.hpp> #include <libcorpus2/io/sax.h> #include <libpwrutils/foreach.h> namespace Corpus2 { class XmlReader : public BasicSaxParser { public: XmlReader(const TokenReader& base_reader, std::deque< boost::shared_ptr<Chunk> >& obuf); virtual ~XmlReader(); bool get_disamb_only() const { return disamb_only_; } void set_disamb_only(bool v) { disamb_only_ = v; } bool get_disamb_sh() const { return disamb_sh_; } void set_disamb_sh(bool v) { disamb_sh_ = v; } bool get_warn_on_unexpected() const { return warn_on_unexpected_; } void set_warn_on_unexpected(bool v) { warn_on_unexpected_ = v; } bool get_warn_on_inconsistent() const { return warn_on_inconsistent_; } void set_warn_on_inconsistent(bool v) { warn_on_inconsistent_ = v; } protected: std::string get_type_from_attributes(const AttributeList& attributes) const; void on_start_element(const Glib::ustring & name, const AttributeList& attributes); void on_end_element(const Glib::ustring & name); virtual bool process_start_element(const Glib::ustring & name, const AttributeList& attributes); virtual bool process_end_element(const Glib::ustring & name); virtual void start_chunk(const AttributeList& attributes); virtual void start_sentence(const AttributeList& attributes); virtual void start_token(const AttributeList& attributes); void start_lexeme(const AttributeList& attributes); virtual void finish_chunk(); virtual void finish_sentence(); virtual void finish_token(); const TokenReader& base_reader_; static const int STATE_NONE = 0; static const int STATE_CHUNK = 1; static const int STATE_SENTENCE = 2; static const int STATE_TOK = 3; static const int STATE_ORTH = 4; static const int STATE_LEX = 5; static const int STATE_LEMMA = 6; static const int STATE_TAG = 7; static const int STATE_LEX_SKIP = 8; /// The state of the parser int state_; /// Flag signyfying there was a sentence outside of a chunk bool chunkless_; /// Flag signyfying there was a token outside of a sentence/chunk bool out_of_chunk_; /// Whitespace for the next token PwrNlp::Whitespace::Enum wa_; /// Character data buffer Glib::ustring sbuf_; /// Token being constructed Token* tok_; /// Sentence being constructed Sentence::Ptr sent_; /// Chunk being constructed boost::shared_ptr<Chunk> chunk_; /// Output chunk buffer std::deque< boost::shared_ptr<Chunk> >& obuf_; /// Flag to only read disamb tags bool disamb_only_; /// Read Pantera-like disamb_sh diamb tag markings bool disamb_sh_; /// Flag to control warning messages on state errors bool warn_on_inconsistent_; /// Floag to control warning messages on unknown XML tags bool warn_on_unexpected_; /// Tag name for sentence objects, customized in child class ctors std::string sentence_tag_name_; }; } /* end ns Corpus2 */ #endif // LIBCORPUS2_IO_XMLREADER_H