diff --git a/libcorpus2/io/relreader.cpp b/libcorpus2/io/relreader.cpp index db62c55dca4a25c9647365bdaae839ee21d915f4..abfc59dca1f5acea4cc267bfd5130c6e986f2d64 100644 --- a/libcorpus2/io/relreader.cpp +++ b/libcorpus2/io/relreader.cpp @@ -14,7 +14,11 @@ or FITNESS FOR A PARTICULAR PURPOSE. See the LICENSE and COPYING files for more details. */ +#include <libpwrutils/foreach.h> +#include <libcorpus2/exception.h> #include <libcorpus2/io/relreader.h> + +#include <fstream> #include <boost/make_shared.hpp> namespace Corpus2 { @@ -22,12 +26,151 @@ RelationReader::RelationReader(const std::string &rela_path) : rela_path_(rela_path) { readed_ = false; + in_relation_ = false; + in_relations_ = false; + + file_.reset(new std::ifstream(rela_path.c_str(), std::ifstream::in)); + + if (!file_->good()) { + file_.reset(); + throw Corpus2Error("File not found!"); + } } void RelationReader::read() { + std::istream* is = NULL; + static const int BUFSIZE = 1024; + + if (!file_) { + throw Corpus2Error("File not found!"); + } + else { + is = file_.get(); + } + + while (is->good()) { + unsigned char buf[BUFSIZE + 1]; + is->read(reinterpret_cast<char*>(buf), BUFSIZE); + + parse_chunk_raw(buf, is->gcount()); + if (is->eof()) { + finish_chunk_parsing(); + } + } + // mark that document has been readed readed_ = true; } +void RelationReader::on_start_element(const Glib::ustring& name, + const AttributeList& attributes) { + if (name == RELATIONS_TAG) { + in_relations_ = true; + } + else if (in_relations_ && name == RELATION_TAG) { + in_relation_ = true; + parse_relation_name(attributes); + } + else if (in_relation_ && name == RELATION_DIRECT_FROM) { + parse_direction_from(attributes); + } + else if (in_relation_ && name == RELATION_DIRECT_TO) { + parse_direction_to(attributes); + } + else { + // + } +} + +void RelationReader::on_end_element(const Glib::ustring& name) { + if (name == RELATIONS_TAG) { + in_relations_ = false; + } + else if (in_relations_ && name == RELATION_TAG) { + in_relation_ = false; + try { + validate(); + add_current_relation(); + } catch (...) { + throw; + } + } + else { + // + } +} + +void RelationReader::on_characters(const Glib::ustring &text) +{ + if (in_relation_) { + ann_number_ = text.raw(); + } +} + +void RelationReader::validate() +{ + if (rel_name_.empty()) { + throw Corpus2Error("Relation name is empty!"); + } + if (!rel_from_) { + throw Corpus2Error("Cannot find \"from\" in relation " + rel_name_); + } + if (!rel_to_) { + throw Corpus2Error("Cannot find \"to\" in relation " + rel_name_); + } + if (ann_number_.empty()) { + throw Corpus2Error("Cannot find annotation number in relation " + rel_name_); + } +} + +void RelationReader::add_current_relation() +{ + boost::shared_ptr<Relation> relation; + relation = boost::make_shared<Relation>(rel_name_, rel_from_, rel_to_); + relations_.push_back(relation); +} + +void RelationReader::parse_relation_name(const AttributeList& attributes) +{ + rel_name_ = get_attribute_value(attributes, RELATION_NAME); + ann_number_ = ""; + rel_from_.reset(); + rel_to_.reset(); +} + +void RelationReader::parse_direction_from(const AttributeList& attributes) +{ + parse_direction(attributes, rel_from_); +} + +void RelationReader::parse_direction_to(const AttributeList& attributes) +{ + parse_direction(attributes, rel_to_); +} + +void RelationReader::parse_direction(const AttributeList& attributes, + boost::shared_ptr<DirectionPoint>& direct) +{ + int annotation_number = 99999999; + std::string sentence_id = get_attribute_value(attributes, RELATION_SENTENCE_ID); + std::string channel_name = get_attribute_value(attributes, RELATION_CHANNEL_NAME); + + std::istringstream (ann_number_) >> annotation_number; + + direct = boost::make_shared<DirectionPoint>( + sentence_id, channel_name, annotation_number); +} + +std::string RelationReader::get_attribute_value( + const AttributeList& attributes, const std::string& name) +{ + foreach (const Attribute& a, attributes) { + if (a.name == name) { + return a.value; + } + } + return ""; +} + } /* end ns Corpus2 */ diff --git a/libcorpus2/io/relreader.h b/libcorpus2/io/relreader.h index d9e0a6b63291a0db32a9d0e633e02f39ecf4c2e3..990fca09baa663f5a77203a6a99f8ea12bff1044 100644 --- a/libcorpus2/io/relreader.h +++ b/libcorpus2/io/relreader.h @@ -19,15 +19,28 @@ or FITNESS FOR A PARTICULAR PURPOSE. #include <vector> #include <boost/shared_ptr.hpp> +#include <boost/scoped_ptr.hpp> +#include <libxml++/parsers/saxparser.h> + #include <libcorpus2/relation.h> +#include <iostream> + namespace Corpus2 { + const static std::string RELATION_TAG = "rel"; + const static std::string RELATIONS_TAG = "relations"; + const static std::string RELATION_DIRECT_FROM = "from"; + const static std::string RELATION_DIRECT_TO = "to"; + + const static std::string RELATION_NAME = "name"; + const static std::string RELATION_SENTENCE_ID = "sent"; + const static std::string RELATION_CHANNEL_NAME = "chan"; /** * A reader for realtion documents. Note that document is read into memory * before any processing may take place. */ -class RelationReader { +class RelationReader : public xmlpp::SaxParser { public: /** * Reads a document with relations @@ -37,8 +50,8 @@ public: RelationReader(const std::string &rela_path); /** - * Relations accessor. If relations are not readed then read relations - * and returns list of them. + * Lazy relations accessor. + * If relations are not readed then read relations and returns list of them. * @return List of readed relations */ const std::vector< boost::shared_ptr<Relation> >& relations() { @@ -49,17 +62,60 @@ public: return relations_; } +protected: + // implementations of sax parser method + void on_start_element(const Glib::ustring& name, + const AttributeList& attributes); + void on_end_element(const Glib::ustring& name); + void on_characters(const Glib::ustring &text); + private: + /// Reads the document. It use Glib parser (LibXML++ parser) void read(); + /// Validates relation + void validate(); + + // + void parse_relation_name(const AttributeList& attributes); + void parse_direction_from(const AttributeList& attributes); + void parse_direction_to(const AttributeList& attributes); + void parse_direction(const AttributeList& attributes, + boost::shared_ptr<DirectionPoint>& direct); + + /// Adds readed relation to relations list + void add_current_relation(); + + /** + * Gets atribute from list of attributes + * @param attributes List of the attributes + * @param name Name of attribute + * @return Attribute value or empty string if attribute name not found + */ + std::string get_attribute_value(const AttributeList& attributes, + const std::string& name); + + // ------------------------------------------------------------------------- /// List of the relations in given relation file std::vector< boost::shared_ptr<Relation> > relations_; /// Path to file with relations const std::string rela_path_; - /// + /// Markers: bool readed_; + bool in_relation_; + bool in_relations_; + + /// File pointer + boost::scoped_ptr<std::istream> file_; + + // ------------------------------------------------------------------------- + // Temporary information of actual parsing relation + std::string rel_name_; + std::string ann_number_; + boost::shared_ptr<DirectionPoint> rel_from_; + boost::shared_ptr<DirectionPoint> rel_to_; }; } /* end ns Corpus2 */