/* Copyright (C) 2010 Tomasz Ĺšniatowski, Adam Radziszewski Part of the libcorpus2 project This program is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the LICENSE.CORPUS2, LICENSE.POLIQARP, COPYING.LESSER and COPYING files for more details. */ #include <libcorpus2/io/cclreader.h> #include <libcorpus2/io/xmlreader.h> #include <boost/foreach.hpp> #include <libxml++/libxml++.h> #include <libxml2/libxml/parser.h> #include <boost/make_shared.hpp> #include <boost/algorithm/string.hpp> #include <libcorpus2/ann/annotatedsentence.h> #include <cstdlib> #include <fstream> #include <sstream> namespace Corpus2 { bool CclReader::registered = TokenReader::register_reader<CclReader>("ccl", "ign,loose,strict,disamb_only,no_warn_inconsistent,no_warn_unexpected_xml,autogen_chunk_id,autogen_sent_id"); class CclReaderImpl : public XmlReader { public: CclReaderImpl(const TokenReader& base_reader, std::deque< boost::shared_ptr<Chunk> >& obuf, bool disamb_only, bool disamb_sh); ~CclReaderImpl(); protected: bool process_start_element(const Glib::ustring & name, const AttributeList& attributes); bool process_end_element(const Glib::ustring& name); void start_chunk(const AttributeList &attributes); void start_sentence(const AttributeList &attributes); void start_token(const AttributeList &attributes); void finish_token(); static const int STATE_ANN = 901; static const int STATE_REL = 902; // currently unused static const int STATE_PROP = 910; boost::shared_ptr<AnnotatedSentence> ann_sent_; std::string ann_chan_; std::string prop_key_; bool ann_head_; typedef std::map<std::string, int> token_ann_t; token_ann_t token_anns_; std::set<std::string> token_ann_heads_; }; CclReader::CclReader(const Tagset& tagset, std::istream& is, bool disamb_only, bool disamb_sh) : BufferedChunkReader(tagset), impl_(new CclReaderImpl(*this, chunk_buf_, disamb_only, disamb_sh)) { this->is_ = &is; } CclReader::CclReader(const Tagset& tagset, const std::string& filename, bool disamb_only, bool disamb_sh) : BufferedChunkReader(tagset), impl_(new CclReaderImpl(*this, chunk_buf_, disamb_only, disamb_sh)) { this->is_owned_.reset(new std::ifstream(filename.c_str(), std::ifstream::in)); if (!this->is_owned_->good()) { throw Corpus2Error("File not found!"); } else { this->is_ = is_owned_.get(); } } CclReader::~CclReader() { } void CclReader::ensure_more() { static const int BUFSIZE=1024; while (chunk_buf_.empty() && is().good()) { unsigned char buf[BUFSIZE+1]; is().read(reinterpret_cast<char*>(buf), BUFSIZE); impl_->parse_chunk_raw(buf, is().gcount()); if (is().eof()) { impl_->finish_chunk_parsing(); } } } CclReaderImpl::CclReaderImpl(const TokenReader& base_reader, std::deque< boost::shared_ptr<Chunk> >& obuf, bool disamb_only, bool disamb_sh) : XmlReader(base_reader, obuf) { XmlReader::set_disamb_only(disamb_only); XmlReader::set_disamb_sh(disamb_sh); sentence_tag_name_ = "sentence"; } CclReaderImpl::~CclReaderImpl() { } void CclReaderImpl::start_chunk(const AttributeList& attributes) { std::string id = get_id_from_attributes(attributes); chunk_ = boost::make_shared<Chunk>(); chunk_->set_attribute("id", id); std::string type = get_type_from_attributes(attributes); if (type == "s") { throw XcesError("Trying to parse XCES as CCL (<chunk type=\"s\">)"); } BOOST_FOREACH(const Attribute& a, attributes) { chunk_->set_attribute(a.name, a.value); } state_ = STATE_CHUNK; } void CclReaderImpl::start_sentence(const AttributeList &attributes) { // find sentence id std::string id = ""; BOOST_FOREACH(const Attribute& a, attributes) { if (a.name == "id") { id = a.value; break; } } ann_sent_ = boost::make_shared<AnnotatedSentence>(id); sent_ = ann_sent_; state_ = STATE_SENTENCE; } void CclReaderImpl::start_token(const AttributeList& attributes) { XmlReader::start_token(attributes); token_anns_.clear(); token_ann_heads_.clear(); } bool CclReaderImpl::process_start_element(const Glib::ustring & name, const AttributeList& attributes) { if (state_ == STATE_TOK && name == "ann") { state_ = STATE_ANN; grab_characters_ = true; clear_buf(); ann_chan_ = ""; ann_head_ = false; BOOST_FOREACH(const Attribute& a, attributes) { if (a.name == "chan") { ann_chan_ = a.value; } else if (a.name == "head" && a.value == "1") { ann_head_ = true; } } if (ann_chan_.empty()) { throw XcesError("<ann> with no channel name"); } return true; } else if (state_ == STATE_TOK && name == "prop") { state_ = STATE_PROP; grab_characters_ = true; clear_buf(); prop_key_ = ""; BOOST_FOREACH(const Attribute& a, attributes) { if (a.name == "key") { prop_key_ = a.value; } } return true; } else { return false; } } bool CclReaderImpl::process_end_element(const Glib::ustring & name) { if (state_ == STATE_ANN && name == "ann") { std::string buf = get_buf(); grab_characters_ = false; int segid = atoi(buf.c_str()); if (!ann_sent_->has_channel(ann_chan_)) { ann_sent_->create_channel(ann_chan_); } if (segid > 0) { token_anns_.insert(std::make_pair(ann_chan_, segid)); if (ann_head_) { token_ann_heads_.insert(ann_chan_); } } state_ = STATE_TOK; return true; } else if (state_ == STATE_PROP && name == "prop") { std::string prop_val = get_buf(); boost::algorithm::trim(prop_val); grab_characters_ = false; if (!tok_->get_metadata()) { tok_->create_metadata(); } tok_->get_metadata()->set_attribute(prop_key_, prop_val); state_ = STATE_TOK; return true; } else { return false; } } void CclReaderImpl::finish_token() { XmlReader::finish_token(); BOOST_FOREACH(const token_ann_t::value_type& v, token_anns_) { ann_sent_->get_channel(v.first).set_segment_at(sent_->size() - 1, v.second); if (token_ann_heads_.find(v.first) != token_ann_heads_.end()) { ann_sent_->get_channel(v.first).set_head_at(sent_->size() - 1, true); } } } void CclReader::set_option(const std::string& option) { if (option == "no_warn_inconsistent") { impl_->set_warn_on_inconsistent(false); } else if (option == "disamb_only") { impl_->set_disamb_only(true); } else if (option == "autogen_sent_id") { // no action, left for backwards compatibility } else if (option == "autogen_chunk_id") { // no action, left for backwards compatibility } else if (option == "no_warn_unexpected_xml") { impl_->set_warn_on_unexpected(false); } else { BufferedChunkReader::set_option(option); } } std::string CclReader::get_option(const std::string& option) const { if (option == "disamb_only") { return impl_->get_disamb_only() ? option : ""; } else if (option == "no_warn_inconsistent") { return impl_->get_warn_on_inconsistent() ? "" : option; } else if (option == "autogen_sent_id") { return option; // left for backward compatibility } else if (option == "autogen_chunk_id") { return option; // left for backward compatibility } else if (option == "no_warn_unexpected_xml") { return impl_->get_warn_on_unexpected() ? "" : option; } return BufferedChunkReader::get_option(option); } } /* end ns Corpus2 */