Skip to content
Snippets Groups Projects
Select Git revision
  • c03647e835c3f6ee62ddc065872bd411b1fa8bcf
  • master default protected
  • develop protected
  • feat_remove_attr
  • python2.7
  • python3.8
6 results

xcesreader.cpp

Blame
  • xcesreader.cpp 2.31 KiB
    /*
        Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski
        Part of the libcorpus2 project
    
        This program is free software; you can redistribute it and/or modify it
    under the terms of the GNU General Public License as published by the Free
    Software Foundation; either version 3 of the License, or (at your option)
    any later version.
    
        This program is distributed in the hope that it will be useful, but
    WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    or FITNESS FOR A PARTICULAR PURPOSE. 
    
        See the LICENSE and COPYING files for more details.
    */
    
    #include <libcorpus2/io/xcesreader.h>
    #include <libcorpus2/io/xmlreader.h>
    #include <libpwrutils/foreach.h>
    #include <libxml++/libxml++.h>
    #include <libxml2/libxml/parser.h>
    #include <boost/make_shared.hpp>
    #include <fstream>
    
    namespace Corpus2 {
    
    class XcesReaderImpl : public XmlReader
    {
    public:
    	XcesReaderImpl(const Tagset& tagset,
    		std::deque< boost::shared_ptr<Chunk> >& obuf,
    		bool disamb_only, bool disamb_sh);
    
    	~XcesReaderImpl();
    
    protected:
    };
    
    XcesReader::XcesReader(const Tagset& tagset, std::istream& is,
    		bool disamb_only, bool disamb_sh)
    	: BufferedChunkReader(tagset),
    	impl_(new XcesReaderImpl(tagset, chunk_buf_, disamb_only, disamb_sh))
    {
    	this->is_ = &is;
    }
    
    XcesReader::XcesReader(const Tagset& tagset, const std::string& filename, bool disamb_only, bool disamb_sh)
    	: BufferedChunkReader(tagset),
    	impl_(new XcesReaderImpl(tagset, chunk_buf_, disamb_only, disamb_sh))
    {
    	this->is_owned_.reset(new std::ifstream(filename.c_str(), std::ifstream::in));
    
    	if (this->is_owned_->bad()) {
    		throw Corpus2Error("File not found!");
    	}
    	else {
    		this->is_ = is_owned_.get();
    	}
    }
    
    XcesReader::~XcesReader()
    {
    }
    
    void XcesReader::ensure_more()
    {
    	static const int BUFSIZE=1024;
    	while (chunk_buf_.empty() && is().good()) {
    		unsigned char buf[BUFSIZE+1];
    		is().read(reinterpret_cast<char*>(buf), BUFSIZE);
    		impl_->parse_chunk_raw(buf, is().gcount());
    		if (is().eof()) {
    			impl_->finish_chunk_parsing();
    		}
    	}
    }
    
    XcesReaderImpl::XcesReaderImpl(const Tagset& tagset,
    		std::deque< boost::shared_ptr<Chunk> >& obuf,
    		bool disamb_only, bool disamb_sh)
    	: XmlReader(tagset, obuf)
    {
    	XmlReader::set_disamb_only(disamb_only);
    	XmlReader::set_disamb_sh(disamb_sh);
    	sentence_tag_name_ = "chunk";
    }
    
    XcesReaderImpl::~XcesReaderImpl()
    {
    }
    
    } /* end ns Corpus2 */