Skip to content
Snippets Groups Projects
Select Git revision
  • 90e4ea15ffaac89643678278c5e3ac57a8eeae9e
  • master default protected
  • develop protected
  • feat_remove_attr
  • python2.7
  • python3.8
6 results

fastxces.cpp

Blame
  • fastxces.cpp 4.52 KiB
    #include <libcorpus2/io/fastxces.h>
    #include <boost/make_shared.hpp>
    #include <boost/regex.hpp>
    #include <fstream>
    
    namespace Corpus2 {
    
    bool FastXcesReader::registered = TokenReader::register_reader<FastXcesReader>(
    	"xces-fast","ign,loose,strict");
    
    class FastXcesReaderImpl
    {
    public:
    	FastXcesReaderImpl(const TokenReader& base_reader,
    		std::deque< boost::shared_ptr<Chunk> >& obuf)
    		: base_reader_(base_reader), obuf_(obuf)
    	{
    		tok_ = new Token();
    		sent_ = boost::make_shared<Sentence>();
    		chunk_ = boost::make_shared<Chunk>();
    	}
    
    	~FastXcesReaderImpl()
    	{
    		delete tok_;
    	}
    
    	void parse_stream(std::istream& is)
    	{
    		boost::regex parsing_regex = boost::regex(
    				"(<ns\\>)|"
    				"(?:<chunk[^>]*\\<type=[\"']?([a-zA-Z]*)[\"']?[^>]*>)|"
    				"(?:<orth>\\s*(.*?)\\s*</orth>)|"
    				"(?:<lex\\>([^>]*\\<disamb=[\"']?1[\"']?)?[^>]*>\\s*"
    				"<base>\\s*(.*?)\\s*</base>\\s*<ctag>\\s*(.*?)\\s*</ctag>"
    				"\\s*</lex>)|"
    				"(</tok>)");
    		enum {
    			MATCH_NS = 1,
    			MATCH_CHUNK_START = 2,
    			MATCH_ORTH = 3,
    			MATCH_DISAMB = 4,
    			MATCH_BASE = 5,
    			MATCH_CTAG = 6,
    			MATCH_ETOK = 7
    		};
    		// This code is heavily based on example from Boost.Regex
    		// (http://www.boost.org/doc/libs/1_41_0/libs/regex/doc/html/boost_regex/partial_matches.html)
    		char buf[4096];
    		const char* next_pos = buf + sizeof(buf);
    		while (!is.eof()) {
    			std::streamsize leftover = (buf + sizeof(buf)) - next_pos;
    			std::streamsize size = next_pos - buf;
    			memcpy(buf, next_pos, leftover);
    			is.read(buf + leftover, size);
    			std::streamsize read = is.gcount();
    			next_pos = buf + sizeof(buf);
    
    			boost::cregex_iterator i(buf, buf + read + leftover, parsing_regex,
    					boost::match_default | boost::match_partial);
    			boost::cregex_iterator end;
    			for (; i != end; ++i) {
    				if ((*i)[0].matched == false) {
    					// Partial match, save position and break:
    					next_pos = (*i)[0].first;
    					break;
    				}
    				if ((*i)[MATCH_ORTH].matched) {
    					orth(i->str(MATCH_ORTH));
    				} else if ((*i)[MATCH_CTAG].matched) {
    					ctag(i->str(MATCH_BASE),
    						i->str(MATCH_CTAG),
    						(*i)[MATCH_DISAMB].matched);
    				} else if ((*i)[MATCH_ETOK].matched) {
    					token_end();
    				} else if ((*i)[MATCH_NS].matched) {
    					no_space();
    				} else if ((*i)[MATCH_CHUNK_START].matched) {
    					chunk_start(i->str(MATCH_CHUNK_START));
    				}
    			}
    		}
    
    		// Finally close all remaining chunks.
    		finish_all();
    	}
    private:
    	const TokenReader& base_reader_;
    
    	/// Token being constructed
    	Token* tok_;
    
    	/// Sentence being constructed
    	Sentence::Ptr sent_;
    
    	/// Chunk being constructed
    	boost::shared_ptr<Chunk> chunk_;
    
    	/// Output chunk buffer
    	std::deque< boost::shared_ptr<Chunk> >& obuf_;
    
    	void chunk_start(const std::string& type) {
    		if (type == "s") {
    			if (!sent_->empty()) {
    				chunk_->append(sent_);
    			}
    			sent_ = base_reader_.make_sentence();
    			tok_->set_wa(PwrNlp::Whitespace::Newline);
    		} else {
    			if (!chunk_->empty()) {
    				obuf_.push_back(chunk_);
    				chunk_ = boost::make_shared<Chunk>();
    			}
    			return;
    		}
    	}
    
    	void finish_all() {
    		if (!sent_->empty()) {
    			chunk_->append(sent_);
    			sent_ = boost::make_shared<Sentence>();
    		}
    		if (!chunk_->empty()) {
    			obuf_.push_back(chunk_);
    			chunk_ = boost::make_shared<Chunk>();
    		}
    	}
    
    	void no_space() {
    		tok_->set_wa(PwrNlp::Whitespace::None);
    	}
    
    	void orth(const std::string& orth) {
    		tok_->set_orth(UnicodeString::fromUTF8(orth));
    	}
    
    	void ctag(const std::string& base, const std::string& ctag, bool disamb) {
    		Tag tag = base_reader_.parse_tag(ctag);
    		Lexeme lex(UnicodeString::fromUTF8(base), tag);
    		lex.set_disamb(disamb);
    		tok_->add_lexeme(lex);
    	}
    
    	void token_end() {
    		sent_->append(tok_);
    		tok_ = new Token();
    		tok_->set_wa(PwrNlp::Whitespace::Space);
    	}
    };
    
    FastXcesReader::FastXcesReader(const Tagset &tagset, std::istream &is)
    	: BufferedChunkReader(tagset),
    	impl_(new FastXcesReaderImpl(*this, chunk_buf_))
    {
    	this->is_ = &is;
    }
    
    FastXcesReader::~FastXcesReader()
    {
    }
    
    
    void FastXcesReader::set_option(const std::string& /*option*/)
    {
    }
    
    std::string FastXcesReader::get_option(const std::string& option) const
    {
    	return BufferedChunkReader::get_option(option);
    }
    
    FastXcesReader::FastXcesReader(const Tagset &tagset, const std::string &filename)
    	: BufferedChunkReader(tagset),
    	impl_(new FastXcesReaderImpl(*this, chunk_buf_))
    {
    	this->is_owned_.reset(new std::ifstream(filename.c_str(), std::ifstream::in));
    
    	if (!this->is_owned_->good()) {
    		throw Corpus2Error("File not found!");
    	}
    	else {
    		this->is_ = is_owned_.get();
    	}
    }
    
    void FastXcesReader::ensure_more()
    {
    	if (is_->good()) {
    		impl_->parse_stream(*is_);
    	}
    }
    
    
    
    } /* end ns Corpus2 */