fastxces.cpp

#include <libcorpus2/io/fastxces.h>
#include <boost/make_shared.hpp>
#include <boost/regex.hpp>
#include <fstream>

namespace Corpus2 {

bool FastXcesReader::registered = TokenReader::register_reader<FastXcesReader>(
	"xces-fast","ign,loose,strict");

class FastXcesReaderImpl
{
public:
	FastXcesReaderImpl(const TokenReader& base_reader,
		std::deque< boost::shared_ptr<Chunk> >& obuf)
		: base_reader_(base_reader), obuf_(obuf)
	{
		tok_ = new Token();
		sent_ = boost::make_shared<Sentence>();
		chunk_ = boost::make_shared<Chunk>();
	}

	~FastXcesReaderImpl()
	{
		delete tok_;
	}

	void parse_stream(std::istream& is)
	{
		boost::regex parsing_regex = boost::regex(
				"(<ns\\>)|"
				"(?:<chunk[^>]*\\<type=[\"']?([a-zA-Z]*)[\"']?[^>]*>)|"
				"(?:<orth>\\s*(.*?)\\s*</orth>)|"
				"(?:<lex\\>([^>]*\\<disamb=[\"']?1[\"']?)?[^>]*>\\s*"
				"<base>\\s*(.*?)\\s*</base>\\s*<ctag>\\s*(.*?)\\s*</ctag>"
				"\\s*</lex>)|"
				"(</tok>)");
		enum {
			MATCH_NS = 1,
			MATCH_CHUNK_START = 2,
			MATCH_ORTH = 3,
			MATCH_DISAMB = 4,
			MATCH_BASE = 5,
			MATCH_CTAG = 6,
			MATCH_ETOK = 7
		};
		// This code is heavily based on example from Boost.Regex
		// (http://www.boost.org/doc/libs/1_41_0/libs/regex/doc/html/boost_regex/partial_matches.html)
		char buf[4096];
		const char* next_pos = buf + sizeof(buf);
		while (!is.eof()) {
			std::streamsize leftover = (buf + sizeof(buf)) - next_pos;
			std::streamsize size = next_pos - buf;
			memcpy(buf, next_pos, leftover);
			is.read(buf + leftover, size);
			std::streamsize read = is.gcount();
			next_pos = buf + sizeof(buf);

			boost::cregex_iterator i(buf, buf + read + leftover, parsing_regex,
					boost::match_default | boost::match_partial);
			boost::cregex_iterator end;
			for (; i != end; ++i) {
				if ((*i)[0].matched == false) {
					// Partial match, save position and break:
					next_pos = (*i)[0].first;
					break;
				}
				if ((*i)[MATCH_ORTH].matched) {
					orth(i->str(MATCH_ORTH));
				} else if ((*i)[MATCH_CTAG].matched) {
					ctag(i->str(MATCH_BASE),
						i->str(MATCH_CTAG),
						(*i)[MATCH_DISAMB].matched);
				} else if ((*i)[MATCH_ETOK].matched) {
					token_end();
				} else if ((*i)[MATCH_NS].matched) {
					no_space();
				} else if ((*i)[MATCH_CHUNK_START].matched) {
					chunk_start(i->str(MATCH_CHUNK_START));
				}
			}
		}

		// Finally close all remaining chunks.
		finish_all();
	}
private:
	const TokenReader& base_reader_;

	/// Token being constructed
	Token* tok_;

	/// Sentence being constructed
	Sentence::Ptr sent_;

	/// Chunk being constructed
	boost::shared_ptr<Chunk> chunk_;

	/// Output chunk buffer
	std::deque< boost::shared_ptr<Chunk> >& obuf_;

	void chunk_start(const std::string& type) {
		if (type == "s") {
			if (!sent_->empty()) {
				chunk_->append(sent_);
			}
			sent_ = base_reader_.make_sentence();
			tok_->set_wa(PwrNlp::Whitespace::Newline);
		} else {
			if (!chunk_->empty()) {
				obuf_.push_back(chunk_);
				chunk_ = boost::make_shared<Chunk>();
			}
			return;
		}
	}

	void finish_all() {
		if (!sent_->empty()) {
			chunk_->append(sent_);
			sent_ = boost::make_shared<Sentence>();
		}
		if (!chunk_->empty()) {
			obuf_.push_back(chunk_);
			chunk_ = boost::make_shared<Chunk>();
		}
	}

	void no_space() {
		tok_->set_wa(PwrNlp::Whitespace::None);
	}

	void orth(const std::string& orth) {
		tok_->set_orth(UnicodeString::fromUTF8(orth));
	}

	void ctag(const std::string& base, const std::string& ctag, bool disamb) {
		Tag tag = base_reader_.parse_tag(ctag);
		Lexeme lex(UnicodeString::fromUTF8(base), tag);
		lex.set_disamb(disamb);
		tok_->add_lexeme(lex);
	}

	void token_end() {
		sent_->append(tok_);
		tok_ = new Token();
		tok_->set_wa(PwrNlp::Whitespace::Space);
	}
};

FastXcesReader::FastXcesReader(const Tagset &tagset, std::istream &is)
	: BufferedChunkReader(tagset),
	impl_(new FastXcesReaderImpl(*this, chunk_buf_))
{
	this->is_ = &is;
}

FastXcesReader::~FastXcesReader()
{
}


void FastXcesReader::set_option(const std::string& /*option*/)
{
}

std::string FastXcesReader::get_option(const std::string& option) const
{
	return BufferedChunkReader::get_option(option);
}

FastXcesReader::FastXcesReader(const Tagset &tagset, const std::string &filename)
	: BufferedChunkReader(tagset),
	impl_(new FastXcesReaderImpl(*this, chunk_buf_))
{
	this->is_owned_.reset(new std::ifstream(filename.c_str(), std::ifstream::in));

	if (!this->is_owned_->good()) {
		throw Corpus2Error("File not found!");
	}
	else {
		this->is_ = is_owned_.get();
	}
}

void FastXcesReader::ensure_more()
{
	if (is_->good()) {
		impl_->parse_stream(*is_);
	}
}


} /* end ns Corpus2 */