From 0e3d0347054199e56a7da6aa315f978ce4ee1544 Mon Sep 17 00:00:00 2001
From: ilor <ilor@bauer.(none)>
Date: Mon, 24 Jan 2011 11:00:44 +0100
Subject: [PATCH] rescue out-of-chunk tokens in xcesreader, log such cases to
 stderr

---
 libcorpus2/io/xcesreader.cpp | 47 +++++++++++++++++++++++++++---------
 1 file changed, 36 insertions(+), 11 deletions(-)

diff --git a/libcorpus2/io/xcesreader.cpp b/libcorpus2/io/xcesreader.cpp
index d0e18af..a0861e2 100644
--- a/libcorpus2/io/xcesreader.cpp
+++ b/libcorpus2/io/xcesreader.cpp
@@ -18,6 +18,7 @@ or FITNESS FOR A PARTICULAR PURPOSE.
 #include <libcorpus2/io/sax.h>
 #include <libpwrutils/foreach.h>
 #include <libxml++/libxml++.h>
+#include <libxml2/libxml/parser.h>
 #include <boost/make_shared.hpp>
 #include <fstream>
 
@@ -37,6 +38,8 @@ protected:
 			const AttributeList& attributes);
 	void on_end_element(const Glib::ustring & name);
 
+	void finish_sentence();
+
 	const Tagset& tagset_;
 
 	enum state_t { XS_NONE, XS_CHUNK, XS_SENTENCE, XS_TOK, XS_ORTH, XS_LEX,
@@ -45,6 +48,8 @@ protected:
 
 	bool chunkless_;
 
+	bool out_of_chunk_;
+
 	PwrNlp::Whitespace::Enum wa_;
 
 	Glib::ustring sbuf_;
@@ -105,7 +110,7 @@ XcesReaderImpl::XcesReaderImpl(const Tagset& tagset,
 		std::deque< boost::shared_ptr<Chunk> >& obuf,
 		bool disamb_only, bool disamb_sh)
 	: BasicSaxParser()
-	, tagset_(tagset), state_(XS_NONE), chunkless_(false)
+	, tagset_(tagset), state_(XS_NONE), chunkless_(false), out_of_chunk_(false)
 	, wa_(PwrNlp::Whitespace::Newline)
 	, sbuf_(), tok_(NULL), sent_(), chunk_(), obuf_(obuf)
 	, disamb_only_(disamb_only), disamb_sh_(disamb_sh)
@@ -127,6 +132,10 @@ void XcesReaderImpl::on_start_element(const Glib::ustring &name,
 				type = a.value;
 			}
 		}
+		if (out_of_chunk_) {
+			finish_sentence();
+			out_of_chunk_ = false;
+		}
 		if (state_ == XS_NONE) {
 			if (type == "s") {
 				//throw XcesError("Top level <chunk> is type=\"s\"");
@@ -191,6 +200,31 @@ void XcesReaderImpl::on_start_element(const Glib::ustring &name,
 		clear_buf();
 	} else if (name == "ns") {
 		wa_ = PwrNlp::Whitespace::None;
+	} else if (name == "tok" && state_ == XS_NONE) {
+		std::cerr << "Warning: out-of-chunk token, assuming sentence start on line ";
+		std::cerr << this->context_->input->line << "\n";
+		chunkless_ = true;
+		out_of_chunk_ = true;
+		chunk_ = boost::make_shared<Chunk>();
+		sent_ = boost::make_shared<Sentence>();
+		state_ = XS_TOK;
+		tok_ = new Token();
+		tok_->set_wa(wa_);
+		wa_ = PwrNlp::Whitespace::Space;
+	}
+}
+
+void XcesReaderImpl::finish_sentence()
+{
+	chunk_->append(sent_);
+	sent_.reset();
+	if (chunkless_) {
+		obuf_.push_back(chunk_);
+		chunk_.reset();
+		state_ = XS_NONE;
+		chunkless_ = false;
+	} else {
+		state_ = XS_CHUNK;
 	}
 }
 
@@ -216,16 +250,7 @@ void XcesReaderImpl::on_end_element(const Glib::ustring &name)
 		tok_ = NULL;
 		state_ = XS_SENTENCE;
 	} else if (state_ == XS_SENTENCE && name == "chunk") {
-		chunk_->append(sent_);
-		sent_.reset();
-		if (chunkless_) {
-			obuf_.push_back(chunk_);
-			chunk_.reset();
-			state_ = XS_NONE;
-			chunkless_ = false;
-		} else {
-			state_ = XS_CHUNK;
-		}
+		finish_sentence();
 	} else if (state_ == XS_CHUNK && name == "chunk") {
 		obuf_.push_back(chunk_);
 		chunk_.reset();
-- 
GitLab