From 49bfae807c57d9ac4fbbda443ac51ebd953a7bd2 Mon Sep 17 00:00:00 2001 From: ilor <kailoran@gmail.com> Date: Mon, 11 Oct 2010 12:24:12 +0200 Subject: [PATCH] Add support for 'chunkless' sentences in xces reader. In case the xces document contains sentences with no chunk containing them, a fake chunk will be created to hold the sentences. Previously this situation threw an exception. --- libcorpus2/io/xcesreader.cpp | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/libcorpus2/io/xcesreader.cpp b/libcorpus2/io/xcesreader.cpp index af32af0..cb7ec43 100644 --- a/libcorpus2/io/xcesreader.cpp +++ b/libcorpus2/io/xcesreader.cpp @@ -24,6 +24,8 @@ protected: XS_LEMMA, XS_TAG }; state_t state_; + bool chunkless_; + PwrNlp::Whitespace::Enum wa_; Glib::ustring sbuf_; @@ -68,7 +70,8 @@ void XcesReader::ensure_more() XcesReaderImpl::XcesReaderImpl(const Tagset& tagset, std::deque<Chunk*>& obuf, bool disamb_only, bool disamb_sh) : BasicSaxParser() - , tagset_(tagset), state_(XS_NONE), wa_(PwrNlp::Whitespace::Newline) + , tagset_(tagset), state_(XS_NONE), chunkless_(false) + , wa_(PwrNlp::Whitespace::Newline) , sbuf_(), tok_(NULL), sent_(NULL), chunk_(NULL), obuf_(obuf) , disamb_only_(disamb_only), disamb_sh_(disamb_sh) { @@ -93,12 +96,17 @@ void XcesReaderImpl::on_start_element(const Glib::ustring &name, } if (state_ == XS_NONE) { if (type == "s") { - throw XcesError("Top level <chunk> is type=\"s\""); - } - state_ = XS_CHUNK; - chunk_ = new Chunk; - foreach (const Attribute& a, attributes) { - chunk_->set_attribute(a.name, a.value); + //throw XcesError("Top level <chunk> is type=\"s\""); + state_ = XS_SENTENCE; + chunkless_ = true; + chunk_ = new Chunk; + sent_ = new Sentence; + } else { + chunk_ = new Chunk; + state_ = XS_CHUNK; + foreach (const Attribute& a, attributes) { + chunk_->set_attribute(a.name, a.value); + } } } else if (state_ == XS_CHUNK) { if (type != "s") { @@ -177,7 +185,14 @@ void XcesReaderImpl::on_end_element(const Glib::ustring &name) } else if (state_ == XS_SENTENCE && name == "chunk") { chunk_->append(sent_); sent_ = NULL; - state_ = XS_CHUNK; + if (chunkless_) { + obuf_.push_back(chunk_); + chunk_ = NULL; + state_ = XS_NONE; + chunkless_ = false; + } else { + state_ = XS_CHUNK; + } } else if (state_ == XS_CHUNK && name == "chunk") { obuf_.push_back(chunk_); chunk_ = NULL; -- GitLab