From 0e3d0347054199e56a7da6aa315f978ce4ee1544 Mon Sep 17 00:00:00 2001 From: ilor <ilor@bauer.(none)> Date: Mon, 24 Jan 2011 11:00:44 +0100 Subject: [PATCH] rescue out-of-chunk tokens in xcesreader, log such cases to stderr --- libcorpus2/io/xcesreader.cpp | 47 +++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 11 deletions(-) diff --git a/libcorpus2/io/xcesreader.cpp b/libcorpus2/io/xcesreader.cpp index d0e18af..a0861e2 100644 --- a/libcorpus2/io/xcesreader.cpp +++ b/libcorpus2/io/xcesreader.cpp @@ -18,6 +18,7 @@ or FITNESS FOR A PARTICULAR PURPOSE. #include <libcorpus2/io/sax.h> #include <libpwrutils/foreach.h> #include <libxml++/libxml++.h> +#include <libxml2/libxml/parser.h> #include <boost/make_shared.hpp> #include <fstream> @@ -37,6 +38,8 @@ protected: const AttributeList& attributes); void on_end_element(const Glib::ustring & name); + void finish_sentence(); + const Tagset& tagset_; enum state_t { XS_NONE, XS_CHUNK, XS_SENTENCE, XS_TOK, XS_ORTH, XS_LEX, @@ -45,6 +48,8 @@ protected: bool chunkless_; + bool out_of_chunk_; + PwrNlp::Whitespace::Enum wa_; Glib::ustring sbuf_; @@ -105,7 +110,7 @@ XcesReaderImpl::XcesReaderImpl(const Tagset& tagset, std::deque< boost::shared_ptr<Chunk> >& obuf, bool disamb_only, bool disamb_sh) : BasicSaxParser() - , tagset_(tagset), state_(XS_NONE), chunkless_(false) + , tagset_(tagset), state_(XS_NONE), chunkless_(false), out_of_chunk_(false) , wa_(PwrNlp::Whitespace::Newline) , sbuf_(), tok_(NULL), sent_(), chunk_(), obuf_(obuf) , disamb_only_(disamb_only), disamb_sh_(disamb_sh) @@ -127,6 +132,10 @@ void XcesReaderImpl::on_start_element(const Glib::ustring &name, type = a.value; } } + if (out_of_chunk_) { + finish_sentence(); + out_of_chunk_ = false; + } if (state_ == XS_NONE) { if (type == "s") { //throw XcesError("Top level <chunk> is type=\"s\""); @@ -191,6 +200,31 @@ void XcesReaderImpl::on_start_element(const Glib::ustring &name, clear_buf(); } else if (name == "ns") { wa_ = PwrNlp::Whitespace::None; + } else if (name == "tok" && state_ == XS_NONE) { + std::cerr << "Warning: out-of-chunk token, assuming sentence start on line "; + std::cerr << this->context_->input->line << "\n"; + chunkless_ = true; + out_of_chunk_ = true; + chunk_ = boost::make_shared<Chunk>(); + sent_ = boost::make_shared<Sentence>(); + state_ = XS_TOK; + tok_ = new Token(); + tok_->set_wa(wa_); + wa_ = PwrNlp::Whitespace::Space; + } +} + +void XcesReaderImpl::finish_sentence() +{ + chunk_->append(sent_); + sent_.reset(); + if (chunkless_) { + obuf_.push_back(chunk_); + chunk_.reset(); + state_ = XS_NONE; + chunkless_ = false; + } else { + state_ = XS_CHUNK; } } @@ -216,16 +250,7 @@ void XcesReaderImpl::on_end_element(const Glib::ustring &name) tok_ = NULL; state_ = XS_SENTENCE; } else if (state_ == XS_SENTENCE && name == "chunk") { - chunk_->append(sent_); - sent_.reset(); - if (chunkless_) { - obuf_.push_back(chunk_); - chunk_.reset(); - state_ = XS_NONE; - chunkless_ = false; - } else { - state_ = XS_CHUNK; - } + finish_sentence(); } else if (state_ == XS_CHUNK && name == "chunk") { obuf_.push_back(chunk_); chunk_.reset(); -- GitLab