Skip to content
Snippets Groups Projects
Commit 49bfae80 authored by ilor's avatar ilor
Browse files

Add support for 'chunkless' sentences in xces reader.

In case the xces document contains sentences with no chunk containing them, a fake chunk will be created to hold the sentences.
Previously this situation threw an exception.
parent b000d45b
No related branches found
No related tags found
No related merge requests found
...@@ -24,6 +24,8 @@ protected: ...@@ -24,6 +24,8 @@ protected:
XS_LEMMA, XS_TAG }; XS_LEMMA, XS_TAG };
state_t state_; state_t state_;
bool chunkless_;
PwrNlp::Whitespace::Enum wa_; PwrNlp::Whitespace::Enum wa_;
Glib::ustring sbuf_; Glib::ustring sbuf_;
...@@ -68,7 +70,8 @@ void XcesReader::ensure_more() ...@@ -68,7 +70,8 @@ void XcesReader::ensure_more()
XcesReaderImpl::XcesReaderImpl(const Tagset& tagset, XcesReaderImpl::XcesReaderImpl(const Tagset& tagset,
std::deque<Chunk*>& obuf, bool disamb_only, bool disamb_sh) std::deque<Chunk*>& obuf, bool disamb_only, bool disamb_sh)
: BasicSaxParser() : BasicSaxParser()
, tagset_(tagset), state_(XS_NONE), wa_(PwrNlp::Whitespace::Newline) , tagset_(tagset), state_(XS_NONE), chunkless_(false)
, wa_(PwrNlp::Whitespace::Newline)
, sbuf_(), tok_(NULL), sent_(NULL), chunk_(NULL), obuf_(obuf) , sbuf_(), tok_(NULL), sent_(NULL), chunk_(NULL), obuf_(obuf)
, disamb_only_(disamb_only), disamb_sh_(disamb_sh) , disamb_only_(disamb_only), disamb_sh_(disamb_sh)
{ {
...@@ -93,13 +96,18 @@ void XcesReaderImpl::on_start_element(const Glib::ustring &name, ...@@ -93,13 +96,18 @@ void XcesReaderImpl::on_start_element(const Glib::ustring &name,
} }
if (state_ == XS_NONE) { if (state_ == XS_NONE) {
if (type == "s") { if (type == "s") {
throw XcesError("Top level <chunk> is type=\"s\""); //throw XcesError("Top level <chunk> is type=\"s\"");
} state_ = XS_SENTENCE;
state_ = XS_CHUNK; chunkless_ = true;
chunk_ = new Chunk;
sent_ = new Sentence;
} else {
chunk_ = new Chunk; chunk_ = new Chunk;
state_ = XS_CHUNK;
foreach (const Attribute& a, attributes) { foreach (const Attribute& a, attributes) {
chunk_->set_attribute(a.name, a.value); chunk_->set_attribute(a.name, a.value);
} }
}
} else if (state_ == XS_CHUNK) { } else if (state_ == XS_CHUNK) {
if (type != "s") { if (type != "s") {
throw XcesError("Sub level <chunk> not type=\"s\""); throw XcesError("Sub level <chunk> not type=\"s\"");
...@@ -177,7 +185,14 @@ void XcesReaderImpl::on_end_element(const Glib::ustring &name) ...@@ -177,7 +185,14 @@ void XcesReaderImpl::on_end_element(const Glib::ustring &name)
} else if (state_ == XS_SENTENCE && name == "chunk") { } else if (state_ == XS_SENTENCE && name == "chunk") {
chunk_->append(sent_); chunk_->append(sent_);
sent_ = NULL; sent_ = NULL;
if (chunkless_) {
obuf_.push_back(chunk_);
chunk_ = NULL;
state_ = XS_NONE;
chunkless_ = false;
} else {
state_ = XS_CHUNK; state_ = XS_CHUNK;
}
} else if (state_ == XS_CHUNK && name == "chunk") { } else if (state_ == XS_CHUNK && name == "chunk") {
obuf_.push_back(chunk_); obuf_.push_back(chunk_);
chunk_ = NULL; chunk_ = NULL;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment