Skip to content
Snippets Groups Projects
Commit 49bfae80 authored by ilor's avatar ilor
Browse files

Add support for 'chunkless' sentences in xces reader.

In case the xces document contains sentences with no chunk containing them, a fake chunk will be created to hold the sentences.
Previously this situation threw an exception.
parent b000d45b
Branches
No related merge requests found
...@@ -24,6 +24,8 @@ protected: ...@@ -24,6 +24,8 @@ protected:
XS_LEMMA, XS_TAG }; XS_LEMMA, XS_TAG };
state_t state_; state_t state_;
bool chunkless_;
PwrNlp::Whitespace::Enum wa_; PwrNlp::Whitespace::Enum wa_;
Glib::ustring sbuf_; Glib::ustring sbuf_;
...@@ -68,7 +70,8 @@ void XcesReader::ensure_more() ...@@ -68,7 +70,8 @@ void XcesReader::ensure_more()
XcesReaderImpl::XcesReaderImpl(const Tagset& tagset, XcesReaderImpl::XcesReaderImpl(const Tagset& tagset,
std::deque<Chunk*>& obuf, bool disamb_only, bool disamb_sh) std::deque<Chunk*>& obuf, bool disamb_only, bool disamb_sh)
: BasicSaxParser() : BasicSaxParser()
, tagset_(tagset), state_(XS_NONE), wa_(PwrNlp::Whitespace::Newline) , tagset_(tagset), state_(XS_NONE), chunkless_(false)
, wa_(PwrNlp::Whitespace::Newline)
, sbuf_(), tok_(NULL), sent_(NULL), chunk_(NULL), obuf_(obuf) , sbuf_(), tok_(NULL), sent_(NULL), chunk_(NULL), obuf_(obuf)
, disamb_only_(disamb_only), disamb_sh_(disamb_sh) , disamb_only_(disamb_only), disamb_sh_(disamb_sh)
{ {
...@@ -93,12 +96,17 @@ void XcesReaderImpl::on_start_element(const Glib::ustring &name, ...@@ -93,12 +96,17 @@ void XcesReaderImpl::on_start_element(const Glib::ustring &name,
} }
if (state_ == XS_NONE) { if (state_ == XS_NONE) {
if (type == "s") { if (type == "s") {
throw XcesError("Top level <chunk> is type=\"s\""); //throw XcesError("Top level <chunk> is type=\"s\"");
} state_ = XS_SENTENCE;
state_ = XS_CHUNK; chunkless_ = true;
chunk_ = new Chunk; chunk_ = new Chunk;
foreach (const Attribute& a, attributes) { sent_ = new Sentence;
chunk_->set_attribute(a.name, a.value); } else {
chunk_ = new Chunk;
state_ = XS_CHUNK;
foreach (const Attribute& a, attributes) {
chunk_->set_attribute(a.name, a.value);
}
} }
} else if (state_ == XS_CHUNK) { } else if (state_ == XS_CHUNK) {
if (type != "s") { if (type != "s") {
...@@ -177,7 +185,14 @@ void XcesReaderImpl::on_end_element(const Glib::ustring &name) ...@@ -177,7 +185,14 @@ void XcesReaderImpl::on_end_element(const Glib::ustring &name)
} else if (state_ == XS_SENTENCE && name == "chunk") { } else if (state_ == XS_SENTENCE && name == "chunk") {
chunk_->append(sent_); chunk_->append(sent_);
sent_ = NULL; sent_ = NULL;
state_ = XS_CHUNK; if (chunkless_) {
obuf_.push_back(chunk_);
chunk_ = NULL;
state_ = XS_NONE;
chunkless_ = false;
} else {
state_ = XS_CHUNK;
}
} else if (state_ == XS_CHUNK && name == "chunk") { } else if (state_ == XS_CHUNK && name == "chunk") {
obuf_.push_back(chunk_); obuf_.push_back(chunk_);
chunk_ = NULL; chunk_ = NULL;
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment