Skip to content
Snippets Groups Projects
Commit 49bfae80 authored by ilor's avatar ilor
Browse files

Add support for 'chunkless' sentences in xces reader.

In case the xces document contains sentences with no chunk containing them, a fake chunk will be created to hold the sentences.
Previously this situation threw an exception.
parent b000d45b
No related branches found
No related tags found
No related merge requests found
......@@ -24,6 +24,8 @@ protected:
XS_LEMMA, XS_TAG };
state_t state_;
bool chunkless_;
PwrNlp::Whitespace::Enum wa_;
Glib::ustring sbuf_;
......@@ -68,7 +70,8 @@ void XcesReader::ensure_more()
XcesReaderImpl::XcesReaderImpl(const Tagset& tagset,
std::deque<Chunk*>& obuf, bool disamb_only, bool disamb_sh)
: BasicSaxParser()
, tagset_(tagset), state_(XS_NONE), wa_(PwrNlp::Whitespace::Newline)
, tagset_(tagset), state_(XS_NONE), chunkless_(false)
, wa_(PwrNlp::Whitespace::Newline)
, sbuf_(), tok_(NULL), sent_(NULL), chunk_(NULL), obuf_(obuf)
, disamb_only_(disamb_only), disamb_sh_(disamb_sh)
{
......@@ -93,13 +96,18 @@ void XcesReaderImpl::on_start_element(const Glib::ustring &name,
}
if (state_ == XS_NONE) {
if (type == "s") {
throw XcesError("Top level <chunk> is type=\"s\"");
}
state_ = XS_CHUNK;
//throw XcesError("Top level <chunk> is type=\"s\"");
state_ = XS_SENTENCE;
chunkless_ = true;
chunk_ = new Chunk;
sent_ = new Sentence;
} else {
chunk_ = new Chunk;
state_ = XS_CHUNK;
foreach (const Attribute& a, attributes) {
chunk_->set_attribute(a.name, a.value);
}
}
} else if (state_ == XS_CHUNK) {
if (type != "s") {
throw XcesError("Sub level <chunk> not type=\"s\"");
......@@ -177,7 +185,14 @@ void XcesReaderImpl::on_end_element(const Glib::ustring &name)
} else if (state_ == XS_SENTENCE && name == "chunk") {
chunk_->append(sent_);
sent_ = NULL;
if (chunkless_) {
obuf_.push_back(chunk_);
chunk_ = NULL;
state_ = XS_NONE;
chunkless_ = false;
} else {
state_ = XS_CHUNK;
}
} else if (state_ == XS_CHUNK && name == "chunk") {
obuf_.push_back(chunk_);
chunk_ = NULL;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment