-
ilor authoredb000d45b
xcesreader.cpp 4.50 KiB
#include <libcorpus2/io/xcesreader.h>
#include <libcorpus2/io/sax.h>
#include <libpwrutils/foreach.h>
#include <libxml++/libxml++.h>
namespace Corpus2 {
class XcesReaderImpl : public BasicSaxParser
{
public:
XcesReaderImpl(const Tagset& tagset, std::deque<Chunk*>& obuf,
bool disamb_only, bool disamb_sh);
~XcesReaderImpl();
protected:
void on_start_element(const Glib::ustring & name,
const AttributeList& attributes);
void on_end_element(const Glib::ustring & name);
const Tagset& tagset_;
enum state_t { XS_NONE, XS_CHUNK, XS_SENTENCE, XS_TOK, XS_ORTH, XS_LEX,
XS_LEMMA, XS_TAG };
state_t state_;
PwrNlp::Whitespace::Enum wa_;
Glib::ustring sbuf_;
Token* tok_;
Sentence* sent_;
Chunk* chunk_;
std::deque<Chunk*>& obuf_;
bool disamb_only_;
bool disamb_sh_;
};
XcesReader::XcesReader(const Tagset& tagset, std::istream& is,
bool disamb_only, bool disamb_sh)
: BufferedChunkReader(tagset), is_(is)
, impl_(new XcesReaderImpl(tagset, chunk_buf_, disamb_only, disamb_sh))
{
}
XcesReader::~XcesReader()
{
}
void XcesReader::ensure_more()
{
static const int BUFSIZE=1024;
while (chunk_buf_.empty() && is().good()) {
unsigned char buf[BUFSIZE+1];
is().read(reinterpret_cast<char*>(buf), BUFSIZE);
impl_->parse_chunk_raw(buf, is().gcount());
if (is().eof()) {
impl_->finish_chunk_parsing();
}
}
}
XcesReaderImpl::XcesReaderImpl(const Tagset& tagset,
std::deque<Chunk*>& obuf, bool disamb_only, bool disamb_sh)
: BasicSaxParser()
, tagset_(tagset), state_(XS_NONE), wa_(PwrNlp::Whitespace::Newline)
, sbuf_(), tok_(NULL), sent_(NULL), chunk_(NULL), obuf_(obuf)
, disamb_only_(disamb_only), disamb_sh_(disamb_sh)
{
}
XcesReaderImpl::~XcesReaderImpl()
{
delete tok_;
delete sent_;
delete chunk_;
}
void XcesReaderImpl::on_start_element(const Glib::ustring &name,
const AttributeList& attributes)
{
if (name == "chunk") {
std::string type;
foreach (const Attribute& a, attributes) {
if (a.name == "type") {
type = a.value;
}
}
if (state_ == XS_NONE) {
if (type == "s") {
throw XcesError("Top level <chunk> is type=\"s\"");
}
state_ = XS_CHUNK;
chunk_ = new Chunk;
foreach (const Attribute& a, attributes) {
chunk_->set_attribute(a.name, a.value);
}
} else if (state_ == XS_CHUNK) {
if (type != "s") {
throw XcesError("Sub level <chunk> not type=\"s\"");
}
state_ = XS_SENTENCE;
sent_ = new Sentence;
} else {
throw XcesError("Unexpected <chunk>");
}
} else if (state_ == XS_SENTENCE && name == "tok") {
state_ = XS_TOK;
tok_ = new Token();
tok_->set_wa(wa_);
wa_ = PwrNlp::Whitespace::Space;
} else if (state_ == XS_TOK && name == "orth") {
state_ = XS_ORTH;
grab_characters_ = true;
clear_buf();
} else if (state_ == XS_TOK && name == "lex") {
assert(tok_ != NULL);
bool is_disamb = false;
if (!disamb_sh_) {
foreach (const Attribute& a, attributes) {
if (a.name == "disamb" && a.value == "1") {
is_disamb = true;
}
}
} else {
is_disamb = true;
foreach (const Attribute& a, attributes) {
if (a.name == "disamb_sh" && a.value == "0") {
is_disamb = false;
}
}
}
if (!disamb_only_ || is_disamb) {
tok_->add_lexeme(Lexeme());
tok_->lexemes().back().set_disamb(is_disamb);
state_ = XS_LEX;
}
} else if (state_ == XS_LEX && name == "base") {
state_ = XS_LEMMA;
grab_characters_ = true;
clear_buf();
} else if (state_ == XS_LEX && name == "ctag") {
state_ = XS_TAG;
grab_characters_ = true;
clear_buf();
} else if (name == "ns") {
wa_ = PwrNlp::Whitespace::None;
}
}
void XcesReaderImpl::on_end_element(const Glib::ustring &name)
{
if (state_ == XS_ORTH && name == "orth") {
tok_->set_orth(UnicodeString::fromUTF8(get_buf()));
grab_characters_ = false;
state_ = XS_TOK;
} else if (state_ == XS_LEMMA && name == "base") {
tok_->lexemes().back().set_lemma(UnicodeString::fromUTF8(get_buf()));
grab_characters_ = false;
state_ = XS_LEX;
} else if (state_ == XS_TAG && name == "ctag") {
Tag tag = tagset_.parse_simple_tag(get_buf(), true);
tok_->lexemes().back().set_tag(tag);
grab_characters_ = false;
state_ = XS_LEX;
} else if (state_ == XS_LEX && name == "lex") {
state_ = XS_TOK;
} else if (state_ == XS_TOK && name == "tok") {
sent_->append(tok_);
tok_ = NULL;
state_ = XS_SENTENCE;
} else if (state_ == XS_SENTENCE && name == "chunk") {
chunk_->append(sent_);
sent_ = NULL;
state_ = XS_CHUNK;
} else if (state_ == XS_CHUNK && name == "chunk") {
obuf_.push_back(chunk_);
chunk_ = NULL;
state_ = XS_NONE;
}
}
} /* end ns Corpus2 */