Select Git revision
xmlreader.cpp
xmlreader.cpp 6.38 KiB
/*
Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski
Part of the libcorpus2 project
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.
See the LICENSE and COPYING files for more details.
*/
#include <libcorpus2/io/xmlreader.h>
#include <libpwrutils/foreach.h>
#include <libxml++/libxml++.h>
#include <libxml2/libxml/parser.h>
#include <boost/make_shared.hpp>
#include <fstream>
namespace Corpus2 {
XmlReader::XmlReader(const TokenReader& base_reader,
std::deque< boost::shared_ptr<Chunk> >& obuf)
: BasicSaxParser()
, base_reader_(base_reader), state_(STATE_NONE)
, chunkless_(false), out_of_chunk_(false)
, wa_(PwrNlp::Whitespace::Newline)
, sbuf_(), tok_(NULL), sent_(), chunk_(), obuf_(obuf)
, disamb_only_(false), disamb_sh_(false)
, warn_on_inconsistent_(true), warn_on_unexpected_(true)
{
}
XmlReader::~XmlReader()
{
delete tok_;
}
std::string XmlReader::get_type_from_attributes(const AttributeList& attributes) const
{
std::string type;
foreach (const Attribute& a, attributes) {
if (a.name == "type") {
type = a.value;
}
}
return type;
}
void XmlReader::on_start_element(const Glib::ustring &name,
const AttributeList& attributes)
{
//std::cerr << name << state_ << "\n";
if (state_ == STATE_NONE && name == "chunk") {
start_chunk(attributes);
} else if (state_ == STATE_CHUNK && name == sentence_tag_name_) {
start_sentence(attributes);
} else if (state_ == STATE_SENTENCE && name == "tok") {
start_token(attributes);
} else if (state_ == STATE_TOK && name == "orth") {
state_ = STATE_ORTH;
grab_characters_ = true;
clear_buf();
} else if (state_ == STATE_TOK && name == "lex") {
start_lexeme(attributes);
} else if (state_ == STATE_LEX && name == "base") {
state_ = STATE_LEMMA;
grab_characters_ = true;
clear_buf();
} else if (state_ == STATE_LEX && name == "ctag") {
state_ = STATE_TAG;
grab_characters_ = true;
clear_buf();
} else if (state_ == STATE_LEX_SKIP &&
(name == "lex" || name == "base" || name == "ctag")) {
//nop
} else if (name == "ns") {
wa_ = PwrNlp::Whitespace::None;
} else if (state_ == STATE_NONE && name == "tok") {
if (warn_on_inconsistent_) {
std::cerr << "Warning: out-of-chunk token, assuming sentence start on line ";
std::cerr << this->context_->input->line << "\n";
}
AttributeList fake;
start_chunk(fake);
fake.push_back(Attribute("type", "s"));
start_sentence(fake);
chunkless_ = true;
out_of_chunk_ = true;
start_token(attributes);
} else if (state_ == STATE_NONE && name == "cesAna") {
//nop
} else if (state_ == STATE_NONE && name == "chunkList") {
//nop
} else if (process_start_element(name, attributes)) {
//nop
} else if (out_of_chunk_ && state_ == STATE_SENTENCE && name == "chunk") {
finish_sentence();
out_of_chunk_ = false;
start_chunk(attributes);
} else if (warn_on_unexpected_) {
std::cerr << "Unexpected tag <" << name << "> on line ";
std::cerr << this->context_->input->line << " (" << state_ << ")\n";
}
}
bool XmlReader::process_start_element(const Glib::ustring &/*name*/,
const AttributeList &/*attributes*/)
{
return false;
}
bool XmlReader::process_end_element(const Glib::ustring & /*name*/)
{
return false;
}
void XmlReader::start_chunk(const AttributeList& attributes)
{
std::string type = get_type_from_attributes(attributes);
chunk_ = boost::make_shared<Chunk>();
if (type == "s") {
// top-level chunk is a sentence
start_sentence(attributes);
chunkless_ = true;
} else {
foreach (const Attribute& a, attributes) {
chunk_->set_attribute(a.name, a.value);
}
state_ = STATE_CHUNK;
}
}
void XmlReader::start_sentence(const AttributeList &attributes)
{
std::string type = get_type_from_attributes(attributes);
if (type != "s") {
throw XcesError("Sub level <chunk> not type=\"s\"");
}
sent_ = base_reader_.make_sentence();
state_ = STATE_SENTENCE;
}
void XmlReader::start_token(const AttributeList &/*attributes*/)
{
state_ = STATE_TOK;
tok_ = new Token();
tok_->set_wa(wa_);
wa_ = PwrNlp::Whitespace::Space;
}
void XmlReader::start_lexeme(const AttributeList &attributes)
{
assert(tok_ != NULL);
bool is_disamb = false;
if (!disamb_sh_) {
foreach (const Attribute& a, attributes) {
if (a.name == "disamb" && a.value == "1") {
is_disamb = true;
}
}
} else {
is_disamb = true;
foreach (const Attribute& a, attributes) {
if (a.name == "disamb_sh" && a.value == "0") {
is_disamb = false;
}
}
}
if (!disamb_only_ || is_disamb) {
tok_->add_lexeme(Lexeme());
tok_->lexemes().back().set_disamb(is_disamb);
state_ = STATE_LEX;
} else {
state_ = STATE_LEX_SKIP;
}
}
void XmlReader::finish_chunk()
{
assert(chunk_);
obuf_.push_back(chunk_);
chunk_.reset();
state_ = STATE_NONE;
}
void XmlReader::finish_sentence()
{
assert(chunk_);
chunk_->append(sent_);
sent_.reset();
if (chunkless_) {
obuf_.push_back(chunk_);
chunk_.reset();
state_ = STATE_NONE;
chunkless_ = false;
} else {
state_ = STATE_CHUNK;
}
}
void XmlReader::finish_token()
{
assert(sent_);
sent_->append(tok_);
tok_ = NULL;
state_ = STATE_SENTENCE;
}
void XmlReader::on_end_element(const Glib::ustring &name)
{
//std::cerr << "/" << name << state_ << "\n";
if (state_ == STATE_ORTH && name == "orth") {
tok_->set_orth(UnicodeString::fromUTF8(get_buf()));
grab_characters_ = false;
state_ = STATE_TOK;
} else if (state_ == STATE_LEMMA && name == "base") {
tok_->lexemes().back().set_lemma(UnicodeString::fromUTF8(get_buf()));
grab_characters_ = false;
state_ = STATE_LEX;
} else if (state_ == STATE_TAG && name == "ctag") {
Tag tag = base_reader_.parse_tag(get_buf());
tok_->lexemes().back().set_tag(tag);
grab_characters_ = false;
state_ = STATE_LEX;
} else if ((state_ == STATE_LEX || state_ == STATE_LEX_SKIP) && name == "lex") {
state_ = STATE_TOK;
} else if (state_ == STATE_TOK && name == "tok") {
finish_token();
} else if (state_ == STATE_SENTENCE && name == sentence_tag_name_) {
finish_sentence();
} else if (state_ == STATE_CHUNK && name == "chunk") {
finish_chunk();
} else {
process_end_element(name);
}
}
} /* end ns Corpus2 */