Skip to content
Snippets Groups Projects
Commit c03647e8 authored by ilor's avatar ilor
Browse files

part 1 of the xml readers refactoring -- extract xmlreader from xces and ccl readers

parent 3061dc24
Branches
No related merge requests found
......@@ -68,6 +68,7 @@ SET(libcorpus2_STAT_SRC
io/xcesreader.cpp
io/xcesvalidate.cpp
io/xceswriter.cpp
io/xmlreader.cpp
util/settings.cpp
util/symboldictionary.cpp
util/tokentimer.cpp
......
......@@ -15,7 +15,7 @@ or FITNESS FOR A PARTICULAR PURPOSE.
*/
#include <libcorpus2/io/cclreader.h>
#include <libcorpus2/io/sax.h>
#include <libcorpus2/io/xmlreader.h>
#include <libpwrutils/foreach.h>
#include <libxml++/libxml++.h>
#include <libxml2/libxml/parser.h>
......@@ -26,7 +26,7 @@ or FITNESS FOR A PARTICULAR PURPOSE.
namespace Corpus2 {
class CclReaderImpl : public BasicSaxParser
class CclReaderImpl : public XmlReader
{
public:
CclReaderImpl(const Tagset& tagset,
......@@ -36,29 +36,23 @@ public:
~CclReaderImpl();
protected:
void on_start_element(const Glib::ustring & name,
const AttributeList& attributes);
void on_end_element(const Glib::ustring & name);
bool process_start_element(const Glib::ustring & name,
const AttributeList& attributes);
void finish_sentence();
bool process_end_element(const Glib::ustring& name);
const Tagset& tagset_;
void start_chunk(const AttributeList &attributes);
enum state_t { XS_NONE, XS_CHUNK, XS_SENTENCE, XS_TOK, XS_ANN, XS_ORTH, XS_LEX,
XS_LEMMA, XS_TAG, XS_REL };
state_t state_;
void start_sentence(const AttributeList &attributes);
bool chunkless_;
void start_token(const AttributeList &attributes);
bool out_of_chunk_;
void finish_token();
PwrNlp::Whitespace::Enum wa_;
static const int STATE_ANN = 901;
static const int STATE_REL = 902;
Glib::ustring sbuf_;
Token* tok_;
boost::shared_ptr<AnnotatedSentence> sent_;
boost::shared_ptr<AnnotatedSentence> ann_sent_;
std::string ann_chan_;
......@@ -69,14 +63,6 @@ protected:
token_ann_t token_anns_;
std::set<std::string> token_ann_heads_;
boost::shared_ptr<Chunk> chunk_;
std::deque< boost::shared_ptr<Chunk> >& obuf_;
bool disamb_only_;
bool disamb_sh_;
};
CclReader::CclReader(const Tagset& tagset, std::istream& is,
......@@ -121,58 +107,53 @@ void CclReader::ensure_more()
CclReaderImpl::CclReaderImpl(const Tagset& tagset,
std::deque< boost::shared_ptr<Chunk> >& obuf,
bool disamb_only, bool disamb_sh)
: BasicSaxParser()
, tagset_(tagset), state_(XS_NONE), chunkless_(false), out_of_chunk_(false)
, wa_(PwrNlp::Whitespace::Newline)
, sbuf_(), tok_(NULL), sent_(), chunk_(), obuf_(obuf)
, disamb_only_(disamb_only), disamb_sh_(disamb_sh)
: XmlReader(tagset, obuf)
{
XmlReader::set_disamb_only(disamb_only);
XmlReader::set_disamb_sh(disamb_sh);
sentence_tag_name_ = "sentence";
}
CclReaderImpl::~CclReaderImpl()
{
delete tok_;
}
void CclReaderImpl::on_start_element(const Glib::ustring &name,
const AttributeList& attributes)
void CclReaderImpl::start_chunk(const AttributeList& attributes)
{
if (name == "chunk") {
std::string type;
foreach (const Attribute& a, attributes) {
if (a.name == "type") {
type = a.value;
}
}
if (type == "s") {
throw XcesError("Trying to parse XCES as CCL (<chunk type=\"s\">)");
} else if (state_ == XS_NONE) {
chunk_ = boost::make_shared<Chunk>();
state_ = XS_CHUNK;
foreach (const Attribute& a, attributes) {
chunk_->set_attribute(a.name, a.value);
}
} else if (state_ == XS_CHUNK) {
throw XcesError("Nested <chunk>");
} else {
throw XcesError("Unexpected <chunk>");
}
} else if (state_ == XS_CHUNK && name == "sentence") {
state_ = XS_SENTENCE;
sent_ = boost::make_shared<AnnotatedSentence>();
} else if (state_ == XS_SENTENCE && name == "tok") {
state_ = XS_TOK;
tok_ = new Token();
tok_->set_wa(wa_);
wa_ = PwrNlp::Whitespace::Space;
token_anns_.clear();
token_ann_heads_.clear();
} else if (state_ == XS_TOK && name == "orth") {
state_ = XS_ORTH;
grab_characters_ = true;
clear_buf();
} else if (state_ == XS_TOK && name == "ann") {
state_ = XS_ANN;
chunk_ = boost::make_shared<Chunk>();
std::string type = get_type_from_attributes(attributes);
if (type == "s") {
throw XcesError("Trying to parse XCES as CCL (<chunk type=\"s\">)");
}
foreach (const Attribute& a, attributes) {
chunk_->set_attribute(a.name, a.value);
}
state_ = STATE_CHUNK;
std::cerr << "Chunk";
}
void CclReaderImpl::start_sentence(const AttributeList& /*attributes*/)
{
ann_sent_ = boost::make_shared<AnnotatedSentence>();
sent_ = ann_sent_;
state_ = STATE_SENTENCE;
}
void CclReaderImpl::start_token(const AttributeList& attributes)
{
XmlReader::start_token(attributes);
token_anns_.clear();
token_ann_heads_.clear();
}
bool CclReaderImpl::process_start_element(const Glib::ustring & name,
const AttributeList& attributes)
{
if (state_ == STATE_TOK && name == "ann") {
state_ = STATE_ANN;
grab_characters_ = true;
clear_buf();
ann_chan_ = "";
......@@ -187,102 +168,40 @@ void CclReaderImpl::on_start_element(const Glib::ustring &name,
if (ann_chan_.empty()) {
throw XcesError("<ann> with no channel name");
}
} else if (state_ == XS_TOK && name == "lex") {
assert(tok_ != NULL);
bool is_disamb = false;
foreach (const Attribute& a, attributes) {
if (a.name == "disamb" && a.value == "1") {
is_disamb = true;
}
}
if (!disamb_only_ || is_disamb) {
tok_->add_lexeme(Lexeme());
tok_->lexemes().back().set_disamb(is_disamb);
state_ = XS_LEX;
}
} else if (state_ == XS_LEX && name == "base") {
state_ = XS_LEMMA;
grab_characters_ = true;
clear_buf();
} else if (state_ == XS_LEX && name == "ctag") {
state_ = XS_TAG;
grab_characters_ = true;
clear_buf();
} else if (name == "ns") {
wa_ = PwrNlp::Whitespace::None;
} else if (name == "tok" && state_ == XS_NONE) {
std::cerr << "Warning: out-of-chunk token, assuming sentence start on line ";
std::cerr << this->context_->input->line << "\n";
chunkless_ = true;
out_of_chunk_ = true;
chunk_ = boost::make_shared<Chunk>();
sent_ = boost::make_shared<AnnotatedSentence>();
state_ = XS_TOK;
tok_ = new Token();
tok_->set_wa(wa_);
wa_ = PwrNlp::Whitespace::Space;
}
}
void CclReaderImpl::finish_sentence()
{
chunk_->append(sent_);
sent_.reset();
if (chunkless_) {
obuf_.push_back(chunk_);
chunk_.reset();
state_ = XS_NONE;
chunkless_ = false;
return true;
} else {
state_ = XS_CHUNK;
return false;
}
}
void CclReaderImpl::on_end_element(const Glib::ustring &name)
bool CclReaderImpl::process_end_element(const Glib::ustring & name)
{
if (state_ == XS_ORTH && name == "orth") {
tok_->set_orth(UnicodeString::fromUTF8(get_buf()));
grab_characters_ = false;
state_ = XS_TOK;
} else if (state_ == XS_ANN && name == "ann") {
if (state_ == STATE_ANN && name == "ann") {
std::string buf = get_buf();
grab_characters_ = false;
int segid = atoi(buf.c_str());
if (!sent_->has_channel(ann_chan_)) {
sent_->create_channel(ann_chan_);
if (!ann_sent_->has_channel(ann_chan_)) {
ann_sent_->create_channel(ann_chan_);
}
if (segid > 0) {
token_anns_.insert(std::make_pair(ann_chan_, segid));
token_ann_heads_.insert(ann_chan_);
}
state_ = XS_TOK;
} else if (state_ == XS_LEMMA && name == "base") {
tok_->lexemes().back().set_lemma(UnicodeString::fromUTF8(get_buf()));
grab_characters_ = false;
state_ = XS_LEX;
} else if (state_ == XS_TAG && name == "ctag") {
Tag tag = tagset_.parse_simple_tag(get_buf(), true);
tok_->lexemes().back().set_tag(tag);
grab_characters_ = false;
state_ = XS_LEX;
} else if (state_ == XS_LEX && name == "lex") {
state_ = XS_TOK;
} else if (state_ == XS_TOK && name == "tok") {
sent_->append(tok_);
tok_ = NULL;
state_ = XS_SENTENCE;
foreach (const token_ann_t::value_type& v, token_anns_) {
sent_->get_channel(v.first).set_segment_at(sent_->size() - 1, v.second);
if (token_ann_heads_.find(v.first) != token_ann_heads_.end()) {
sent_->get_channel(v.first).set_head_at(sent_->size() - 1, true);
}
state_ = STATE_TOK;
return true;
} else {
return false;
}
}
void CclReaderImpl::finish_token()
{
XmlReader::finish_token();
foreach (const token_ann_t::value_type& v, token_anns_) {
ann_sent_->get_channel(v.first).set_segment_at(sent_->size() - 1, v.second);
if (token_ann_heads_.find(v.first) != token_ann_heads_.end()) {
ann_sent_->get_channel(v.first).set_head_at(sent_->size() - 1, true);
}
} else if (state_ == XS_SENTENCE && name == "sentence") {
finish_sentence();
} else if (state_ == XS_CHUNK && name == "chunk") {
obuf_.push_back(chunk_);
chunk_.reset();
state_ = XS_NONE;
}
}
......
......@@ -15,7 +15,7 @@ or FITNESS FOR A PARTICULAR PURPOSE.
*/
#include <libcorpus2/io/xcesreader.h>
#include <libcorpus2/io/sax.h>
#include <libcorpus2/io/xmlreader.h>
#include <libpwrutils/foreach.h>
#include <libxml++/libxml++.h>
#include <libxml2/libxml/parser.h>
......@@ -24,7 +24,7 @@ or FITNESS FOR A PARTICULAR PURPOSE.
namespace Corpus2 {
class XcesReaderImpl : public BasicSaxParser
class XcesReaderImpl : public XmlReader
{
public:
XcesReaderImpl(const Tagset& tagset,
......@@ -34,37 +34,6 @@ public:
~XcesReaderImpl();
protected:
void on_start_element(const Glib::ustring & name,
const AttributeList& attributes);
void on_end_element(const Glib::ustring & name);
void finish_sentence();
const Tagset& tagset_;
enum state_t { XS_NONE, XS_CHUNK, XS_SENTENCE, XS_TOK, XS_ORTH, XS_LEX,
XS_LEMMA, XS_TAG };
state_t state_;
bool chunkless_;
bool out_of_chunk_;
PwrNlp::Whitespace::Enum wa_;
Glib::ustring sbuf_;
Token* tok_;
Sentence::Ptr sent_;
boost::shared_ptr<Chunk> chunk_;
std::deque< boost::shared_ptr<Chunk> >& obuf_;
bool disamb_only_;
bool disamb_sh_;
};
XcesReader::XcesReader(const Tagset& tagset, std::istream& is,
......@@ -109,153 +78,15 @@ void XcesReader::ensure_more()
XcesReaderImpl::XcesReaderImpl(const Tagset& tagset,
std::deque< boost::shared_ptr<Chunk> >& obuf,
bool disamb_only, bool disamb_sh)
: BasicSaxParser()
, tagset_(tagset), state_(XS_NONE), chunkless_(false), out_of_chunk_(false)
, wa_(PwrNlp::Whitespace::Newline)
, sbuf_(), tok_(NULL), sent_(), chunk_(), obuf_(obuf)
, disamb_only_(disamb_only), disamb_sh_(disamb_sh)
: XmlReader(tagset, obuf)
{
XmlReader::set_disamb_only(disamb_only);
XmlReader::set_disamb_sh(disamb_sh);
sentence_tag_name_ = "chunk";
}
XcesReaderImpl::~XcesReaderImpl()
{
delete tok_;
}
void XcesReaderImpl::on_start_element(const Glib::ustring &name,
const AttributeList& attributes)
{
if (name == "chunk") {
std::string type;
foreach (const Attribute& a, attributes) {
if (a.name == "type") {
type = a.value;
}
}
if (out_of_chunk_) {
finish_sentence();
out_of_chunk_ = false;
}
if (state_ == XS_NONE) {
if (type == "s") {
//throw XcesError("Top level <chunk> is type=\"s\"");
state_ = XS_SENTENCE;
chunkless_ = true;
chunk_ = boost::make_shared<Chunk>();
sent_ = boost::make_shared<Sentence>();
} else {
chunk_ = boost::make_shared<Chunk>();
state_ = XS_CHUNK;
foreach (const Attribute& a, attributes) {
chunk_->set_attribute(a.name, a.value);
}
}
} else if (state_ == XS_CHUNK) {
if (type != "s") {
throw XcesError("Sub level <chunk> not type=\"s\"");
}
state_ = XS_SENTENCE;
sent_ = boost::make_shared<Sentence>();
} else {
throw XcesError("Unexpected <chunk>");
}
} else if (state_ == XS_SENTENCE && name == "tok") {
state_ = XS_TOK;
tok_ = new Token();
tok_->set_wa(wa_);
wa_ = PwrNlp::Whitespace::Space;
} else if (state_ == XS_TOK && name == "orth") {
state_ = XS_ORTH;
grab_characters_ = true;
clear_buf();
} else if (state_ == XS_TOK && name == "lex") {
assert(tok_ != NULL);
bool is_disamb = false;
if (!disamb_sh_) {
foreach (const Attribute& a, attributes) {
if (a.name == "disamb" && a.value == "1") {
is_disamb = true;
}
}
} else {
is_disamb = true;
foreach (const Attribute& a, attributes) {
if (a.name == "disamb_sh" && a.value == "0") {
is_disamb = false;
}
}
}
if (!disamb_only_ || is_disamb) {
tok_->add_lexeme(Lexeme());
tok_->lexemes().back().set_disamb(is_disamb);
state_ = XS_LEX;
}
} else if (state_ == XS_LEX && name == "base") {
state_ = XS_LEMMA;
grab_characters_ = true;
clear_buf();
} else if (state_ == XS_LEX && name == "ctag") {
state_ = XS_TAG;
grab_characters_ = true;
clear_buf();
} else if (name == "ns") {
wa_ = PwrNlp::Whitespace::None;
} else if (name == "tok" && state_ == XS_NONE) {
std::cerr << "Warning: out-of-chunk token, assuming sentence start on line ";
std::cerr << this->context_->input->line << "\n";
chunkless_ = true;
out_of_chunk_ = true;
chunk_ = boost::make_shared<Chunk>();
sent_ = boost::make_shared<Sentence>();
state_ = XS_TOK;
tok_ = new Token();
tok_->set_wa(wa_);
wa_ = PwrNlp::Whitespace::Space;
}
}
void XcesReaderImpl::finish_sentence()
{
chunk_->append(sent_);
sent_.reset();
if (chunkless_) {
obuf_.push_back(chunk_);
chunk_.reset();
state_ = XS_NONE;
chunkless_ = false;
} else {
state_ = XS_CHUNK;
}
}
void XcesReaderImpl::on_end_element(const Glib::ustring &name)
{
if (state_ == XS_ORTH && name == "orth") {
tok_->set_orth(UnicodeString::fromUTF8(get_buf()));
grab_characters_ = false;
state_ = XS_TOK;
} else if (state_ == XS_LEMMA && name == "base") {
tok_->lexemes().back().set_lemma(UnicodeString::fromUTF8(get_buf()));
grab_characters_ = false;
state_ = XS_LEX;
} else if (state_ == XS_TAG && name == "ctag") {
Tag tag = tagset_.parse_simple_tag(get_buf(), true);
tok_->lexemes().back().set_tag(tag);
grab_characters_ = false;
state_ = XS_LEX;
} else if (state_ == XS_LEX && name == "lex") {
state_ = XS_TOK;
} else if (state_ == XS_TOK && name == "tok") {
sent_->append(tok_);
tok_ = NULL;
state_ = XS_SENTENCE;
} else if (state_ == XS_SENTENCE && name == "chunk") {
finish_sentence();
} else if (state_ == XS_CHUNK && name == "chunk") {
obuf_.push_back(chunk_);
chunk_.reset();
state_ = XS_NONE;
}
}
} /* end ns Corpus2 */
/*
Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski
Part of the libcorpus2 project
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.
See the LICENSE and COPYING files for more details.
*/
#include <libcorpus2/io/xmlreader.h>
#include <libpwrutils/foreach.h>
#include <libxml++/libxml++.h>
#include <libxml2/libxml/parser.h>
#include <boost/make_shared.hpp>
#include <fstream>
namespace Corpus2 {
XmlReader::XmlReader(const Tagset& tagset,
std::deque< boost::shared_ptr<Chunk> >& obuf)
: BasicSaxParser()
, tagset_(tagset), state_(STATE_NONE)
, chunkless_(false), out_of_chunk_(false)
, wa_(PwrNlp::Whitespace::Newline)
, sbuf_(), tok_(NULL), sent_(), chunk_(), obuf_(obuf)
, disamb_only_(false), disamb_sh_(false)
, warn_on_inconsistent_(true), warn_on_unexpected_(true)
{
}
XmlReader::~XmlReader()
{
delete tok_;
}
std::string XmlReader::get_type_from_attributes(const AttributeList& attributes) const
{
std::string type;
foreach (const Attribute& a, attributes) {
if (a.name == "type") {
type = a.value;
}
}
return type;
}
void XmlReader::on_start_element(const Glib::ustring &name,
const AttributeList& attributes)
{
std::cerr << name << state_ << "\n";
if (state_ == STATE_NONE && name == "chunk") {
start_chunk(attributes);
} else if (state_ == STATE_CHUNK && name == sentence_tag_name_) {
start_sentence(attributes);
} else if (state_ == STATE_SENTENCE && name == "tok") {
start_token(attributes);
} else if (state_ == STATE_TOK && name == "orth") {
state_ = STATE_ORTH;
grab_characters_ = true;
clear_buf();
} else if (state_ == STATE_TOK && name == "lex") {
start_lexeme(attributes);
} else if (state_ == STATE_LEX && name == "base") {
state_ = STATE_LEMMA;
grab_characters_ = true;
clear_buf();
} else if (state_ == STATE_LEX && name == "ctag") {
state_ = STATE_TAG;
grab_characters_ = true;
clear_buf();
} else if (name == "ns") {
wa_ = PwrNlp::Whitespace::None;
} else if (state_ == STATE_NONE && name == "tok") {
if (warn_on_inconsistent_) {
std::cerr << "Warning: out-of-chunk token, assuming sentence start on line ";
std::cerr << this->context_->input->line << "\n";
}
chunkless_ = true;
out_of_chunk_ = true;
AttributeList fake;
start_chunk(fake);
start_sentence(fake);
start_token(attributes);
} else if (state_ == STATE_NONE && name == "cesAna") {
//nop
} else if (state_ == STATE_NONE && name == "chunkList") {
//nop
} else if (process_start_element(name, attributes)) {
//nop
} else if (warn_on_unexpected_) {
std::cerr << "Unexpected tag <" << name << "> on line ";
std::cerr << this->context_->input->line << " (" << state_ << ")\n";
}
}
bool XmlReader::process_start_element(const Glib::ustring &/*name*/,
const AttributeList &/*attributes*/)
{
return false;
}
bool XmlReader::process_end_element(const Glib::ustring & /*name*/)
{
return false;
}
void XmlReader::start_chunk(const AttributeList& attributes)
{
if (out_of_chunk_) {
finish_sentence();
out_of_chunk_ = false;
}
std::string type = get_type_from_attributes(attributes);
chunk_ = boost::make_shared<Chunk>();
if (type == "s") {
// top-level chunk is a sentence
start_sentence(attributes);
chunkless_ = true;
} else {
foreach (const Attribute& a, attributes) {
chunk_->set_attribute(a.name, a.value);
}
state_ = STATE_CHUNK;
}
}
void XmlReader::start_sentence(const AttributeList &attributes)
{
std::string type = get_type_from_attributes(attributes);
if (type != "s") {
throw XcesError("Sub level <chunk> not type=\"s\"");
}
sent_ = boost::make_shared<Corpus2::Sentence>();
state_ = STATE_SENTENCE;
}
void XmlReader::start_token(const AttributeList &/*attributes*/)
{
state_ = STATE_TOK;
tok_ = new Token();
tok_->set_wa(wa_);
wa_ = PwrNlp::Whitespace::Space;
}
void XmlReader::start_lexeme(const AttributeList &attributes)
{
assert(tok_ != NULL);
bool is_disamb = false;
if (!disamb_sh_) {
foreach (const Attribute& a, attributes) {
if (a.name == "disamb" && a.value == "1") {
is_disamb = true;
}
}
} else {
is_disamb = true;
foreach (const Attribute& a, attributes) {
if (a.name == "disamb_sh" && a.value == "0") {
is_disamb = false;
}
}
}
if (!disamb_only_ || is_disamb) {
tok_->add_lexeme(Lexeme());
tok_->lexemes().back().set_disamb(is_disamb);
state_ = STATE_LEX;
}
}
void XmlReader::finish_chunk()
{
std::cerr << "FC\n";
assert(chunk_);
obuf_.push_back(chunk_);
chunk_.reset();
state_ = STATE_NONE;
}
void XmlReader::finish_sentence()
{
assert(chunk_);
chunk_->append(sent_);
sent_.reset();
if (chunkless_) {
obuf_.push_back(chunk_);
chunk_.reset();
state_ = STATE_NONE;
chunkless_ = false;
} else {
state_ = STATE_CHUNK;
}
}
void XmlReader::finish_token()
{
assert(sent_);
sent_->append(tok_);
tok_ = NULL;
state_ = STATE_SENTENCE;
}
void XmlReader::on_end_element(const Glib::ustring &name)
{
std::cerr << "/" << name << state_ << "\n";
if (state_ == STATE_ORTH && name == "orth") {
tok_->set_orth(UnicodeString::fromUTF8(get_buf()));
grab_characters_ = false;
state_ = STATE_TOK;
} else if (state_ == STATE_LEMMA && name == "base") {
tok_->lexemes().back().set_lemma(UnicodeString::fromUTF8(get_buf()));
grab_characters_ = false;
state_ = STATE_LEX;
} else if (state_ == STATE_TAG && name == "ctag") {
Tag tag = tagset_.parse_simple_tag(get_buf(), true);
tok_->lexemes().back().set_tag(tag);
grab_characters_ = false;
state_ = STATE_LEX;
} else if (state_ == STATE_LEX && name == "lex") {
state_ = STATE_TOK;
} else if (state_ == STATE_TOK && name == "tok") {
finish_token();
} else if (state_ == STATE_SENTENCE && name == sentence_tag_name_) {
finish_sentence();
} else if (state_ == STATE_CHUNK && name == "chunk") {
finish_chunk();
} else {
process_end_element(name);
}
}
} /* end ns Corpus2 */
/*
Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski
Part of the libcorpus2 project
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.
See the LICENSE and COPYING files for more details.
*/
#ifndef LIBCORPUS2_IO_XMLREADER_H
#define LIBCORPUS2_IO_XMLREADER_H
#include <libcorpus2/io/reader.h>
#include <libcorpus2/io/xces.h>
#include <libcorpus2/chunk.h>
#include <deque>
#include <boost/scoped_ptr.hpp>
#include <libcorpus2/io/sax.h>
#include <libpwrutils/foreach.h>
namespace Corpus2 {
class XmlReader : public BasicSaxParser
{
public:
XmlReader(const Tagset& tagset,
std::deque< boost::shared_ptr<Chunk> >& obuf);
virtual ~XmlReader();
void set_disamb_only(bool v) {
disamb_only_ = v;
}
void set_disamb_sh(bool v) {
disamb_sh_ = v;
}
protected:
std::string get_type_from_attributes(const AttributeList& attributes) const;
void on_start_element(const Glib::ustring & name,
const AttributeList& attributes);
void on_end_element(const Glib::ustring & name);
virtual bool process_start_element(const Glib::ustring & name,
const AttributeList& attributes);
virtual bool process_end_element(const Glib::ustring & name);
virtual void start_chunk(const AttributeList& attributes);
virtual void start_sentence(const AttributeList& attributes);
virtual void start_token(const AttributeList& attributes);
void start_lexeme(const AttributeList& attributes);
virtual void finish_chunk();
virtual void finish_sentence();
virtual void finish_token();
const Tagset& tagset_;
static const int STATE_NONE = 0;
static const int STATE_CHUNK = 1;
static const int STATE_SENTENCE = 2;
static const int STATE_TOK = 3;
static const int STATE_ORTH = 4;
static const int STATE_LEX = 5;
static const int STATE_LEMMA = 6;
static const int STATE_TAG = 7;
int state_;
bool chunkless_;
bool out_of_chunk_;
/// Whitespace for the next token
PwrNlp::Whitespace::Enum wa_;
/// Character data buffer
Glib::ustring sbuf_;
/// Token being constructed
Token* tok_;
/// Sentence being constructed
Sentence::Ptr sent_;
/// Chunk being constructed
boost::shared_ptr<Chunk> chunk_;
/// Output chunk buffer
std::deque< boost::shared_ptr<Chunk> >& obuf_;
/// Flag to only read disamb tags
bool disamb_only_;
/// Read Pantera-like disamb_sh diamb tag markings
bool disamb_sh_;
bool warn_on_inconsistent_;
bool warn_on_unexpected_;
std::string sentence_tag_name_;
};
} /* end ns Corpus2 */
#endif // LIBCORPUS2_IO_XMLREADER_H
......@@ -99,6 +99,7 @@ BOOST_AUTO_TEST_CASE( iobase )
ssin << swiatopoglad_ann;
Corpus2::CclReader xr(tagset, ssin);
boost::shared_ptr<Corpus2::Chunk> chunk = xr.get_next_chunk();
BOOST_REQUIRE(chunk);
std::stringstream ss;
boost::shared_ptr<Corpus2::TokenWriter> w(Corpus2::TokenWriter::create("xces,flat", ss, tagset));
w->write_chunk(*chunk);
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment