diff --git a/libcorpus2/CMakeLists.txt b/libcorpus2/CMakeLists.txt index 87b97583e4e0f405fca0a43b51df8130ec508147..6dcd0017dc514687dda6b2ff82e14b472d3235b0 100644 --- a/libcorpus2/CMakeLists.txt +++ b/libcorpus2/CMakeLists.txt @@ -68,6 +68,7 @@ SET(libcorpus2_STAT_SRC io/xcesreader.cpp io/xcesvalidate.cpp io/xceswriter.cpp + io/xmlreader.cpp util/settings.cpp util/symboldictionary.cpp util/tokentimer.cpp diff --git a/libcorpus2/io/cclreader.cpp b/libcorpus2/io/cclreader.cpp index 23179f9237e892f277d6ec5a057bef8da9357013..71ef4a17b94024f13933e6deaced4f56b827466a 100644 --- a/libcorpus2/io/cclreader.cpp +++ b/libcorpus2/io/cclreader.cpp @@ -15,7 +15,7 @@ or FITNESS FOR A PARTICULAR PURPOSE. */ #include <libcorpus2/io/cclreader.h> -#include <libcorpus2/io/sax.h> +#include <libcorpus2/io/xmlreader.h> #include <libpwrutils/foreach.h> #include <libxml++/libxml++.h> #include <libxml2/libxml/parser.h> @@ -26,7 +26,7 @@ or FITNESS FOR A PARTICULAR PURPOSE. namespace Corpus2 { -class CclReaderImpl : public BasicSaxParser +class CclReaderImpl : public XmlReader { public: CclReaderImpl(const Tagset& tagset, @@ -36,29 +36,23 @@ public: ~CclReaderImpl(); protected: - void on_start_element(const Glib::ustring & name, - const AttributeList& attributes); - void on_end_element(const Glib::ustring & name); + bool process_start_element(const Glib::ustring & name, + const AttributeList& attributes); - void finish_sentence(); + bool process_end_element(const Glib::ustring& name); - const Tagset& tagset_; + void start_chunk(const AttributeList &attributes); - enum state_t { XS_NONE, XS_CHUNK, XS_SENTENCE, XS_TOK, XS_ANN, XS_ORTH, XS_LEX, - XS_LEMMA, XS_TAG, XS_REL }; - state_t state_; + void start_sentence(const AttributeList &attributes); - bool chunkless_; + void start_token(const AttributeList &attributes); - bool out_of_chunk_; + void finish_token(); - PwrNlp::Whitespace::Enum wa_; + static const int STATE_ANN = 901; + static const int STATE_REL = 902; - Glib::ustring sbuf_; - - Token* tok_; - - boost::shared_ptr<AnnotatedSentence> sent_; + boost::shared_ptr<AnnotatedSentence> ann_sent_; std::string ann_chan_; @@ -69,14 +63,6 @@ protected: token_ann_t token_anns_; std::set<std::string> token_ann_heads_; - - boost::shared_ptr<Chunk> chunk_; - - std::deque< boost::shared_ptr<Chunk> >& obuf_; - - bool disamb_only_; - - bool disamb_sh_; }; CclReader::CclReader(const Tagset& tagset, std::istream& is, @@ -121,58 +107,53 @@ void CclReader::ensure_more() CclReaderImpl::CclReaderImpl(const Tagset& tagset, std::deque< boost::shared_ptr<Chunk> >& obuf, bool disamb_only, bool disamb_sh) - : BasicSaxParser() - , tagset_(tagset), state_(XS_NONE), chunkless_(false), out_of_chunk_(false) - , wa_(PwrNlp::Whitespace::Newline) - , sbuf_(), tok_(NULL), sent_(), chunk_(), obuf_(obuf) - , disamb_only_(disamb_only), disamb_sh_(disamb_sh) + : XmlReader(tagset, obuf) { + XmlReader::set_disamb_only(disamb_only); + XmlReader::set_disamb_sh(disamb_sh); + sentence_tag_name_ = "sentence"; } CclReaderImpl::~CclReaderImpl() { - delete tok_; } -void CclReaderImpl::on_start_element(const Glib::ustring &name, - const AttributeList& attributes) +void CclReaderImpl::start_chunk(const AttributeList& attributes) { - if (name == "chunk") { - std::string type; - foreach (const Attribute& a, attributes) { - if (a.name == "type") { - type = a.value; - } - } - if (type == "s") { - throw XcesError("Trying to parse XCES as CCL (<chunk type=\"s\">)"); - } else if (state_ == XS_NONE) { - chunk_ = boost::make_shared<Chunk>(); - state_ = XS_CHUNK; - foreach (const Attribute& a, attributes) { - chunk_->set_attribute(a.name, a.value); - } - } else if (state_ == XS_CHUNK) { - throw XcesError("Nested <chunk>"); - } else { - throw XcesError("Unexpected <chunk>"); - } - } else if (state_ == XS_CHUNK && name == "sentence") { - state_ = XS_SENTENCE; - sent_ = boost::make_shared<AnnotatedSentence>(); - } else if (state_ == XS_SENTENCE && name == "tok") { - state_ = XS_TOK; - tok_ = new Token(); - tok_->set_wa(wa_); - wa_ = PwrNlp::Whitespace::Space; - token_anns_.clear(); - token_ann_heads_.clear(); - } else if (state_ == XS_TOK && name == "orth") { - state_ = XS_ORTH; - grab_characters_ = true; - clear_buf(); - } else if (state_ == XS_TOK && name == "ann") { - state_ = XS_ANN; + chunk_ = boost::make_shared<Chunk>(); + std::string type = get_type_from_attributes(attributes); + if (type == "s") { + throw XcesError("Trying to parse XCES as CCL (<chunk type=\"s\">)"); + } + foreach (const Attribute& a, attributes) { + chunk_->set_attribute(a.name, a.value); + } + state_ = STATE_CHUNK; + std::cerr << "Chunk"; +} + + + +void CclReaderImpl::start_sentence(const AttributeList& /*attributes*/) +{ + ann_sent_ = boost::make_shared<AnnotatedSentence>(); + sent_ = ann_sent_; + state_ = STATE_SENTENCE; +} + + +void CclReaderImpl::start_token(const AttributeList& attributes) +{ + XmlReader::start_token(attributes); + token_anns_.clear(); + token_ann_heads_.clear(); +} + +bool CclReaderImpl::process_start_element(const Glib::ustring & name, + const AttributeList& attributes) +{ + if (state_ == STATE_TOK && name == "ann") { + state_ = STATE_ANN; grab_characters_ = true; clear_buf(); ann_chan_ = ""; @@ -187,102 +168,40 @@ void CclReaderImpl::on_start_element(const Glib::ustring &name, if (ann_chan_.empty()) { throw XcesError("<ann> with no channel name"); } - } else if (state_ == XS_TOK && name == "lex") { - assert(tok_ != NULL); - bool is_disamb = false; - foreach (const Attribute& a, attributes) { - if (a.name == "disamb" && a.value == "1") { - is_disamb = true; - } - } - if (!disamb_only_ || is_disamb) { - tok_->add_lexeme(Lexeme()); - tok_->lexemes().back().set_disamb(is_disamb); - state_ = XS_LEX; - } - } else if (state_ == XS_LEX && name == "base") { - state_ = XS_LEMMA; - grab_characters_ = true; - clear_buf(); - } else if (state_ == XS_LEX && name == "ctag") { - state_ = XS_TAG; - grab_characters_ = true; - clear_buf(); - } else if (name == "ns") { - wa_ = PwrNlp::Whitespace::None; - } else if (name == "tok" && state_ == XS_NONE) { - std::cerr << "Warning: out-of-chunk token, assuming sentence start on line "; - std::cerr << this->context_->input->line << "\n"; - chunkless_ = true; - out_of_chunk_ = true; - chunk_ = boost::make_shared<Chunk>(); - sent_ = boost::make_shared<AnnotatedSentence>(); - state_ = XS_TOK; - tok_ = new Token(); - tok_->set_wa(wa_); - wa_ = PwrNlp::Whitespace::Space; - } -} - -void CclReaderImpl::finish_sentence() -{ - chunk_->append(sent_); - sent_.reset(); - if (chunkless_) { - obuf_.push_back(chunk_); - chunk_.reset(); - state_ = XS_NONE; - chunkless_ = false; + return true; } else { - state_ = XS_CHUNK; + return false; } } -void CclReaderImpl::on_end_element(const Glib::ustring &name) +bool CclReaderImpl::process_end_element(const Glib::ustring & name) { - if (state_ == XS_ORTH && name == "orth") { - tok_->set_orth(UnicodeString::fromUTF8(get_buf())); - grab_characters_ = false; - state_ = XS_TOK; - } else if (state_ == XS_ANN && name == "ann") { + if (state_ == STATE_ANN && name == "ann") { std::string buf = get_buf(); grab_characters_ = false; int segid = atoi(buf.c_str()); - if (!sent_->has_channel(ann_chan_)) { - sent_->create_channel(ann_chan_); + if (!ann_sent_->has_channel(ann_chan_)) { + ann_sent_->create_channel(ann_chan_); } if (segid > 0) { token_anns_.insert(std::make_pair(ann_chan_, segid)); token_ann_heads_.insert(ann_chan_); } - state_ = XS_TOK; - } else if (state_ == XS_LEMMA && name == "base") { - tok_->lexemes().back().set_lemma(UnicodeString::fromUTF8(get_buf())); - grab_characters_ = false; - state_ = XS_LEX; - } else if (state_ == XS_TAG && name == "ctag") { - Tag tag = tagset_.parse_simple_tag(get_buf(), true); - tok_->lexemes().back().set_tag(tag); - grab_characters_ = false; - state_ = XS_LEX; - } else if (state_ == XS_LEX && name == "lex") { - state_ = XS_TOK; - } else if (state_ == XS_TOK && name == "tok") { - sent_->append(tok_); - tok_ = NULL; - state_ = XS_SENTENCE; - foreach (const token_ann_t::value_type& v, token_anns_) { - sent_->get_channel(v.first).set_segment_at(sent_->size() - 1, v.second); - if (token_ann_heads_.find(v.first) != token_ann_heads_.end()) { - sent_->get_channel(v.first).set_head_at(sent_->size() - 1, true); - } + state_ = STATE_TOK; + return true; + } else { + return false; + } +} + +void CclReaderImpl::finish_token() +{ + XmlReader::finish_token(); + foreach (const token_ann_t::value_type& v, token_anns_) { + ann_sent_->get_channel(v.first).set_segment_at(sent_->size() - 1, v.second); + if (token_ann_heads_.find(v.first) != token_ann_heads_.end()) { + ann_sent_->get_channel(v.first).set_head_at(sent_->size() - 1, true); } - } else if (state_ == XS_SENTENCE && name == "sentence") { - finish_sentence(); - } else if (state_ == XS_CHUNK && name == "chunk") { - obuf_.push_back(chunk_); - chunk_.reset(); - state_ = XS_NONE; } } diff --git a/libcorpus2/io/xcesreader.cpp b/libcorpus2/io/xcesreader.cpp index 87fbd05c22e853d48cc32ae99439e010465dd6cc..a33aeb46bdcaed9f6c11e5ca5d7d2c08fc909f7f 100644 --- a/libcorpus2/io/xcesreader.cpp +++ b/libcorpus2/io/xcesreader.cpp @@ -15,7 +15,7 @@ or FITNESS FOR A PARTICULAR PURPOSE. */ #include <libcorpus2/io/xcesreader.h> -#include <libcorpus2/io/sax.h> +#include <libcorpus2/io/xmlreader.h> #include <libpwrutils/foreach.h> #include <libxml++/libxml++.h> #include <libxml2/libxml/parser.h> @@ -24,7 +24,7 @@ or FITNESS FOR A PARTICULAR PURPOSE. namespace Corpus2 { -class XcesReaderImpl : public BasicSaxParser +class XcesReaderImpl : public XmlReader { public: XcesReaderImpl(const Tagset& tagset, @@ -34,37 +34,6 @@ public: ~XcesReaderImpl(); protected: - void on_start_element(const Glib::ustring & name, - const AttributeList& attributes); - void on_end_element(const Glib::ustring & name); - - void finish_sentence(); - - const Tagset& tagset_; - - enum state_t { XS_NONE, XS_CHUNK, XS_SENTENCE, XS_TOK, XS_ORTH, XS_LEX, - XS_LEMMA, XS_TAG }; - state_t state_; - - bool chunkless_; - - bool out_of_chunk_; - - PwrNlp::Whitespace::Enum wa_; - - Glib::ustring sbuf_; - - Token* tok_; - - Sentence::Ptr sent_; - - boost::shared_ptr<Chunk> chunk_; - - std::deque< boost::shared_ptr<Chunk> >& obuf_; - - bool disamb_only_; - - bool disamb_sh_; }; XcesReader::XcesReader(const Tagset& tagset, std::istream& is, @@ -109,153 +78,15 @@ void XcesReader::ensure_more() XcesReaderImpl::XcesReaderImpl(const Tagset& tagset, std::deque< boost::shared_ptr<Chunk> >& obuf, bool disamb_only, bool disamb_sh) - : BasicSaxParser() - , tagset_(tagset), state_(XS_NONE), chunkless_(false), out_of_chunk_(false) - , wa_(PwrNlp::Whitespace::Newline) - , sbuf_(), tok_(NULL), sent_(), chunk_(), obuf_(obuf) - , disamb_only_(disamb_only), disamb_sh_(disamb_sh) + : XmlReader(tagset, obuf) { + XmlReader::set_disamb_only(disamb_only); + XmlReader::set_disamb_sh(disamb_sh); + sentence_tag_name_ = "chunk"; } XcesReaderImpl::~XcesReaderImpl() { - delete tok_; -} - -void XcesReaderImpl::on_start_element(const Glib::ustring &name, - const AttributeList& attributes) -{ - if (name == "chunk") { - std::string type; - foreach (const Attribute& a, attributes) { - if (a.name == "type") { - type = a.value; - } - } - if (out_of_chunk_) { - finish_sentence(); - out_of_chunk_ = false; - } - if (state_ == XS_NONE) { - if (type == "s") { - //throw XcesError("Top level <chunk> is type=\"s\""); - state_ = XS_SENTENCE; - chunkless_ = true; - chunk_ = boost::make_shared<Chunk>(); - sent_ = boost::make_shared<Sentence>(); - } else { - chunk_ = boost::make_shared<Chunk>(); - state_ = XS_CHUNK; - foreach (const Attribute& a, attributes) { - chunk_->set_attribute(a.name, a.value); - } - } - } else if (state_ == XS_CHUNK) { - if (type != "s") { - throw XcesError("Sub level <chunk> not type=\"s\""); - } - state_ = XS_SENTENCE; - sent_ = boost::make_shared<Sentence>(); - } else { - throw XcesError("Unexpected <chunk>"); - } - } else if (state_ == XS_SENTENCE && name == "tok") { - state_ = XS_TOK; - tok_ = new Token(); - tok_->set_wa(wa_); - wa_ = PwrNlp::Whitespace::Space; - } else if (state_ == XS_TOK && name == "orth") { - state_ = XS_ORTH; - grab_characters_ = true; - clear_buf(); - } else if (state_ == XS_TOK && name == "lex") { - assert(tok_ != NULL); - bool is_disamb = false; - if (!disamb_sh_) { - foreach (const Attribute& a, attributes) { - if (a.name == "disamb" && a.value == "1") { - is_disamb = true; - } - } - } else { - is_disamb = true; - foreach (const Attribute& a, attributes) { - if (a.name == "disamb_sh" && a.value == "0") { - is_disamb = false; - } - } - } - if (!disamb_only_ || is_disamb) { - tok_->add_lexeme(Lexeme()); - tok_->lexemes().back().set_disamb(is_disamb); - state_ = XS_LEX; - } - } else if (state_ == XS_LEX && name == "base") { - state_ = XS_LEMMA; - grab_characters_ = true; - clear_buf(); - } else if (state_ == XS_LEX && name == "ctag") { - state_ = XS_TAG; - grab_characters_ = true; - clear_buf(); - } else if (name == "ns") { - wa_ = PwrNlp::Whitespace::None; - } else if (name == "tok" && state_ == XS_NONE) { - std::cerr << "Warning: out-of-chunk token, assuming sentence start on line "; - std::cerr << this->context_->input->line << "\n"; - chunkless_ = true; - out_of_chunk_ = true; - chunk_ = boost::make_shared<Chunk>(); - sent_ = boost::make_shared<Sentence>(); - state_ = XS_TOK; - tok_ = new Token(); - tok_->set_wa(wa_); - wa_ = PwrNlp::Whitespace::Space; - } -} - -void XcesReaderImpl::finish_sentence() -{ - chunk_->append(sent_); - sent_.reset(); - if (chunkless_) { - obuf_.push_back(chunk_); - chunk_.reset(); - state_ = XS_NONE; - chunkless_ = false; - } else { - state_ = XS_CHUNK; - } -} - -void XcesReaderImpl::on_end_element(const Glib::ustring &name) -{ - if (state_ == XS_ORTH && name == "orth") { - tok_->set_orth(UnicodeString::fromUTF8(get_buf())); - grab_characters_ = false; - state_ = XS_TOK; - } else if (state_ == XS_LEMMA && name == "base") { - tok_->lexemes().back().set_lemma(UnicodeString::fromUTF8(get_buf())); - grab_characters_ = false; - state_ = XS_LEX; - } else if (state_ == XS_TAG && name == "ctag") { - Tag tag = tagset_.parse_simple_tag(get_buf(), true); - tok_->lexemes().back().set_tag(tag); - grab_characters_ = false; - state_ = XS_LEX; - } else if (state_ == XS_LEX && name == "lex") { - state_ = XS_TOK; - } else if (state_ == XS_TOK && name == "tok") { - sent_->append(tok_); - tok_ = NULL; - state_ = XS_SENTENCE; - } else if (state_ == XS_SENTENCE && name == "chunk") { - finish_sentence(); - } else if (state_ == XS_CHUNK && name == "chunk") { - obuf_.push_back(chunk_); - chunk_.reset(); - state_ = XS_NONE; - } } } /* end ns Corpus2 */ diff --git a/libcorpus2/io/xmlreader.cpp b/libcorpus2/io/xmlreader.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d028e6db28cb83c47f7190846721ebf669e6c7b4 --- /dev/null +++ b/libcorpus2/io/xmlreader.cpp @@ -0,0 +1,243 @@ +/* + Copyright (C) 2010 Tomasz Åšniatowski, Adam Radziszewski + Part of the libcorpus2 project + + This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) +any later version. + + This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. + + See the LICENSE and COPYING files for more details. +*/ + +#include <libcorpus2/io/xmlreader.h> +#include <libpwrutils/foreach.h> +#include <libxml++/libxml++.h> +#include <libxml2/libxml/parser.h> +#include <boost/make_shared.hpp> +#include <fstream> + +namespace Corpus2 { + + +XmlReader::XmlReader(const Tagset& tagset, + std::deque< boost::shared_ptr<Chunk> >& obuf) + : BasicSaxParser() + , tagset_(tagset), state_(STATE_NONE) + , chunkless_(false), out_of_chunk_(false) + , wa_(PwrNlp::Whitespace::Newline) + , sbuf_(), tok_(NULL), sent_(), chunk_(), obuf_(obuf) + , disamb_only_(false), disamb_sh_(false) + , warn_on_inconsistent_(true), warn_on_unexpected_(true) +{ +} + +XmlReader::~XmlReader() +{ + delete tok_; +} + +std::string XmlReader::get_type_from_attributes(const AttributeList& attributes) const +{ + std::string type; + foreach (const Attribute& a, attributes) { + if (a.name == "type") { + type = a.value; + } + } + return type; +} + + +void XmlReader::on_start_element(const Glib::ustring &name, + const AttributeList& attributes) +{ + std::cerr << name << state_ << "\n"; + if (state_ == STATE_NONE && name == "chunk") { + start_chunk(attributes); + } else if (state_ == STATE_CHUNK && name == sentence_tag_name_) { + start_sentence(attributes); + } else if (state_ == STATE_SENTENCE && name == "tok") { + start_token(attributes); + } else if (state_ == STATE_TOK && name == "orth") { + state_ = STATE_ORTH; + grab_characters_ = true; + clear_buf(); + } else if (state_ == STATE_TOK && name == "lex") { + start_lexeme(attributes); + } else if (state_ == STATE_LEX && name == "base") { + state_ = STATE_LEMMA; + grab_characters_ = true; + clear_buf(); + } else if (state_ == STATE_LEX && name == "ctag") { + state_ = STATE_TAG; + grab_characters_ = true; + clear_buf(); + } else if (name == "ns") { + wa_ = PwrNlp::Whitespace::None; + } else if (state_ == STATE_NONE && name == "tok") { + if (warn_on_inconsistent_) { + std::cerr << "Warning: out-of-chunk token, assuming sentence start on line "; + std::cerr << this->context_->input->line << "\n"; + } + chunkless_ = true; + out_of_chunk_ = true; + AttributeList fake; + start_chunk(fake); + start_sentence(fake); + start_token(attributes); + } else if (state_ == STATE_NONE && name == "cesAna") { + //nop + } else if (state_ == STATE_NONE && name == "chunkList") { + //nop + } else if (process_start_element(name, attributes)) { + //nop + } else if (warn_on_unexpected_) { + std::cerr << "Unexpected tag <" << name << "> on line "; + std::cerr << this->context_->input->line << " (" << state_ << ")\n"; + } +} + +bool XmlReader::process_start_element(const Glib::ustring &/*name*/, + const AttributeList &/*attributes*/) +{ + return false; +} + +bool XmlReader::process_end_element(const Glib::ustring & /*name*/) +{ + return false; +} + + +void XmlReader::start_chunk(const AttributeList& attributes) +{ + if (out_of_chunk_) { + finish_sentence(); + out_of_chunk_ = false; + } + std::string type = get_type_from_attributes(attributes); + chunk_ = boost::make_shared<Chunk>(); + + if (type == "s") { + // top-level chunk is a sentence + start_sentence(attributes); + chunkless_ = true; + } else { + foreach (const Attribute& a, attributes) { + chunk_->set_attribute(a.name, a.value); + } + state_ = STATE_CHUNK; + } +} + +void XmlReader::start_sentence(const AttributeList &attributes) +{ + std::string type = get_type_from_attributes(attributes); + if (type != "s") { + throw XcesError("Sub level <chunk> not type=\"s\""); + } + sent_ = boost::make_shared<Corpus2::Sentence>(); + state_ = STATE_SENTENCE; +} + +void XmlReader::start_token(const AttributeList &/*attributes*/) +{ + state_ = STATE_TOK; + tok_ = new Token(); + tok_->set_wa(wa_); + wa_ = PwrNlp::Whitespace::Space; +} + +void XmlReader::start_lexeme(const AttributeList &attributes) +{ + assert(tok_ != NULL); + bool is_disamb = false; + if (!disamb_sh_) { + foreach (const Attribute& a, attributes) { + if (a.name == "disamb" && a.value == "1") { + is_disamb = true; + } + } + } else { + is_disamb = true; + foreach (const Attribute& a, attributes) { + if (a.name == "disamb_sh" && a.value == "0") { + is_disamb = false; + } + } + } + if (!disamb_only_ || is_disamb) { + tok_->add_lexeme(Lexeme()); + tok_->lexemes().back().set_disamb(is_disamb); + state_ = STATE_LEX; + } +} + +void XmlReader::finish_chunk() +{ + std::cerr << "FC\n"; + assert(chunk_); + obuf_.push_back(chunk_); + chunk_.reset(); + state_ = STATE_NONE; +} + +void XmlReader::finish_sentence() +{ + assert(chunk_); + chunk_->append(sent_); + sent_.reset(); + if (chunkless_) { + obuf_.push_back(chunk_); + chunk_.reset(); + state_ = STATE_NONE; + chunkless_ = false; + } else { + state_ = STATE_CHUNK; + } +} + +void XmlReader::finish_token() +{ + assert(sent_); + sent_->append(tok_); + tok_ = NULL; + state_ = STATE_SENTENCE; +} + +void XmlReader::on_end_element(const Glib::ustring &name) +{ + std::cerr << "/" << name << state_ << "\n"; + + if (state_ == STATE_ORTH && name == "orth") { + tok_->set_orth(UnicodeString::fromUTF8(get_buf())); + grab_characters_ = false; + state_ = STATE_TOK; + } else if (state_ == STATE_LEMMA && name == "base") { + tok_->lexemes().back().set_lemma(UnicodeString::fromUTF8(get_buf())); + grab_characters_ = false; + state_ = STATE_LEX; + } else if (state_ == STATE_TAG && name == "ctag") { + Tag tag = tagset_.parse_simple_tag(get_buf(), true); + tok_->lexemes().back().set_tag(tag); + grab_characters_ = false; + state_ = STATE_LEX; + } else if (state_ == STATE_LEX && name == "lex") { + state_ = STATE_TOK; + } else if (state_ == STATE_TOK && name == "tok") { + finish_token(); + } else if (state_ == STATE_SENTENCE && name == sentence_tag_name_) { + finish_sentence(); + } else if (state_ == STATE_CHUNK && name == "chunk") { + finish_chunk(); + } else { + process_end_element(name); + } +} + +} /* end ns Corpus2 */ diff --git a/libcorpus2/io/xmlreader.h b/libcorpus2/io/xmlreader.h new file mode 100644 index 0000000000000000000000000000000000000000..4f484752a4ee5b5feba62aa9d9abb4fe9915ec82 --- /dev/null +++ b/libcorpus2/io/xmlreader.h @@ -0,0 +1,123 @@ +/* + Copyright (C) 2010 Tomasz Åšniatowski, Adam Radziszewski + Part of the libcorpus2 project + + This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) +any later version. + + This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. + + See the LICENSE and COPYING files for more details. +*/ + +#ifndef LIBCORPUS2_IO_XMLREADER_H +#define LIBCORPUS2_IO_XMLREADER_H + +#include <libcorpus2/io/reader.h> +#include <libcorpus2/io/xces.h> +#include <libcorpus2/chunk.h> +#include <deque> +#include <boost/scoped_ptr.hpp> +#include <libcorpus2/io/sax.h> +#include <libpwrutils/foreach.h> + +namespace Corpus2 { + +class XmlReader : public BasicSaxParser +{ +public: + XmlReader(const Tagset& tagset, + std::deque< boost::shared_ptr<Chunk> >& obuf); + + virtual ~XmlReader(); + + void set_disamb_only(bool v) { + disamb_only_ = v; + } + + void set_disamb_sh(bool v) { + disamb_sh_ = v; + } + +protected: + std::string get_type_from_attributes(const AttributeList& attributes) const; + + void on_start_element(const Glib::ustring & name, + const AttributeList& attributes); + + void on_end_element(const Glib::ustring & name); + + virtual bool process_start_element(const Glib::ustring & name, + const AttributeList& attributes); + + virtual bool process_end_element(const Glib::ustring & name); + + virtual void start_chunk(const AttributeList& attributes); + + virtual void start_sentence(const AttributeList& attributes); + + virtual void start_token(const AttributeList& attributes); + + void start_lexeme(const AttributeList& attributes); + + virtual void finish_chunk(); + + virtual void finish_sentence(); + + virtual void finish_token(); + + const Tagset& tagset_; + + static const int STATE_NONE = 0; + static const int STATE_CHUNK = 1; + static const int STATE_SENTENCE = 2; + static const int STATE_TOK = 3; + static const int STATE_ORTH = 4; + static const int STATE_LEX = 5; + static const int STATE_LEMMA = 6; + static const int STATE_TAG = 7; + + int state_; + + bool chunkless_; + + bool out_of_chunk_; + + /// Whitespace for the next token + PwrNlp::Whitespace::Enum wa_; + + /// Character data buffer + Glib::ustring sbuf_; + + /// Token being constructed + Token* tok_; + + /// Sentence being constructed + Sentence::Ptr sent_; + + /// Chunk being constructed + boost::shared_ptr<Chunk> chunk_; + + /// Output chunk buffer + std::deque< boost::shared_ptr<Chunk> >& obuf_; + + /// Flag to only read disamb tags + bool disamb_only_; + + /// Read Pantera-like disamb_sh diamb tag markings + bool disamb_sh_; + + bool warn_on_inconsistent_; + + bool warn_on_unexpected_; + + std::string sentence_tag_name_; +}; + +} /* end ns Corpus2 */ + +#endif // LIBCORPUS2_IO_XMLREADER_H diff --git a/tests/ioann.cpp b/tests/ioann.cpp index 8334bf32effb64027a96cf93eb1604ae23484147..e17dc87f26706d74e28a0bddfafa9885af51b757 100644 --- a/tests/ioann.cpp +++ b/tests/ioann.cpp @@ -99,6 +99,7 @@ BOOST_AUTO_TEST_CASE( iobase ) ssin << swiatopoglad_ann; Corpus2::CclReader xr(tagset, ssin); boost::shared_ptr<Corpus2::Chunk> chunk = xr.get_next_chunk(); + BOOST_REQUIRE(chunk); std::stringstream ss; boost::shared_ptr<Corpus2::TokenWriter> w(Corpus2::TokenWriter::create("xces,flat", ss, tagset)); w->write_chunk(*chunk);