From b000d45bd7c08d268495d3974cb3dd8fbdebac94 Mon Sep 17 00:00:00 2001 From: ilor <kailoran@gmail.com> Date: Mon, 11 Oct 2010 11:41:24 +0200 Subject: [PATCH] preliminary disamb_sh support in sces reader, bumps version to 0.0.2 --- libcorpus2/CMakeLists.txt | 2 +- libcorpus2/io/xcesreader.cpp | 27 +++++++++++++++++++-------- libcorpus2/io/xcesreader.h | 2 +- 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/libcorpus2/CMakeLists.txt b/libcorpus2/CMakeLists.txt index 424099b..3cf1325 100644 --- a/libcorpus2/CMakeLists.txt +++ b/libcorpus2/CMakeLists.txt @@ -3,7 +3,7 @@ PROJECT(corpus2) set(corpus2_ver_major "0") set(corpus2_ver_minor "0") -set(corpus2_ver_patch "1") +set(corpus2_ver_patch "2") if(NOT LIBCORPUS2_SRC_DATA_DIR) diff --git a/libcorpus2/io/xcesreader.cpp b/libcorpus2/io/xcesreader.cpp index 03e8f8c..af32af0 100644 --- a/libcorpus2/io/xcesreader.cpp +++ b/libcorpus2/io/xcesreader.cpp @@ -9,7 +9,7 @@ class XcesReaderImpl : public BasicSaxParser { public: XcesReaderImpl(const Tagset& tagset, std::deque<Chunk*>& obuf, - bool disamb_only); + bool disamb_only, bool disamb_sh); ~XcesReaderImpl(); @@ -37,12 +37,14 @@ protected: std::deque<Chunk*>& obuf_; bool disamb_only_; + + bool disamb_sh_; }; XcesReader::XcesReader(const Tagset& tagset, std::istream& is, - bool disamb_only) + bool disamb_only, bool disamb_sh) : BufferedChunkReader(tagset), is_(is) - , impl_(new XcesReaderImpl(tagset, chunk_buf_, disamb_only)) + , impl_(new XcesReaderImpl(tagset, chunk_buf_, disamb_only, disamb_sh)) { } @@ -64,11 +66,11 @@ void XcesReader::ensure_more() } XcesReaderImpl::XcesReaderImpl(const Tagset& tagset, - std::deque<Chunk*>& obuf, bool disamb_only) + std::deque<Chunk*>& obuf, bool disamb_only, bool disamb_sh) : BasicSaxParser() , tagset_(tagset), state_(XS_NONE), wa_(PwrNlp::Whitespace::Newline) , sbuf_(), tok_(NULL), sent_(NULL), chunk_(NULL), obuf_(obuf) - , disamb_only_(disamb_only) + , disamb_only_(disamb_only), disamb_sh_(disamb_sh) { } @@ -119,9 +121,18 @@ void XcesReaderImpl::on_start_element(const Glib::ustring &name, } else if (state_ == XS_TOK && name == "lex") { assert(tok_ != NULL); bool is_disamb = false; - foreach (const Attribute& a, attributes) { - if (a.name == "disamb" && a.value == "1") { - is_disamb = true; + if (!disamb_sh_) { + foreach (const Attribute& a, attributes) { + if (a.name == "disamb" && a.value == "1") { + is_disamb = true; + } + } + } else { + is_disamb = true; + foreach (const Attribute& a, attributes) { + if (a.name == "disamb_sh" && a.value == "0") { + is_disamb = false; + } } } if (!disamb_only_ || is_disamb) { diff --git a/libcorpus2/io/xcesreader.h b/libcorpus2/io/xcesreader.h index 2af10d7..457a8aa 100644 --- a/libcorpus2/io/xcesreader.h +++ b/libcorpus2/io/xcesreader.h @@ -15,7 +15,7 @@ class XcesReader : public BufferedChunkReader { public: XcesReader(const Tagset& tagset, std::istream& is, - bool disamb_only = false); + bool disamb_only = false, bool disamb_sh = false); ~XcesReader(); -- GitLab