diff --git a/libcorpus2/CMakeLists.txt b/libcorpus2/CMakeLists.txt index 424099bfd21aad3e38d156d383d3691ea76a4d74..3cf132509924ddcdf060b5915fee5a6f04323cce 100644 --- a/libcorpus2/CMakeLists.txt +++ b/libcorpus2/CMakeLists.txt @@ -3,7 +3,7 @@ PROJECT(corpus2) set(corpus2_ver_major "0") set(corpus2_ver_minor "0") -set(corpus2_ver_patch "1") +set(corpus2_ver_patch "2") if(NOT LIBCORPUS2_SRC_DATA_DIR) diff --git a/libcorpus2/io/xcesreader.cpp b/libcorpus2/io/xcesreader.cpp index 03e8f8c1c78b78e9fe92dfddd54372c45e29b617..af32af0734917da086601fa1e70d9c132494e1ca 100644 --- a/libcorpus2/io/xcesreader.cpp +++ b/libcorpus2/io/xcesreader.cpp @@ -9,7 +9,7 @@ class XcesReaderImpl : public BasicSaxParser { public: XcesReaderImpl(const Tagset& tagset, std::deque<Chunk*>& obuf, - bool disamb_only); + bool disamb_only, bool disamb_sh); ~XcesReaderImpl(); @@ -37,12 +37,14 @@ protected: std::deque<Chunk*>& obuf_; bool disamb_only_; + + bool disamb_sh_; }; XcesReader::XcesReader(const Tagset& tagset, std::istream& is, - bool disamb_only) + bool disamb_only, bool disamb_sh) : BufferedChunkReader(tagset), is_(is) - , impl_(new XcesReaderImpl(tagset, chunk_buf_, disamb_only)) + , impl_(new XcesReaderImpl(tagset, chunk_buf_, disamb_only, disamb_sh)) { } @@ -64,11 +66,11 @@ void XcesReader::ensure_more() } XcesReaderImpl::XcesReaderImpl(const Tagset& tagset, - std::deque<Chunk*>& obuf, bool disamb_only) + std::deque<Chunk*>& obuf, bool disamb_only, bool disamb_sh) : BasicSaxParser() , tagset_(tagset), state_(XS_NONE), wa_(PwrNlp::Whitespace::Newline) , sbuf_(), tok_(NULL), sent_(NULL), chunk_(NULL), obuf_(obuf) - , disamb_only_(disamb_only) + , disamb_only_(disamb_only), disamb_sh_(disamb_sh) { } @@ -119,9 +121,18 @@ void XcesReaderImpl::on_start_element(const Glib::ustring &name, } else if (state_ == XS_TOK && name == "lex") { assert(tok_ != NULL); bool is_disamb = false; - foreach (const Attribute& a, attributes) { - if (a.name == "disamb" && a.value == "1") { - is_disamb = true; + if (!disamb_sh_) { + foreach (const Attribute& a, attributes) { + if (a.name == "disamb" && a.value == "1") { + is_disamb = true; + } + } + } else { + is_disamb = true; + foreach (const Attribute& a, attributes) { + if (a.name == "disamb_sh" && a.value == "0") { + is_disamb = false; + } } } if (!disamb_only_ || is_disamb) { diff --git a/libcorpus2/io/xcesreader.h b/libcorpus2/io/xcesreader.h index 2af10d786735a58c77b58b0cec7a3dd8d41cb63f..457a8aadcda20560a3a1886a9b71cd07edf7b5e8 100644 --- a/libcorpus2/io/xcesreader.h +++ b/libcorpus2/io/xcesreader.h @@ -15,7 +15,7 @@ class XcesReader : public BufferedChunkReader { public: XcesReader(const Tagset& tagset, std::istream& is, - bool disamb_only = false); + bool disamb_only = false, bool disamb_sh = false); ~XcesReader();