diff --git a/libcorpus2/io/xcesreader.cpp b/libcorpus2/io/xcesreader.cpp index d0e18afa133b06ca82352cfcba848fd64da6c620..a0861e2a1bc234f08220e065213425b71167b59e 100644 --- a/libcorpus2/io/xcesreader.cpp +++ b/libcorpus2/io/xcesreader.cpp @@ -18,6 +18,7 @@ or FITNESS FOR A PARTICULAR PURPOSE. #include <libcorpus2/io/sax.h> #include <libpwrutils/foreach.h> #include <libxml++/libxml++.h> +#include <libxml2/libxml/parser.h> #include <boost/make_shared.hpp> #include <fstream> @@ -37,6 +38,8 @@ protected: const AttributeList& attributes); void on_end_element(const Glib::ustring & name); + void finish_sentence(); + const Tagset& tagset_; enum state_t { XS_NONE, XS_CHUNK, XS_SENTENCE, XS_TOK, XS_ORTH, XS_LEX, @@ -45,6 +48,8 @@ protected: bool chunkless_; + bool out_of_chunk_; + PwrNlp::Whitespace::Enum wa_; Glib::ustring sbuf_; @@ -105,7 +110,7 @@ XcesReaderImpl::XcesReaderImpl(const Tagset& tagset, std::deque< boost::shared_ptr<Chunk> >& obuf, bool disamb_only, bool disamb_sh) : BasicSaxParser() - , tagset_(tagset), state_(XS_NONE), chunkless_(false) + , tagset_(tagset), state_(XS_NONE), chunkless_(false), out_of_chunk_(false) , wa_(PwrNlp::Whitespace::Newline) , sbuf_(), tok_(NULL), sent_(), chunk_(), obuf_(obuf) , disamb_only_(disamb_only), disamb_sh_(disamb_sh) @@ -127,6 +132,10 @@ void XcesReaderImpl::on_start_element(const Glib::ustring &name, type = a.value; } } + if (out_of_chunk_) { + finish_sentence(); + out_of_chunk_ = false; + } if (state_ == XS_NONE) { if (type == "s") { //throw XcesError("Top level <chunk> is type=\"s\""); @@ -191,6 +200,31 @@ void XcesReaderImpl::on_start_element(const Glib::ustring &name, clear_buf(); } else if (name == "ns") { wa_ = PwrNlp::Whitespace::None; + } else if (name == "tok" && state_ == XS_NONE) { + std::cerr << "Warning: out-of-chunk token, assuming sentence start on line "; + std::cerr << this->context_->input->line << "\n"; + chunkless_ = true; + out_of_chunk_ = true; + chunk_ = boost::make_shared<Chunk>(); + sent_ = boost::make_shared<Sentence>(); + state_ = XS_TOK; + tok_ = new Token(); + tok_->set_wa(wa_); + wa_ = PwrNlp::Whitespace::Space; + } +} + +void XcesReaderImpl::finish_sentence() +{ + chunk_->append(sent_); + sent_.reset(); + if (chunkless_) { + obuf_.push_back(chunk_); + chunk_.reset(); + state_ = XS_NONE; + chunkless_ = false; + } else { + state_ = XS_CHUNK; } } @@ -216,16 +250,7 @@ void XcesReaderImpl::on_end_element(const Glib::ustring &name) tok_ = NULL; state_ = XS_SENTENCE; } else if (state_ == XS_SENTENCE && name == "chunk") { - chunk_->append(sent_); - sent_.reset(); - if (chunkless_) { - obuf_.push_back(chunk_); - chunk_.reset(); - state_ = XS_NONE; - chunkless_ = false; - } else { - state_ = XS_CHUNK; - } + finish_sentence(); } else if (state_ == XS_CHUNK && name == "chunk") { obuf_.push_back(chunk_); chunk_.reset(); diff --git a/libcorpus2/tagset.cpp b/libcorpus2/tagset.cpp index 00f31f4ed289bd5d9ee371563fc73c85cf7e3e15..b732690e713d9138195fc069e13da361605377ca 100644 --- a/libcorpus2/tagset.cpp +++ b/libcorpus2/tagset.cpp @@ -141,6 +141,7 @@ namespace { std::vector< mask_t > & current, const std::vector<mask_t> & to_add, mask_t to_add_attr) { + if (to_add.empty()) return; size_t current_size = current.size(); for (size_t ai = 1; ai < to_add.size(); ++ai) { for (size_t oi = 0; oi < current_size; ++oi) { @@ -176,6 +177,7 @@ void Tagset::parse_tag(const string_range_vector &fields, bool allow_extra, std::vector<mask_t> values; mask_t amask; foreach (string_range& dot, dots) { + if (dot.empty()) continue; mask_t v = get_value_mask(boost::copy_range<std::string>(dot)); mask_t curr = get_attribute_mask(get_value_attribute(v)); @@ -281,6 +283,16 @@ Tag Tagset::make_tag(idx_t pos_idx, mask_t values, bool allow_extra) const // << " of " << pos_required_attributes_idx_[pos_idx].size() << "\n"; size_t has_req = PwrNlp::count_bits_set(required_values & values); if (has_req != pos_required_attributes_idx_[pos_idx].size()) { + foreach (idx_t a, get_pos_attributes(pos_idx)) { + if (pos_requires_attribute(pos_idx, a)) { + mask_t amask = get_attribute_mask(a); + if ((values & amask).none()) { + throw TagParseError("Required attribute missing", + tag_to_string(Tag(get_pos_mask(pos_idx), values)), + get_attribute_name(a), id_string()); + } + } + } throw TagParseError("Required attribute missing", tag_to_string(Tag(get_pos_mask(pos_idx), values)), get_pos_name(pos_idx), id_string()); diff --git a/tagset-tool/main.cpp b/tagset-tool/main.cpp index 791d5ccd603d483697ee6135a14c5adb28b64542..c51a991152e9dfe8358a8a9b1a26f4be1d5d3f9b 100644 --- a/tagset-tool/main.cpp +++ b/tagset-tool/main.cpp @@ -92,11 +92,11 @@ void tagset_info(const Corpus2::Tagset& tagset) std::cerr << "Corpus2::Tagset loaded: " << tagset.pos_count() << " POSes, " << tagset.attribute_count() << " attributes, " - << tagset.value_count() << " values ["; + << tagset.value_count() << " values [ "; for (Corpus2::idx_t a = 0; a < tagset.attribute_count(); ++a) { std::cerr << tagset.get_attribute_values(a).size() << " "; } - std::cerr << "\n"; + std::cerr << "]\n"; std::cerr << "Size is " << tagset.size() << " (extra size is " << tagset.size_extra() << ")\n"; std::cerr << "POSes: ";