diff --git a/libcorpus2/io/xmlreader.cpp b/libcorpus2/io/xmlreader.cpp index 85ba6e99dc20a9a0b05c268bc47e2d09a332dac8..7d5ed6ac0845ad51a970686c96d8aa8791a994c1 100644 --- a/libcorpus2/io/xmlreader.cpp +++ b/libcorpus2/io/xmlreader.cpp @@ -77,6 +77,8 @@ void XmlReader::on_start_element(const Glib::ustring &name, state_ = STATE_TAG; grab_characters_ = true; clear_buf(); + } else if (state_ == STATE_LEX_SKIP && name == "lex" || name == "base" || name == "ctag") { + //nop } else if (name == "ns") { wa_ = PwrNlp::Whitespace::None; } else if (state_ == STATE_NONE && name == "tok") { @@ -175,6 +177,8 @@ void XmlReader::start_lexeme(const AttributeList &attributes) tok_->add_lexeme(Lexeme()); tok_->lexemes().back().set_disamb(is_disamb); state_ = STATE_LEX; + } else { + state_ = STATE_LEX_SKIP; } } @@ -225,7 +229,7 @@ void XmlReader::on_end_element(const Glib::ustring &name) tok_->lexemes().back().set_tag(tag); grab_characters_ = false; state_ = STATE_LEX; - } else if (state_ == STATE_LEX && name == "lex") { + } else if ((state_ == STATE_LEX || state_ == STATE_LEX_SKIP) && name == "lex") { state_ = STATE_TOK; } else if (state_ == STATE_TOK && name == "tok") { finish_token(); diff --git a/libcorpus2/io/xmlreader.h b/libcorpus2/io/xmlreader.h index 7b875710ef8e0043461447c7b94e101ffddb9c10..d9234d4c38fd8e437f15bd8bebf7ab17dea8ad16 100644 --- a/libcorpus2/io/xmlreader.h +++ b/libcorpus2/io/xmlreader.h @@ -84,6 +84,8 @@ protected: static const int STATE_LEX = 5; static const int STATE_LEMMA = 6; static const int STATE_TAG = 7; + static const int STATE_LEX_SKIP = 8; + /// The state of the parser int state_;