Skip to content
Snippets Groups Projects
Select Git revision
  • 90e4ea15ffaac89643678278c5e3ac57a8eeae9e
  • master default protected
  • develop protected
  • feat_remove_attr
  • python2.7
  • python3.8
6 results

xmlreader.cpp

Blame
  • xmlreader.cpp 6.38 KiB
    /*
        Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski
        Part of the libcorpus2 project
    
        This program is free software; you can redistribute it and/or modify it
    under the terms of the GNU General Public License as published by the Free
    Software Foundation; either version 3 of the License, or (at your option)
    any later version.
    
        This program is distributed in the hope that it will be useful, but
    WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    or FITNESS FOR A PARTICULAR PURPOSE. 
    
        See the LICENSE and COPYING files for more details.
    */
    
    #include <libcorpus2/io/xmlreader.h>
    #include <libpwrutils/foreach.h>
    #include <libxml++/libxml++.h>
    #include <libxml2/libxml/parser.h>
    #include <boost/make_shared.hpp>
    #include <fstream>
    
    namespace Corpus2 {
    
    
    XmlReader::XmlReader(const TokenReader& base_reader,
    		std::deque< boost::shared_ptr<Chunk> >& obuf)
    	: BasicSaxParser()
    	, base_reader_(base_reader), state_(STATE_NONE)
    	, chunkless_(false), out_of_chunk_(false)
    	, wa_(PwrNlp::Whitespace::Newline)
    	, sbuf_(), tok_(NULL), sent_(), chunk_(), obuf_(obuf)
    	, disamb_only_(false), disamb_sh_(false)
    	, warn_on_inconsistent_(true), warn_on_unexpected_(true)
    {
    }
    
    XmlReader::~XmlReader()
    {
    	delete tok_;
    }
    
    std::string XmlReader::get_type_from_attributes(const AttributeList& attributes) const
    {
    	std::string type;
    	foreach (const Attribute& a, attributes) {
    		if (a.name == "type") {
    			type = a.value;
    		}
    	}
    	return type;
    }
    
    
    void XmlReader::on_start_element(const Glib::ustring &name,
    		const AttributeList& attributes)
    {
    	//std::cerr << name << state_ << "\n";
    	if (state_ == STATE_NONE && name == "chunk") {
    		start_chunk(attributes);
    	} else if (state_ == STATE_CHUNK && name == sentence_tag_name_) {
    		start_sentence(attributes);
    	} else if (state_ == STATE_SENTENCE && name == "tok") {
    		start_token(attributes);
    	} else if (state_ == STATE_TOK && name == "orth") {
    		state_ = STATE_ORTH;
    		grab_characters_ = true;
    		clear_buf();
    	} else if (state_ == STATE_TOK && name == "lex") {
    		start_lexeme(attributes);
    	} else if (state_ == STATE_LEX && name == "base") {
    		state_ = STATE_LEMMA;
    		grab_characters_ = true;
    		clear_buf();
    	} else if (state_ == STATE_LEX && name == "ctag") {
    		state_ = STATE_TAG;
    		grab_characters_ = true;
    		clear_buf();
    	} else if (state_ == STATE_LEX_SKIP &&
    		(name == "lex" || name == "base" || name == "ctag")) {
    		//nop
    	} else if (name == "ns") {
    		wa_ = PwrNlp::Whitespace::None;
    	} else if (state_ == STATE_NONE && name == "tok") {
    		if (warn_on_inconsistent_) {
    			std::cerr << "Warning: out-of-chunk token, assuming sentence start on line ";
    			std::cerr << this->context_->input->line << "\n";
    		}
    		AttributeList fake;
    		start_chunk(fake);
    		fake.push_back(Attribute("type", "s"));
    		start_sentence(fake);
    		chunkless_ = true;
    		out_of_chunk_ = true;
    		start_token(attributes);
    	} else if (state_ == STATE_NONE && name == "cesAna") {
    		//nop
    	} else if (state_ == STATE_NONE && name == "chunkList") {
    		//nop
    	} else if (process_start_element(name, attributes)) {
    		//nop
    	} else if (out_of_chunk_ && state_ == STATE_SENTENCE && name == "chunk") {
    		finish_sentence();
    		out_of_chunk_ = false;
    		start_chunk(attributes);
    	} else if (warn_on_unexpected_) {
    		std::cerr << "Unexpected tag <" << name << "> on line ";
    		std::cerr << this->context_->input->line << " (" << state_ << ")\n";
    	}
    }
    
    bool XmlReader::process_start_element(const Glib::ustring &/*name*/,
    	const AttributeList &/*attributes*/)
    {
    	return false;
    }
    
    bool XmlReader::process_end_element(const Glib::ustring & /*name*/)
    {
    	return false;
    }
    
    
    void XmlReader::start_chunk(const AttributeList& attributes)
    {
    	std::string type = get_type_from_attributes(attributes);
    	chunk_ = boost::make_shared<Chunk>();
    	if (type == "s") {
    		// top-level chunk is a sentence
    		start_sentence(attributes);
    		chunkless_ = true;
    	} else {
    		foreach (const Attribute& a, attributes) {
    			chunk_->set_attribute(a.name, a.value);
    		}
    		state_ = STATE_CHUNK;
    	}
    }
    
    void XmlReader::start_sentence(const AttributeList &attributes)
    {
    	std::string type = get_type_from_attributes(attributes);
    	if (type != "s") {
    		throw XcesError("Sub level <chunk> not type=\"s\"");
    	}
    	sent_ = base_reader_.make_sentence();
    	state_ = STATE_SENTENCE;
    }
    
    void XmlReader::start_token(const AttributeList &/*attributes*/)
    {
    	state_ = STATE_TOK;
    	tok_ = new Token();
    	tok_->set_wa(wa_);
    	wa_ = PwrNlp::Whitespace::Space;
    }
    
    void XmlReader::start_lexeme(const AttributeList &attributes)
    {
    	assert(tok_ != NULL);
    	bool is_disamb = false;
    	if (!disamb_sh_) {
    		foreach (const Attribute& a, attributes) {
    			if (a.name == "disamb" && a.value == "1") {
    				is_disamb = true;
    			}
    		}
    	} else {
    		is_disamb = true;
    		foreach (const Attribute& a, attributes) {
    			if (a.name == "disamb_sh" && a.value == "0") {
    				is_disamb = false;
    			}
    		}
    	}
    	if (!disamb_only_ || is_disamb) {
    		tok_->add_lexeme(Lexeme());
    		tok_->lexemes().back().set_disamb(is_disamb);
    		state_ = STATE_LEX;
    	} else {
    		state_ = STATE_LEX_SKIP;
    	}
    }
    
    void XmlReader::finish_chunk()
    {
    	assert(chunk_);
    	obuf_.push_back(chunk_);
    	chunk_.reset();
    	state_ = STATE_NONE;
    }
    
    void XmlReader::finish_sentence()
    {
    	assert(chunk_);
    	chunk_->append(sent_);
    	sent_.reset();
    	if (chunkless_) {
    		obuf_.push_back(chunk_);
    		chunk_.reset();
    		state_ = STATE_NONE;
    		chunkless_ = false;
    	} else {
    		state_ = STATE_CHUNK;
    	}
    }
    
    void XmlReader::finish_token()
    {
    	assert(sent_);
    	sent_->append(tok_);
    	tok_ = NULL;
    	state_ = STATE_SENTENCE;
    }
    
    void XmlReader::on_end_element(const Glib::ustring &name)
    {
    	//std::cerr << "/" << name << state_ << "\n";
    	if (state_ == STATE_ORTH && name == "orth") {
    		tok_->set_orth(UnicodeString::fromUTF8(get_buf()));
    		grab_characters_ = false;
    		state_ = STATE_TOK;
    	} else if (state_ == STATE_LEMMA && name == "base") {
    		tok_->lexemes().back().set_lemma(UnicodeString::fromUTF8(get_buf()));
    		grab_characters_ = false;
    		state_ = STATE_LEX;
    	} else if (state_ == STATE_TAG && name == "ctag") {
    		Tag tag = base_reader_.parse_tag(get_buf());
    		tok_->lexemes().back().set_tag(tag);
    		grab_characters_ = false;
    		state_ = STATE_LEX;
    	} else if ((state_ == STATE_LEX || state_ == STATE_LEX_SKIP) && name == "lex") {
    		state_ = STATE_TOK;
    	} else if (state_ == STATE_TOK && name == "tok") {
    		finish_token();
    	} else if (state_ == STATE_SENTENCE && name == sentence_tag_name_) {
    		finish_sentence();
    	} else if (state_ == STATE_CHUNK && name == "chunk") {
    		finish_chunk();
    	} else {
    		process_end_element(name);
    	}
    }
    
    
    } /* end ns Corpus2 */