Skip to content
Snippets Groups Projects
Select Git revision
  • 0d514882b10859587eb6fd61cfc5bb48a09d22e0
  • master default protected
  • vertical_relations
  • lu_without_semantic_frames
  • hierarchy
  • additional-unification-filters
  • v0.1.1
  • v0.1.0
  • v0.0.9
  • v0.0.8
  • v0.0.7
  • v0.0.6
  • v0.0.5
  • v0.0.4
  • v0.0.3
  • v0.0.2
  • v0.0.1
17 results

vite.config.js

Blame
  • token.cpp 4.00 KiB
    /*
        Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski
        Part of the libcorpus2 project
    
        This program is free software; you can redistribute it and/or modify it
    under the terms of the GNU General Public License as published by the Free
    Software Foundation; either version 3 of the License, or (at your option)
    any later version.
    
        This program is distributed in the hope that it will be useful, but
    WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    or FITNESS FOR A PARTICULAR PURPOSE. 
    
        See the LICENSE and COPYING files for more details.
    */
    
    #include <libcorpus2/token.h>
    #include <libpwrutils/foreach.h>
    #include <libcorpus2/tokenmetadata.h>
    #include <sstream>
    #include <boost/bind.hpp>
    
    namespace Corpus2 {
    
    Token::Token()
    	: orth_(), wa_(), lexemes_(), metadata_(NULL)
    {
    }
    
    Token::Token(const UnicodeString &orth, PwrNlp::Whitespace::Enum wa)
    	: orth_(orth), wa_(wa), lexemes_()
    {
    }
    
    Token* Token::create_utf8(const std::string& orth_utf8,
    		PwrNlp::Whitespace::Enum wa /*= PwrNlp::Whitespace::Space*/)
    {
    	return new Token(UnicodeString::fromUTF8(orth_utf8), wa);
    }
    
    Token* Token::clone() const
    {
    	Token* t = new Token();
    	t->orth_ = orth_;
    	t->wa_ = wa_;
    	t->lexemes_ = lexemes_;
    	if (metadata_.get()) {
    		t->set_metadata(metadata_->clone());
    	}
    	return t;
    }
    
    const Lexeme& Token::get_preferred_lexeme(const Tagset& tagset) const
    {
    	size_t idx = get_preferred_lexeme_index(tagset);
    	if (idx < lexemes_.size()) {
    		return lexemes_[idx];
    	} else {
    		throw Corpus2Error("No lexemes but best lexeme requested");
    	}
    }
    
    struct preferred_lexeme_cmp
    {
    	const Tagset* tagset;
    
    	bool operator()(const Lexeme& l1, const Lexeme& l2) const {
    		return (!l1.is_disamb() && l2.is_disamb())
    				|| (l1.is_disamb() == l2.is_disamb()
    				&& (tagset->get_original_pos_index(l1.tag().get_pos_index()) >
    					tagset->get_original_pos_index(l2.tag().get_pos_index())
    				|| (l1.tag().get_pos() == l2.tag().get_pos()
    				&& l1 > l2)));
    	}
    };
    
    size_t Token::get_preferred_lexeme_index(const Tagset& tagset) const
    {
    	if (lexemes_.empty()) {
    		return static_cast<size_t>(-1);
    	}
    	preferred_lexeme_cmp cmp;
    	cmp.tagset = &tagset;
    	std::vector<Lexeme>::const_iterator pref;
    	pref = std::max_element(lexemes_.begin(), lexemes_.end(), cmp);
    	return std::distance(lexemes_.begin(), pref);
    }
    
    bool Token::has_disamb_lexeme() const
    {
    	return std::find_if(lexemes().begin(), lexemes().end(),
    		boost::bind(&Lexeme::is_disamb, _1)) != lexemes().end();
    }
    
    int Token::count_disamb_lexemes() const
    {
    	return std::count_if(lexemes().begin(), lexemes().end(),
    		boost::bind(&Lexeme::is_disamb, _1));
    }
    
    std::pair<Token::lexeme_filter_iterator, Token::lexeme_filter_iterator> Token::disamb_lexemes() const
    {
    	lexeme_filter_iterator f1(boost::bind(&Lexeme::is_disamb, _1), lexemes().begin(), lexemes().end());
    	lexeme_filter_iterator f2(boost::bind(&Lexeme::is_disamb, _1), lexemes().end(), lexemes().end());
    	return std::make_pair(f1, f2);
    }
    
    void Token::make_ign(const Tagset& tagset)
    {
    	lexemes_.clear();
    	add_ign(tagset);
    }
    
    void Token::add_ign(const Tagset &tagset)
    {
    	Lexeme lex(orth_, tagset.make_ign_tag());
    	lexemes_.push_back(lex);
    }
    
    bool Token::operator ==(const Token& other) const
    {
    	return orth_ == other.orth_ && wa_ == other.wa_
    			&& lexemes_ == other.lexemes_;
    }
    
    bool Token::check_duplicate_lexemes() const
    {
    	std::set<Lexeme> s(lexemes_.begin(), lexemes_.end());
    	return s.size() != lexemes_.size();
    }
    
    bool Token::remove_duplicate_lexemes()
    {
    	size_t old_size = lexemes_.size();
    	std::sort(lexemes_.begin(), lexemes_.end());
    	lexemes_.erase(std::unique(lexemes_.begin(), lexemes_.end(), Lexeme::DisamblessComparator()),
    			lexemes_.end());
    	return old_size != lexemes_.size();
    }
    
    bool Token::orth_pos_match(mask_t pos, const UnicodeString &orth) const
    {
    	if (orth.length() > 0) {
    		if (orth.caseCompare(orth_, 0) != 0) return false;
    	}
    	if (pos.any()) {
    		foreach (const Lexeme& lex, lexemes_) {
    			if (lex.tag().get_pos() != pos) return false;
    		}
    	}
    	return true;
    }
    
    void Token::create_metadata()
    {
    	metadata_.reset(new TokenMetaData);
    }
    
    } /* end ns Corpus2 */