Skip to content
Snippets Groups Projects
Select Git revision
  • e862fc89f09089c9fa1215256587393d75041c8f
  • master default protected
  • develop protected
  • feat_remove_attr
  • python2.7
  • python3.8
6 results

symboldictionary.h

Blame
  • user avatar
    ilor authored
    Libcorpus2 now contains the base strructures (tokens etc), their I/O and related classes.
    Libpwrutils (name TBD) contains basic building blocks common to all tagger-related libraries
    Tagset-tool only needs libcorpus, so was brough here as well.
    e862fc89
    History
    symboldictionary.h 4.57 KiB
    #ifndef LIBCORPUS2_SYMBOLDICTIONARY_H
    #define LIBCORPUS2_SYMBOLDICTIONARY_H
    
    #include <libcorpus2/typedefs.h>
    #include <vector>
    #include <string>
    #include <boost/range.hpp>
    #include <map>
    #include <iostream>
    
    namespace Corpus2 {
    
    /**
     * A template for string-index dictionaries offering lookups in both ways.
     *
     * A dictionary is created from a set of strings, which is then mapped to
     * indices which can be used.
     *
     * The class is templated on the index type, which should be some form of
     * an integer. The amount of used indices should be less than the maximum
     * value of the type. There should be no empty strings or duplicate
     * strings.
     */
    template <typename IndexT>
    class SymbolDictionary
    {
    public:
    	/// Empty dictionary constructor
    	SymbolDictionary();
    
    	/// Load data into the dictionary
    	void load_data(const std::vector<std::string>& data);
    
    	/// Load data into the dictionary. The strings in the vector are
    	/// assumed to be already sorted.
    	void load_sorted_data(const std::vector<std::string>& data);
    
    	/// Check if an index is valid in this dictionary
    	bool is_id_valid(IndexT idx) const;
    
    	/// Getter for the size of this dictionary
    	size_t  size() const;
    
    	/**
    	 * Get the index for a given string identifier, const char* version.
    	 *
    	 * Essentially a wrapper for the range overload, needed to avoid
    	 * confusion when a static char array is passed there.
    	 */
    	IndexT get_id(const char* c) const;
    
    	/**
    	 * Get the index for a given string identifier, range version.
    	 *
    	 * @returns a valid index into this dictionary, or an index equivalent
    	 * to -1
    	 */
    	IndexT get_id(const string_range& r) const;
    
    	/// Get the string identifier for a valid index into the dictionary.
    	/// If the index is not valid, an empty string is returned.
    	const std::string& get_string(IndexT id) const;
    
    	/// Plumbing allow foreach() iteration through the dictionary objects
    	typedef typename std::vector<std::string>::iterator iterator;
    	typedef typename std::vector<std::string>::const_iterator
    			const_iterator;
    	typedef typename std::vector<std::string>::value_type value_type;
    
    	iterator begin() {
    		return data_.begin();
    	}
    
    	const_iterator begin() const {
    		return data_.begin();
    	}
    
    	iterator end() {
    		return data_.end();
    	}
    
    	const_iterator end() const {
    		return data_.end();
    	}
    
    	/**
    	 * Create a old-index to new-index mapping between two dictionaries
    	 * with mappings based on string identifier equality. Missing mappings
    	 * are not set.
    	 */
    	void create_mapping_to(const SymbolDictionary<IndexT>& other,
    			std::map<IndexT, IndexT>& map) const;
    
    private:
    	/// The contents of the dictionary (a sorted vector)
    	std::vector<std::string> data_;
    
    	/// empty string to be returned by get_string on invalid indices
    	static std::string nullstr;
    };
    
    template <typename IndexT>
    std::string SymbolDictionary<IndexT>::nullstr;
    
    template <typename IndexT>
    SymbolDictionary<IndexT>::SymbolDictionary()
    	: data_()
    {
    }
    
    template <typename IndexT>
    void SymbolDictionary<IndexT>::load_data(
    		const std::vector<std::string> &data)
    {
    	data_ = data;
    	std::sort(data_.begin(), data_.end());
    }
    
    template <typename IndexT>
    void SymbolDictionary<IndexT>::load_sorted_data(
    		const std::vector<std::string> &data)
    {
    	data_ = data;
    }
    
    template <typename IndexT>
    bool SymbolDictionary<IndexT>::is_id_valid(IndexT idx) const
    {
    	return static_cast<size_t>(idx) < data_.size();
    }
    
    template <typename IndexT>
    size_t SymbolDictionary<IndexT>::size() const
    {
    	return data_.size();
    }
    
    template <typename IndexT>
    IndexT SymbolDictionary<IndexT>::get_id(const char *c) const
    {
    	return get_id(std::make_pair(c, c + strlen(c)));
    }
    
    template <typename IndexT>
    IndexT SymbolDictionary<IndexT>::get_id(const string_range& r) const
    {
    	//std::cerr << "get_id called with '" << r << "' (" << r.size() << ")\n";
    	boost::sub_range< const std::vector<std::string> > sr =
    			std::equal_range(data_.begin(), data_.end(), r);
    	if (!sr.empty()) {
    		return static_cast<IndexT>(sr.begin() - data_.begin());
    	} else {
    		return static_cast<IndexT>(-1);
    	}
    }
    
    template <typename IndexT>
    const std::string& SymbolDictionary<IndexT>::get_string(IndexT id) const
    {
    	size_t idx = static_cast<size_t>(id);
    	if (id < data_.size()) {
    		return data_[idx];
    	} else {
    		return nullstr;
    	}
    }
    
    template <typename IndexT>
    void SymbolDictionary<IndexT>::create_mapping_to(
    		const SymbolDictionary<IndexT> &other,
    		std::map<IndexT, IndexT> &map) const
    {
    	for (IndexT i = static_cast<IndexT>(0); i < size(); ++i) {
    		std::string name = get_string(i);
    		IndexT t = other.get_id(name);
    		if (other.is_id_valid(t)) {
    			map.insert(std::make_pair(i, t));
    		}
    	}
    }
    
    } /* end ns Corpus2 */
    
    #endif // LIBCORPUS2_SYMBOLDICTIONARY_H