STL关联式容器-hashtable

liudongdong1 收录于 Categories 编程语言

2021-04-24 约 3467 字预计阅读 7 分钟 - 次阅读

https://cdn.pixabay.com/photo/2022/04/02/13/43/music-7107045__340.jpg

1. 哈希冲突

.1. 线性探测

根据元素的值然后除以数组大小，然后插入指定的位置

.2. 二次探测

F(i)=i*i; 如果新元素起始插入位置为H，但是H已经被占用，则会尝试H+i^2; i=[1,n];

.3. 开链（链地址法）

2. 数据结构

.1. hash_table节点

template <class _Val>
struct _Hashtable_node
{
  _Hashtable_node* _M_next;
  _Val _M_val;
};

.2. 迭代器

迭代器没有后退操作（operator–），也没有定义所谓的你想迭代器（reverse iterator）

其前进操作时首先尝试从目前所知的节点出发，前进一个位置（节点），由于节点被安置于 list 内，所以利用节点的 next 指针即可轻易达到进行的目的

如果目前节点正巧是 list 的尾端，就跳到下一个 bucket 内，跳过之后指向下一个 list 的头节点

template <class _Val, class _Key, class _HashFcn,
class _ExtractKey, class _EqualKey, class _Alloc>
    struct _Hashtable_iterator {
        typedef hashtable<_Val,_Key,_HashFcn,_ExtractKey,_EqualKey,_Alloc>
            _Hashtable;
        typedef _Hashtable_iterator<_Val, _Key, _HashFcn, 
        _ExtractKey, _EqualKey, _Alloc>
            iterator;
        typedef _Hashtable_const_iterator<_Val, _Key, _HashFcn, 
        _ExtractKey, _EqualKey, _Alloc>
            const_iterator;
        typedef _Hashtable_node<_Val> _Node;

        typedef forward_iterator_tag iterator_category;
        typedef _Val value_type;
        typedef ptrdiff_t difference_type;
        typedef size_t size_type;
        typedef _Val& reference;
        typedef _Val* pointer;

        _Node* _M_cur; //迭代器目前所指节点
        _Hashtable* _M_ht;//保持对容器的连结关系（因为可能需要从bucket跳到bucket）

        _Hashtable_iterator(_Node* __n, _Hashtable* __tab) 
            : _M_cur(__n), _M_ht(__tab) {}
        _Hashtable_iterator() {}
        reference operator*() const { return _M_cur->_M_val; }
        #ifndef __SGI_STL_NO_ARROW_OPERATOR
        pointer operator->() const { return &(operator*()); }
        #endif /* __SGI_STL_NO_ARROW_OPERATOR */
        iterator& operator++();
        iterator operator++(int);
        bool operator==(const iterator& __it) const
        { return _M_cur == __it._M_cur; }
        bool operator!=(const iterator& __it) const
        { return _M_cur != __it._M_cur; }
    };

template <class _Val, class _Key, class _HF, class _ExK, class _EqK, 
class _All>
    _Hashtable_iterator<_Val,_Key,_HF,_ExK,_EqK,_All>&
    _Hashtable_iterator<_Val,_Key,_HF,_ExK,_EqK,_All>::operator++()
    {
        const _Node* __old = _M_cur;
        _M_cur = _M_cur->_M_next; //如果存在，就是它，否则进入if
        if (!_M_cur) {
            //根据元素值，定位出下一个bucket。其起头处就是我们的目的地
            size_type __bucket = _M_ht->_M_bkt_num(__old->_M_val);
            while (!_M_cur && ++__bucket < _M_ht->_M_buckets.size())//注意operator++
                _M_cur = _M_ht->_M_buckets[__bucket];
        }
        return *this;
    }

template <class _Val, class _Key, class _HF, class _ExK, class _EqK, 
class _All>
    inline _Hashtable_iterator<_Val,_Key,_HF,_ExK,_EqK,_All>
    _Hashtable_iterator<_Val,_Key,_HF,_ExK,_EqK,_All>::operator++(int)
    {
        iterator __tmp = *this;
        ++*this; //调用operator++()
        return __tmp;
    }

.3. hashtable结构

_Val：节点的实值类型

_Key：节点的键值类型

_HF：hash function 的函数类型

_Ex：从节点取出键值的方法（函数或仿函数）

_Eq：判断键值相同与否的方法（函数或仿函数）

_All：空间配置器。缺省使用 std::alloc

template <class Value, class Key, class HashFcn,
          class ExtractKey, class EqualKey,
          class Alloc>
class hashtable {
public:
  typedef Key key_type;
  typedef Value value_type;
  typedef HashFcn hasher;
  typedef EqualKey key_equal;

  typedef size_t            size_type;
  typedef ptrdiff_t         difference_type;
  typedef value_type*       pointer;
  typedef const value_type* const_pointer;
  typedef value_type&       reference;
  typedef const value_type& const_reference;

  hasher hash_funct() const { return hash; }
  key_equal key_eq() const { return equals; }

private:
  hasher hash;
  key_equal equals;
  ExtractKey get_key;

  typedef __hashtable_node<Value> node;
  typedef simple_alloc<node, Alloc> node_allocator;

  vector<node*,Alloc> buckets;
  size_type num_elements;

public:
  typedef __hashtable_iterator<Value, Key, HashFcn, ExtractKey, EqualKey, 
                               Alloc>
  iterator;

  typedef __hashtable_const_iterator<Value, Key, HashFcn, ExtractKey, EqualKey,
                                     Alloc>
  const_iterator;

  friend struct
  __hashtable_iterator<Value, Key, HashFcn, ExtractKey, EqualKey, Alloc>;
  friend struct
  __hashtable_const_iterator<Value, Key, HashFcn, ExtractKey, EqualKey, Alloc>;

public:
  hashtable(size_type n,
            const HashFcn&    hf,
            const EqualKey&   eql,
            const ExtractKey& ext)
    : hash(hf), equals(eql), get_key(ext), num_elements(0)
  {
    initialize_buckets(n);
  }

  hashtable(size_type n,
            const HashFcn&    hf,
            const EqualKey&   eql)
    : hash(hf), equals(eql), get_key(ExtractKey()), num_elements(0)
  {
    initialize_buckets(n);
  }

  hashtable(const hashtable& ht)
    : hash(ht.hash), equals(ht.equals), get_key(ht.get_key), num_elements(0)
  {
    copy_from(ht);
  }

  hashtable& operator= (const hashtable& ht)
  {
    if (&ht != this) {
      clear();
      hash = ht.hash;
      equals = ht.equals;
      get_key = ht.get_key;
      copy_from(ht);
    }
    return *this;
  }
    ...
}

3. 内存管理

_Node* _M_new_node(const value_type& __obj)
{
    _Node* __n = _M_get_node();
    __n->_M_next = 0;
    __STL_TRY {
      construct(&__n->_M_val, __obj);
      return __n;
    }
    __STL_UNWIND(_M_put_node(__n));
}
 
void _M_delete_node(_Node* __n)
{
    destroy(&__n->_M_val);
    _M_put_node(__n);
}
 
void _M_put_node(_Node* __p) { _M_node_allocator.deallocate(__p, 1); }

template <class _Val, class _Key, class _HashFcn,
          class _ExtractKey, class _EqualKey, class _Alloc>
class hashtable {
public:
  hashtable(size_type __n,
            const _HashFcn&    __hf,
            const _EqualKey&   __eql,
            const _ExtractKey& __ext,
            const allocator_type& __a = allocator_type())
    : __HASH_ALLOC_INIT(__a)
      _M_hash(__hf),
      _M_equals(__eql),
      _M_get_key(__ext),
      _M_buckets(__a),
      _M_num_elements(0)
  {
    _M_initialize_buckets(__n);
  }
private:
    void _M_initialize_buckets(size_type __n)
    {
      const size_type __n_buckets = _M_next_size(__n);//调用_M_next_size
      //举例：传入50，返回53.以下首先保留53个元素空间，然后将其全部填0
      _M_buckets.reserve(__n_buckets);
      _M_buckets.insert(_M_buckets.end(), __n_buckets, (_Node*) 0);
      _M_num_elements = 0;
   }
 
   //该函数返回最近接n并大于n的质数。其中调用了我们上面介绍的__stl_next_prime函数
   size_type _M_next_size(size_type __n) const
       { return __stl_next_prime(__n); }
};

4. 插入操作

.1. insert_unique()

pair<iterator, bool> insert_unique(const value_type& __obj)
{
    resize(_M_num_elements + 1);          //判断是否需要重建表格
    return insert_unique_noresize(__obj); //在不需要重建表格的情况下插入节点，键值不允许重复
}

template <class V, class K, class HF, class Ex, class Eq, class A>
void hashtable<V, K, HF, Ex, Eq, A>::resize(size_type num_elements_hint)
{
    const size_type old_n = buckets.size();
    if (num_elements_hint > old_n) {
        const size_type n = next_size(num_elements_hint);
        if (n > old_n) {
            vector<node*, A> tmp(n, (node*) 0);  //设立新的buckets
            __STL_TRY {
                //处理每一个旧的buckets   todo这里还是没有看懂
                for (size_type bucket = 0; bucket < old_n; ++bucket) {
                    node* first = buckets[bucket];   //指向节点所对应之串行的起始节点
                    //处理每一个就bucket中所含的每一个节点
                    while (first) {
                        //找出节点落在哪一个新bucket内
                        size_type new_bucket = bkt_num(first->val, n);
                        //令旧bucket指向其所对应串行的下一个节点
                        buckets[bucket] = first->next;
                        //将当前节点插入到新bucket内，使其成为对应串行的第一个节点
                        first->next = tmp[new_bucket];
                        tmp[new_bucket] = first;
                        //回到旧bucket所指的待处理行，准备处理下一个节点
                        first = buckets[bucket];          
                    }
                }
                buckets.swap(tmp);
            }
            #         ifdef __STL_USE_EXCEPTIONS
            catch(...) {
                for (size_type bucket = 0; bucket < tmp.size(); ++bucket) {
                    while (tmp[bucket]) {
                        node* next = tmp[bucket]->next;
                        delete_node(tmp[bucket]);
                        tmp[bucket] = next;
                    }
                }
                throw;
            }
            #         endif /* __STL_USE_EXCEPTIONS */
        }
    }
}

.2. insert_unique_noresize

template <class V, class K, class HF, class Ex, class Eq, class A>
pair<typename hashtable<V, K, HF, Ex, Eq, A>::iterator, bool> 
hashtable<V, K, HF, Ex, Eq, A>::insert_unique_noresize(const value_type& obj)
{
  const size_type n = bkt_num(obj); //决定obj应位于 #n bucket
  node* first = buckets[n];   //令first指向bucket对应的串行头部

  for (node* cur = first; cur; cur = cur->next) 
    if (equals(get_key(cur->val), get_key(obj)))
        //遍历bucket所对应的链表，如果发现与链表中某个键值相同，就不插入，立刻返回
      return pair<iterator, bool>(iterator(cur, this), false);

  node* tmp = new_node(obj);
  tmp->next = first;  //令新节点成为链表中的第一个节点
  buckets[n] = tmp;
  ++num_elements;      //节点数加一
  return pair<iterator, bool>(iterator(tmp, this), true);
}

.3. insert_equal

iterator insert_equal(const value_type& obj)
{
    resize(num_elements + 1);
    return insert_equal_noresize(obj);
}

.4. insert_equal_noresize

template <class V, class K, class HF, class Ex, class Eq, class A>
typename hashtable<V, K, HF, Ex, Eq, A>::iterator 
hashtable<V, K, HF, Ex, Eq, A>::insert_equal_noresize(const value_type& obj)
{
  const size_type n = bkt_num(obj);
  node* first = buckets[n];

  for (node* cur = first; cur; cur = cur->next) 
    if (equals(get_key(cur->val), get_key(obj))) {
        //遍历bucket所对应的整个链表，如果发现与链表中的某键值相同，将新节点插入当前位置之后，返回指向新节点的iterator
      node* tmp = new_node(obj);
      tmp->next = cur->next;
      cur->next = tmp;
      ++num_elements;
      return iterator(tmp, this);
    }

  node* tmp = new_node(obj);
  tmp->next = first;
  buckets[n] = tmp;
  ++num_elements;
  return iterator(tmp, this);
}

5. hash function

插入元素之后需要知道某个元素落脚于哪一个 bucket 内。这本来是哈希函数的责任，但是 SGI 把这个任务包装了一层，先交给 bkt_num () 函数，再由此函数调用哈希函数，取得一个可以执行 modulus（取模）运算的数值为什么要这么做？因为有些函数类型无法直接拿来对哈表表的大小进行模运算，例如字符串，这时候我们需要做一些转换

hash functions 是计算元素位置的函数，SGI 将这项任务赋予了先前提到过的 bkt_num () 函数，再由 bkt_num () 函数调用这些 hash function，取得一个可以对 hashtable 进行模运算的值，针对 char、int、long 等整数类型，这里大部分的 hash function 什么都没有做，只是直接返回原值。具体见<stl_hash_fun.h>

hashtable 无法处理上述所列各项类型之外的元素。例如 string、double、float 等，这些类型用户必须自己定义 hash function。

//版本1：接受实值（value）和buckets个数
size_type _M_bkt_num(const value_type& __obj, size_t __n) const
{
    return _M_bkt_num_key(_M_get_key(__obj), __n); //调用版本4
}
 
//版本2：只接受实值（value）
size_type _M_bkt_num(const value_type& __obj) const
{
    return _M_bkt_num_key(_M_get_key(__obj)); //调用版本3
}
 
//版本3，只接受键值
size_type _M_bkt_num_key(const key_type& __key) const
{
    return _M_bkt_num_key(__key, _M_buckets.size()); //调用版本4
}
 
//版本4：接受键值和buckets个数
size_type _M_bkt_num_key(const key_type& __key, size_t __n) const
{
    return _M_hash(__key) % __n; //SGI的所有内建的hash()，在后面的hash functions中介绍
}

6. copy_from&clear

template <class V, class K, class HF, class Ex, class Eq, class A>
void hashtable<V, K, HF, Ex, Eq, A>::clear()
{
  for (size_type i = 0; i < buckets.size(); ++i) {
    node* cur = buckets[i];
    //删除桶中每一个元素
    while (cur != 0) {
      node* next = cur->next;
      delete_node(cur);
      cur = next;
    }
    buckets[i] = 0;  //令桶的内容为null指针
  }
  num_elements = 0;   //总结点个数为0
}

template <class V, class K, class HF, class Ex, class Eq, class A>
void hashtable<V, K, HF, Ex, Eq, A>::copy_from(const hashtable& ht)
{
  buckets.clear();
  //如果大于ht的空间就不动，否则增大自己空间
  buckets.reserve(ht.buckets.size());
  //此时，buckets vector为空，所以此处的尾端及开头
  buckets.insert(buckets.end(), ht.buckets.size(), (node*) 0);
  __STL_TRY {
    for (size_type i = 0; i < ht.buckets.size(); ++i) {
        //复制vector的每一个元素
      if (const node* cur = ht.buckets[i]) {
        node* copy = new_node(cur->val);
        buckets[i] = copy;
		//复制bucket list中每一个节点
        for (node* next = cur->next; next; cur = next, next = cur->next) {
          copy->next = new_node(next->val);
          copy = copy->next;
        }
      }
    }
    num_elements = ht.num_elements;
  }
  __STL_UNWIND(clear());
}

7. find & count

const_iterator find(const key_type& key) const
{
    size_type n = bkt_num_key(key);
    const node* first;
    for ( first = buckets[n];
         first && !equals(get_key(first->val), key);
         first = first->next)
    {}
    return const_iterator(first, this);
} 

size_type count(const key_type& key) const
{
    const size_type n = bkt_num_key(key);
    size_type result = 0;

    for (const node* cur = buckets[n]; cur; cur = cur->next)
        if (equals(get_key(cur->val), key))
            ++result;
    return result;
}

8. demo

#include <hash_set> //会包含<stl_hashtable.h>
#include <iostream>
using namespace std;
 
int main()
{
	hashtable<int, int, hash<int>, identity<int>, equal_to<int>, alloc>
		iht(50, hash<int>(), equal_to<int>());
 
	std::cout << iht.size() << std::endl; //0
	std::cout << iht.bucket_count()() << std::endl; //53。这是STL供应的第一个质数
	std::cout << iht.max_bucket_count() << std::endl; //4294967291，这是STL供应的最后一个质数
 
	iht.insert_unique(59);
	iht.insert_unique(63);
	iht.insert_unique(108);
	iht.insert_unique(2);
	iht.insert_unique(53);
	iht.insert_unique(55);
	std::cout << iht.size() << std::endl; //6。这个就是hashtable<T>::num_element
	
	
	//下面声明一个hashtable迭代器
	hashtable<int, int, hash<int>, identity<int>, equal_to<int>, alloc>::iterator
		ite = iht.begin();
		
	//遍历hashtable
	for (int i = 0; i < iht.size(); ++i, ++ite)
		std::cout << *ite << std::endl; //53 55 2 108 59 53
	std::cout << std::endl;
 
	//遍历所有buckets。如果其节点个数不为0，就打印节点个数
	for (int i = 0; i < iht.bucket_count(); ++i) {
		int n = iht.elems_in_bucket(i);  //桶中元素个数
		if (n != 0)
			std::cout << "bucket[" << i << "]has"<<n<<"elems." << std::endl;
	}
	//会打印如下内容
	//bucket[0] has 1 elems
	//bucket[2] has 3 elems
	//bucket[6] has 1 elems
	//bucket[10] has 1 elems
 
	/*为了验证bucket(list)的容量就是buckets vector的大小，
	这里从hastable<T>::Resize()得到结果。此处刻意将元素加到54个，
	看看是否发生”表格重建“
	*/
	for (int i = 0; i <= 47; ++i)
		iht.insert_equal(i);
	std::cout << iht.size() << std::endl; //54。元素(节点)个数
	std::cout << iht.bucket_count() << std::endl; //97，buckets个数
 
	//遍历所有buckets，如果其节点个数不为0，就打印节点个数
	for (int i = 0; i < iht.bucket_count(); ++i) {
		int n = iht.elems_in_bucket(i);
		if (n != 0)
			std::cout << "bucket[" << i << "]has" << n << "elems." << std::endl;
	}
	//打印的结果为：bucket[2]和bucket[11]的节点个数为2
	//其余的bucket[0]~bucket[47]的节点个数为1
	//此外，bucket[53],[55],[59],[63]的节点个数均为1
 
	//以迭代器遍历hashtable，将所有节点的值打印出来
	ite = iht.begin();
	for (int i = 0; i < iht.size(); ++i, ++ite)
		std::cout << *ite << " ";
	std::cout << std::endl;
	//0 1 2 2 3 4 5 6 7 8 9 10 11 108 12 13 14 15 16 17 18 19 20 21
	//22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
	//43 44 45 46 47 53 55 59 63
 
	std::cout << *(iht.find(2)) << std::endl;  //2
	std::cout << *(iht.count(2)) << std::endl; //2
 
	return 0;
}

Resource

https://github1s.com/TBLGSn/SGI-STL/blob/HEAD/g++/stl_hashtable.h#L162-L240