bitbitbyte
diff --git a/‎07_哈希表/hashtable.py
Lines changed: 50 additions & 52 deletions b/‎07_哈希表/hashtable.py
Lines changed: 50 additions & 52 deletions
diff --git a/‎07_哈希表/hashtable/index.html
Lines changed: 24 additions & 1 deletion b/‎07_哈希表/hashtable/index.html
Lines changed: 24 additions & 1 deletion
diff --git a/‎08_字典/__pycache__/dict_adt.cpython-27-PYTEST.pyc
11.5 KB b/‎08_字典/__pycache__/dict_adt.cpython-27-PYTEST.pyc
11.5 KB
@@ -5,9 +5,9 @@
 
 class Array(object):
 
-    def __init__(self, size=32):
+    def __init__(self, size=32, init=None):
         self._size = size
-        self._items = [None] * size
+        self._items = [init] * size
 
     def __getitem__(self, index):
         return self._items[index]
@@ -41,96 +41,88 @@ def __init__(self, key, value):
 
 class HashTable(object):
 
-    UNUSED = None    # 没被使用过的槽，作为该类变量的一个单例，下边都是is 判断
-    EMPTY = Slot(None, None)     # 使用过但是被删除的槽
+    UNUSED = None  # 没被使用过
+    EMPTY = Slot(None, None)  # 使用却被删除过
 
     def __init__(self):
-        self._table = Array(7)
+        self._table = Array(8, init=HashTable.UNUSED)   # 保持 2*i 次方
         self.length = 0
 
     @property
     def _load_factor(self):
-        # load factor 超过 2/3 就重新分配空间
+        # load_factor 超过 0.8 重新分配
         return self.length / float(len(self._table))
 
     def __len__(self):
         return self.length
 
-    def _hash1(self, key):
-        """ 计算key的hash值"""
+    def _hash(self, key):
         return abs(hash(key)) % len(self._table)
 
-    def _find_slot(self, key, for_insert=False):
-        """_find_slot
-
-        :param key:
-        :param for_insert: 是否插入，还是仅仅查询
-        :return:  slot index or None
-        """
-        index = self._hash1(key)
-        base_index = index
-        hash_times = 1
+    def _find_key(self, key):
+        index = self._hash(key)
         _len = len(self._table)
-
-        if not for_insert:  # 查找是否存在 key
-            while self._table[index] is not HashTable.UNUSED:
-                if self._table[index] is HashTable.EMPTY:
-                    index = (base_index + hash_times * hash_times) % _len    # 一个简单的二次方探查
-                    continue
-                elif self._table[index].key == key:
-                    return index
-                index = (base_index + hash_times * hash_times) % _len
-                hash_times += 1
-            return None
-        else:
-            while not self._slot_can_insert(index):  # 循环直到找到一个可以插入的槽
-                index = (base_index + hash_times * hash_times) % _len
-                hash_times += 1
-            return index
+        while self._table[index] is not HashTable.UNUSED:
+            if self._table[index] is HashTable.EMPTY:
+                index = (index*5 + 1) % _len
+                continue
+            elif self._table[index].key == key:
+                return index
+            else:
+                index = (index*5 + 1) % _len
+        return None
+
+    def _find_slot_for_insert(self, key):
+        index = self._hash(key)
+        _len = len(self._table)
+        while not self._slot_can_insert(index):
+            index = (index*5 + 1) % _len
+        return index
 
     def _slot_can_insert(self, index):
         return (self._table[index] is HashTable.EMPTY or self._table[index] is HashTable.UNUSED)
 
-    def __contains__(self, key):   # in operator
-        index = self._find_slot(key, for_insert=False)
+    def __contains__(self, key):  # in operator
+        index = self._find_key(key)
         return index is not None
 
     def add(self, key, value):
-        if key in self:    # key 相同值不一样的时候，用新的值
-            index = self._find_slot(key, for_insert=False)
+        if key in self:
+            index = self._find_key(key)
             self._table[index].value = value
             return False
         else:
-            index = self._find_slot(key, for_insert=True)
+            index = self._find_slot_for_insert(key)
             self._table[index] = Slot(key, value)
             self.length += 1
-            if self._load_factor >= 0.8:    # 注意超过了 阈值 rehashing
+            if self._load_factor >= 0.8:
                 self._rehash()
             return True
 
     def _rehash(self):
         old_table = self._table
-        newsize = len(self._table) * 2 + 1   # 扩大 2*n + 1
-        self._table = Array(newsize)
+        newsize = len(self._table) * 2
+        self._table = Array(newsize, HashTable.UNUSED)
 
         self.length = 0
 
         for slot in old_table:
             if slot is not HashTable.UNUSED and slot is not HashTable.EMPTY:
-                index = self._find_slot(slot.key, for_insert=True)
+                index = self._find_slot_for_insert(slot.key)
                 self._table[index] = slot
                 self.length += 1
 
     def get(self, key, default=None):
-        index = self._find_slot(key, for_insert=False)
+        index = self._find_key(key)
         if index is None:
             return default
         else:
             return self._table[index].value
 
     def remove(self, key):
-        assert key in self, 'keyerror'
-        index = self._find_slot(key, for_insert=False)
+        index = self._find_key(key)
+        if index is None:
+            raise KeyError()
         value = self._table[index].value
         self.length -= 1
         self._table[index] = HashTable.EMPTY
@@ -139,28 +131,34 @@ def remove(self, key):
     def __iter__(self):
         for slot in self._table:
             if slot not in (HashTable.EMPTY, HashTable.UNUSED):
-                yield slot.key   # 和 python dict 一样，默认遍历 key，需要value 的话写个 items() 方法
+                yield slot.key
 
 
 def test_hash_table():
     h = HashTable()
     h.add('a', 0)
     h.add('b', 1)
     h.add('c', 2)
-
     assert len(h) == 3
     assert h.get('a') == 0
     assert h.get('b') == 1
     assert h.get('hehe') is None
 
     h.remove('a')
     assert h.get('a') is None
-
     assert sorted(list(h)) == ['b', 'c']
 
-    # 50 超过了 HashTable 的原始 size，我们测试下是否 reshah 操作能正确工作
-    for i in range(50):
+    n = 50
+    for i in range(n):
         h.add(i, i)
 
-    for i in range(50):
+    for i in range(n):
         assert h.get(i) == i
+
+
+if __name__ == '__main__':
+    print(
+        'beg',
+        test_hash_table(),
+        'end',
+    )
@@ -101,6 +101,9 @@
     <li class="toctree-l2"><a href="#collision">哈希冲突 (collision)</a></li>
 
 
+    <li class="toctree-l2"><a href="#cpython">Cpython 如何解决哈希冲突</a></li>
+    
+
     <li class="toctree-l2"><a href="#_3">哈希函数</a></li>
 
 
@@ -278,7 +281,7 @@ <h1 id="collision">哈希冲突 (collision)</h1>
 <li>双重散列(double hashing): 重新计算 hash 结果。 <script type="math/tex"> h(k,i) = (h_1(k) + ih_2(k)) \% m </script>
 </li>
 </ul>
-<p>cpython 使用的是二次探查，这里我们也使用二次探查， 我们选一个简单的二次探查函数 <script type="math/tex"> h(k, i) = (home + i^2) \% m </script>，它的意思是如果
+<p>我们选一个简单的二次探查函数 <script type="math/tex"> h(k, i) = (home + i^2) \% m </script>，它的意思是如果
 遇到了冲突，我们就在原始计算的位置不断加上 i 的平方。我写了段代码来模拟整个计算下标的过程：</p>
 <pre><code class="py">inserted_index_set = set()
 M = 13
@@ -321,6 +324,26 @@ <h1 id="collision">哈希冲突 (collision)</h1>
 <p>遇到冲突之后会重新计算，每个待插入元素最终的下标就是：</p>
 <p><img alt="" src="../quadratic_hash.png" /></p>
 <p><img alt="" src="../quadratic_result.png" /></p>
+<h1 id="cpython">Cpython 如何解决哈希冲突</h1>
+<p>如果你对 cpython 解释器的实现感兴趣，可以参考下这个文件 <a href="https://github.com/python/cpython/blob/master/Objects/dictobject.c#L165">dictobject.c</a>。
+不同 cpython 版本实现的探查方式是不同的，后边我们自己实现 HashTable ADT 的时候会模仿这个探查方式来解决冲突。</p>
+<pre><code>The first half of collision resolution is to visit table indices via this
+recurrence:
+
+    j = ((5*j) + 1) mod 2**i
+
+For any initial j in range(2**i), repeating that 2**i times generates each
+int in range(2**i) exactly once (see any text on random-number generation for
+proof).  By itself, this doesn't help much:  like linear probing (setting
+j += 1, or j -= 1, on each loop trip), it scans the table entries in a fixed
+order.  This would be bad, except that's not the only thing we do, and it's
+actually *good* in the common cases where hash keys are consecutive.  In an
+example that's really too small to make this entirely clear, for a table of
+size 2**3 the order of indices is:
+
+    0 -&gt; 1 -&gt; 6 -&gt; 7 -&gt; 4 -&gt; 5 -&gt; 2 -&gt; 3 -&gt; 0 [and here it's repeating]
+</code></pre>
+
 <h1 id="_3">哈希函数</h1>
 <p>到这里你应该明白哈希表插入的工作原理了，不过有个重要的问题之前没提到，就是 hash 函数怎么选？
 当然是散列得到的冲突越来越小就好啦，也就是说每个 key 都能尽量被等可能地散列到 m 个槽中的任何一个，并且与其他 key 被散列到哪个槽位无关。