Skip to content

Commit 7f7c487

Browse files
committed
Deployed 31796cc with MkDocs version: 0.17.3
1 parent 49f99cc commit 7f7c487

File tree

8 files changed

+203
-176
lines changed

8 files changed

+203
-176
lines changed

07_哈希表/hashtable.py

Lines changed: 50 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@
55

66
class Array(object):
77

8-
def __init__(self, size=32):
8+
def __init__(self, size=32, init=None):
99
self._size = size
10-
self._items = [None] * size
10+
self._items = [init] * size
1111

1212
def __getitem__(self, index):
1313
return self._items[index]
@@ -41,96 +41,88 @@ def __init__(self, key, value):
4141

4242
class HashTable(object):
4343

44-
UNUSED = None # 没被使用过的槽,作为该类变量的一个单例,下边都是is 判断
45-
EMPTY = Slot(None, None) # 使用过但是被删除的槽
44+
UNUSED = None # 没被使用过
45+
EMPTY = Slot(None, None) # 使用却被删除过
4646

4747
def __init__(self):
48-
self._table = Array(7)
48+
self._table = Array(8, init=HashTable.UNUSED) # 保持 2*i 次方
4949
self.length = 0
5050

5151
@property
5252
def _load_factor(self):
53-
# load factor 超过 2/3 就重新分配空间
53+
# load_factor 超过 0.8 重新分配
5454
return self.length / float(len(self._table))
5555

5656
def __len__(self):
5757
return self.length
5858

59-
def _hash1(self, key):
60-
""" 计算key的hash值"""
59+
def _hash(self, key):
6160
return abs(hash(key)) % len(self._table)
6261

63-
def _find_slot(self, key, for_insert=False):
64-
"""_find_slot
65-
66-
:param key:
67-
:param for_insert: 是否插入,还是仅仅查询
68-
:return: slot index or None
69-
"""
70-
index = self._hash1(key)
71-
base_index = index
72-
hash_times = 1
62+
def _find_key(self, key):
63+
index = self._hash(key)
7364
_len = len(self._table)
74-
75-
if not for_insert: # 查找是否存在 key
76-
while self._table[index] is not HashTable.UNUSED:
77-
if self._table[index] is HashTable.EMPTY:
78-
index = (base_index + hash_times * hash_times) % _len # 一个简单的二次方探查
79-
continue
80-
elif self._table[index].key == key:
81-
return index
82-
index = (base_index + hash_times * hash_times) % _len
83-
hash_times += 1
84-
return None
85-
else:
86-
while not self._slot_can_insert(index): # 循环直到找到一个可以插入的槽
87-
index = (base_index + hash_times * hash_times) % _len
88-
hash_times += 1
89-
return index
65+
while self._table[index] is not HashTable.UNUSED:
66+
if self._table[index] is HashTable.EMPTY:
67+
index = (index*5 + 1) % _len
68+
continue
69+
elif self._table[index].key == key:
70+
return index
71+
else:
72+
index = (index*5 + 1) % _len
73+
return None
74+
75+
def _find_slot_for_insert(self, key):
76+
index = self._hash(key)
77+
_len = len(self._table)
78+
while not self._slot_can_insert(index):
79+
index = (index*5 + 1) % _len
80+
return index
9081

9182
def _slot_can_insert(self, index):
9283
return (self._table[index] is HashTable.EMPTY or self._table[index] is HashTable.UNUSED)
9384

94-
def __contains__(self, key): # in operator
95-
index = self._find_slot(key, for_insert=False)
85+
def __contains__(self, key): # in operator
86+
index = self._find_key(key)
9687
return index is not None
9788

9889
def add(self, key, value):
99-
if key in self: # key 相同值不一样的时候,用新的值
100-
index = self._find_slot(key, for_insert=False)
90+
if key in self:
91+
index = self._find_key(key)
10192
self._table[index].value = value
10293
return False
10394
else:
104-
index = self._find_slot(key, for_insert=True)
95+
index = self._find_slot_for_insert(key)
10596
self._table[index] = Slot(key, value)
10697
self.length += 1
107-
if self._load_factor >= 0.8: # 注意超过了 阈值 rehashing
98+
if self._load_factor >= 0.8:
10899
self._rehash()
109100
return True
110101

111102
def _rehash(self):
112103
old_table = self._table
113-
newsize = len(self._table) * 2 + 1 # 扩大 2*n + 1
114-
self._table = Array(newsize)
104+
newsize = len(self._table) * 2
105+
self._table = Array(newsize, HashTable.UNUSED)
115106

116107
self.length = 0
117108

118109
for slot in old_table:
119110
if slot is not HashTable.UNUSED and slot is not HashTable.EMPTY:
120-
index = self._find_slot(slot.key, for_insert=True)
111+
index = self._find_slot_for_insert(slot.key)
121112
self._table[index] = slot
122113
self.length += 1
123114

124115
def get(self, key, default=None):
125-
index = self._find_slot(key, for_insert=False)
116+
index = self._find_key(key)
126117
if index is None:
127118
return default
128119
else:
129120
return self._table[index].value
130121

131122
def remove(self, key):
132-
assert key in self, 'keyerror'
133-
index = self._find_slot(key, for_insert=False)
123+
index = self._find_key(key)
124+
if index is None:
125+
raise KeyError()
134126
value = self._table[index].value
135127
self.length -= 1
136128
self._table[index] = HashTable.EMPTY
@@ -139,28 +131,34 @@ def remove(self, key):
139131
def __iter__(self):
140132
for slot in self._table:
141133
if slot not in (HashTable.EMPTY, HashTable.UNUSED):
142-
yield slot.key # 和 python dict 一样,默认遍历 key,需要value 的话写个 items() 方法
134+
yield slot.key
143135

144136

145137
def test_hash_table():
146138
h = HashTable()
147139
h.add('a', 0)
148140
h.add('b', 1)
149141
h.add('c', 2)
150-
151142
assert len(h) == 3
152143
assert h.get('a') == 0
153144
assert h.get('b') == 1
154145
assert h.get('hehe') is None
155146

156147
h.remove('a')
157148
assert h.get('a') is None
158-
159149
assert sorted(list(h)) == ['b', 'c']
160150

161-
# 50 超过了 HashTable 的原始 size,我们测试下是否 reshah 操作能正确工作
162-
for i in range(50):
151+
n = 50
152+
for i in range(n):
163153
h.add(i, i)
164154

165-
for i in range(50):
155+
for i in range(n):
166156
assert h.get(i) == i
157+
158+
159+
if __name__ == '__main__':
160+
print(
161+
'beg',
162+
test_hash_table(),
163+
'end',
164+
)

07_哈希表/hashtable/index.html

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,9 @@
101101
<li class="toctree-l2"><a href="#collision">哈希冲突 (collision)</a></li>
102102

103103

104+
<li class="toctree-l2"><a href="#cpython">Cpython 如何解决哈希冲突</a></li>
105+
106+
104107
<li class="toctree-l2"><a href="#_3">哈希函数</a></li>
105108

106109

@@ -278,7 +281,7 @@ <h1 id="collision">哈希冲突 (collision)</h1>
278281
<li>双重散列(double hashing): 重新计算 hash 结果。 <script type="math/tex"> h(k,i) = (h_1(k) + ih_2(k)) \% m </script>
279282
</li>
280283
</ul>
281-
<p>cpython 使用的是二次探查,这里我们也使用二次探查, 我们选一个简单的二次探查函数 <script type="math/tex"> h(k, i) = (home + i^2) \% m </script>,它的意思是如果
284+
<p>我们选一个简单的二次探查函数 <script type="math/tex"> h(k, i) = (home + i^2) \% m </script>,它的意思是如果
282285
遇到了冲突,我们就在原始计算的位置不断加上 i 的平方。我写了段代码来模拟整个计算下标的过程:</p>
283286
<pre><code class="py">inserted_index_set = set()
284287
M = 13
@@ -321,6 +324,26 @@ <h1 id="collision">哈希冲突 (collision)</h1>
321324
<p>遇到冲突之后会重新计算,每个待插入元素最终的下标就是:</p>
322325
<p><img alt="" src="../quadratic_hash.png" /></p>
323326
<p><img alt="" src="../quadratic_result.png" /></p>
327+
<h1 id="cpython">Cpython 如何解决哈希冲突</h1>
328+
<p>如果你对 cpython 解释器的实现感兴趣,可以参考下这个文件 <a href="https://github.com/python/cpython/blob/master/Objects/dictobject.c#L165">dictobject.c</a>
329+
不同 cpython 版本实现的探查方式是不同的,后边我们自己实现 HashTable ADT 的时候会模仿这个探查方式来解决冲突。</p>
330+
<pre><code>The first half of collision resolution is to visit table indices via this
331+
recurrence:
332+
333+
j = ((5*j) + 1) mod 2**i
334+
335+
For any initial j in range(2**i), repeating that 2**i times generates each
336+
int in range(2**i) exactly once (see any text on random-number generation for
337+
proof). By itself, this doesn't help much: like linear probing (setting
338+
j += 1, or j -= 1, on each loop trip), it scans the table entries in a fixed
339+
order. This would be bad, except that's not the only thing we do, and it's
340+
actually *good* in the common cases where hash keys are consecutive. In an
341+
example that's really too small to make this entirely clear, for a table of
342+
size 2**3 the order of indices is:
343+
344+
0 -&gt; 1 -&gt; 6 -&gt; 7 -&gt; 4 -&gt; 5 -&gt; 2 -&gt; 3 -&gt; 0 [and here it's repeating]
345+
</code></pre>
346+
324347
<h1 id="_3">哈希函数</h1>
325348
<p>到这里你应该明白哈希表插入的工作原理了,不过有个重要的问题之前没提到,就是 hash 函数怎么选?
326349
当然是散列得到的冲突越来越小就好啦,也就是说每个 key 都能尽量被等可能地散列到 m 个槽中的任何一个,并且与其他 key 被散列到哪个槽位无关。
Binary file not shown.

0 commit comments

Comments
 (0)