Skip to content

Commit 31796cc

Browse files
committed
修正哈希表实现 bug,模仿 cpython 实现解决哈希冲突
1 parent af2b395 commit 31796cc

File tree

4 files changed

+171
-149
lines changed

4 files changed

+171
-149
lines changed

docs/07_哈希表/hashtable.md

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ h(388) = 388 % M = 11
5353
- 二次探查(quadratic probing): 当一个槽被占用,以二次方作为偏移量。 $ h(k, i) = (h^\prime(k) + c_1 + c_2i^2) \% m , i=0,1,...,m-1 $
5454
- 双重散列(double hashing): 重新计算 hash 结果。 $ h(k,i) = (h_1(k) + ih_2(k)) \% m $
5555

56-
cpython 使用的是二次探查,这里我们也使用二次探查, 我们选一个简单的二次探查函数 $ h(k, i) = (home + i^2) \% m $,它的意思是如果
56+
我们选一个简单的二次探查函数 $ h(k, i) = (home + i^2) \% m $,它的意思是如果
5757
遇到了冲突,我们就在原始计算的位置不断加上 i 的平方。我写了段代码来模拟整个计算下标的过程:
5858

5959
```py
@@ -103,6 +103,29 @@ h(388) = 388 % M = 1
103103
![](quadratic_result.png)
104104

105105

106+
# Cpython 如何解决哈希冲突
107+
如果你对 cpython 解释器的实现感兴趣,可以参考下这个文件 [dictobject.c](https://github.com/python/cpython/blob/master/Objects/dictobject.c#L165)
108+
不同 cpython 版本实现的探查方式是不同的,后边我们自己实现 HashTable ADT 的时候会模仿这个探查方式来解决冲突。
109+
110+
111+
```
112+
The first half of collision resolution is to visit table indices via this
113+
recurrence:
114+
115+
j = ((5*j) + 1) mod 2**i
116+
117+
For any initial j in range(2**i), repeating that 2**i times generates each
118+
int in range(2**i) exactly once (see any text on random-number generation for
119+
proof). By itself, this doesn't help much: like linear probing (setting
120+
j += 1, or j -= 1, on each loop trip), it scans the table entries in a fixed
121+
order. This would be bad, except that's not the only thing we do, and it's
122+
actually *good* in the common cases where hash keys are consecutive. In an
123+
example that's really too small to make this entirely clear, for a table of
124+
size 2**3 the order of indices is:
125+
126+
0 -> 1 -> 6 -> 7 -> 4 -> 5 -> 2 -> 3 -> 0 [and here it's repeating]
127+
```
128+
106129
# 哈希函数
107130
到这里你应该明白哈希表插入的工作原理了,不过有个重要的问题之前没提到,就是 hash 函数怎么选?
108131
当然是散列得到的冲突越来越小就好啦,也就是说每个 key 都能尽量被等可能地散列到 m 个槽中的任何一个,并且与其他 key 被散列到哪个槽位无关。

docs/07_哈希表/hashtable.py

Lines changed: 50 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@
55

66
class Array(object):
77

8-
def __init__(self, size=32):
8+
def __init__(self, size=32, init=None):
99
self._size = size
10-
self._items = [None] * size
10+
self._items = [init] * size
1111

1212
def __getitem__(self, index):
1313
return self._items[index]
@@ -41,96 +41,88 @@ def __init__(self, key, value):
4141

4242
class HashTable(object):
4343

44-
UNUSED = None # 没被使用过的槽,作为该类变量的一个单例,下边都是is 判断
45-
EMPTY = Slot(None, None) # 使用过但是被删除的槽
44+
UNUSED = None # 没被使用过
45+
EMPTY = Slot(None, None) # 使用却被删除过
4646

4747
def __init__(self):
48-
self._table = Array(7)
48+
self._table = Array(8, init=HashTable.UNUSED) # 保持 2*i 次方
4949
self.length = 0
5050

5151
@property
5252
def _load_factor(self):
53-
# load factor 超过 2/3 就重新分配空间
53+
# load_factor 超过 0.8 重新分配
5454
return self.length / float(len(self._table))
5555

5656
def __len__(self):
5757
return self.length
5858

59-
def _hash1(self, key):
60-
""" 计算key的hash值"""
59+
def _hash(self, key):
6160
return abs(hash(key)) % len(self._table)
6261

63-
def _find_slot(self, key, for_insert=False):
64-
"""_find_slot
65-
66-
:param key:
67-
:param for_insert: 是否插入,还是仅仅查询
68-
:return: slot index or None
69-
"""
70-
index = self._hash1(key)
71-
base_index = index
72-
hash_times = 1
62+
def _find_key(self, key):
63+
index = self._hash(key)
7364
_len = len(self._table)
74-
75-
if not for_insert: # 查找是否存在 key
76-
while self._table[index] is not HashTable.UNUSED:
77-
if self._table[index] is HashTable.EMPTY:
78-
index = (base_index + hash_times * hash_times) % _len # 一个简单的二次方探查
79-
continue
80-
elif self._table[index].key == key:
81-
return index
82-
index = (base_index + hash_times * hash_times) % _len
83-
hash_times += 1
84-
return None
85-
else:
86-
while not self._slot_can_insert(index): # 循环直到找到一个可以插入的槽
87-
index = (base_index + hash_times * hash_times) % _len
88-
hash_times += 1
89-
return index
65+
while self._table[index] is not HashTable.UNUSED:
66+
if self._table[index] is HashTable.EMPTY:
67+
index = (index*5 + 1) % _len
68+
continue
69+
elif self._table[index].key == key:
70+
return index
71+
else:
72+
index = (index*5 + 1) % _len
73+
return None
74+
75+
def _find_slot_for_insert(self, key):
76+
index = self._hash(key)
77+
_len = len(self._table)
78+
while not self._slot_can_insert(index):
79+
index = (index*5 + 1) % _len
80+
return index
9081

9182
def _slot_can_insert(self, index):
9283
return (self._table[index] is HashTable.EMPTY or self._table[index] is HashTable.UNUSED)
9384

94-
def __contains__(self, key): # in operator
95-
index = self._find_slot(key, for_insert=False)
85+
def __contains__(self, key): # in operator
86+
index = self._find_key(key)
9687
return index is not None
9788

9889
def add(self, key, value):
99-
if key in self: # key 相同值不一样的时候,用新的值
100-
index = self._find_slot(key, for_insert=False)
90+
if key in self:
91+
index = self._find_key(key)
10192
self._table[index].value = value
10293
return False
10394
else:
104-
index = self._find_slot(key, for_insert=True)
95+
index = self._find_slot_for_insert(key)
10596
self._table[index] = Slot(key, value)
10697
self.length += 1
107-
if self._load_factor >= 0.8: # 注意超过了 阈值 rehashing
98+
if self._load_factor >= 0.8:
10899
self._rehash()
109100
return True
110101

111102
def _rehash(self):
112103
old_table = self._table
113-
newsize = len(self._table) * 2 + 1 # 扩大 2*n + 1
114-
self._table = Array(newsize)
104+
newsize = len(self._table) * 2
105+
self._table = Array(newsize, HashTable.UNUSED)
115106

116107
self.length = 0
117108

118109
for slot in old_table:
119110
if slot is not HashTable.UNUSED and slot is not HashTable.EMPTY:
120-
index = self._find_slot(slot.key, for_insert=True)
111+
index = self._find_slot_for_insert(slot.key)
121112
self._table[index] = slot
122113
self.length += 1
123114

124115
def get(self, key, default=None):
125-
index = self._find_slot(key, for_insert=False)
116+
index = self._find_key(key)
126117
if index is None:
127118
return default
128119
else:
129120
return self._table[index].value
130121

131122
def remove(self, key):
132-
assert key in self, 'keyerror'
133-
index = self._find_slot(key, for_insert=False)
123+
index = self._find_key(key)
124+
if index is None:
125+
raise KeyError()
134126
value = self._table[index].value
135127
self.length -= 1
136128
self._table[index] = HashTable.EMPTY
@@ -139,28 +131,34 @@ def remove(self, key):
139131
def __iter__(self):
140132
for slot in self._table:
141133
if slot not in (HashTable.EMPTY, HashTable.UNUSED):
142-
yield slot.key # 和 python dict 一样,默认遍历 key,需要value 的话写个 items() 方法
134+
yield slot.key
143135

144136

145137
def test_hash_table():
146138
h = HashTable()
147139
h.add('a', 0)
148140
h.add('b', 1)
149141
h.add('c', 2)
150-
151142
assert len(h) == 3
152143
assert h.get('a') == 0
153144
assert h.get('b') == 1
154145
assert h.get('hehe') is None
155146

156147
h.remove('a')
157148
assert h.get('a') is None
158-
159149
assert sorted(list(h)) == ['b', 'c']
160150

161-
# 50 超过了 HashTable 的原始 size,我们测试下是否 reshah 操作能正确工作
162-
for i in range(50):
151+
n = 50
152+
for i in range(n):
163153
h.add(i, i)
164154

165-
for i in range(50):
155+
for i in range(n):
166156
assert h.get(i) == i
157+
158+
159+
if __name__ == '__main__':
160+
print(
161+
'beg',
162+
test_hash_table(),
163+
'end',
164+
)

0 commit comments

Comments
 (0)