Skip to content

Commit 01cbc49

Browse files
author
liwei
committed
update
1 parent 115e507 commit 01cbc49

File tree

3 files changed

+376
-0
lines changed

3 files changed

+376
-0
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,6 @@ nosetests.xml
3333
.mr.developer.cfg
3434
.project
3535
.pydevproject
36+
37+
# linux swap file
38+
*.swp

dimension_reduction.py

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
#!/usr/bin/python
2+
3+
# chapter 11
4+
5+
import math
6+
import numpy as np
7+
8+
EPSILON = 0.00001
9+
10+
def perfect_data():
11+
U = np.array([[.14, .42, .56, .70, 0, 0, 0],
12+
[0, 0, 0, 0, .60, .75, .30]]).T
13+
VT = np.array([[.58, .58, .58, 0, 0],
14+
[0, 0, 0, .71, .71]])
15+
Sigma = np.array([[12.4, 0,],
16+
[0, 9.5]])
17+
return U, Sigma, VT
18+
19+
def noise_data():
20+
U = np.array([[0.13, 0.41, 0.55, 0.68, 0.15, 0.07],
21+
[0.02, 0.07, .09, .11, -.59, -.73, -.29]]).T
22+
VT = np.array([[0.56, 0.59, 0.56, 0.09, 0.09],
23+
[0.12, -0.02, 0.12, -0.69, -0.69]])
24+
Sigma = np.array([[12.4, 0],
25+
[0, 0.5]])
26+
27+
return U, Sigma, VT
28+
29+
def demo_querying_using_concepts():
30+
# q represents a user's rating of movies
31+
q = np.array([4, 0, 0, 0, 0])
32+
_, _, VT = perfect_data()
33+
34+
# map q to concept space.
35+
concept = np.dot(q, VT.T)
36+
print concept
37+
# map back
38+
print np.dot(concept, VT)
39+
40+
41+
def frobenius_norm_square(V):
42+
return np.sum(np.square(V))
43+
44+
def frobenius_norm(V):
45+
return math.sqrt(frobenius_norm_square(V))
46+
47+
def power_iteration(M, max_loop=100, power_iteration_epsilon=EPSILON):
48+
n,m = M.shape
49+
x = np.ones(n)
50+
51+
while max_loop >= 1:
52+
x_k = np.dot(M, x)
53+
x_k = x_k / (frobenius_norm(x_k) * 1.0)
54+
if frobenius_norm(x_k - x) < power_iteration_epsilon:
55+
x = x_k
56+
break
57+
x = x_k
58+
max_loop -=1
59+
60+
eigen_vector = x
61+
eigen_value = x.T.dot(M).dot(x)
62+
return eigen_vector, eigen_value
63+
64+
def eigen_solver(M, min_eigvalue=0.01, *args, **kw):
65+
"""Note M will be modified"""
66+
E = []
67+
Sigma = []
68+
while 1:
69+
eigvec, eigvalue = power_iteration(M, *args, **kw)
70+
if abs(eigvalue) <= min_eigvalue:
71+
break
72+
if eigvec[0] < 0:
73+
# modify the direction.
74+
eigvec = eigvec * (-1)
75+
E.append(eigvec)
76+
Sigma.append(eigvalue)
77+
M = M - eigvalue * np.outer(eigvec, eigvec)
78+
79+
return np.array(E).T, np.diag(Sigma)
80+
81+
def svd(M, *args, **kw):
82+
tmp = M.T.dot(M)
83+
V, SigmaSquare = eigen_solver(tmp, *args, **kw)
84+
tmp = M.dot(M.T)
85+
U, SigmaSquare = eigen_solver(tmp, *args, **kw)
86+
return U, np.sqrt(SigmaSquare), V.T
87+
88+
89+
RANDOMLY = 0
90+
RANDOM_BY_PROB = 1
91+
MAX_BY_PROB = 2
92+
93+
def _select_randomly(M, r):
94+
raise NotImplementedError
95+
96+
def _select_randomly_by_prob(M, r):
97+
raise NotImplementedError
98+
99+
100+
def _calc_prob(v, fnorm):
101+
return 1.0 * frobenius_norm_square(v) / fnorm
102+
103+
104+
def kbig(vec, k):
105+
if k <= 0:
106+
return None
107+
108+
for offset,value in enumerate(vec):
109+
pass
110+
111+
def _select_max_by_prob(M, r):
112+
fnorm = frobenius_norm_square(M)
113+
# columns
114+
column_probs = np.apply_along_axis(_calc_prob, 0, M, fnorm)
115+
# rows
116+
row_probs = np.apply_along_axis(_calc_prob, 1, M, fnorm)
117+
118+
119+
def cur(M, r, select_method=MAX_BY_PROB, *args, **kw):
120+
"""CUR decomposition
121+
M is the matrix to be decomposed, r is the estimated rankd of
122+
the matrix"""
123+
def select_cr():
124+
if select_method=
125+
126+
127+
def test_eigen_solver():
128+
arr = np.array([[3, 2],
129+
[2, 6]])
130+
print eigen_solver(arr)
131+
arr = np.array([[.8, .3],
132+
[.2, .7]])
133+
print eigen_solver(arr)
134+
135+
def test_svd():
136+
arr = np.array([[1, 1, 1, 0, 0],
137+
[3, 3, 3, 0, 0],
138+
[4, 4, 4, 0, 0],
139+
[5, 5, 5, 0, 0],
140+
[0, 2, 0, 4, 4],
141+
[0, 0, 0, 5, 5],
142+
[0, 1, 0, 2, 2]])
143+
U, Sigma, VT = svd(arr)
144+
print U
145+
print "***************"
146+
print Sigma
147+
print "***************"
148+
print VT
149+
150+
#demo_querying_using_concepts()
151+
#test_eigen_solver()
152+
test_svd()

minhash2.py

Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
#!/usr/bin/python
2+
3+
"""This implements the exercises of minhash part for book: 'Mining the massive
4+
dataset(v1.3)'. """
5+
6+
import sys
7+
import itertools as itor
8+
import math
9+
import re
10+
import numpy as np
11+
12+
def fig_3_2():
13+
"""Return the data of Fig. 3.2(Column-major)"""
14+
15+
# a,b,c,d,e
16+
s1 = [1,0,0,1,0]
17+
s2 = [0,0,1,0,0]
18+
s3 = [0,1,0,1,1]
19+
s4 = [1,0,1,1,0]
20+
21+
mat = [s1,s2,s3,s4]
22+
23+
# num_rows, num_cols = len(mat[0]), len(mat)
24+
25+
#data = np.array(num_rows, num_cols)
26+
#for i in num_cols:
27+
#for j in num_rows:
28+
#data[j,i] = mat[i][j]
29+
30+
return mat
31+
32+
def fig_3_4():
33+
"""Return the data of Fig. 3.4"""
34+
35+
# The data is the same with Fig. 3.2
36+
return fig_3_2()
37+
38+
def fig_3_5():
39+
"""Column-major"""
40+
41+
s1 = [0,0,1,0,0,1]
42+
s2 = [1,1,0,0,0,0]
43+
s3 = [0,0,0,1,1,0]
44+
s4 = [1,0,1,0,1,0]
45+
46+
return [s1,s2,s3,s4]
47+
# For two sets s1, s2, rows divided into:
48+
# Type X rows: 1 in both columns
49+
# Type Y rows: 1 in one column, 0 in the other column
50+
# Type Z rows: 0 in both columns
51+
52+
def shape(col_major):
53+
num_rows, num_cols = len(col_major[0]), len(col_major)
54+
return num_rows, num_cols
55+
56+
def jaccard_sim_naive(char_mat):
57+
"""The naive method to calculate jaccard similarity of the characteristic
58+
matrix: char_mat"""
59+
60+
_, set_num = shape(char_mat) # num cols
61+
sim_mat = np.zeros((set_num, set_num))
62+
for i,s1 in enumerate(char_mat):
63+
for j,s2 in enumerate(char_mat):
64+
if i == j:
65+
sim_mat[i,j] = 1
66+
else:
67+
num_X, num_Y = 0, 0
68+
# Calculate number of type X and Y rows
69+
for v1, v2 in zip(s1, s2):
70+
if v1 == v2 == 1:
71+
num_X += 1
72+
elif v1 != v2:
73+
num_Y += 1
74+
sim_mat[i,j] = float(num_X) / float(num_X + num_Y)
75+
76+
return sim_mat
77+
78+
79+
def minhash_naive(char_mat):
80+
num_rows = len(char_mat[0])
81+
set_num = len(char_mat)
82+
sim_mat = np.zeros((set_num, set_num))
83+
total_num = math.factorial(num_rows)
84+
for i,s1 in enumerate(char_mat):
85+
for j,s2 in enumerate(char_mat):
86+
same_hash_num = 0
87+
for perm in itor.permutations(xrange(num_rows)):
88+
for idx in perm:
89+
if s1[idx] == s2[idx] == 1:
90+
# permutation that make the two columns hash
91+
# to the same value
92+
same_hash_num += 1
93+
break
94+
elif s1[idx] != s2[idx]:
95+
# permuation that make the two colums hash to
96+
# different value
97+
break
98+
99+
sim_mat[i,j] = float(same_hash_num) / float(total_num)
100+
101+
return sim_mat
102+
103+
def calc_sig_mat(char_mat, hash_funcs, elements=None):
104+
"""Calculate signature matrix from characteristic matrix"""
105+
num_rows, num_cols = len(char_mat[0]), len(char_mat)
106+
num_hf = len(hash_funcs)
107+
sig_mat = np.zeros((num_hf, num_cols))
108+
109+
elements = elements or xrange(num_rows)
110+
111+
112+
for i in xrange(num_hf):
113+
for j in xrange(num_cols):
114+
sig_mat[i,j] = sys.maxint
115+
116+
for k in xrange(num_rows):
117+
for i,hf in enumerate(hash_funcs):
118+
hash_value = hf(elements[k])
119+
for j,cols in enumerate(char_mat):
120+
if cols[k] == 1:
121+
sig_mat[i,j] = min(sig_mat[i,j], hash_value)
122+
123+
return sig_mat
124+
125+
126+
def jaccard_sim(sig_mat):
127+
128+
num_rows, num_cols = sig_mat.shape # num_hf X num_sets
129+
130+
sim_mat = np.zeros((num_cols, num_cols))
131+
132+
for i in xrange(num_cols):
133+
for j in xrange(num_cols):
134+
if i == j:
135+
sim_mat[i,j] = 0
136+
else:
137+
same_hash_num = 0
138+
for k in xrange(num_rows):
139+
if sig_mat[k,i] == sig_mat[k,j]:
140+
same_hash_num += 1
141+
sim_mat[i,j] = float(same_hash_num) / float(num_rows)
142+
143+
return sim_mat
144+
145+
def exercise_3_3_1():
146+
"""Exercise 3.3.1 : Verify the theorem from Section 3.3.3, which relates the Jac-
147+
card similarity to the probability of minhashing to equal values, for the partic-
148+
ular case of Fig. 3.2."""
149+
150+
char_mat = fig_3_2()
151+
152+
def a():
153+
"""Compute the Jaccard similarity of each of the pairs of columns in Fig. 3.2."""
154+
print jaccard_sim_naive(char_mat)
155+
156+
def b():
157+
"""Compute, for each pair of columns of that figure, the fraction of the 120
158+
permutations of the rows that make the two columns hash to the same value."""
159+
print minhash_naive(char_mat)
160+
161+
#a()
162+
#b()
163+
164+
def exercise_3_3_2():
165+
def h1(x):
166+
return (x+1)%5
167+
168+
def h2(x):
169+
return (3*x+1)%5
170+
171+
def h3(x):
172+
return 2*x+4
173+
174+
def h4(x):
175+
return 3*x - 1
176+
177+
#print calc_sig_mat(fig_3_4(), [h1,h2,h3,h4])
178+
179+
def exercise_3_3_3():
180+
char_mat = fig_3_5()
181+
182+
hash_funcs = [lambda x: (2*x+1)%6, lambda x: (3*x+2)%6, lambda x: (5*x+2)%6]
183+
184+
def a():
185+
"""Compute the minhash signature for each column if we use the following
186+
three hash functions: h1 (x) = 2x + 1 mod 6; h2 (x) = 3x + 2 mod 6;
187+
h3 (x) = 5x + 2 mod 6."""
188+
189+
print calc_sig_mat(char_mat, hash_funcs)
190+
191+
def b():
192+
"""Which of these hash functions are true permutations?"""
193+
for hf in hash_funcs:
194+
res = []
195+
for i in xrange(6):
196+
res.append(str(hf(i)))
197+
print ','.join(res)
198+
199+
# Therefore, h3(x) is true permutation
200+
201+
def c():
202+
print "True jaccard similarity:"
203+
print jaccard_sim_naive(char_mat)
204+
print "\nEsitmated jaccard similarity:"
205+
print jaccard_sim(calc_sig_mat(char_mat, hash_funcs))
206+
207+
#a()
208+
#b()
209+
#c()
210+
211+
def exercise_3_3_4():
212+
pass
213+
214+
def exec_exercises(_globals):
215+
prog = re.compile('exercise_.*')
216+
for func_name in _globals:
217+
m = prog.match(func_name)
218+
if m:
219+
_globals[m.group(0)]()
220+
221+
exec_exercises(globals())

0 commit comments

Comments
 (0)