update

liwei · liwei · commit 01cbc49a2954 · 2013-06-13T09:05:56.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -33,3 +33,6 @@ nosetests.xml
 .mr.developer.cfg
 .project
 .pydevproject
+
+# linux swap file
+*.swp
diff --git a/dimension_reduction.py b/dimension_reduction.py
@@ -0,0 +1,152 @@
+#!/usr/bin/python
+
+# chapter 11
+
+import math
+import numpy as np
+
+EPSILON = 0.00001
+
+def perfect_data():
+    U = np.array([[.14, .42, .56, .70, 0, 0, 0],
+                  [0, 0, 0, 0, .60, .75, .30]]).T
+    VT = np.array([[.58, .58, .58, 0, 0],
+                   [0, 0, 0, .71, .71]])
+    Sigma = np.array([[12.4, 0,],
+                      [0, 9.5]])
+    return U, Sigma, VT
+
+def noise_data():
+    U = np.array([[0.13, 0.41, 0.55, 0.68, 0.15, 0.07],
+                  [0.02, 0.07, .09, .11, -.59, -.73, -.29]]).T
+    VT = np.array([[0.56, 0.59, 0.56, 0.09, 0.09],
+                  [0.12, -0.02, 0.12, -0.69, -0.69]])
+    Sigma = np.array([[12.4, 0],
+                      [0, 0.5]])
+
+    return U, Sigma, VT
+
+def demo_querying_using_concepts():
+    # q represents a user's rating of movies
+    q = np.array([4, 0, 0, 0, 0])
+    _, _, VT = perfect_data()
+
+    # map q to concept space.
+    concept = np.dot(q, VT.T)
+    print concept
+    # map back 
+    print np.dot(concept, VT)
+
+
+def frobenius_norm_square(V):
+    return np.sum(np.square(V))
+
+def frobenius_norm(V):
+    return math.sqrt(frobenius_norm_square(V))
+
+def power_iteration(M, max_loop=100, power_iteration_epsilon=EPSILON):
+    n,m = M.shape
+    x = np.ones(n)
+
+    while max_loop >= 1:
+        x_k = np.dot(M, x)
+        x_k = x_k / (frobenius_norm(x_k) * 1.0)
+        if frobenius_norm(x_k - x) < power_iteration_epsilon:
+            x = x_k
+            break
+        x = x_k
+        max_loop -=1
+
+    eigen_vector = x
+    eigen_value = x.T.dot(M).dot(x)
+    return eigen_vector, eigen_value
+
+def eigen_solver(M, min_eigvalue=0.01, *args, **kw):
+    """Note M will be modified"""
+    E = []
+    Sigma = []
+    while 1:
+        eigvec, eigvalue = power_iteration(M, *args, **kw)
+        if abs(eigvalue) <= min_eigvalue:
+            break
+        if eigvec[0] < 0:
+            # modify the direction.
+            eigvec = eigvec * (-1)
+        E.append(eigvec)
+        Sigma.append(eigvalue)
+        M = M - eigvalue * np.outer(eigvec, eigvec)
+
+    return np.array(E).T, np.diag(Sigma)
+
+def svd(M, *args, **kw):
+    tmp = M.T.dot(M)
+    V, SigmaSquare = eigen_solver(tmp, *args, **kw)
+    tmp = M.dot(M.T)
+    U, SigmaSquare = eigen_solver(tmp, *args, **kw)
+    return U, np.sqrt(SigmaSquare), V.T
+
+
+RANDOMLY = 0
+RANDOM_BY_PROB = 1
+MAX_BY_PROB = 2
+
+def _select_randomly(M, r):
+    raise NotImplementedError
+
+def _select_randomly_by_prob(M, r):
+    raise NotImplementedError
+
+
+def _calc_prob(v, fnorm):
+    return 1.0 * frobenius_norm_square(v) / fnorm
+
+
+def kbig(vec, k):
+    if k <= 0:
+        return None
+
+    for offset,value in enumerate(vec):
+        pass
+
+def _select_max_by_prob(M, r):
+    fnorm = frobenius_norm_square(M)
+    # columns 
+    column_probs = np.apply_along_axis(_calc_prob, 0, M, fnorm)
+    # rows
+    row_probs = np.apply_along_axis(_calc_prob, 1, M, fnorm)
+    
+
+def cur(M, r, select_method=MAX_BY_PROB, *args, **kw):
+    """CUR decomposition
+    M is the matrix to be decomposed, r is the estimated rankd of
+    the matrix"""
+    def select_cr():
+        if select_method=
+
+
+def test_eigen_solver():
+    arr = np.array([[3, 2],
+                    [2, 6]])
+    print eigen_solver(arr)
+    arr = np.array([[.8, .3],
+                    [.2, .7]])
+    print eigen_solver(arr)
+
+def test_svd():
+    arr = np.array([[1, 1, 1, 0, 0],
+                    [3, 3, 3, 0, 0],
+                    [4, 4, 4, 0, 0],
+                    [5, 5, 5, 0, 0],
+                    [0, 2, 0, 4, 4],
+                    [0, 0, 0, 5, 5],
+                    [0, 1, 0, 2, 2]])
+    U, Sigma, VT = svd(arr)
+    print U
+    print "***************"
+    print Sigma
+    print "***************"
+    print VT
+
+#demo_querying_using_concepts()
+#test_eigen_solver()
+test_svd()
diff --git a/minhash2.py b/minhash2.py
@@ -0,0 +1,221 @@
+#!/usr/bin/python
+
+"""This implements the exercises of minhash part for book: 'Mining the massive
+dataset(v1.3)'. """
+
+import sys
+import itertools as itor
+import math
+import re
+import numpy as np
+
+def fig_3_2():
+    """Return the data of Fig. 3.2(Column-major)"""
+    
+    #     a,b,c,d,e
+    s1 = [1,0,0,1,0]
+    s2 = [0,0,1,0,0]
+    s3 = [0,1,0,1,1]
+    s4 = [1,0,1,1,0]
+
+    mat = [s1,s2,s3,s4]
+
+#    num_rows, num_cols = len(mat[0]), len(mat)
+
+    #data = np.array(num_rows, num_cols)
+    #for i in num_cols:
+        #for j in num_rows:
+            #data[j,i] = mat[i][j]
+
+    return mat
+
+def fig_3_4():
+    """Return the data of Fig. 3.4"""
+    
+    # The data is the same with Fig. 3.2
+    return fig_3_2()
+
+def fig_3_5():
+    """Column-major"""
+
+    s1 = [0,0,1,0,0,1]
+    s2 = [1,1,0,0,0,0]
+    s3 = [0,0,0,1,1,0]
+    s4 = [1,0,1,0,1,0]
+
+    return [s1,s2,s3,s4]
+# For two sets s1, s2, rows divided into:
+# Type X rows: 1 in both columns
+# Type Y rows: 1 in one column, 0 in the other column
+# Type Z rows: 0 in both columns
+
+def shape(col_major):
+    num_rows, num_cols = len(col_major[0]), len(col_major)
+    return num_rows, num_cols
+
+def jaccard_sim_naive(char_mat):
+    """The naive method to calculate jaccard similarity of the characteristic
+    matrix: char_mat"""
+    
+    _, set_num = shape(char_mat)  # num cols
+    sim_mat = np.zeros((set_num, set_num))
+    for i,s1 in enumerate(char_mat):
+        for j,s2 in enumerate(char_mat):
+            if i == j:
+                sim_mat[i,j] = 1
+            else:
+                num_X, num_Y = 0, 0
+                # Calculate number of type X and Y rows
+                for v1, v2 in zip(s1, s2):
+                    if v1 == v2 == 1:
+                        num_X += 1
+                    elif v1 != v2:
+                        num_Y += 1
+                sim_mat[i,j] = float(num_X) / float(num_X + num_Y)
+
+    return sim_mat
+
+
+def minhash_naive(char_mat):
+    num_rows = len(char_mat[0])
+    set_num = len(char_mat)
+    sim_mat = np.zeros((set_num, set_num))
+    total_num = math.factorial(num_rows)
+    for i,s1 in enumerate(char_mat):
+        for j,s2 in enumerate(char_mat):
+            same_hash_num = 0
+            for perm in itor.permutations(xrange(num_rows)):
+                for idx in perm:
+                    if s1[idx] == s2[idx] == 1:
+                        # permutation that make the two columns hash
+                        # to the same value
+                        same_hash_num += 1
+                        break
+                    elif s1[idx] != s2[idx]:
+                        # permuation that make the two colums hash to
+                        # different value
+                        break
+
+            sim_mat[i,j] = float(same_hash_num) / float(total_num)
+    
+    return sim_mat
+
+def calc_sig_mat(char_mat, hash_funcs, elements=None):
+    """Calculate signature matrix from characteristic matrix"""
+    num_rows, num_cols = len(char_mat[0]), len(char_mat)
+    num_hf = len(hash_funcs)
+    sig_mat = np.zeros((num_hf, num_cols))
+
+    elements = elements or xrange(num_rows)
+
+
+    for i in xrange(num_hf):
+        for j in xrange(num_cols):
+            sig_mat[i,j] = sys.maxint
+
+    for k in xrange(num_rows):
+        for i,hf in enumerate(hash_funcs):
+            hash_value = hf(elements[k])
+            for j,cols in enumerate(char_mat):
+                if cols[k] == 1:
+                    sig_mat[i,j] = min(sig_mat[i,j], hash_value)
+
+    return sig_mat
+
+
+def jaccard_sim(sig_mat):
+
+    num_rows, num_cols = sig_mat.shape # num_hf X num_sets
+    
+    sim_mat = np.zeros((num_cols, num_cols))
+
+    for i in xrange(num_cols):
+        for j in xrange(num_cols):
+            if i == j:
+                sim_mat[i,j] = 0
+            else:
+                same_hash_num = 0
+                for k in xrange(num_rows):
+                    if sig_mat[k,i] == sig_mat[k,j]:
+                        same_hash_num += 1
+                sim_mat[i,j] = float(same_hash_num) / float(num_rows)
+
+    return sim_mat
+
+def exercise_3_3_1():
+    """Exercise 3.3.1 : Verify the theorem from Section 3.3.3, which relates the Jac-
+    card similarity to the probability of minhashing to equal values, for the partic-
+    ular case of Fig. 3.2."""
+
+    char_mat = fig_3_2()
+
+    def a():
+        """Compute the Jaccard similarity of each of the pairs of columns in Fig. 3.2."""
+        print jaccard_sim_naive(char_mat)
+
+    def b():
+        """Compute, for each pair of columns of that figure, the fraction of the 120
+        permutations of the rows that make the two columns hash to the same value."""
+        print minhash_naive(char_mat)
+
+    #a()
+    #b()
+
+def exercise_3_3_2():
+    def h1(x):
+        return (x+1)%5
+
+    def h2(x):
+        return (3*x+1)%5
+
+    def h3(x):
+        return 2*x+4
+
+    def h4(x):
+        return 3*x - 1
+
+    #print calc_sig_mat(fig_3_4(), [h1,h2,h3,h4])
+
+def exercise_3_3_3():
+    char_mat = fig_3_5()
+
+    hash_funcs = [lambda x: (2*x+1)%6, lambda x: (3*x+2)%6, lambda x: (5*x+2)%6]
+
+    def a():
+        """Compute the minhash signature for each column if we use the following
+        three hash functions: h1 (x) = 2x + 1 mod 6; h2 (x) = 3x + 2 mod 6;
+        h3 (x) = 5x + 2 mod 6."""
+
+        print calc_sig_mat(char_mat, hash_funcs)
+
+    def b():
+        """Which of these hash functions are true permutations?"""
+        for hf in hash_funcs:
+            res = []
+            for i in xrange(6):
+                res.append(str(hf(i)))
+            print ','.join(res)
+
+        # Therefore, h3(x) is true permutation
+
+    def c():
+        print "True jaccard similarity:"
+        print jaccard_sim_naive(char_mat)
+        print "\nEsitmated jaccard similarity:"
+        print jaccard_sim(calc_sig_mat(char_mat, hash_funcs))
+
+    #a()
+    #b()
+    #c()
+
+def exercise_3_3_4():
+    pass
+
+def exec_exercises(_globals):
+    prog = re.compile('exercise_.*')
+    for func_name in _globals:
+        m = prog.match(func_name)
+        if m:
+            _globals[m.group(0)]()
+
+exec_exercises(globals())