Full implementation, bug buggy

cxyzs7 · cxyzs7 · commit af6825e54d17 · 2014-06-16T10:30:36.000-07:00
diff --git a/compute_numerical_gradient.py b/compute_numerical_gradient.py
@@ -6,7 +6,7 @@ def compute_numerical_gradient(func, theta):
     # func: a function that outputs a real-number. Calling y = J(theta) will return the
     # function value at theta. 
 
-    # Initialize numgrad with zeros
+    # Initialize numgrad (no need to initialize to zero, empty_like is a good fit here)
     numgrad = np.empty_like(theta)
 
     # Instructions: 
@@ -18,15 +18,19 @@ def compute_numerical_gradient(func, theta):
     # respect to theta(i).
     #                
     # Hint: You will probably want to compute the elements of numgrad one at a time.
-    EPSILON = 1e-4
+    epsilon = 1e-4
     for i in xrange(theta.size):
+        # temporarily save the value
         theta_i = theta[i]
-        theta[i] = theta_i+EPSILON
+        # temporarily increase the value
+        theta[i] = theta_i+epsilon
         val_plus = func(theta)
-        theta[i] = theta_i-EPSILON
+        # temporarily decrease the value
+        theta[i] = theta_i-epsilon
         val_minus = func(theta)
-        numgrad[i] = (val_plus-val_minus)/(EPSILON*2)
-        # recover theta
+        # compute numerical gradient
+        numgrad[i] = (val_plus-val_minus)/(epsilon*2)
+        # restore theta
         theta[i] = theta_i
 
     return numgrad
diff --git a/display_network.py b/display_network.py
@@ -56,6 +56,7 @@ def display_network(data, cols=-1, opt_normalize=True, opt_graycolor=True):
             array[buf+i*(sz+buf):buf+i*(sz+buf)+sz, buf+j*(sz+buf):buf+j*(sz+buf)+sz] = data[k, :].reshape([sz, sz])/clim
             k += 1
 
+    # simulate imagesc
     ax = plt.figure().gca()
     pix_width = 5
     h, w = array.shape
diff --git a/sample_images.py b/sample_images.py
@@ -25,17 +25,14 @@ def normalize_data(patches):
     return patches
 
 
-def sample_images():
+def sample_images(patch_size, num_patches):
     """
     :return: 10000 patches for training
     """
     ## Get IMAGES.mat from http://ufldl.stanford.edu/wiki/resources/sparseae_exercise.zip
     images = loadmat('IMAGES.mat')['IMAGES']    # load images from disk
     num_images = images.shape[2]
 
-    patch_size = 8          # we'll use 8x8 patches
-    num_patches = 10000
-
     # Initialize patches
     patches = np.empty([num_patches, patch_size*patch_size])
 
diff --git a/sparse_autoencoder_cost.py b/sparse_autoencoder_cost.py
@@ -4,14 +4,14 @@
 import numpy as np
 
 
-def initialize_parameters(hidden_size, visible_size):
+def initialize_parameters(visible_size, hidden_size):
     # Initialize parameters randomly based on layer sizes.
     r = sqrt(6) / sqrt(hidden_size+visible_size+1)   # we'll choose weights uniformly from the interval [-r, r]
-    w1 = np.random.rand(hidden_size, visible_size) * 2 * r - r
-    w2 = np.random.rand(visible_size, hidden_size) * 2 * r - r
+    w1 = np.random.rand(visible_size, hidden_size) * 2 * r - r
+    w2 = np.random.rand(hidden_size, visible_size) * 2 * r - r
 
-    b1 = np.zeros((hidden_size, 1))
-    b2 = np.zeros((visible_size, 1))
+    b1 = np.zeros((1, hidden_size))
+    b2 = np.zeros((1, visible_size))
 
     # Convert weights and bias gradients to the vector form.
     # This step will "unroll" (flatten and concatenate together) all
@@ -42,18 +42,10 @@ def sparse_autoencoder_cost_and_grad(theta, visible_size, hidden_size, decay_lam
     # follows the notation convention of the lecture notes.
 
     num_combinations = visible_size*hidden_size
-    w1 = theta[0:num_combinations].reshape((hidden_size, visible_size))
-    w2 = theta[num_combinations:2*num_combinations].reshape((visible_size, hidden_size))
+    w1 = theta[0:num_combinations].reshape((visible_size, hidden_size))
+    w2 = theta[num_combinations:2*num_combinations].reshape((hidden_size, visible_size))
     b1 = theta[2*num_combinations:2*num_combinations+hidden_size]
     b2 = theta[2*num_combinations+hidden_size:]
-    
-    # Cost and gradient variables (your code needs to compute these values).
-    # Here, we initialize them to zeros.
-    cost = 0
-    w1grad = np.zeros_like(w1)
-    w2grad = np.zeros_like(w2)
-    b1grad = np.zeros_like(b1)
-    b2grad = np.zeros_like(b2)
 
     #  Instructions: Compute the cost/optimization objective J_sparse(W,b) for the Sparse Autoencoder,
     #                and the corresponding gradients W1grad, W2grad, b1grad, b2grad.
@@ -68,20 +60,51 @@ def sparse_autoencoder_cost_and_grad(theta, visible_size, hidden_size, decay_lam
     #
     # Stated differently, if we were using batch gradient descent to optimize the parameters,
     # the gradient descent update to W1 would be W1 := W1 - alpha * W1grad, and similarly for W2, b1, b2.
-    #
-    
-    
-    
-    
-    
-    
-    
-    
+
+    # autoencoder, y = x
+    y = data
+
+    # feedforward pass
+    a1 = data
+    z2 = np.dot(a1, w1)+b1
+    a2 = sigmoid(z2)
+    z3 = np.dot(a2, w2)+b2
+    a3 = sigmoid(z3)
+
+    # compute all deltas
+    # output layer
+    prime3 = np.multiply(a3, (1.0-a3))
+    delta3 = -np.multiply(y-a3, prime3)
+    # hidden layer
+    one_over_m = 1.0/np.float32(data.shape[0])
+    sparsity_avg = one_over_m*np.sum(a2, axis=0)
+    sparsity_term = -sparsity_param/sparsity_avg+(1.0-sparsity_param)/(1.0-sparsity_avg)
+    prime2 = np.multiply(a2, (1.0-a2))
+    delta2 = np.multiply(np.dot(delta3, np.transpose(w2)) + beta*sparsity_term, prime2)
+
+    # compute gradient
+    w1grad = np.zeros_like(w1)
+    for i in xrange(data.shape[0]):
+        w1grad += np.dot(a1[i, :].reshape((visible_size, 1)), delta2[i, :].reshape((1, hidden_size)))
+    w1grad = one_over_m*w1grad + decay_lambda*w1
+    w2grad = np.zeros_like(w2)
+    for i in xrange(data.shape[0]):
+        w2grad += np.dot(a2[i, :].reshape((hidden_size, 1)), delta3[i, :].reshape((1, visible_size)))
+    w2grad = one_over_m*w2grad + decay_lambda*w2
+    b1grad = one_over_m*np.sum(delta2, axis=0)
+    b2grad = one_over_m*np.sum(delta3, axis=0)
+
+    # compute cost
+    error_flatten = (a3-y).flatten()
+    w1_flatten = w1.flatten()
+    w2_flatten = w2.flatten()
+    cost = np.dot(error_flatten, error_flatten)*one_over_m/2.0 + \
+        decay_lambda*(np.dot(w1_flatten, w1_flatten)+np.dot(w2_flatten, w2_flatten))/2.0 + \
+        beta*(np.sum(sparsity_param*np.log(sparsity_param/sparsity_avg)+(1.0-sparsity_param)*np.log((1.0-sparsity_param)/(1.0-sparsity_avg))))
 
     # After computing the cost and gradient, we will convert the gradients back
     # to a vector format (suitable for minFunc).  Specifically, we will unroll
     # your gradient matrices into a vector.
-
     grad = np.concatenate((w1grad.flatten(), w2grad.flatten(), b1grad.flatten(), b2grad.flatten()))
 
     return cost, grad
diff --git a/train.py b/train.py
@@ -13,8 +13,10 @@ def train():
     # allow your sparse autoencoder to get good filters; you do not need to
     # change the parameters below.
 
-    visible_size = 8*8   # number of input units
-    hidden_size = 25     # number of hidden units
+    patch_size = 8
+    num_patches = 10000
+    visible_size = patch_size**2    # number of input units
+    hidden_size = 25                # number of hidden units
     sparsity_param = 0.01   # desired average activation of the hidden units.
                         # (This was denoted by the Greek alphabet rho, which looks like a lower-case "p",
                         #  in the lecture notes).
@@ -25,12 +27,12 @@ def train():
     # After implementing sampleIMAGES, the display_network command should
     # display a random sample of 200 patches from the dataset
 
-    patches = sample_images()
+    patches = sample_images(patch_size, num_patches)
 #    list = [randint(0, patches.shape[0]-1) for i in xrange(64)]
 #    display_network(patches[list, :], 8)
 
     # Obtain random parameters theta
-    theta = initialize_parameters(hidden_size, visible_size)
+#    theta = initialize_parameters(visible_size, hidden_size)
 
     # STEP 2: Implement sparseAutoencoderCost
     #
@@ -57,8 +59,8 @@ def train():
     #  and/or lambda to zero may be helpful for debugging.)  However, in your
     #  final submission of the visualized weights, please use parameters we
     #  gave in Step 0 above.
-    cost, grad = sparse_autoencoder_cost_and_grad(theta, visible_size, hidden_size,
-                                                  decay_lambda, sparsity_param, beta, patches)
+#    cost, grad = sparse_autoencoder_cost_and_grad(theta, visible_size, hidden_size,
+#                                                  decay_lambda, sparsity_param, beta, patches)
 
     # STEP 3: Gradient Checking
     #
@@ -69,35 +71,36 @@ def train():
     # First, lets make sure your numerical gradient computation is correct for a
     # simple function.  After you have implemented compute_numerical_gradient,
     # run the following:
-    check_numerical_gradient()
+#    check_numerical_gradient()
 
     # Now we can use it to check your cost function and derivative calculations
     # for the sparse autoencoder.
-    func = lambda x: sparse_autoencoder_cost(x, visible_size, hidden_size,
-                                             decay_lambda, sparsity_param, beta, patches)
-    numgrad = compute_numerical_gradient(func, theta)
+#    func = lambda x: sparse_autoencoder_cost(x, visible_size, hidden_size,
+#                                             decay_lambda, sparsity_param, beta, patches)
+#    numgrad = compute_numerical_gradient(func, theta)
 
     # Use this to visually compare the gradients side by side
-    print numgrad, grad
+#    print numgrad, grad
 
     # Compare numerically computed gradients with the ones obtained from backpropagation
-    diff = np.linalg.norm(numgrad-grad)/np.linalg.norm(numgrad+grad)
+#    diff = np.linalg.norm(numgrad-grad)/np.linalg.norm(numgrad+grad)
     # Should be small. In our implementation, these values are usually less than 1e-9.
-    print diff
+#    print diff
 
     # STEP 4: After verifying that your implementation of
-    #  sparse_autoencoder_cost is correct, You can start training your sparse
-    #  autoencoder with minFunc (L-BFGS).
+    # sparse_autoencoder_cost is correct, You can start training your sparse
+    # autoencoder with minFunc (L-BFGS).
 
-    #  Randomly initialize the parameters
-    theta = initialize_parameters(hidden_size, visible_size)
+    # Randomly initialize the parameters
+    # Use minimize interface, and set jac=True, so it can accept cost and grad together
+    theta = initialize_parameters(visible_size, hidden_size)
     func_args = (visible_size, hidden_size, decay_lambda, sparsity_param, beta, patches)
     res = minimize(sparse_autoencoder_cost_and_grad, x0=theta, args=func_args, method='L-BFGS-B',
                    jac=True, options={'maxiter': 400, 'disp': True})
 
     # STEP 5: Visualization
     w1 = res.x[0: hidden_size*visible_size].reshape((hidden_size, visible_size))
-#    display_network(w1, 12)
+    display_network(w1, 5)
 
 
 if __name__ == "__main__":