Compute Numerical Gradient and Gradient checking

Michael Lou · Michael Lou · commit 7122ef0ef650 · 2014-06-15T12:50:23.000-07:00
diff --git a/check_numerical_gradient.py b/check_numerical_gradient.py
@@ -0,0 +1,49 @@
+import numpy as np
+
+from compute_numerical_gradient import compute_numerical_gradient
+
+
+def simple_quadratic_function(x):
+    # this function accepts a 2D vector as input.
+    # Its outputs are:
+    #   value: h(x1, x2) = x1^2 + 3*x1*x2
+    #   grad: A 2x1 vector that gives the partial derivatives of h with respect to x1 and x2
+    # Note that when we pass simple_quadratic_function(x) to check_numerical_gradient, we're assuming
+    # that compute_numerical_gradient will use only the first returned value of this function.
+    value = x[0]**2 + 3*x[0]*x[1]
+
+    grad = np.empty_like(x)
+    grad[0] = 2*x[0] + 3*x[1]
+    grad[1] = 3*x[0]
+
+    return value, grad
+
+
+def check_numerical_gradient():
+    # This code can be used to check your numerical gradient implementation
+    # in computeNumericalGradient.m
+    # It analytically evaluates the gradient of a very simple function called
+    # simpleQuadraticFunction (see below) and compares the result with your numerical
+    # solution. Your numerical gradient implementation is incorrect if
+    # your numerical solution deviates too much from the analytical solution.
+
+    # Evaluate the function and gradient at x = [4, 10] (Here, x is a 2d vector.)
+    x = np.array([4.0, 10.0])
+    value, grad = simple_quadratic_function(x)
+
+    # Use your code to numerically compute the gradient of simple_quadratic_function at x.
+    # (The notation "simple_quadratic_function" denotes a pointer to a function.)
+    numgrad = compute_numerical_gradient(simple_quadratic_function, x)
+
+    # Visually examine the two gradient computations.  The two columns
+    # you get should be very similar.
+    print numgrad, grad
+    print 'The above two columns you get should be very similar.\n' \
+          '(Left-Your Numerical Gradient, Right-Analytical Gradient)\n\n'
+
+    # Evaluate the norm of the difference between two solutions.
+    # If you have a correct implementation, and assuming you used EPSILON = 0.0001
+    # in computeNumericalGradient.m, then diff below should be 2.1452e-12
+    diff = np.linalg.norm(numgrad-grad)/np.linalg.norm(numgrad+grad)
+    print diff
+    print 'Norm of the difference between numerical and analytical gradient (should be < 1e-9)\n\n'
diff --git a/compute_numerical_gradient.py b/compute_numerical_gradient.py
@@ -0,0 +1,32 @@
+import numpy as np
+
+
+def compute_numerical_gradient(func, theta):
+    # theta: a vector of parameters
+    # func: a function that outputs a real-number. Calling y = J(theta) will return the
+    # function value at theta. 
+
+    # Initialize numgrad with zeros
+    numgrad = np.empty_like(theta)
+
+    # Instructions: 
+    # Implement numerical gradient checking, and return the result in numgrad.  
+    # (See Section 2.3 of the lecture notes.)
+    # You should write code so that numgrad(i) is (the numerical approximation to) the 
+    # partial derivative of func with respect to the i-th input argument, evaluated at theta.
+    # I.e., numgrad(i) should be the (approximately) the partial derivative of func with
+    # respect to theta(i).
+    #                
+    # Hint: You will probably want to compute the elements of numgrad one at a time.
+    EPSILON = 1e-4
+    for i in xrange(theta.size):
+        theta_i = theta[i]
+        theta[i] = theta_i+EPSILON
+        val_plus = func(theta)[0]
+        theta[i] = theta_i-EPSILON
+        val_minus = func(theta)[0]
+        numgrad[i] = (val_plus-val_minus)/(EPSILON*2)
+        # recover theta
+        theta[i] = theta_i
+
+    return numgrad
diff --git a/train.py b/train.py
@@ -6,6 +6,8 @@
 from sample_images import sample_images
 from display_network import display_network
 from sparse_autoencoder_cost import sparse_autoencoder_cost
+from check_numerical_gradient import check_numerical_gradient
+from compute_numerical_gradient import compute_numerical_gradient
 
 
 def initialize_parameters(hidden_size, visible_size):
@@ -43,8 +45,8 @@ def train():
     # display a random sample of 200 patches from the dataset
 
     patches = sample_images()
-    list = [randint(0, patches.shape[0]-1) for i in xrange(64)]
-    display_network(patches[list, :], 8)
+#    list = [randint(0, patches.shape[0]-1) for i in xrange(64)]
+#    display_network(patches[list, :], 8)
 
     # Obtain random parameters theta
     theta = initialize_parameters(hidden_size, visible_size)
@@ -77,7 +79,32 @@ def train():
 
     cost, grad = sparse_autoencoder_cost(theta, visible_size, hidden_size, decay_lambda, sparsity_param, beta, patches)
 
-
+    ## STEP 3: Gradient Checking
+    #
+    # Hint: If you are debugging your code, performing gradient checking on smaller models
+    # and smaller training sets (e.g., using only 10 training examples and 1-2 hidden
+    # units) may speed things up.
+
+    # First, lets make sure your numerical gradient computation is correct for a
+    # simple function.  After you have implemented compute_numerical_gradient,
+    # run the following:
+    check_numerical_gradient()
+
+    # Now we can use it to check your cost function and derivative calculations
+    # for the sparse autoencoder.
+    func = lambda x: sparse_autoencoder_cost(x, visible_size, hidden_size,
+                                             decay_lambda, sparsity_param, beta, patches)
+    numgrad = compute_numerical_gradient(func, theta)
+
+    # Use this to visually compare the gradients side by side
+    print numgrad, grad
+
+    # Compare numerically computed gradients with the ones obtained from backpropagation
+    diff = np.linalg.norm(numgrad-grad)/np.linalg.norm(numgrad+grad)
+    # Should be small. In our implementation, these values are usually less than 1e-9.
+    print diff
+
+    # When you got this working, Congratulations!!!
 
 
 if __name__ == "__main__":