Skip to content

Commit af6825e

Browse files
committed
Full implementation, bug buggy
1 parent 31d9a77 commit af6825e

File tree

5 files changed

+81
-53
lines changed

5 files changed

+81
-53
lines changed

compute_numerical_gradient.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ def compute_numerical_gradient(func, theta):
66
# func: a function that outputs a real-number. Calling y = J(theta) will return the
77
# function value at theta.
88

9-
# Initialize numgrad with zeros
9+
# Initialize numgrad (no need to initialize to zero, empty_like is a good fit here)
1010
numgrad = np.empty_like(theta)
1111

1212
# Instructions:
@@ -18,15 +18,19 @@ def compute_numerical_gradient(func, theta):
1818
# respect to theta(i).
1919
#
2020
# Hint: You will probably want to compute the elements of numgrad one at a time.
21-
EPSILON = 1e-4
21+
epsilon = 1e-4
2222
for i in xrange(theta.size):
23+
# temporarily save the value
2324
theta_i = theta[i]
24-
theta[i] = theta_i+EPSILON
25+
# temporarily increase the value
26+
theta[i] = theta_i+epsilon
2527
val_plus = func(theta)
26-
theta[i] = theta_i-EPSILON
28+
# temporarily decrease the value
29+
theta[i] = theta_i-epsilon
2730
val_minus = func(theta)
28-
numgrad[i] = (val_plus-val_minus)/(EPSILON*2)
29-
# recover theta
31+
# compute numerical gradient
32+
numgrad[i] = (val_plus-val_minus)/(epsilon*2)
33+
# restore theta
3034
theta[i] = theta_i
3135

3236
return numgrad

display_network.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ def display_network(data, cols=-1, opt_normalize=True, opt_graycolor=True):
5656
array[buf+i*(sz+buf):buf+i*(sz+buf)+sz, buf+j*(sz+buf):buf+j*(sz+buf)+sz] = data[k, :].reshape([sz, sz])/clim
5757
k += 1
5858

59+
# simulate imagesc
5960
ax = plt.figure().gca()
6061
pix_width = 5
6162
h, w = array.shape

sample_images.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,17 +25,14 @@ def normalize_data(patches):
2525
return patches
2626

2727

28-
def sample_images():
28+
def sample_images(patch_size, num_patches):
2929
"""
3030
:return: 10000 patches for training
3131
"""
3232
## Get IMAGES.mat from http://ufldl.stanford.edu/wiki/resources/sparseae_exercise.zip
3333
images = loadmat('IMAGES.mat')['IMAGES'] # load images from disk
3434
num_images = images.shape[2]
3535

36-
patch_size = 8 # we'll use 8x8 patches
37-
num_patches = 10000
38-
3936
# Initialize patches
4037
patches = np.empty([num_patches, patch_size*patch_size])
4138

sparse_autoencoder_cost.py

Lines changed: 48 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,14 @@
44
import numpy as np
55

66

7-
def initialize_parameters(hidden_size, visible_size):
7+
def initialize_parameters(visible_size, hidden_size):
88
# Initialize parameters randomly based on layer sizes.
99
r = sqrt(6) / sqrt(hidden_size+visible_size+1) # we'll choose weights uniformly from the interval [-r, r]
10-
w1 = np.random.rand(hidden_size, visible_size) * 2 * r - r
11-
w2 = np.random.rand(visible_size, hidden_size) * 2 * r - r
10+
w1 = np.random.rand(visible_size, hidden_size) * 2 * r - r
11+
w2 = np.random.rand(hidden_size, visible_size) * 2 * r - r
1212

13-
b1 = np.zeros((hidden_size, 1))
14-
b2 = np.zeros((visible_size, 1))
13+
b1 = np.zeros((1, hidden_size))
14+
b2 = np.zeros((1, visible_size))
1515

1616
# Convert weights and bias gradients to the vector form.
1717
# This step will "unroll" (flatten and concatenate together) all
@@ -42,18 +42,10 @@ def sparse_autoencoder_cost_and_grad(theta, visible_size, hidden_size, decay_lam
4242
# follows the notation convention of the lecture notes.
4343

4444
num_combinations = visible_size*hidden_size
45-
w1 = theta[0:num_combinations].reshape((hidden_size, visible_size))
46-
w2 = theta[num_combinations:2*num_combinations].reshape((visible_size, hidden_size))
45+
w1 = theta[0:num_combinations].reshape((visible_size, hidden_size))
46+
w2 = theta[num_combinations:2*num_combinations].reshape((hidden_size, visible_size))
4747
b1 = theta[2*num_combinations:2*num_combinations+hidden_size]
4848
b2 = theta[2*num_combinations+hidden_size:]
49-
50-
# Cost and gradient variables (your code needs to compute these values).
51-
# Here, we initialize them to zeros.
52-
cost = 0
53-
w1grad = np.zeros_like(w1)
54-
w2grad = np.zeros_like(w2)
55-
b1grad = np.zeros_like(b1)
56-
b2grad = np.zeros_like(b2)
5749

5850
# Instructions: Compute the cost/optimization objective J_sparse(W,b) for the Sparse Autoencoder,
5951
# and the corresponding gradients W1grad, W2grad, b1grad, b2grad.
@@ -68,20 +60,51 @@ def sparse_autoencoder_cost_and_grad(theta, visible_size, hidden_size, decay_lam
6860
#
6961
# Stated differently, if we were using batch gradient descent to optimize the parameters,
7062
# the gradient descent update to W1 would be W1 := W1 - alpha * W1grad, and similarly for W2, b1, b2.
71-
#
72-
73-
74-
75-
76-
77-
78-
79-
63+
64+
# autoencoder, y = x
65+
y = data
66+
67+
# feedforward pass
68+
a1 = data
69+
z2 = np.dot(a1, w1)+b1
70+
a2 = sigmoid(z2)
71+
z3 = np.dot(a2, w2)+b2
72+
a3 = sigmoid(z3)
73+
74+
# compute all deltas
75+
# output layer
76+
prime3 = np.multiply(a3, (1.0-a3))
77+
delta3 = -np.multiply(y-a3, prime3)
78+
# hidden layer
79+
one_over_m = 1.0/np.float32(data.shape[0])
80+
sparsity_avg = one_over_m*np.sum(a2, axis=0)
81+
sparsity_term = -sparsity_param/sparsity_avg+(1.0-sparsity_param)/(1.0-sparsity_avg)
82+
prime2 = np.multiply(a2, (1.0-a2))
83+
delta2 = np.multiply(np.dot(delta3, np.transpose(w2)) + beta*sparsity_term, prime2)
84+
85+
# compute gradient
86+
w1grad = np.zeros_like(w1)
87+
for i in xrange(data.shape[0]):
88+
w1grad += np.dot(a1[i, :].reshape((visible_size, 1)), delta2[i, :].reshape((1, hidden_size)))
89+
w1grad = one_over_m*w1grad + decay_lambda*w1
90+
w2grad = np.zeros_like(w2)
91+
for i in xrange(data.shape[0]):
92+
w2grad += np.dot(a2[i, :].reshape((hidden_size, 1)), delta3[i, :].reshape((1, visible_size)))
93+
w2grad = one_over_m*w2grad + decay_lambda*w2
94+
b1grad = one_over_m*np.sum(delta2, axis=0)
95+
b2grad = one_over_m*np.sum(delta3, axis=0)
96+
97+
# compute cost
98+
error_flatten = (a3-y).flatten()
99+
w1_flatten = w1.flatten()
100+
w2_flatten = w2.flatten()
101+
cost = np.dot(error_flatten, error_flatten)*one_over_m/2.0 + \
102+
decay_lambda*(np.dot(w1_flatten, w1_flatten)+np.dot(w2_flatten, w2_flatten))/2.0 + \
103+
beta*(np.sum(sparsity_param*np.log(sparsity_param/sparsity_avg)+(1.0-sparsity_param)*np.log((1.0-sparsity_param)/(1.0-sparsity_avg))))
80104

81105
# After computing the cost and gradient, we will convert the gradients back
82106
# to a vector format (suitable for minFunc). Specifically, we will unroll
83107
# your gradient matrices into a vector.
84-
85108
grad = np.concatenate((w1grad.flatten(), w2grad.flatten(), b1grad.flatten(), b2grad.flatten()))
86109

87110
return cost, grad

train.py

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,10 @@ def train():
1313
# allow your sparse autoencoder to get good filters; you do not need to
1414
# change the parameters below.
1515

16-
visible_size = 8*8 # number of input units
17-
hidden_size = 25 # number of hidden units
16+
patch_size = 8
17+
num_patches = 10000
18+
visible_size = patch_size**2 # number of input units
19+
hidden_size = 25 # number of hidden units
1820
sparsity_param = 0.01 # desired average activation of the hidden units.
1921
# (This was denoted by the Greek alphabet rho, which looks like a lower-case "p",
2022
# in the lecture notes).
@@ -25,12 +27,12 @@ def train():
2527
# After implementing sampleIMAGES, the display_network command should
2628
# display a random sample of 200 patches from the dataset
2729

28-
patches = sample_images()
30+
patches = sample_images(patch_size, num_patches)
2931
# list = [randint(0, patches.shape[0]-1) for i in xrange(64)]
3032
# display_network(patches[list, :], 8)
3133

3234
# Obtain random parameters theta
33-
theta = initialize_parameters(hidden_size, visible_size)
35+
# theta = initialize_parameters(visible_size, hidden_size)
3436

3537
# STEP 2: Implement sparseAutoencoderCost
3638
#
@@ -57,8 +59,8 @@ def train():
5759
# and/or lambda to zero may be helpful for debugging.) However, in your
5860
# final submission of the visualized weights, please use parameters we
5961
# gave in Step 0 above.
60-
cost, grad = sparse_autoencoder_cost_and_grad(theta, visible_size, hidden_size,
61-
decay_lambda, sparsity_param, beta, patches)
62+
# cost, grad = sparse_autoencoder_cost_and_grad(theta, visible_size, hidden_size,
63+
# decay_lambda, sparsity_param, beta, patches)
6264

6365
# STEP 3: Gradient Checking
6466
#
@@ -69,35 +71,36 @@ def train():
6971
# First, lets make sure your numerical gradient computation is correct for a
7072
# simple function. After you have implemented compute_numerical_gradient,
7173
# run the following:
72-
check_numerical_gradient()
74+
# check_numerical_gradient()
7375

7476
# Now we can use it to check your cost function and derivative calculations
7577
# for the sparse autoencoder.
76-
func = lambda x: sparse_autoencoder_cost(x, visible_size, hidden_size,
77-
decay_lambda, sparsity_param, beta, patches)
78-
numgrad = compute_numerical_gradient(func, theta)
78+
# func = lambda x: sparse_autoencoder_cost(x, visible_size, hidden_size,
79+
# decay_lambda, sparsity_param, beta, patches)
80+
# numgrad = compute_numerical_gradient(func, theta)
7981

8082
# Use this to visually compare the gradients side by side
81-
print numgrad, grad
83+
# print numgrad, grad
8284

8385
# Compare numerically computed gradients with the ones obtained from backpropagation
84-
diff = np.linalg.norm(numgrad-grad)/np.linalg.norm(numgrad+grad)
86+
# diff = np.linalg.norm(numgrad-grad)/np.linalg.norm(numgrad+grad)
8587
# Should be small. In our implementation, these values are usually less than 1e-9.
86-
print diff
88+
# print diff
8789

8890
# STEP 4: After verifying that your implementation of
89-
# sparse_autoencoder_cost is correct, You can start training your sparse
90-
# autoencoder with minFunc (L-BFGS).
91+
# sparse_autoencoder_cost is correct, You can start training your sparse
92+
# autoencoder with minFunc (L-BFGS).
9193

92-
# Randomly initialize the parameters
93-
theta = initialize_parameters(hidden_size, visible_size)
94+
# Randomly initialize the parameters
95+
# Use minimize interface, and set jac=True, so it can accept cost and grad together
96+
theta = initialize_parameters(visible_size, hidden_size)
9497
func_args = (visible_size, hidden_size, decay_lambda, sparsity_param, beta, patches)
9598
res = minimize(sparse_autoencoder_cost_and_grad, x0=theta, args=func_args, method='L-BFGS-B',
9699
jac=True, options={'maxiter': 400, 'disp': True})
97100

98101
# STEP 5: Visualization
99102
w1 = res.x[0: hidden_size*visible_size].reshape((hidden_size, visible_size))
100-
# display_network(w1, 12)
103+
display_network(w1, 5)
101104

102105

103106
if __name__ == "__main__":

0 commit comments

Comments
 (0)