Self-Taught Learning

cxyzs7 · cxyzs7 · commit ac28ed79a128 · 2014-06-29T19:43:49.000-07:00
diff --git a/self_taught_learning/feed_forward_autoencoder.py b/self_taught_learning/feed_forward_autoencoder.py
@@ -12,14 +12,14 @@ def feed_forward_autoencoder(theta, hidden_size, visible_size, data):
     # We first convert theta to the (W1, W2, b1, b2) matrix/vector format, so that this
     # follows the notation convention of the lecture notes.
 
-    W1 = reshape(theta(1:hiddenSize*visibleSize), hiddenSize, visibleSize);
-    b1 = theta(2*hiddenSize*visibleSize+1:2*hiddenSize*visibleSize+hiddenSize);
+    num_combinations = visible_size*hidden_size
+    w1 = theta[0:num_combinations].reshape((visible_size, hidden_size))
+    b1 = theta[2*num_combinations:2*num_combinations+hidden_size]
 
     #  Instructions: Compute the activation of the hidden layer for the Sparse Autoencoder.
-
+    activation = sigmoid(np.dot(data, w1)+b1)
 
     #-------------------------------------------------------------------
-
     return activation
 
 
diff --git a/self_taught_learning/stl_exercise.py b/self_taught_learning/stl_exercise.py
@@ -1,3 +1,15 @@
+from sklearn.datasets import fetch_mldata
+import numpy as np
+from scipy.optimize import minimize
+from random import shuffle
+
+from sparse_autoencoder.display_network import display_network
+from sparse_autoencoder.sparse_autoencoder_cost import initialize_parameters, sparse_autoencoder_cost_and_grad
+from softmax_regression.softmax_train import softmax_train
+from softmax_regression.softmax_predict import softmax_predict
+
+from feed_forward_autoencoder import feed_forward_autoencoder
+
 ## CS294A/CS294W Self-taught Learning Exercise
 
 #  Instructions
@@ -13,15 +25,15 @@
 #  allow your sparse autoencoder to get good filters; you do not need to 
 #  change the parameters below.
 
-inputSize  = 28 * 28;
-numLabels  = 5;
-hiddenSize = 200;
-sparsityParam = 0.1; # desired average activation of the hidden units.
-                     # (This was denoted by the Greek alphabet rho, which looks like a lower-case "p",
-		             #  in the lecture notes). 
-lambda = 3e-3;       # weight decay parameter       
-beta = 3;            # weight of sparsity penalty term   
-maxIter = 400;
+input_size = 28 * 28
+num_labels = 5
+hidden_size = 200
+sparsity_param = 0.1    # desired average activation of the hidden units.
+                        # (This was denoted by the Greek alphabet rho, which looks like a lower-case "p",
+                        #  in the lecture notes).
+decay_lambda = 3e-3     # weight decay parameter
+beta = 3                # weight of sparsity penalty term
+max_iter = 400
 
 ## ======================================================================
 #  STEP 1: Load data from the MNIST database
@@ -31,77 +43,75 @@
 #  change it.
 
 # Load MNIST database files
-mnistData   = loadMNISTImages('mnist/train-images-idx3-ubyte');
-mnistLabels = loadMNISTLabels('mnist/train-labels-idx1-ubyte');
+mnist = fetch_mldata('MNIST original', data_home='../data/')
+images = np.float32(mnist.data)/255.0
+labels = mnist.target
 
 # Set Unlabeled Set (All Images)
 
 # Simulate a Labeled and Unlabeled set
-labeledSet   = find(mnistLabels >= 0 & mnistLabels <= 4);
-unlabeledSet = find(mnistLabels >= 5);
-
-numTrain = round(numel(labeledSet)/2);
-trainSet = labeledSet(1:numTrain);
-testSet  = labeledSet(numTrain+1:end);
-
-unlabeledData = mnistData(:, unlabeledSet);
-
-trainData   = mnistData(:, trainSet);
-trainLabels = mnistLabels(trainSet)' + 1; # Shift Labels to the Range 1-5
+labeled_set = np.where((labels >= 0) & (labels <= 5))[0]
+unlabeled_set = np.where(labels >= 6)[0]
 
-testData   = mnistData(:, testSet);
-testLabels = mnistLabels(testSet)' + 1;   # Shift Labels to the Range 1-5
+unlabeled_data = images[unlabeled_set]
 
 # Output Some Statistics
-fprintf('# examples in unlabeled set: #d\n', size(unlabeledData, 2));
-fprintf('# examples in supervised training set: #d\n\n', size(trainData, 2));
-fprintf('# examples in supervised testing set: #d\n\n', size(testData, 2));
+print '# examples in unlabeled set: {0}'.format(unlabeled_data.shape[0])
 
 ## ======================================================================
 #  STEP 2: Train the sparse autoencoder
 #  This trains the sparse autoencoder on the unlabeled training
 #  images. 
 
-#  Randomly initialize the parameters
-theta = initializeParameters(hiddenSize, inputSize);
-
-## ----------------- YOUR CODE HERE ----------------------
-#  Find opttheta by running the sparse autoencoder on
-#  unlabeledTrainingImages
-
-opttheta = theta; 
-
-
-
-
-
-
-
-
+trained_theta_file = '../data/opttheta.npy'
+TRAIN = False
+if TRAIN:
+    #  Randomly initialize the parameters
+    theta = initialize_parameters(input_size, hidden_size)
+
+    #  Find opttheta by running the sparse autoencoder on
+    #  unlabeledTrainingImages
+    func_args = (input_size, hidden_size, decay_lambda, sparsity_param, beta, unlabeled_data)
+    res = minimize(sparse_autoencoder_cost_and_grad, x0=theta, args=func_args, method='L-BFGS-B',
+                    jac=True, options={'maxiter': max_iter, 'disp': True})
+    opttheta = res.x
+    np.save(trained_theta_file, opttheta)
+else:
+    opttheta = np.load(trained_theta_file)
 
 ## -----------------------------------------------------
-                          
+
 # Visualize weights
-W1 = reshape(opttheta(1:hiddenSize * inputSize), hiddenSize, inputSize);
-display_network(W1');
+w1 = opttheta[0: hidden_size*input_size].reshape((input_size, hidden_size))
+display_network(w1.T, save_figure_path='../data/stl.png')
 
 ##======================================================================
 ## STEP 3: Extract Features from the Supervised Dataset
 #  
 #  You need to complete the code in feedForwardAutoencoder.m so that the 
 #  following command will extract features from the data.
 
-trainFeatures = feedForwardAutoencoder(opttheta, hiddenSize, inputSize, ...
-                                       trainData);
+num_train = np.round(labeled_set.shape[0]/2)
+indices = [i for i in xrange(labeled_set.shape[0])]
+shuffle(indices)
+train_set = labeled_set[indices[0:num_train]]
+test_set = labeled_set[indices[num_train:]]
+
+train_data = images[train_set]
+train_labels = labels[train_set]      # Shift Labels to the Range 1-5
 
-testFeatures = feedForwardAutoencoder(opttheta, hiddenSize, inputSize, ...
-                                       testData);
+test_data = images[test_set]
+test_labels = labels[test_set]        # Shift Labels to the Range 1-5
+
+print '# examples in supervised training set: {0}'.format(train_data.shape[0])
+print '# examples in supervised testing set: {0}'.format(test_data.shape[0])
+
+train_features = feed_forward_autoencoder(opttheta, hidden_size, input_size, train_data)
+
+test_features = feed_forward_autoencoder(opttheta, hidden_size, input_size, test_data)
 
 ##======================================================================
 ## STEP 4: Train the softmax classifier
-
-softmaxModel = struct;  
-## ----------------- YOUR CODE HERE ----------------------
 #  Use softmaxTrain.m from the previous exercise to train a multi-class
 #  classifier. 
 
@@ -110,43 +120,22 @@
 # You need to compute softmaxModel using softmaxTrain on trainFeatures and
 # trainLabels
 
-
-
-
-
-
-
-
-
-
-## -----------------------------------------------------
-
+num_classes = 10
+decay_lambda = 1e-4
+options = {'maxiter': 100}
+softmax_model = softmax_train(hidden_size, num_classes, decay_lambda, train_features, train_labels, options)
 
 ##======================================================================
 ## STEP 5: Testing 
 
-## ----------------- YOUR CODE HERE ----------------------
 # Compute Predictions on the test set (testFeatures) using softmaxPredict
 # and softmaxModel
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+pred = softmax_predict(softmax_model, test_features)
 
 ## -----------------------------------------------------
-
 # Classification Score
-fprintf('Test Accuracy: #f##\n', 100*mean(pred(:) == testLabels(:)));
+acc = np.mean(test_labels == pred)
+print 'Test Accuracy: {0:.3f}\n'.format(100*acc)
 
 # (note that we shift the labels by 1, so that digit 0 now corresponds to
 #  label 1)
diff --git a/softmax_regression/softmax_cost.py b/softmax_regression/softmax_cost.py
@@ -38,12 +38,11 @@ def softmax_cost_and_grad(theta, num_classes, input_size, decay_lambda, data, la
         decay_lambda/2.0*np.dot(theta.flatten(), theta.flatten())
 
     # compute gradient
-    thetagrad = -np.dot(data.T, ground_truth-hypothesis)/num_cases + \
-        decay_lambda*theta
+    thetagrad = -np.dot(data.T, ground_truth-hypothesis)/num_cases + decay_lambda*theta
 
     # ------------------------------------------------------------------
     # Unroll the gradient matrices into a vector for minFunc
-    grad = thetagrad.flatten()
+    grad = thetagrad.ravel()
 
     return cost, grad
 
diff --git a/softmax_regression/softmax_exercise.py b/softmax_regression/softmax_exercise.py
@@ -120,7 +120,7 @@
 pred = softmax_predict(softmax_model, input_data)
 
 acc = np.mean(labels == pred)
-print ('Accuracy: #0.3f##\n', acc * 100)
+print 'Accuracy: {0:.3f}\n'.format(acc * 100)
 
 # Accuracy is the proportion of correctly classified images
 # After 100 iterations, the results for our implementation were:
diff --git a/sparse_autoencoder/display_network.py b/sparse_autoencoder/display_network.py
@@ -47,7 +47,7 @@ def display_network(data, cols=-1, opt_normalize=True, opt_graycolor=True, save_
     k = 0
     for i in xrange(m):
         for j in xrange(n):
-            if k > num:
+            if k >= num:
                 continue
             if opt_normalize:
                 clim = np.amax(np.absolute(data[k, :]))
diff --git a/sparse_autoencoder/sparse_autoencoder_cost.py b/sparse_autoencoder/sparse_autoencoder_cost.py
@@ -1,4 +1,5 @@
 from math import sqrt
+import gc
 
 import numpy as np
 
@@ -65,47 +66,38 @@ def sparse_autoencoder_cost_and_grad(theta, visible_size, hidden_size, decay_lam
 
     # feedforward pass
     a1 = data
-    z2 = np.dot(a1, w1)+b1
-    a2 = sigmoid(z2)
-    z3 = np.dot(a2, w2)+b2
-    a3 = sigmoid(z3)
+    a2 = sigmoid(np.dot(a1, w1)+b1)
+    a3 = sigmoid(np.dot(a2, w2)+b2)
 
     # compute all deltas
     # output layer
-    prime3 = np.multiply(a3, (1.0-a3))
-    delta3 = -np.multiply(y-a3, prime3)
+    delta3 = (a3-y)*a3*(1.0-a3)
     # hidden layer
     one_over_m = 1.0/np.float32(data.shape[0])
     sparsity_avg = one_over_m*np.sum(a2, axis=0)
     sparsity_term = -sparsity_param/sparsity_avg+(1.0-sparsity_param)/(1.0-sparsity_avg)
-    prime2 = np.multiply(a2, (1.0-a2))
-    delta2 = np.multiply(np.dot(delta3, np.transpose(w2)) + beta*sparsity_term, prime2)
-
-    # compute partial gradient
-    w1grad_p = np.dot(a1.T, delta2)
-    w2grad_p = np.dot(a2.T, delta3)
-    b1grad_p = delta2
-    b2grad_p = delta3
+    delta2 = (np.dot(delta3, w2.T) + beta*sparsity_term)*a2*(1.0-a2)
+    del sparsity_term
+    gc.collect()
 
     # compute gradient
-    w1grad = one_over_m*w1grad_p + decay_lambda*w1
-    w2grad = one_over_m*w2grad_p + decay_lambda*w2
-    b1grad = one_over_m*np.sum(b1grad_p, axis=0)
-    b2grad = one_over_m*np.sum(b2grad_p, axis=0)
+    w1grad = one_over_m*np.dot(a1.T, delta2) + decay_lambda*w1
+    w2grad = one_over_m*np.dot(a2.T, delta3) + decay_lambda*w2
+    b1grad = one_over_m*np.sum(delta2, axis=0)
+    b2grad = one_over_m*np.sum(delta3, axis=0)
 
     # compute cost
-    error_flatten = (a3-y).flatten()
-    w1_flatten = w1.flatten()
-    w2_flatten = w2.flatten()
-    cost = np.dot(error_flatten, error_flatten)*one_over_m/2.0 + \
-        decay_lambda*(np.dot(w1_flatten, w1_flatten)+np.dot(w2_flatten, w2_flatten))/2.0 + \
+    w1_ravel = w1.ravel()
+    w2_ravel = w2.ravel()
+    cost = np.dot((a3-y).ravel(), (a3-y).ravel())*one_over_m/2.0 + \
+        decay_lambda*(np.dot(w1_ravel, w1_ravel)+np.dot(w2_ravel, w2_ravel))/2.0 + \
         beta*(np.sum(sparsity_param*np.log(sparsity_param/sparsity_avg) +
                      (1.0-sparsity_param)*np.log((1.0-sparsity_param)/(1.0-sparsity_avg))))
 
     # After computing the cost and gradient, we will convert the gradients back
     # to a vector format (suitable for minFunc).  Specifically, we will unroll
     # your gradient matrices into a vector.
-    grad = np.concatenate((w1grad.flatten(), w2grad.flatten(), b1grad.flatten(), b2grad.flatten()))
+    grad = np.concatenate((w1grad.ravel(), w2grad.ravel(), b1grad.ravel(), b2grad.ravel()))
 
     return cost, grad
 
diff --git a/sparse_autoencoder/train.py b/sparse_autoencoder/train.py
@@ -100,7 +100,7 @@ def train():
 
     # STEP 5: Visualization
     w1 = res.x[0: hidden_size*visible_size].reshape((visible_size, hidden_size))
-    display_network(np.transpose(w1), 5, save_figure_path='../data/sparse_autoencoder.png')
+    display_network(w1.T, 5, save_figure_path='../data/sparse_autoencoder.png')
 
 
 if __name__ == "__main__":