Separating backprop and SGD more cleanly

mnielsen · mnielsen · commit 73a3f682dad8 · 2014-02-21T11:16:54.000-05:00
diff --git a/code/network_basic.py b/code/network_basic.py
@@ -59,52 +59,62 @@ def SGD(self, training_data, epochs, mini_batch_size, eta,
                 training_data[k:k+mini_batch_size]
                 for k in xrange(0, n, mini_batch_size)]
             for mini_batch in mini_batches:
-                self.backprop(mini_batch, eta)
+                self.update_mini_batch(mini_batch, eta)
             if test_data:
                 print "Epoch {}: {} / {}".format(
                     j, self.evaluate(test_data), n_test)
             else:
                 print "Epoch %s complete" % j
 
-    def backprop(self, training_data, eta):
-        """Update the network's weights and biases by applying a single
-        iteration of gradient descent using backpropagation.  The
-        ``training_data`` is a list of tuples ``(x, y)``.  It need not
-        include the entire training data set --- it might be a
-        mini-batch, or even a single training example.  ``eta`` is the
-        learning rate."""
+    def update_mini_batch(self, mini_batch, eta):
+        """Update the network's weights and biases by applying a
+        gradient descent using backpropagation to a single mini batch.
+        The ``mini_batch`` is a list of tuples ``(x, y)``, and ``eta``
+        is the learning rate."""
         nabla_b = [np.zeros(b.shape) for b in self.biases]
         nabla_w = [np.zeros(w.shape) for w in self.weights]
-        for x, y in training_data:
-            # feedforward
-            activation = x
-            activations = [x] # list to store all the activations, layer by layer
-            zs = [] # list to store all the z vectors, layer by layer
-            for b, w in zip(self.biases, self.weights):
-                z = np.dot(w, activation)+b
-                zs.append(z)
-                activation = sigmoid_vec(z)
-                activations.append(activation)
-            # backward pass
-            delta = self.cost_derivative(activations[-1], y) * \
-                sigmoid_prime_vec(zs[-1])
-            nabla_b[-1] += delta
-            nabla_w[-1] += np.dot(delta, activations[-2].transpose())
-            # Note that the variable l in the loop below is used a
-            # little differently to the notation in Chapter 2 of the book.
-            # Here, l = 1 means the last layer of neurons, l = 2 is the
-            # second-last layer, and so on.  It's a renumbering of the
-            # scheme used in the book, used here to take advantage of the
-            # fact that Python can use negative indices in lists.
-            for l in xrange(2, self.num_layers):
-                z = zs[-l]
-                spv = sigmoid_prime_vec(z)
-                delta = np.dot(self.weights[-l+1].transpose(), delta) * spv
-                nabla_b[-l] += delta
-                nabla_w[-l] += np.dot(delta, activations[-l-1].transpose())
+        for x, y in mini_batch:
+            delta_nabla_b, delta_nabla_w = self.backprop(x, y)
+            nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
+            nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
         self.weights = [w-eta*nw for w, nw in zip(self.weights, nabla_w)]
         self.biases = [b-eta*nb for b, nb in zip(self.biases, nabla_b)]
 
+    def backprop(self, x, y):
+        """Return a tuple ``(nabla_b, nabla_w)`` representing the
+        gradient for the cost function C_x.  ``nabla_b`` and
+        ``nabla_w`` are layer-by-layer lists of numpy arrays, similar
+        to ``self.biases`` and ``self.weights``."""
+        nabla_b = [np.zeros(b.shape) for b in self.biases]
+        nabla_w = [np.zeros(w.shape) for w in self.weights]
+        # feedforward
+        activation = x
+        activations = [x] # list to store all the activations, layer by layer
+        zs = [] # list to store all the z vectors, layer by layer
+        for b, w in zip(self.biases, self.weights):
+            z = np.dot(w, activation)+b
+            zs.append(z)
+            activation = sigmoid_vec(z)
+            activations.append(activation)
+        # backward pass
+        delta = self.cost_derivative(activations[-1], y) * \
+            sigmoid_prime_vec(zs[-1])
+        nabla_b[-1] = delta
+        nabla_w[-1] = np.dot(delta, activations[-2].transpose())
+        # Note that the variable l in the loop below is used a little
+        # differently to the notation in Chapter 2 of the book.  Here,
+        # l = 1 means the last layer of neurons, l = 2 is the
+        # second-last layer, and so on.  It's a renumbering of the
+        # scheme in the book, used here to take advantage of the fact
+        # that Python can use negative indices in lists.
+        for l in xrange(2, self.num_layers):
+            z = zs[-l]
+            spv = sigmoid_prime_vec(z)
+            delta = np.dot(self.weights[-l+1].transpose(), delta) * spv
+            nabla_b[-l] = delta
+            nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
+        return (nabla_b, nabla_w)
+
     def evaluate(self, test_data):
         """Return the number of test inputs for which the neural
         network outputs the correct result. Note that the neural