|
3 | 3 | ~~~~~~~~~~~~
|
4 | 4 |
|
5 | 5 | A library to load the MNIST image data. For details of the data
|
6 |
| -structures that are returned, see the doc string for ``load_data``. |
7 |
| -The library also contains a helper method ``load_data_nn`` which |
8 |
| -returns the data in a format well adapted for use with our neural |
9 |
| -network code. |
| 6 | +structures that are returned, see the doc strings for ``load_data`` |
| 7 | +and ``load_data_wrapper``. In practice, ``load_data_wrapper`` is the |
| 8 | +function usually called by our neural network code. |
10 | 9 |
|
11 | 10 | Note that the code requires the file ``../data/mnist.pkl``. This is not
|
12 | 11 | included in the repository. It may be downloaded from:
|
@@ -40,46 +39,49 @@ def load_data():
|
40 | 39 | The ``validation_data`` and ``test_data`` are similar, except
|
41 | 40 | each contains only 10,000 images.
|
42 | 41 |
|
43 |
| - Note that the format the data is returned in is well adapted for |
44 |
| - use by scikit-learn's SVM method, but not so well adapted for our |
45 |
| - neural network code. For that, see the wrapper function |
46 |
| - ``load_data_nn``. |
| 42 | + This is a nice and convenient data format, but for use in neural |
| 43 | + networks it's actually helpful to modify the format of the |
| 44 | + ``training_data`` a little. That's done in the wrapper function |
| 45 | + ``load_data_wrapper()``, see below. |
47 | 46 | """
|
48 | 47 | f = open('../data/mnist.pkl', 'rb')
|
49 | 48 | training_data, validation_data, test_data = cPickle.load(f)
|
50 | 49 | f.close()
|
51 | 50 | return (training_data, validation_data, test_data)
|
52 | 51 |
|
53 |
| -def load_data_nn(): |
54 |
| - """Return a tuple containing ``(training_data, test_inputs, |
55 |
| - actual_test_results)`` from the MNIST data. The tuples are in a |
56 |
| - format optimized for use by our neural network code. This |
57 |
| - function makesuse of ``load_data()``, but does some additional |
58 |
| - processing to put the data in the right format. |
59 |
| -
|
60 |
| - ``training_data`` is a list containing 50,000 2-tuples ``(x, y)``. |
61 |
| - ``x`` is a 784-dimensional numpy.ndarray containing the input |
62 |
| - image. ``y`` is a 10-dimensional numpy.ndarray representing the |
63 |
| - unit vector corresponding to the correct digit for ``x``. |
64 |
| -
|
65 |
| - ``test_inputs`` is a list containing 10,000 x 784-dimensional |
66 |
| - numpy.ndarray objects, representing test images. |
67 |
| -
|
68 |
| - ``actual_test_results`` is a list containing the 10,000 digit |
69 |
| - values (integers) corresponding to the ``test_inputs``. |
70 |
| -
|
71 |
| - Obviously, we're using slightly different formats for the training |
72 |
| - and test data. These formats turn out to be the most convenient |
73 |
| - for use in our neural network code.""" |
74 |
| - training_data, validation_data, test_data = load_data() |
75 |
| - inputs = [np.reshape(x, (784, 1)) for x in training_data[0]] |
76 |
| - results = [vectorized_result(y) for y in training_data[1]] |
77 |
| - training_data = zip(inputs, results) |
78 |
| - test_inputs = [np.reshape(x, (784, 1)) for x in test_data[0]] |
79 |
| - return (training_data, test_inputs, test_data[1]) |
| 52 | +def load_data_wrapper(): |
| 53 | + """Return a tuple containing ``(training_data, validation_data, |
| 54 | + test_data)``. Based on ``load_data``, but the format is a little more |
| 55 | + convenient for use in neural networks. |
| 56 | +
|
| 57 | + In particular, ``training_data`` is a list containing 50,000 |
| 58 | + 2-tuples ``(x, y)``. ``x`` is a 784-dimensional numpy.ndarray |
| 59 | + containing the input image. ``y`` is a 10-dimensional |
| 60 | + numpy.ndarray representing the unit vector corresponding to the |
| 61 | + correct digit for ``x``. |
| 62 | +
|
| 63 | + ``validation_data`` and ``test_data`` are lists containing 10,000 |
| 64 | + 2-tuples ``(x, y)``. In each case, ``x`` is a 784-dimensional |
| 65 | + numpy.ndarry containing the input image, and ``y`` is the |
| 66 | + corresponding classification, i.e., the digit values (integers) |
| 67 | + corresponding to ``x``. |
| 68 | +
|
| 69 | + Obviously, this means we're using slightly different formats for |
| 70 | + the training data and the validation / test data. These formats |
| 71 | + turn out to be the most convenient for use in our neural network |
| 72 | + code.""" |
| 73 | + tr_d, va_d, te_d = load_data() |
| 74 | + training_inputs = [np.reshape(x, (784, 1)) for x in tr_d[0]] |
| 75 | + training_results = [vectorized_result(y) for y in tr_d[1]] |
| 76 | + training_data = zip(training_inputs, training_results) |
| 77 | + validation_inputs = [np.reshape(x, (784, 1)) for x in va_d[0]] |
| 78 | + validation_data = zip(validation_inputs, va_d[1]) |
| 79 | + test_inputs = [np.reshape(x, (784, 1)) for x in te_d[0]] |
| 80 | + test_data = zip(test_inputs, te_d[1]) |
| 81 | + return (training_data, validation_data, test_data) |
80 | 82 |
|
81 | 83 | def vectorized_result(j):
|
82 |
| - """ Return a 10-dimensional unit vector with a 1.0 in the jth |
| 84 | + """Return a 10-dimensional unit vector with a 1.0 in the jth |
83 | 85 | position and zeroes elsewhere. This is used to convert a digit
|
84 | 86 | (0...9) into a corresponding desired output from the neural
|
85 | 87 | network."""
|
|
0 commit comments