diff --git a/README.md b/README.md
index 05218427..cbcd3155 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,14 @@
 Building Machine Learning Systems with Python
 =============================================
 
-Source Code for the book Building Machine Learning Systems with Python by
-[Willi Richert](http://twotoreal.com) and [Luis Pedro
-Coelho](http://luispedro.org).
+Source Code for the book Building Machine Learning Systems with Python by [Luis
+Pedro Coelho](http://luispedro.org) and [Willi Richert](http://twotoreal.com).
 
-The book was published in 2013 by Packt Publishing and is available [from their
+The book was published in 2013 (second edition in 2015) by Packt Publishing and
+is available [from their
 website](http://www.packtpub.com/building-machine-learning-systems-with-python/book).
 
+The code in the repository corresponds to the second edition. Code for the
+first edition is available in [first\_edition
+branch](https://github.com/luispedro/BuildingMachineLearningSystemsWithPython/tree/first_edition).
 
diff --git a/SimpleImageDataset/building02.jpg b/SimpleImageDataset/building02.jpg
new file mode 100644
index 00000000..343c5242
Binary files /dev/null and b/SimpleImageDataset/building02.jpg differ
diff --git a/SimpleImageDataset/scene01.jpg b/SimpleImageDataset/scene01.jpg
index 29b327d2..e7b416d3 100644
Binary files a/SimpleImageDataset/scene01.jpg and b/SimpleImageDataset/scene01.jpg differ
diff --git a/SimpleImageDataset/scene02.jpg b/SimpleImageDataset/scene02.jpg
new file mode 100644
index 00000000..89fd6b27
Binary files /dev/null and b/SimpleImageDataset/scene02.jpg differ
diff --git a/SimpleImageDataset/scene08.jpg b/SimpleImageDataset/scene08.jpg
new file mode 100644
index 00000000..7dee8860
Binary files /dev/null and b/SimpleImageDataset/scene08.jpg differ
diff --git a/ch01/analyze_webstats.py b/ch01/analyze_webstats.py
index a053cec4..5da892e2 100644
--- a/ch01/analyze_webstats.py
+++ b/ch01/analyze_webstats.py
@@ -6,13 +6,15 @@
 # It is made available under the MIT License
 
 import os
+from utils import DATA_DIR, CHART_DIR
 import scipy as sp
 import matplotlib.pyplot as plt
 
-from utils import DATA_DIR, CHART_DIR
+sp.random.seed(3)  # to reproduce the data later on
 
 data = sp.genfromtxt(os.path.join(DATA_DIR, "web_traffic.tsv"), delimiter="\t")
 print(data[:10])
+print(data.shape)
 
 # all examples will have three classes in this file
 colors = ['g', 'k', 'b', 'm', 'r']
@@ -24,10 +26,11 @@
 x = x[~sp.isnan(y)]
 y = y[~sp.isnan(y)]
 
-# plot input data
-
 
 def plot_models(x, y, models, fname, mx=None, ymax=None, xmin=None):
+    ''' plot input data '''
+
+    plt.figure(num=None, figsize=(8, 6))
     plt.clf()
     plt.scatter(x, y, s=10)
     plt.title("Web traffic over the last month")
@@ -59,11 +62,15 @@ def plot_models(x, y, models, fname, mx=None, ymax=None, xmin=None):
 plot_models(x, y, None, os.path.join(CHART_DIR, "1400_01_01.png"))
 
 # create and plot models
-fp1, res, rank, sv, rcond = sp.polyfit(x, y, 1, full=True)
-print("Model parameters: %s" % fp1)
-print("Error of the model:", res)
+fp1, res1, rank1, sv1, rcond1 = sp.polyfit(x, y, 1, full=True)
+print("Model parameters of fp1: %s" % fp1)
+print("Error of the model of fp1:", res1)
 f1 = sp.poly1d(fp1)
-f2 = sp.poly1d(sp.polyfit(x, y, 2))
+
+fp2, res2, rank2, sv2, rcond2 = sp.polyfit(x, y, 2, full=True)
+print("Model parameters of fp2: %s" % fp2)
+print("Error of the model of fp2:", res2)
+f2 = sp.poly1d(fp2)
 f3 = sp.poly1d(sp.polyfit(x, y, 3))
 f10 = sp.poly1d(sp.polyfit(x, y, 10))
 f100 = sp.poly1d(sp.polyfit(x, y, 100))
@@ -102,7 +109,8 @@ def error(f, x, y):
 
 # extrapolating into the future
 plot_models(
-    x, y, [f1, f2, f3, f10, f100], os.path.join(CHART_DIR, "1400_01_06.png"),
+    x, y, [f1, f2, f3, f10, f100],
+    os.path.join(CHART_DIR, "1400_01_06.png"),
     mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100),
     ymax=10000, xmin=0 * 7 * 24)
 
@@ -118,7 +126,8 @@ def error(f, x, y):
     print("Error d=%i: %f" % (f.order, error(f, xb, yb)))
 
 plot_models(
-    x, y, [fb1, fb2, fb3, fb10, fb100], os.path.join(CHART_DIR, "1400_01_07.png"),
+    x, y, [fb1, fb2, fb3, fb10, fb100],
+    os.path.join(CHART_DIR, "1400_01_07.png"),
     mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100),
     ymax=10000, xmin=0 * 7 * 24)
 
@@ -130,6 +139,8 @@ def error(f, x, y):
 train = sorted(shuffled[split_idx:])
 fbt1 = sp.poly1d(sp.polyfit(xb[train], yb[train], 1))
 fbt2 = sp.poly1d(sp.polyfit(xb[train], yb[train], 2))
+print("fbt2(x)= \n%s" % fbt2)
+print("fbt2(x)-100,000= \n%s" % (fbt2-100000))
 fbt3 = sp.poly1d(sp.polyfit(xb[train], yb[train], 3))
 fbt10 = sp.poly1d(sp.polyfit(xb[train], yb[train], 10))
 fbt100 = sp.poly1d(sp.polyfit(xb[train], yb[train], 100))
@@ -139,13 +150,13 @@ def error(f, x, y):
     print("Error d=%i: %f" % (f.order, error(f, xb[test], yb[test])))
 
 plot_models(
-    x, y, [fbt1, fbt2, fbt3, fbt10, fbt100], os.path.join(CHART_DIR,
-                                                          "1400_01_08.png"),
+    x, y, [fbt1, fbt2, fbt3, fbt10, fbt100],
+    os.path.join(CHART_DIR, "1400_01_08.png"),
     mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100),
     ymax=10000, xmin=0 * 7 * 24)
 
 from scipy.optimize import fsolve
 print(fbt2)
 print(fbt2 - 100000)
-reached_max = fsolve(fbt2 - 100000, 800) / (7 * 24)
+reached_max = fsolve(fbt2 - 100000, x0=800) / (7 * 24)
 print("100,000 hits/hour expected at week %f" % reached_max[0])
diff --git a/ch01/gen_webstats.py b/ch01/gen_webstats.py
index 2aea928a..61d0b738 100644
--- a/ch01/gen_webstats.py
+++ b/ch01/gen_webstats.py
@@ -17,26 +17,22 @@
 
 sp.random.seed(3)  # to reproduce the data later on
 
-x = sp.arange(1, 31 * 24)
-y = sp.array(200 * (sp.sin(2 * sp.pi * x / (7 * 24))), dtype=int)
+x = sp.arange(1, 31*24)
+y = sp.array(200*(sp.sin(2*sp.pi*x/(7*24))), dtype=int)
 y += gamma.rvs(15, loc=0, scale=100, size=len(x))
-y += 2 * sp.exp(x / 100.0)
-y = sp.ma.array(y, mask=[y < 0])
-print(sum(y), sum(y < 0))
+y += 2 * sp.exp(x/100.0)
+y = sp.ma.array(y, mask=[y<0])
+print(sum(y), sum(y<0))
 
 plt.scatter(x, y)
 plt.title("Web traffic over the last month")
 plt.xlabel("Time")
 plt.ylabel("Hits/hour")
-plt.xticks([w * 7 * 24 for w in [0, 1, 2, 3, 4]], ['week %i' % (w + 1) for w in [
-           0, 1, 2, 3, 4]])
-
+plt.xticks([w*7*24 for w in range(5)], 
+           ['week %i' %(w+1) for w in range(5)])
 plt.autoscale(tight=True)
 plt.grid()
 plt.savefig(os.path.join(CHART_DIR, "1400_01_01.png"))
 
-# sp.savetxt(os.path.join("..", "web_traffic.tsv"),
-# zip(x[~y.mask],y[~y.mask]), delimiter="\t", fmt="%i")
-
-sp.savetxt(os.path.join(
-    DATA_DIR, "web_traffic.tsv"), list(zip(x, y)), delimiter="\t", fmt="%s")
+sp.savetxt(os.path.join(DATA_DIR, "web_traffic.tsv"), 
+           list(zip(x, y)), delimiter="\t", fmt="%s")
diff --git a/ch01/performance_test.py b/ch01/performance_test.py
index 16f3792d..f2111732 100644
--- a/ch01/performance_test.py
+++ b/ch01/performance_test.py
@@ -8,7 +8,7 @@
 
 import timeit
 
-normal_py_sec = timeit.timeit('sum(x*x for x in xrange(1000))',
+normal_py_sec = timeit.timeit('sum(x*x for x in range(1000))',
                               number=10000)
 naive_np_sec = timeit.timeit('sum(na*na)',
                              setup="import numpy as np; na=np.arange(1000)",
diff --git a/ch02/README.rst b/ch02/README.rst
new file mode 100644
index 00000000..e2cb729a
--- /dev/null
+++ b/ch02/README.rst
@@ -0,0 +1,55 @@
+=========
+Chapter 2
+=========
+
+Support code for *Chapter 2: Learning How to Classify with Real-world
+Examples*. The directory data contains the seeds dataset, originally downloaded
+from https://archive.ics.uci.edu/ml/datasets/seeds
+
+chapter.py
+    The code as printed in the book.
+
+figure1.py
+    Figure 1 in the book: all 2-by-2 scatter plots
+
+figure2.py
+    Figure 2 in the book: threshold & decision area
+
+figure4_5_sklearn.py
+    Figures 4 and 5 in the book: Knn decision borders before and after feature
+    normalization. This also produces a version of the figure using 11
+    neighbors (not in the book), which shows that the result is smoother, not
+    as sensitive to exact positions of each datapoint.
+
+figure4_5_no_sklearn.py
+    Alternative code for Figures 4 and 5 without using scikit-learn
+    
+load.py
+    Code to load the seeds data
+
+simple_threshold.py
+    Code from the book: finds the first partition, between Setosa and the other classes.
+
+stump.py
+    Code from the book: finds the second partition, between Virginica and Versicolor.
+
+threshold.py
+    Functional implementation of a threshold classifier
+
+heldout.py
+    Evalute the threshold model on heldout data
+
+seeds_knn_sklearn.py
+    Demonstrate cross-validation and feature normalization using scikit-learn
+    
+seeds_threshold.py
+    Test thresholding model on the seeds dataset (result mention in book, but no code)
+
+seeds_knn_increasing_k.py
+    Test effect of increasing num_neighbors on accuracy.
+
+knn.py
+    Implementation of K-Nearest neighbor without using scikit-learn.
+
+seeds_knn.py
+    Demonstrate cross-validation (without scikit-learn)
diff --git a/ch02/chapter.py b/ch02/chapter.py
new file mode 100644
index 00000000..c68b45ab
--- /dev/null
+++ b/ch02/chapter.py
@@ -0,0 +1,164 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+#
+# It is made available under the MIT License
+
+
+from matplotlib import pyplot as plt
+import numpy as np
+
+# We load the data with load_iris from sklearn
+from sklearn.datasets import load_iris
+data = load_iris()
+
+# load_iris returns an object with several fields
+features = data.data
+feature_names = data.feature_names
+target = data.target
+target_names = data.target_names
+
+for t in range(3):
+ if t == 0:
+     c = 'r'
+     marker = '>'
+ elif t == 1:
+     c = 'g'
+     marker = 'o'
+ elif t == 2:
+     c = 'b'
+     marker = 'x'
+ plt.scatter(features[target == t, 0],
+            features[target == t, 1],
+            marker=marker,
+            c=c)
+# We use NumPy fancy indexing to get an array of strings:
+labels = target_names[target]
+
+# The petal length is the feature at position 2
+plength = features[:, 2]
+
+# Build an array of booleans:
+is_setosa = (labels == 'setosa')
+
+# This is the important step:
+max_setosa =plength[is_setosa].max()
+min_non_setosa = plength[~is_setosa].min()
+print('Maximum of setosa: {0}.'.format(max_setosa))
+
+print('Minimum of others: {0}.'.format(min_non_setosa))
+
+# ~ is the boolean negation operator
+features = features[~is_setosa]
+labels = labels[~is_setosa]
+# Build a new target variable, is_virigina
+is_virginica = (labels == 'virginica')
+
+# Initialize best_acc to impossibly low value
+best_acc = -1.0
+for fi in range(features.shape[1]):
+    # We are going to test all possible thresholds
+    thresh = features[:,fi]
+    for t in thresh:
+
+        # Get the vector for feature `fi`
+        feature_i = features[:, fi]
+        # apply threshold `t`
+        pred = (feature_i > t)
+        acc = (pred == is_virginica).mean()
+        rev_acc = (pred == ~is_virginica).mean()
+        if rev_acc > acc:
+            reverse = True
+            acc = rev_acc
+        else:
+            reverse = False
+
+        if acc > best_acc:
+            best_acc = acc
+            best_fi = fi
+            best_t = t
+            best_reverse = reverse
+
+print(best_fi, best_t, best_reverse, best_acc)
+
+def is_virginica_test(fi, t, reverse, example):
+    'Apply threshold model to a new example'
+    test = example[fi] > t
+    if reverse:
+        test = not test
+    return test
+from threshold import fit_model, predict
+
+# ning accuracy was 96.0%.
+# ing accuracy was 90.0% (N = 50).
+correct = 0.0
+
+for ei in range(len(features)):
+    # select all but the one at position `ei`:
+    training = np.ones(len(features), bool)
+    training[ei] = False
+    testing = ~training
+    model = fit_model(features[training], is_virginica[training])
+    predictions = predict(model, features[testing])
+    correct += np.sum(predictions == is_virginica[testing])
+acc = correct/float(len(features))
+print('Accuracy: {0:.1%}'.format(acc))
+
+
+###########################################
+############## SEEDS DATASET ##############
+###########################################
+
+from load import load_dataset
+
+feature_names = [
+    'area',
+    'perimeter',
+    'compactness',
+    'length of kernel',
+    'width of kernel',
+    'asymmetry coefficien',
+    'length of kernel groove',
+]
+features, labels = load_dataset('seeds')
+
+
+
+from sklearn.neighbors import KNeighborsClassifier
+classifier = KNeighborsClassifier(n_neighbors=1)
+from sklearn.cross_validation import KFold
+
+kf = KFold(len(features), n_folds=5, shuffle=True)
+means = []
+for training,testing in kf:
+   # We learn a model for this fold with `fit` and then apply it to the
+   # testing data with `predict`:
+   classifier.fit(features[training], labels[training])
+   prediction = classifier.predict(features[testing])
+
+   # np.mean on an array of booleans returns fraction
+ # of correct decisions for this fold:
+   curmean = np.mean(prediction == labels[testing])
+   means.append(curmean)
+print('Mean accuracy: {:.1%}'.format(np.mean(means)))
+
+
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+
+classifier = KNeighborsClassifier(n_neighbors=1)
+classifier = Pipeline([('norm', StandardScaler()), ('knn', classifier)])
+
+means = []
+for training,testing in kf:
+    # We learn a model for this fold with `fit` and then apply it to the
+    # testing data with `predict`:
+    classifier.fit(features[training], labels[training])
+    prediction = classifier.predict(features[testing])
+
+    # np.mean on an array of booleans returns fraction
+    # of correct decisions for this fold:
+    curmean = np.mean(prediction == labels[testing])
+    means.append(curmean)
+print('Mean accuracy: {:.1%}'.format(np.mean(means)))
diff --git a/ch02/extra/create_tsv.py b/ch02/extra/create_tsv.py
index b0ddee89..e6d7b4fd 100644
--- a/ch02/extra/create_tsv.py
+++ b/ch02/extra/create_tsv.py
@@ -5,7 +5,6 @@
 #
 # It is made available under the MIT License
 
-import milksets.iris
 import milksets.seeds
 
 
@@ -16,5 +15,4 @@ def save_as_tsv(fname, module):
         for f, n in zip(features, nlabels):
             print >>ofile, "\t".join(map(str, f) + [n])
 
-save_as_tsv('iris.tsv', milksets.iris)
 save_as_tsv('seeds.tsv', milksets.seeds)
diff --git a/ch02/figure1.py b/ch02/figure1.py
index 922b8ce1..4ec6fff8 100644
--- a/ch02/figure1.py
+++ b/ch02/figure1.py
@@ -5,24 +5,38 @@
 #
 # It is made available under the MIT License
 
-import numpy as np
-from sklearn.datasets import load_iris
 from matplotlib import pyplot as plt
 
-data = load_iris()
-features = data['data']
-feature_names = data['feature_names']
-target = data['target']
+# We load the data with load_iris from sklearn
+from sklearn.datasets import load_iris
 
+# load_iris returns an object with several fields
+data = load_iris()
+features = data.data
+feature_names = data.feature_names
+target = data.target
+target_names = data.target_names
 
+fig,axes = plt.subplots(2, 3)
 pairs = [(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]
+
+# Set up 3 different pairs of (color, marker)
+color_markers = [
+        ('r', '>'),
+        ('g', 'o'),
+        ('b', 'x'),
+        ]
 for i, (p0, p1) in enumerate(pairs):
-    plt.subplot(2, 3, i + 1)
-    for t, marker, c in zip(range(3), ">ox", "rgb"):
-        plt.scatter(features[target == t, p0], features[
+    ax = axes.flat[i]
+
+    for t in range(3):
+        # Use a different color/marker for each class `t`
+        c,marker = color_markers[t]
+        ax.scatter(features[target == t, p0], features[
                     target == t, p1], marker=marker, c=c)
-    plt.xlabel(feature_names[p0])
-    plt.ylabel(feature_names[p1])
-    plt.xticks([])
-    plt.yticks([])
-plt.savefig('figure1.png')
+    ax.set_xlabel(feature_names[p0])
+    ax.set_ylabel(feature_names[p1])
+    ax.set_xticks([])
+    ax.set_yticks([])
+fig.tight_layout()
+fig.savefig('figure1.png')
diff --git a/ch02/figure2.py b/ch02/figure2.py
index a04562f8..0b69d395 100644
--- a/ch02/figure2.py
+++ b/ch02/figure2.py
@@ -10,17 +10,25 @@
 from matplotlib import pyplot as plt
 from sklearn.datasets import load_iris
 data = load_iris()
-features = data['data']
-feature_names = data['feature_names']
-species = data['target_names'][data['target']]
+features = data.data
+feature_names = data.feature_names
+target = data.target
+target_names = data.target_names
 
-setosa = (species == 'setosa')
-features = features[~setosa]
-species = species[~setosa]
-virginica = species == 'virginica'
+# We use NumPy fancy indexing to get an array of strings:
+labels = target_names[target]
 
-t = 1.75
-p0, p1 = 3, 2
+is_setosa = (labels == 'setosa')
+features = features[~is_setosa]
+labels = labels[~is_setosa]
+is_virginica = (labels == 'virginica')
+
+# Hand fixed thresholds:
+t = 1.65
+t2 = 1.75
+
+# Features to use: 3 & 2
+f0, f1 = 3, 2
 
 if COLOUR_FIGURE:
     area1c = (1., .8, .8)
@@ -29,19 +37,27 @@
     area1c = (1., 1, 1)
     area2c = (.7, .7, .7)
 
-x0, x1 = [features[:, p0].min() * .9, features[:, p0].max() * 1.1]
-y0, y1 = [features[:, p1].min() * .9, features[:, p1].max() * 1.1]
-
-plt.fill_between([t, x1], [y0, y0], [y1, y1], color=area2c)
-plt.fill_between([x0, t], [y0, y0], [y1, y1], color=area1c)
-plt.plot([t, t], [y0, y1], 'k--', lw=2)
-plt.plot([t - .1, t - .1], [y0, y1], 'k:', lw=2)
-plt.scatter(features[virginica, p0],
-            features[virginica, p1], c='b', marker='o')
-plt.scatter(features[~virginica, p0],
-            features[~virginica, p1], c='r', marker='x')
-plt.ylim(y0, y1)
-plt.xlim(x0, x1)
-plt.xlabel(feature_names[p0])
-plt.ylabel(feature_names[p1])
-plt.savefig('figure2.png')
+# Plot from 90% of smallest value to 110% of largest value
+# (all feature values are positive, otherwise this would not work very well)
+
+x0 = features[:, f0].min() * .9
+x1 = features[:, f0].max() * 1.1
+
+y0 = features[:, f1].min() * .9
+y1 = features[:, f1].max() * 1.1
+
+fig,ax = plt.subplots()
+ax.fill_between([t, x1], [y0, y0], [y1, y1], color=area2c)
+ax.fill_between([x0, t], [y0, y0], [y1, y1], color=area1c)
+ax.plot([t, t], [y0, y1], 'k--', lw=2)
+ax.plot([t2, t2], [y0, y1], 'k:', lw=2)
+ax.scatter(features[is_virginica, f0],
+            features[is_virginica, f1], c='b', marker='o', s=40)
+ax.scatter(features[~is_virginica, f0],
+            features[~is_virginica, f1], c='r', marker='x', s=40)
+ax.set_ylim(y0, y1)
+ax.set_xlim(x0, x1)
+ax.set_xlabel(feature_names[f0])
+ax.set_ylabel(feature_names[f1])
+fig.tight_layout()
+fig.savefig('figure2.png')
diff --git a/ch02/figure4_5.py b/ch02/figure4_5_no_sklearn.py
similarity index 60%
rename from ch02/figure4_5.py
rename to ch02/figure4_5_no_sklearn.py
index 3ddece0c..adc83d73 100644
--- a/ch02/figure4_5.py
+++ b/ch02/figure4_5_no_sklearn.py
@@ -11,7 +11,7 @@
 from matplotlib.colors import ListedColormap
 from load import load_dataset
 import numpy as np
-from knn import learn_model, apply_model, accuracy
+from knn import fit_model, predict
 
 feature_names = [
     'area',
@@ -24,42 +24,56 @@
 ]
 
 
-def train_plot(features, labels):
+def plot_decision(features, labels):
+    '''Plots decision boundary for KNN
+
+    Parameters
+    ----------
+    features : ndarray
+    labels : sequence
+
+    Returns
+    -------
+    fig : Matplotlib Figure
+    ax  : Matplotlib Axes
+    '''
     y0, y1 = features[:, 2].min() * .9, features[:, 2].max() * 1.1
     x0, x1 = features[:, 0].min() * .9, features[:, 0].max() * 1.1
     X = np.linspace(x0, x1, 100)
     Y = np.linspace(y0, y1, 100)
     X, Y = np.meshgrid(X, Y)
 
-    model = learn_model(1, features[:, (0, 2)], np.array(labels))
-    C = apply_model(
-        np.vstack([X.ravel(), Y.ravel()]).T, model).reshape(X.shape)
+    model = fit_model(1, features[:, (0, 2)], np.array(labels))
+    C = predict(
+        model, np.vstack([X.ravel(), Y.ravel()]).T).reshape(X.shape)
     if COLOUR_FIGURE:
         cmap = ListedColormap([(1., .6, .6), (.6, 1., .6), (.6, .6, 1.)])
     else:
         cmap = ListedColormap([(1., 1., 1.), (.2, .2, .2), (.6, .6, .6)])
-    plt.xlim(x0, x1)
-    plt.ylim(y0, y1)
-    plt.xlabel(feature_names[0])
-    plt.ylabel(feature_names[2])
-    plt.pcolormesh(X, Y, C, cmap=cmap)
+    fig,ax = plt.subplots()
+    ax.set_xlim(x0, x1)
+    ax.set_ylim(y0, y1)
+    ax.set_xlabel(feature_names[0])
+    ax.set_ylabel(feature_names[2])
+    ax.pcolormesh(X, Y, C, cmap=cmap)
     if COLOUR_FIGURE:
         cmap = ListedColormap([(1., .0, .0), (.0, 1., .0), (.0, .0, 1.)])
-        plt.scatter(features[:, 0], features[:, 2], c=labels, cmap=cmap)
+        ax.scatter(features[:, 0], features[:, 2], c=labels, cmap=cmap)
     else:
         for lab, ma in zip(range(3), "Do^"):
-            plt.plot(features[labels == lab, 0], features[
+            ax.plot(features[labels == lab, 0], features[
                      labels == lab, 2], ma, c=(1., 1., 1.))
+    return fig,ax
 
 
 features, labels = load_dataset('seeds')
 names = sorted(set(labels))
 labels = np.array([names.index(ell) for ell in labels])
 
-train_plot(features, labels)
-plt.savefig('figure4.png')
+fig,ax = plot_decision(features, labels)
+fig.savefig('figure4.png')
 
 features -= features.mean(0)
 features /= features.std(0)
-train_plot(features, labels)
-plt.savefig('figure5.png')
+fig,ax = plot_decision(features, labels)
+fig.savefig('figure5.png')
diff --git a/ch02/figure4_5_sklearn.py b/ch02/figure4_5_sklearn.py
new file mode 100644
index 00000000..55ac0c80
--- /dev/null
+++ b/ch02/figure4_5_sklearn.py
@@ -0,0 +1,85 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+#
+# It is made available under the MIT License
+
+COLOUR_FIGURE = False
+
+from matplotlib import pyplot as plt
+from matplotlib.colors import ListedColormap
+from load import load_dataset
+import numpy as np
+from sklearn.neighbors import KNeighborsClassifier
+
+feature_names = [
+    'area',
+    'perimeter',
+    'compactness',
+    'length of kernel',
+    'width of kernel',
+    'asymmetry coefficien',
+    'length of kernel groove',
+]
+
+
+def plot_decision(features, labels, num_neighbors=1):
+    '''Plots decision boundary for KNN
+
+    Parameters
+    ----------
+    features : ndarray
+    labels : sequence
+
+    Returns
+    -------
+    fig : Matplotlib Figure
+    ax  : Matplotlib Axes
+    '''
+    y0, y1 = features[:, 2].min() * .9, features[:, 2].max() * 1.1
+    x0, x1 = features[:, 0].min() * .9, features[:, 0].max() * 1.1
+    X = np.linspace(x0, x1, 1000)
+    Y = np.linspace(y0, y1, 1000)
+    X, Y = np.meshgrid(X, Y)
+
+    model = KNeighborsClassifier(num_neighbors)
+    model.fit(features[:, (0,2)], labels)
+    C = model.predict(np.vstack([X.ravel(), Y.ravel()]).T).reshape(X.shape)
+    if COLOUR_FIGURE:
+        cmap = ListedColormap([(1., .7, .7), (.7, 1., .7), (.7, .7, 1.)])
+    else:
+        cmap = ListedColormap([(1., 1., 1.), (.2, .2, .2), (.6, .6, .6)])
+    fig,ax = plt.subplots()
+    ax.set_xlim(x0, x1)
+    ax.set_ylim(y0, y1)
+    ax.set_xlabel(feature_names[0])
+    ax.set_ylabel(feature_names[2])
+    ax.pcolormesh(X, Y, C, cmap=cmap)
+    if COLOUR_FIGURE:
+        cmap = ListedColormap([(1., .0, .0), (.1, .6, .1), (.0, .0, 1.)])
+        ax.scatter(features[:, 0], features[:, 2], c=labels, cmap=cmap)
+    else:
+        for lab, ma in zip(range(3), "Do^"):
+            ax.plot(features[labels == lab, 0], features[
+                     labels == lab, 2], ma, c=(1., 1., 1.), ms=6)
+    return fig,ax
+
+
+features, labels = load_dataset('seeds')
+names = sorted(set(labels))
+labels = np.array([names.index(ell) for ell in labels])
+
+fig,ax = plot_decision(features, labels)
+fig.tight_layout()
+fig.savefig('figure4sklearn.png')
+
+features -= features.mean(0)
+features /= features.std(0)
+fig,ax = plot_decision(features, labels)
+fig.tight_layout()
+fig.savefig('figure5sklearn.png')
+
+fig,ax = plot_decision(features, labels, 11)
+fig.tight_layout()
+fig.savefig('figure5sklearn_with_11_neighbors.png')
diff --git a/ch02/heldout.py b/ch02/heldout.py
index a14b2df3..e381e706 100644
--- a/ch02/heldout.py
+++ b/ch02/heldout.py
@@ -8,30 +8,32 @@
 # This script demonstrates the difference between the training accuracy and
 # testing (held-out) accuracy.
 
-from matplotlib import pyplot as plt
 import numpy as np
 from sklearn.datasets import load_iris
-from threshold import learn_model, apply_model, accuracy
+from threshold import fit_model, accuracy
 
 data = load_iris()
 features = data['data']
 labels = data['target_names'][data['target']]
 
 # We are going to remove the setosa examples as they are too easy:
-setosa = (labels == 'setosa')
-features = features[~setosa]
-labels = labels[~setosa]
+is_setosa = (labels == 'setosa')
+features = features[~is_setosa]
+labels = labels[~is_setosa]
 
 # Now we classify virginica vs non-virginica
-virginica = (labels == 'virginica')
+is_virginica = (labels == 'virginica')
 
 # Split the data in two: testing and training
 testing = np.tile([True, False], 50) # testing = [True,False,True,False,True,False...]
+
+# Training is the negation of testing: i.e., datapoints not used for testing,
+# will be used for training
 training = ~testing
 
-model = learn_model(features[training], virginica[training])
-train_accuracy = accuracy(features[training], virginica[training], model)
-test_accuracy = accuracy(features[testing], virginica[testing], model)
+model = fit_model(features[training], is_virginica[training])
+train_accuracy = accuracy(features[training], is_virginica[training], model)
+test_accuracy = accuracy(features[testing], is_virginica[testing], model)
 
 print('''\
 Training accuracy was {0:.1%}.
diff --git a/ch02/knn.py b/ch02/knn.py
index 0c9151c9..89ebfdb4 100644
--- a/ch02/knn.py
+++ b/ch02/knn.py
@@ -7,8 +7,8 @@
 
 import numpy as np
 
-
-def learn_model(k, features, labels):
+# This function was called ``learn_model`` in the first edition
+def fit_model(k, features, labels):
     '''Learn a k-nn model'''
     # There is no model in k-nn, just a copy of the inputs
     return k, features.copy(), labels.copy()
@@ -25,8 +25,8 @@ def plurality(xs):
         if v == maxv:
             return k
 
-
-def apply_model(features, model):
+# This function was called ``apply_model`` in the first edition
+def predict(model, features):
     '''Apply k-nn model'''
     k, train_feats, labels = model
     results = []
@@ -42,5 +42,5 @@ def apply_model(features, model):
 
 
 def accuracy(features, labels, model):
-    preds = apply_model(features, model)
+    preds = predict(model, features)
     return np.mean(preds == labels)
diff --git a/ch02/seeds_knn.py b/ch02/seeds_knn.py
index 158b21b2..c18d9592 100644
--- a/ch02/seeds_knn.py
+++ b/ch02/seeds_knn.py
@@ -7,7 +7,7 @@
 
 from load import load_dataset
 import numpy as np
-from knn import learn_model, apply_model, accuracy
+from knn import fit_model, accuracy
 
 features, labels = load_dataset('seeds')
 
@@ -19,7 +19,7 @@ def cross_validate(features, labels):
         training = np.ones(len(features), bool)
         training[fold::10] = 0
         testing = ~training
-        model = learn_model(1, features[training], labels[training])
+        model = fit_model(1, features[training], labels[training])
         test_error = accuracy(features[testing], labels[testing], model)
         error += test_error
 
diff --git a/ch02/seeds_knn_increasing_k.py b/ch02/seeds_knn_increasing_k.py
new file mode 100644
index 00000000..7cd8b3f9
--- /dev/null
+++ b/ch02/seeds_knn_increasing_k.py
@@ -0,0 +1,48 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+#
+# It is made available under the MIT License
+
+# Basic imports
+from __future__ import print_function
+import numpy as np
+from matplotlib import pyplot as plt
+from load import load_dataset
+
+
+from sklearn.neighbors import KNeighborsClassifier
+
+from sklearn.cross_validation import cross_val_score
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+
+
+features, labels = load_dataset('seeds')
+
+# Values of k to consider: all in 1 .. 160
+ks = np.arange(1,161)
+
+# We build a classifier object here with the default number of neighbors
+# (It happens to be 5, but it does not matter as we will be changing it below
+classifier = KNeighborsClassifier()
+classifier = Pipeline([('norm', StandardScaler()), ('knn', classifier)])
+
+# accuracies will hold our results
+accuracies = []
+for k in ks:
+    # set the classifier parameter
+    classifier.set_params(knn__n_neighbors=k)
+    crossed = cross_val_score(classifier, features, labels)
+
+    # Save only the average
+    accuracies.append(crossed.mean())
+
+accuracies = np.array(accuracies)
+
+# Scale the accuracies by 100 to plot as a percentage instead of as a fraction
+plt.plot(ks, accuracies*100)
+plt.xlabel('Value for k (nr. of neighbors)')
+plt.ylabel('Accuracy (%)')
+plt.savefig('figure6.png')
diff --git a/ch02/seeds_knn_sklearn.py b/ch02/seeds_knn_sklearn.py
new file mode 100644
index 00000000..ac89bb59
--- /dev/null
+++ b/ch02/seeds_knn_sklearn.py
@@ -0,0 +1,90 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+#
+# It is made available under the MIT License
+
+# Basic imports
+from __future__ import print_function
+import numpy as np
+from load import load_dataset
+
+
+# Import sklearn implementation of KNN
+from sklearn.neighbors import KNeighborsClassifier
+
+features, labels = load_dataset('seeds')
+classifier = KNeighborsClassifier(n_neighbors=4)
+
+
+n = len(features)
+correct = 0.0
+for ei in range(n):
+    training = np.ones(n, bool)
+    training[ei] = 0
+    testing = ~training
+    classifier.fit(features[training], labels[training])
+    pred = classifier.predict(features[ei])
+    correct += (pred == labels[ei])
+print('Result of leave-one-out: {}'.format(correct/n))
+
+# Import KFold object
+from sklearn.cross_validation import KFold
+
+# means will hold the mean for each fold
+means = []
+
+# kf is a generator of pairs (training,testing) so that each iteration
+# implements a separate fold.
+kf = KFold(len(features), n_folds=3, shuffle=True)
+for training,testing in kf:
+    # We learn a model for this fold with `fit` and then apply it to the
+    # testing data with `predict`:
+    classifier.fit(features[training], labels[training])
+    prediction = classifier.predict(features[testing])
+
+    # np.mean on an array of booleans returns the fraction of correct decisions
+    # for this fold:
+    curmean = np.mean(prediction == labels[testing])
+    means.append(curmean)
+print('Result of cross-validation using KFold: {}'.format(means))
+
+# The function cross_val_score does the same thing as the loop above with a
+# single function call
+
+from sklearn.cross_validation import cross_val_score
+crossed = cross_val_score(classifier, features, labels)
+print('Result of cross-validation using cross_val_score: {}'.format(crossed))
+
+# The results above use the features as is, which we learned was not optimal
+# except if the features happen to all be in the same scale. We can pre-scale
+# the features as explained in the main text:
+
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+classifier = Pipeline([('norm', StandardScaler()), ('knn', classifier)])
+crossed = cross_val_score(classifier, features, labels)
+print('Result with prescaling: {}'.format(crossed))
+
+
+# Now, generate & print a cross-validated confusion matrix for the same result
+from sklearn.metrics import confusion_matrix
+names = list(set(labels))
+labels = np.array([names.index(ell) for ell in labels])
+preds = labels.copy()
+preds[:] = -1
+for train, test in kf:
+    classifier.fit(features[train], labels[train])
+    preds[test] = classifier.predict(features[test])
+
+cmat = confusion_matrix(labels, preds)
+print()
+print('Confusion matrix: [rows represent true outcome, columns predicted outcome]')
+print(cmat)
+
+# The explicit float() conversion is necessary in Python 2
+# (Otherwise, result is rounded to 0)
+acc = cmat.trace()/float(cmat.sum())
+print('Accuracy: {0:.1%}'.format(acc))
+
diff --git a/ch02/seeds_threshold.py b/ch02/seeds_threshold.py
index cd184107..6b1f87d0 100644
--- a/ch02/seeds_threshold.py
+++ b/ch02/seeds_threshold.py
@@ -7,7 +7,7 @@
 
 from load import load_dataset
 import numpy as np
-from threshold import learn_model, apply_model, accuracy
+from threshold import fit_model, accuracy
 
 features, labels = load_dataset('seeds')
 
@@ -24,7 +24,7 @@
     # whatever is not training is for testing
     testing = ~training
 
-    model = learn_model(features[training], labels[training])
+    model = fit_model(features[training], labels[training])
     test_error = accuracy(features[testing], labels[testing], model)
     error += test_error
 
diff --git a/ch02/simple_threshold.py b/ch02/simple_threshold.py
index 3f432512..d174f283 100644
--- a/ch02/simple_threshold.py
+++ b/ch02/simple_threshold.py
@@ -5,7 +5,6 @@
 #
 # It is made available under the MIT License
 
-import numpy as np
 from sklearn.datasets import load_iris
 
 data = load_iris()
@@ -13,8 +12,14 @@
 target = data['target']
 target_names = data['target_names']
 labels = target_names[target]
-
 plength = features[:, 2]
+
+# To use numpy operations to get setosa features,
+# we build a boolean array
 is_setosa = (labels == 'setosa')
-print('Maximum of setosa: {0}.'.format(plength[is_setosa].max()))
-print('Minimum of others: {0}.'.format(plength[~is_setosa].min()))
+
+max_setosa = plength[is_setosa].max()
+min_non_setosa = plength[~is_setosa].min()
+
+print('Maximum of setosa: {0}.'.format(max_setosa))
+print('Minimum of others: {0}.'.format(min_non_setosa))
diff --git a/ch02/stump.py b/ch02/stump.py
index 841ce983..0dfaec85 100644
--- a/ch02/stump.py
+++ b/ch02/stump.py
@@ -5,29 +5,51 @@
 #
 # It is made available under the MIT License
 
-from matplotlib import pyplot as plt
 from sklearn.datasets import load_iris
 data = load_iris()
-features = data['data']
-labels = data['target_names'][data['target']]
+features = data.data
+labels = data.target_names[data.target]
 
 
-setosa = (labels == 'setosa')
-features = features[~setosa]
-labels = labels[~setosa]
-virginica = (labels == 'virginica')
+is_setosa = (labels == 'setosa')
+features = features[~is_setosa]
+labels = labels[~is_setosa]
+is_virginica = (labels == 'virginica')
 
 
+# Initialize to a value that is worse than any possible test
 best_acc = -1.0
+
+# Loop over all the features
 for fi in range(features.shape[1]):
+    # Test every possible threshold value for feature fi
     thresh = features[:, fi].copy()
+
+    # Test them in order
     thresh.sort()
     for t in thresh:
+
+        # Generate predictions using t as a threshold
         pred = (features[:, fi] > t)
-        acc = (pred == virginica).mean()
+
+        # Accuracy is the fraction of predictions that match reality
+        acc = (pred == is_virginica).mean()
+
+        # We test whether negating the test is a better threshold:
+        acc_neg = ((~pred) == is_virginica).mean()
+        if acc_neg > acc:
+            acc = acc_neg
+            negated = True
+        else:
+            negated = False
+
+        # If this is better than previous best, then this is now the new best:
+
         if acc > best_acc:
             best_acc = acc
             best_fi = fi
             best_t = t
-print('Best cut is {0} on feature {1}, which achieves accuracy of {2:.1%}.'.format(
-    best_t, best_fi, best_acc))
+            best_is_negated = negated
+
+print('Best threshold is {0} on feature {1} (index {2}), which achieves accuracy of {3:.1%}.'.format(
+    best_t, data.feature_names[best_fi], best_fi, best_acc))
diff --git a/ch02/threshold.py b/ch02/threshold.py
index 40337433..d621a350 100644
--- a/ch02/threshold.py
+++ b/ch02/threshold.py
@@ -8,7 +8,8 @@
 import numpy as np
 
 
-def learn_model(features, labels):
+# This function was called ``learn_model`` in the first edition
+def fit_model(features, labels):
     '''Learn a simple threshold model'''
     best_acc = -1.0
     # Loop over all the features:
@@ -21,22 +22,34 @@ def learn_model(features, labels):
 
             # Measure the accuracy of this 
             acc = (pred == labels).mean()
+
+            rev_acc = (pred == ~labels).mean()
+            if rev_acc > acc:
+                acc = rev_acc
+                reverse = True
+            else:
+                reverse = False
             if acc > best_acc:
                 best_acc = acc
                 best_fi = fi
                 best_t = t
+                best_reverse = reverse
 
     # A model is a threshold and an index
-    return best_t, best_fi
+    return best_t, best_fi, best_reverse
 
 
-def apply_model(features, model):
+# This function was called ``apply_model`` in the first edition
+def predict(model, features):
     '''Apply a learned model'''
-    # A model is a pair as returned by learn_model
-    t, fi = model
-    return features[:, fi] > t
+    # A model is a pair as returned by fit_model
+    t, fi, reverse = model
+    if reverse:
+        return features[:, fi] <= t
+    else:
+        return features[:, fi] > t
 
 def accuracy(features, labels, model):
     '''Compute the accuracy of the model'''
-    preds = apply_model(features, model)
+    preds = predict(model, features)
     return np.mean(preds == labels)
diff --git a/ch03/data/toy/03.txt b/ch03/data/toy/03.txt
index 7ca1f741..c5e03e90 100644
--- a/ch03/data/toy/03.txt
+++ b/ch03/data/toy/03.txt
@@ -1 +1 @@
-Most imaging databases safe images permanently.
\ No newline at end of file
+Most imaging databases save images permanently.
diff --git a/ch03/noise_analysis.py b/ch03/noise_analysis.py
new file mode 100644
index 00000000..11ca13f4
--- /dev/null
+++ b/ch03/noise_analysis.py
@@ -0,0 +1,69 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+#
+# It is made available under the MIT License
+
+import sklearn.datasets
+
+groups = [
+    'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
+    'comp.sys.mac.hardware', 'comp.windows.x', 'sci.space']
+train_data = sklearn.datasets.fetch_20newsgroups(subset="train",
+                                                 categories=groups)
+
+labels = train_data.target
+num_clusters = 50  # sp.unique(labels).shape[0]
+
+import nltk.stem
+english_stemmer = nltk.stem.SnowballStemmer('english')
+
+from sklearn.feature_extraction.text import TfidfVectorizer
+
+
+class StemmedTfidfVectorizer(TfidfVectorizer):
+
+    def build_analyzer(self):
+        analyzer = super(TfidfVectorizer, self).build_analyzer()
+        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
+
+vectorizer = StemmedTfidfVectorizer(min_df=10, max_df=0.5,
+                                    stop_words='english', decode_error='ignore'
+                                    )
+vectorized = vectorizer.fit_transform(train_data.data)
+
+post_group = zip(train_data.data, train_data.target)
+# Create a list of tuples that can be sorted by
+# the length of the posts
+all = [(len(post[0]), post[0], train_data.target_names[post[1]])
+       for post in post_group]
+graphics = sorted([post for post in all if post[2] == 'comp.graphics'])
+print(graphics[5])
+# (245, 'From: SITUNAYA@IBM3090.BHAM.AC.UK\nSubject: test....(sorry)\nOrganization:
+# The University of Birmingham, United Kingdom\nLines: 1\nNNTP-Posting-Host: ibm3090.bham.ac.uk
+# \n\n==============================================================================\n',
+# 'comp.graphics')
+
+noise_post = graphics[5][1]
+
+analyzer = vectorizer.build_analyzer()
+print(list(analyzer(noise_post)))
+
+useful = set(analyzer(noise_post)).intersection(vectorizer.get_feature_names())
+print(sorted(useful))
+# ['ac', 'birmingham', 'host', 'kingdom', 'nntp', 'sorri', 'test', 'uk', 'unit', 'univers']
+
+for term in sorted(useful):
+    print('IDF(%s)=%.2f' % (term,
+                            vectorizer._tfidf.idf_[vectorizer.vocabulary_[term]]))
+# IDF(ac)=3.51
+# IDF(birmingham)=6.77
+# IDF(host)=1.74
+# IDF(kingdom)=6.68
+# IDF(nntp)=1.77
+# IDF(sorri)=4.14
+# IDF(test)=3.83
+# IDF(uk)=3.70
+# IDF(unit)=4.42
+# IDF(univers)=1.91
diff --git a/ch03/plot_kmeans_example.py b/ch03/plot_kmeans_example.py
index c012894c..43b9b71b 100644
--- a/ch03/plot_kmeans_example.py
+++ b/ch03/plot_kmeans_example.py
@@ -15,7 +15,7 @@
 from matplotlib import pylab
 from sklearn.cluster import KMeans
 
-from utils import DATA_DIR, CHART_DIR
+from utils import CHART_DIR
 
 seed = 2
 sp.random.seed(seed)  # to reproduce the data later on
@@ -33,7 +33,6 @@ def plot_clustering(x, y, title, mx=None, ymax=None, xmin=None, km=None):
     pylab.title(title)
     pylab.xlabel("Occurrence word 1")
     pylab.ylabel("Occurrence word 2")
-    # pylab.xticks([w*7*24 for w in range(10)], ['week %i'%w for w in range(10)])
 
     pylab.autoscale(tight=True)
     pylab.ylim(ymin=0, ymax=1)
diff --git a/ch03/rel_post_01.py b/ch03/rel_post_01.py
index 2f807f3d..9c516180 100644
--- a/ch03/rel_post_01.py
+++ b/ch03/rel_post_01.py
@@ -43,8 +43,7 @@ def build_analyzer(self):
         return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
 
 vectorizer = StemmedTfidfVectorizer(
-    min_df=1, stop_words='english', charset_error='ignore')
-print(vectorizer)
+    min_df=1, stop_words='english', decode_error='ignore')
 
 X_train = vectorizer.fit_transform(posts)
 
diff --git a/ch03/rel_post_mlcomp_01.py b/ch03/rel_post_20news.py
similarity index 57%
rename from ch03/rel_post_mlcomp_01.py
rename to ch03/rel_post_20news.py
index cc738f2c..6dad9431 100644
--- a/ch03/rel_post_mlcomp_01.py
+++ b/ch03/rel_post_20news.py
@@ -5,19 +5,9 @@
 #
 # It is made available under the MIT License
 
-import os
-import sys
 import sklearn.datasets
 import scipy as sp
 
-from utils import DATA_DIR
-
-if not os.path.exists(DATA_DIR):
-    print("""\
-It seems that you have not yet downloaded the MLCOMP data set.
-Please do so and place it into %s."""%DATA_DIR)
-    sys.exit(1)
-
 new_post = \
     """Disk drive problems. Hi, I have a problem with my hard disk.
 After 1 year it is working only sporadically now.
@@ -25,15 +15,28 @@
 Any ideas? Thanks.
 """
 
+print("""\
+Dear reader of the 1st edition of 'Building Machine Learning Systems with Python'!
+For the 2nd edition we introduced a couple of changes that will result into
+results that differ from the results in the 1st edition.
+E.g. we now fully rely on scikit's fetch_20newsgroups() instead of requiring
+you to download the data manually from MLCOMP.
+If you have any questions, please ask at http://www.twotoreal.com
+""")
+
+all_data = sklearn.datasets.fetch_20newsgroups(subset="all")
+print("Number of total posts: %i" % len(all_data.filenames))
+# Number of total posts: 18846
+
 groups = [
     'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
-    'comp.sys.ma c.hardware', 'comp.windows.x', 'sci.space']
-dataset = sklearn.datasets.load_mlcomp("20news-18828", "train",
-                                       mlcomp_root=DATA_DIR,
-                                       categories=groups)
-print("Number of posts:", len(dataset.filenames))
+    'comp.sys.mac.hardware', 'comp.windows.x', 'sci.space']
+train_data = sklearn.datasets.fetch_20newsgroups(subset="train",
+                                                 categories=groups)
+print("Number of training posts in tech groups:", len(train_data.filenames))
+# Number of training posts in tech groups: 3529
 
-labels = dataset.target
+labels = train_data.target
 num_clusters = 50  # sp.unique(labels).shape[0]
 
 import nltk.stem
@@ -49,31 +52,41 @@ def build_analyzer(self):
         return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
 
 vectorizer = StemmedTfidfVectorizer(min_df=10, max_df=0.5,
-                                    # max_features=1000,
-                                    stop_words='english', charset_error='ignore'
+                                    stop_words='english', decode_error='ignore'
                                     )
-vectorized = vectorizer.fit_transform(dataset.data)
+
+vectorized = vectorizer.fit_transform(train_data.data)
 num_samples, num_features = vectorized.shape
 print("#samples: %d, #features: %d" % (num_samples, num_features))
-
+# samples: 3529, #features: 4712
 
 from sklearn.cluster import KMeans
 
-km = KMeans(n_clusters=num_clusters, init='k-means++', n_init=1,
-            verbose=1)
-
+km = KMeans(n_clusters=num_clusters, n_init=1, verbose=1, random_state=3)
 clustered = km.fit(vectorized)
 
+print("km.labels_=%s" % km.labels_)
+# km.labels_=[ 6 34 22 ...,  2 21 26]
+
+print("km.labels_.shape=%s" % km.labels_.shape)
+# km.labels_.shape=3529
+
 from sklearn import metrics
 print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
+# Homogeneity: 0.400
 print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
+# Completeness: 0.206
 print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
+# V-measure: 0.272
 print("Adjusted Rand Index: %0.3f" %
       metrics.adjusted_rand_score(labels, km.labels_))
+# Adjusted Rand Index: 0.064
 print("Adjusted Mutual Information: %0.3f" %
       metrics.adjusted_mutual_info_score(labels, km.labels_))
+# Adjusted Mutual Information: 0.197
 print(("Silhouette Coefficient: %0.3f" %
        metrics.silhouette_score(vectorized, labels, sample_size=1000)))
+# Silhouette Coefficient: 0.006
 
 new_post_vec = vectorizer.transform([new_post])
 new_post_label = km.predict(new_post_vec)[0]
@@ -83,13 +96,14 @@ def build_analyzer(self):
 similar = []
 for i in similar_indices:
     dist = sp.linalg.norm((new_post_vec - vectorized[i]).toarray())
-    similar.append((dist, dataset.data[i]))
+    similar.append((dist, train_data.data[i]))
 
 similar = sorted(similar)
+print("Count similar: %i" % len(similar))
 
 show_at_1 = similar[0]
-show_at_2 = similar[len(similar) / 2]
-show_at_3 = similar[-1]
+show_at_2 = similar[int(len(similar) / 10)]
+show_at_3 = similar[int(len(similar) / 2)]
 
 print("=== #1 ===")
 print(show_at_1)
diff --git a/ch03/utils.py b/ch03/utils.py
index ffb20929..29d46836 100644
--- a/ch03/utils.py
+++ b/ch03/utils.py
@@ -15,3 +15,8 @@
     print("Uh, we were expecting a data directory, which contains the toy data")
     sys.exit(1)
 
+CHART_DIR = os.path.join(
+    os.path.dirname(os.path.realpath(__file__)), "charts")
+if not os.path.exists(CHART_DIR):
+    os.mkdir(CHART_DIR)
+
diff --git a/ch04/.gitignore b/ch04/.gitignore
new file mode 100644
index 00000000..c4c0b18a
--- /dev/null
+++ b/ch04/.gitignore
@@ -0,0 +1,6 @@
+wiki_lda.pkl
+wiki_lda.pkl.state
+*.png
+*.npy
+*.pkl
+topics.txt
diff --git a/ch04/README.rst b/ch04/README.rst
new file mode 100644
index 00000000..99a3c186
--- /dev/null
+++ b/ch04/README.rst
@@ -0,0 +1,65 @@
+=========
+Chapter 4
+=========
+
+Support code for *Chapter 4: Topic Modeling*
+
+
+AP Data
+-------
+
+To download the AP data, use the ``download_ap.sh`` script inside the ``data``
+directory::
+
+    cd data
+    ./download_ap.sh
+
+Word cloud creation
+-------------------
+
+Word cloud creation requires that ``pytagcloud`` be installed (in turn, this
+requires ``pygame``). Since this is not an essential part of the chapter, the
+code will work even if you have not installed it (naturally, the cloud image
+will not be generated and a warning will be printed).
+
+
+Wikipedia processing
+--------------------
+
+You will need **a lot of disk space**. The download of the Wikipedia text is
+11GB and preprocessing it takes another 24GB to save it in the intermediate
+format that gensim uses for a total of 34GB!
+
+Run the following two commands inside the ``data/`` directory::
+
+    ./download_wp.sh
+    ./preprocess-wikidata.sh
+
+As the filenames indicate, the first step will download the data and the second
+one will preprocess it. Preprocessing can take several hours, but it is
+feasible to run it on a modern laptop. Once the second step is finished, you
+may remove the input file if you want to save disk space
+(``data/enwiki-latest-pages-articles.xml.bz2``).
+
+To generate the model, you can run the ``wikitopics_create.py`` script, while
+the ``wikitopics_plot.py`` script will plot the most heavily discussed topic as
+well as the least heavily discussed one. The code is split into steps as the
+first one can take a very long time. Then it saves the results so that you can
+later explore them at leisure.
+
+You should not expect that your results will exactly match the results in the
+book, for two reasons:
+
+1. The LDA algorithm is a probabilistic algorithm and can give different
+   results every time it is run.
+2. Wikipedia keeps changing. Thus, even your input data will be different.
+
+Scripts
+-------
+
+blei_lda.py
+    Computes LDA using the AP Corpus.
+wikitopics_create.py
+    Create the topic model for Wikipedia using LDA (must download wikipedia database first)
+wikitopics_create_hdp.py
+    Create the topic model for Wikipedia using HDP (must download wikipedia database first)
diff --git a/ch04/blei_lda.py b/ch04/blei_lda.py
index 6c4a9f54..7f6ac2b3 100644
--- a/ch04/blei_lda.py
+++ b/ch04/blei_lda.py
@@ -6,56 +6,81 @@
 # It is made available under the MIT License
 
 from __future__ import print_function
+from wordcloud import create_cloud
 try:
-    from gensim import corpora, models, similarities
+    from gensim import corpora, models, matutils
 except:
     print("import gensim failed.")
     print()
     print("Please install it")
     raise
 
-try:
-    from mpltools import style
-    style.use('ggplot')
-except:
-    print("Could not import mpltools: plots will not be styled correctly")
-
 import matplotlib.pyplot as plt
 import numpy as np
 from os import path
 
+NUM_TOPICS = 100
+
+# Check that data exists
 if not path.exists('./data/ap/ap.dat'):
     print('Error: Expected data to be present at data/ap/')
     print('Please cd into ./data & run ./download_ap.sh')
 
+# Load the data
 corpus = corpora.BleiCorpus('./data/ap/ap.dat', './data/ap/vocab.txt')
+
+# Build the topic model
 model = models.ldamodel.LdaModel(
-    corpus, num_topics=100, id2word=corpus.id2word, alpha=None)
+    corpus, num_topics=NUM_TOPICS, id2word=corpus.id2word, alpha=None)
 
-for ti in xrange(84):
+# Iterate over all the topics in the model
+for ti in range(model.num_topics):
     words = model.show_topic(ti, 64)
-    tf = sum(f for f, w in words)
-    print('\n'.join('{}:{}'.format(w, int(1000. * f / tf)) for f, w in words))
-    print()
-    print()
-    print()
+    tf = sum(f for _, f in words)
+    with open('topics.txt', 'w') as output:
+        output.write('\n'.join('{}:{}'.format(w, int(1000. * f / tf)) for w, f in words))
+        output.write("\n\n\n")
+
+# We first identify the most discussed topic, i.e., the one with the
+# highest total weight
 
-thetas = [model[c] for c in corpus]
-plt.hist([len(t) for t in thetas], np.arange(42))
-plt.ylabel('Nr of documents')
-plt.xlabel('Nr of topics')
-plt.savefig('../1400OS_04_01+.png')
+topics = matutils.corpus2dense(model[corpus], num_terms=model.num_topics)
+weight = topics.sum(1)
+max_topic = weight.argmax()
+
+
+# Get the top 64 words for this topic
+# Without the argument, show_topic would return only 10 words
+words = model.show_topic(max_topic, 64)
+
+# This function will actually check for the presence of pytagcloud and is otherwise a no-op
+create_cloud('cloud_blei_lda.png', words)
+
+num_topics_used = [len(model[doc]) for doc in corpus]
+fig,ax = plt.subplots()
+ax.hist(num_topics_used, np.arange(42))
+ax.set_ylabel('Nr of documents')
+ax.set_xlabel('Nr of topics')
+fig.tight_layout()
+fig.savefig('Figure_04_01.png')
+
+
+# Now, repeat the same exercise using alpha=1.0
+# You can edit the constant below to play around with this parameter
+ALPHA = 1.0
 
 model1 = models.ldamodel.LdaModel(
-    corpus, num_topics=100, id2word=corpus.id2word, alpha=1.)
-thetas1 = [model1[c] for c in corpus]
-
-#model8 = models.ldamodel.LdaModel(corpus, num_topics=100, id2word=corpus.id2word, alpha=1.e-8)
-#thetas8 = [model8[c] for c in corpus]
-plt.clf()
-plt.hist([[len(t) for t in thetas], [len(t) for t in thetas1]], np.arange(42))
-plt.ylabel('Nr of documents')
-plt.xlabel('Nr of topics')
-plt.text(9, 223, r'default alpha')
-plt.text(26, 156, 'alpha=1.0')
-plt.savefig('../1400OS_04_02+.png')
+    corpus, num_topics=NUM_TOPICS, id2word=corpus.id2word, alpha=ALPHA)
+num_topics_used1 = [len(model1[doc]) for doc in corpus]
+
+fig,ax = plt.subplots()
+ax.hist([num_topics_used, num_topics_used1], np.arange(42))
+ax.set_ylabel('Nr of documents')
+ax.set_xlabel('Nr of topics')
+
+# The coordinates below were fit by trial and error to look good
+ax.text(9, 223, r'default alpha')
+ax.text(26, 156, 'alpha=1.0')
+fig.tight_layout()
+fig.savefig('Figure_04_02.png')
+
diff --git a/ch04/build_lda.py b/ch04/build_lda.py
index 192e3718..a0ee9c5f 100644
--- a/ch04/build_lda.py
+++ b/ch04/build_lda.py
@@ -14,8 +14,7 @@
     raise
 from scipy.spatial import distance
 import numpy as np
-import string
-from gensim import corpora, models, similarities
+from gensim import corpora, models
 import sklearn.datasets
 import nltk.stem
 from collections import defaultdict
@@ -80,7 +79,7 @@ def __len__(self):
 
 distances = distance.squareform(distance.pdist(thetas))
 large = distances.max() + 1
-for i in xrange(len(distances)):
+for i in range(len(distances)):
     distances[i, i] = large
 
 print(otexts[1])
diff --git a/ch04/data/.gitignore b/ch04/data/.gitignore
index 8eb48cbf..91f24bbc 100644
--- a/ch04/data/.gitignore
+++ b/ch04/data/.gitignore
@@ -1,8 +1,12 @@
 ap.tgz
 ap/
+dataset-379-20news-18828_HJRZF.zip
+379/
 enwiki-latest-pages-articles.xml.bz2
 wiki_en_output_bow.mm
+wiki_en_output_bow.mm.gz
 wiki_en_output_bow.mm.index
 wiki_en_output_tfidf.mm
+wiki_en_output_tfidf.mm.gz
 wiki_en_output_tfidf.mm.index
-wiki_en_output_wordids.txt
+wiki_en_output_wordids.txt.bz2
diff --git a/ch04/data/download_ap.sh b/ch04/data/download_ap.sh
index 6de8ded8..da27814a 100755
--- a/ch04/data/download_ap.sh
+++ b/ch04/data/download_ap.sh
@@ -1,3 +1,3 @@
 #!/bin/sh
-wget http://www.cs.princeton.edu/~blei/lda-c/ap.tgz
+wget http://www.cs.columbia.edu/~blei/lda-c/ap.tgz
 tar xzf ap.tgz
diff --git a/ch04/data/preprocess-wikidata.sh b/ch04/data/preprocess-wikidata.sh
new file mode 100755
index 00000000..d9a6ebb3
--- /dev/null
+++ b/ch04/data/preprocess-wikidata.sh
@@ -0,0 +1,3 @@
+#!/bin/sh
+
+python -m gensim.scripts.make_wiki enwiki-latest-pages-articles.xml.bz2 wiki_en_output
diff --git a/ch04/wikitopics.py b/ch04/wikitopics.py
deleted file mode 100644
index f04698dc..00000000
--- a/ch04/wikitopics.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# This code is supporting material for the book
-# Building Machine Learning Systems with Python
-# by Willi Richert and Luis Pedro Coelho
-# published by PACKT Publishing
-#
-# It is made available under the MIT License
-
-from __future__ import print_function
-import numpy as np
-import logging
-import gensim
-logging.basicConfig(
-    format='%(asctime)s : %(levelname)s : %(message)s',
-    level=logging.INFO)
-id2word = gensim.corpora.Dictionary.load_from_text(
-    'data/wiki_en_output_wordids.txt')
-mm = gensim.corpora.MmCorpus('data/wiki_en_output_tfidf.mm')
-model = gensim.models.ldamodel.LdaModel(
-    corpus=mm,
-    id2word=id2word,
-    num_topics=100,
-    update_every=1,
-    chunksize=10000,
-    passes=1)
-model.save('wiki_lda.pkl')
-topics = [model[doc] for doc in mm]
-lens = np.array([len(t) for t in topics])
-print(np.mean(lens <= 10))
-print(np.mean(lens))
-
-counts = np.zeros(100)
-for doc_top in topics:
-    for ti, _ in doc_toc:
-        counts[ti] += 1
-
-for doc_top in topics:
-    for ti, _ in doc_top:
-        counts[ti] += 1
-
-words = model.show_topic(counts.argmax(), 64)
-print(words)
-print()
-print()
-print()
-words = model.show_topic(counts.argmin(), 64)
-print(words)
-print()
-print()
-print()
diff --git a/ch04/wikitopics_create.py b/ch04/wikitopics_create.py
new file mode 100644
index 00000000..d1aff278
--- /dev/null
+++ b/ch04/wikitopics_create.py
@@ -0,0 +1,51 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+#
+# It is made available under the MIT License
+
+from __future__ import print_function
+import logging
+import gensim
+import numpy as np
+
+NR_OF_TOPICS = 100
+
+# Set up logging in order to get progress information as the model is being built:
+logging.basicConfig(
+    format='%(asctime)s : %(levelname)s : %(message)s',
+    level=logging.INFO)
+
+# Load the preprocessed corpus (id2word & mm):
+id2word = gensim.corpora.Dictionary.load_from_text(
+    'data/wiki_en_output_wordids.txt.bz2')
+mm = gensim.corpora.MmCorpus('data/wiki_en_output_tfidf.mm')
+
+# Calling the constructor is enough to build the model
+# This call will take a few hours!
+model = gensim.models.ldamodel.LdaModel(
+    corpus=mm,
+    id2word=id2word,
+    num_topics=NR_OF_TOPICS,
+    update_every=1,
+    chunksize=10000,
+    passes=1)
+
+# Save the model so we do not need to learn it again.
+model.save('wiki_lda.pkl')
+
+# Compute the document/topic matrix
+topics = np.zeros((len(mm), model.num_topics))
+for di,doc in enumerate(mm):
+    doc_top = model[doc]
+    for ti,tv in doc_top:
+        topics[di,ti] += tv
+np.save('topics.npy', topics)
+
+# Alternatively, we create a sparse matrix and save that. This alternative
+# saves disk space, at the cost of slightly more complex code:
+
+## from scipy import sparse, io
+## sp = sparse.csr_matrix(topics)
+## io.savemat('topics.mat', {'topics': sp})
diff --git a/ch04/wikitopics_create_hdp.py b/ch04/wikitopics_create_hdp.py
new file mode 100644
index 00000000..951d1850
--- /dev/null
+++ b/ch04/wikitopics_create_hdp.py
@@ -0,0 +1,39 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+#
+# It is made available under the MIT License
+
+from __future__ import print_function
+import logging
+import gensim
+import numpy as np
+
+# Set up logging in order to get progress information as the model is being built:
+logging.basicConfig(
+    format='%(asctime)s : %(levelname)s : %(message)s',
+    level=logging.INFO)
+
+# Load the preprocessed corpus (id2word & mm):
+id2word = gensim.corpora.Dictionary.load_from_text(
+    'data/wiki_en_output_wordids.txt.bz2')
+mm = gensim.corpora.MmCorpus('data/wiki_en_output_tfidf.mm')
+
+# Calling the constructor is enough to build the model
+# This call will take a few hours!
+model = gensim.models.hdpmodel.HdpModel(
+    corpus=mm,
+    id2word=id2word,
+    chunksize=10000)
+
+# Save the model so we do not need to learn it again.
+model.save('wiki_hdp.pkl')
+
+# Compute the document/topic matrix
+topics = np.zeros((len(mm), model.num_topics))
+for di,doc in enumerate(mm):
+    doc_top = model[doc]
+    for ti,tv in doc_top:
+        topics[di,ti] += tv
+np.save('topics_hdp.npy', topics)
diff --git a/ch04/wikitopics_plot.py b/ch04/wikitopics_plot.py
new file mode 100644
index 00000000..04adf780
--- /dev/null
+++ b/ch04/wikitopics_plot.py
@@ -0,0 +1,64 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+#
+# It is made available under the MIT License
+
+from __future__ import print_function
+import numpy as np
+import gensim
+from os import path
+from wordcloud import create_cloud
+
+if not path.exists('wiki_lda.pkl'):
+    import sys
+    sys.stderr.write('''\
+This script must be run after wikitopics_create.py!
+
+That script creates and saves the LDA model (this must onlly be done once).
+This script is responsible for the analysis.''')
+    sys.exit(1)
+
+# Load the preprocessed Wikipedia corpus (id2word and mm)
+id2word = gensim.corpora.Dictionary.load_from_text(
+    'data/wiki_en_output_wordids.txt.bz2')
+mm = gensim.corpora.MmCorpus('data/wiki_en_output_tfidf.mm')
+
+# Load the precomputed model
+model = gensim.models.ldamodel.LdaModel.load('wiki_lda.pkl')
+
+topics = np.load('topics.npy', mmap_mode='r')
+
+# Compute the number of topics mentioned in each document
+lens = (topics > 0).sum(axis=1)
+print('Mean number of topics mentioned: {0:.3}'.format(np.mean(lens)))
+print('Percentage of articles mentioning less than 10 topics: {0:.1%}'.format(np.mean(lens <= 10)))
+
+# Weights will be the total weight of each topic
+weights = topics.sum(0)
+
+# Retrieve the most heavily used topic and plot it as a word cloud:
+words = model.show_topic(weights.argmax(), 64)
+
+# The parameter ``maxsize`` often needs some manual tuning to make it look nice.
+create_cloud('Wikipedia_most.png', words, maxsize=250, fontname='Cardo')
+
+fraction_mention = np.mean(topics[:,weights.argmax()] > 0)
+print("The most mentioned topics is mentioned in {:.1%} of documents.".format(fraction_mention))
+total_weight = np.mean(topics[:,weights.argmax()])
+print("It represents {:.1%} of the total number of words.".format(total_weight))
+print()
+print()
+print()
+
+# Retrieve the **least** heavily used topic and plot it as a word cloud:
+words = model.show_topic(weights.argmin(), 64)
+create_cloud('Wikipedia_least.png', words, maxsize=150, fontname='Cardo')
+fraction_mention = np.mean(topics[:,weights.argmin()] > 0)
+print("The least mentioned topics is mentioned in {:.1%} of documents.".format(fraction_mention))
+total_weight = np.mean(topics[:,weights.argmin()])
+print("It represents {:.1%} of the total number of words.".format(total_weight))
+print()
+print()
+print()
diff --git a/ch04/wordcloud.py b/ch04/wordcloud.py
new file mode 100644
index 00000000..accca2d6
--- /dev/null
+++ b/ch04/wordcloud.py
@@ -0,0 +1,29 @@
+from __future__ import print_function
+warned_of_error = False
+
+def create_cloud(oname, words,maxsize=120, fontname='Lobster'):
+    '''Creates a word cloud (when pytagcloud is installed)
+
+    Parameters
+    ----------
+    oname : output filename
+    words : list of (value,str)
+    maxsize : int, optional
+        Size of maximum word. The best setting for this parameter will often
+        require some manual tuning for each input.
+    fontname : str, optional
+        Font to use.
+    '''
+    try:
+        from pytagcloud import create_tag_image, make_tags
+    except ImportError:
+        if not warned_of_error:
+            print("Could not import pytagcloud. Skipping cloud generation")
+        return
+
+    # gensim returns a weight between 0 and 1 for each word, while pytagcloud
+    # expects an integer word count. So, we multiply by a large number and
+    # round. For a visualization this is an adequate approximation.
+    words = [(w,int(v*10000)) for w,v in words]
+    tags = make_tags(words, maxsize=maxsize)
+    create_tag_image(tags, oname, size=(1800, 1200), fontname=fontname)
diff --git a/ch05/README.md b/ch05/README.md
index 776c90bc..39253f22 100644
--- a/ch05/README.md
+++ b/ch05/README.md
@@ -1,11 +1,10 @@
 Chapter 5 - Classification - Detecting Poor Answers
 ===================================================
 
-The book chapter is based on StackExchange's data blob from August 2012:
-[http://www.clearbits.net/get/2076-aug-2012.torrent](http://www.clearbits.net/get/2076-aug-2012.torrent)
+The book chapter is based on StackExchange's data blob from August 2012 for the first edition. 
 
-After publishing the book, StackExchange stayed as awesome as it always has been and released an updated version:
-[https://archive.org/download/stackexchange/stackexchange_archive.torrent](https://archive.org/download/stackexchange/stackexchange_archive.torrent)
+After publishing the book, StackExchange released the May 2014 version at
+[https://archive.org/download/stackexchange/stackexchange_archive.torrent](https://archive.org/download/stackexchange/stackexchange_archive.torrent).
 
 Note that using the latest version, you will get slightly different results.
 
diff --git a/ch05/chose_instances.py b/ch05/chose_instances.py
index e997e0fb..0614cd32 100644
--- a/ch05/chose_instances.py
+++ b/ch05/chose_instances.py
@@ -23,9 +23,12 @@
 will not be used in the chapter. If, however, you want to experiment with them
 (highly encouraged!), you can get the library from http://packages.python.org/pyenchant/.
 """)
+
     class EnchantMock:
+
         def __init__(self):
             pass
+
         def check(self, word):
             return True
     speller = EnchantMock()
@@ -35,7 +38,6 @@ def check(self, word):
 filtered_meta = json.load(open(filtered_meta, "r"))
 
 
-
 def misspelled_fraction(p):
     tokens = p.split()
     if not tokens:
diff --git a/ch05/classify.py b/ch05/classify.py
index 69832b56..76699819 100644
--- a/ch05/classify.py
+++ b/ch05/classify.py
@@ -16,7 +16,7 @@
 from sklearn import neighbors
 
 from data import chosen, chosen_meta
-from utils import plot_roc, plot_pr
+from utils import plot_pr
 from utils import plot_feat_importance
 from utils import load_meta
 from utils import fetch_posts
@@ -30,11 +30,12 @@
 
 import nltk
 
-# splitting questions into train (70%) and test(30%) and then take their
-# answers
-all_posts = list(meta.keys())
-all_questions = [q for q, v in meta.items() if v['ParentId'] == -1]
-all_answers = [q for q, v in meta.items() if v['ParentId'] != -1]  # [:500]
+# The sorting below is only to ensure reproducable numbers. Further down
+# we will occasionally skip a fold when it contains instances of only
+# one label. The two lines below ensure that the behavior is exactly the
+# same for different runs.
+all_questions = sorted([q for q, v in meta.items() if v['ParentId'] == -1])
+all_answers = sorted([q for q, v in meta.items() if v['ParentId'] != -1])
 
 feature_names = np.array((
     'NumTextTokens',
@@ -47,20 +48,15 @@
     'NumImages'
 ))
 
-# activate the following for reduced feature space
-"""
-feature_names = np.array((
-    'NumTextTokens',
-    'LinkCount',
-))
-"""
-
 
 def prepare_sent_features():
     for pid, text in fetch_posts(chosen, with_index=True):
         if not text:
             meta[pid]['AvgSentLen'] = meta[pid]['AvgWordLen'] = 0
         else:
+            from platform import python_version
+            if python_version().startswith('2'):
+                text = text.decode('utf-8')
             sent_lens = [len(nltk.word_tokenize(
                 sent)) for sent in nltk.sent_tokenize(text)]
             meta[pid]['AvgSentLen'] = np.mean(sent_lens)
@@ -80,17 +76,26 @@ def get_features(aid):
     return tuple(meta[aid][fn] for fn in feature_names)
 
 qa_X = np.asarray([get_features(aid) for aid in all_answers])
-# Score > 0 tests => positive class is good answer
-# Score <= 0 tests => positive class is poor answer
-qa_Y = np.asarray([meta[aid]['Score'] > 0 for aid in all_answers])
+
 classifying_answer = "good"
+#classifying_answer = "poor"
+
+if classifying_answer == "good":
+    # Score > 0 tests => positive class is good answer
+    qa_Y = np.asarray([meta[aid]['Score'] > 0 for aid in all_answers])
+elif classifying_answer == "poor":
+    # Score <= 0 tests => positive class is poor answer
+    qa_Y = np.asarray([meta[aid]['Score'] <= 0 for aid in all_answers])
+else:
+    raise Exception("classifying_answer='%s' is not supported" %
+                    classifying_answer)
 
 for idx, feat in enumerate(feature_names):
     plot_feat_hist([(qa_X[:, idx], feat)])
-"""
-plot_feat_hist([(qa_X[:, idx], feature_names[idx]) for idx in [1,0]], 'feat_hist_two.png')
-plot_feat_hist([(qa_X[:, idx], feature_names[idx]) for idx in [3,4,5,6]], 'feat_hist_four.png')
-"""
+
+#plot_feat_hist([(qa_X[:, idx], feature_names[idx]) for idx in [1,0]], 'feat_hist_two.png')
+#plot_feat_hist([(qa_X[:, idx], feature_names[idx]) for idx in [3,4,5,6]], 'feat_hist_four.png')
+
 avg_scores_summary = []
 
 
@@ -115,10 +120,16 @@ def measure(clf_class, parameters, name, data_size=None, plot=False):
     pr_scores = []
     precisions, recalls, thresholds = [], [], []
 
-    for train, test in cv:
+    for fold_idx, (train, test) in enumerate(cv):
         X_train, y_train = X[train], Y[train]
         X_test, y_test = X[test], Y[test]
 
+        only_one_class_in_train = len(set(y_train)) == 1
+        only_one_class_in_test = len(set(y_test)) == 1
+        if only_one_class_in_train or only_one_class_in_test:
+            # this would pose problems later on
+            continue
+
         clf = clf_class(**parameters)
 
         clf.fit(X_train, y_train)
@@ -145,12 +156,20 @@ def measure(clf_class, parameters, name, data_size=None, plot=False):
         precisions.append(precision)
         recalls.append(recall)
         thresholds.append(pr_thresholds)
+
+        # This threshold is determined at the end of the chapter 5,
+        # where we find conditions such that precision is in the area of
+        # about 80%. With it we trade off recall for precision.
+        threshold_for_detecting_good_answers = 0.59
+
+        print("Clone #%i" % fold_idx)
         print(classification_report(y_test, proba[:, label_idx] >
-              0.63, target_names=['not accepted', 'accepted']))
+              threshold_for_detecting_good_answers, target_names=['not accepted', 'accepted']))
 
     # get medium clone
     scores_to_sort = pr_scores  # roc_scores
     medium = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
+    print("Medium clone is #%i" % medium)
 
     if plot:
         #plot_roc(roc_scores[medium], name, fprs[medium], tprs[medium])
@@ -178,6 +197,7 @@ def measure(clf_class, parameters, name, data_size=None, plot=False):
 
 
 def bias_variance_analysis(clf_class, parameters, name):
+    #import ipdb;ipdb.set_trace()
     data_sizes = np.arange(60, 2000, 4)
 
     train_errors = []
@@ -208,16 +228,16 @@ def k_complexity_analysis(clf_class, parameters):
 
     plot_k_complexity(ks, train_errors, test_errors)
 
-for k in [5]:  # [5, 10, 40, 90]:
+for k in [5]:
+# for k in [5, 10, 40]:
+    #measure(neighbors.KNeighborsClassifier, {'n_neighbors': k}, "%iNN" % k)
     bias_variance_analysis(neighbors.KNeighborsClassifier, {
-                           'n_neighbors': k, 'warn_on_equidistant': False}, "%iNN" % k)
-    k_complexity_analysis(neighbors.KNeighborsClassifier, {'n_neighbors': k,
-                                                           'warn_on_equidistant': False})
-    # measure(neighbors.KNeighborsClassifier, {'n_neighbors': k, 'p': 2,
-            #'warn_on_equidistant': False}, "%iNN" % k)
+                           'n_neighbors': k}, "%iNN" % k)
+    k_complexity_analysis(neighbors.KNeighborsClassifier, {'n_neighbors': k})
 
 from sklearn.linear_model import LogisticRegression
-for C in [0.1]:  # [0.01, 0.1, 1.0, 10.0]:
+for C in [0.1]:
+# for C in [0.01, 0.1, 1.0, 10.0]:
     name = "LogReg C=%.2f" % C
     bias_variance_analysis(LogisticRegression, {'penalty': 'l2', 'C': C}, name)
     measure(LogisticRegression, {'penalty': 'l2', 'C': C}, name, plot=True)
diff --git a/ch05/data.py b/ch05/data.py
index 62ddbc6d..e3b7e4bb 100644
--- a/ch05/data.py
+++ b/ch05/data.py
@@ -7,7 +7,7 @@
 
 import os
 
-DATA_DIR = "data" # put your posts-2011-12.xml into this directory
+DATA_DIR = "data"  # put your posts-2012.xml into this directory
 CHART_DIR = "charts"
 
 filtered = os.path.join(DATA_DIR, "filtered.tsv")
diff --git a/ch05/log_reg_example.py b/ch05/log_reg_example.py
index 7efc0eb1..bfbbd227 100644
--- a/ch05/log_reg_example.py
+++ b/ch05/log_reg_example.py
@@ -38,7 +38,8 @@ def lr_model(clf, X):
 pyplot.xlabel("feature value")
 pyplot.ylabel("class")
 pyplot.grid(True, linestyle='-', color='0.75')
-pyplot.savefig(os.path.join(CHART_DIR, "log_reg_example_data.png"), bbox_inches="tight")
+pyplot.savefig(
+    os.path.join(CHART_DIR, "log_reg_example_data.png"), bbox_inches="tight")
 
 
 def lin_model(clf, X):
@@ -69,7 +70,8 @@ def lin_model(clf, X):
 pyplot.ylabel("class")
 pyplot.title("linear fit on additional data")
 pyplot.grid(True, linestyle='-', color='0.75')
-pyplot.savefig(os.path.join(CHART_DIR, "log_reg_log_linear_fit.png"), bbox_inches="tight")
+pyplot.savefig(
+    os.path.join(CHART_DIR, "log_reg_log_linear_fit.png"), bbox_inches="tight")
 
 pyplot.figure(figsize=(10, 4))
 pyplot.xlim((-5, 20))
@@ -79,7 +81,8 @@ def lin_model(clf, X):
 pyplot.xlabel("feature value")
 pyplot.ylabel("class")
 pyplot.grid(True, linestyle='-', color='0.75')
-pyplot.savefig(os.path.join(CHART_DIR, "log_reg_example_fitted.png"), bbox_inches="tight")
+pyplot.savefig(
+    os.path.join(CHART_DIR, "log_reg_example_fitted.png"), bbox_inches="tight")
 
 X = np.arange(0, 1, 0.001)
 pyplot.figure(figsize=(10, 4))
@@ -97,4 +100,5 @@ def lin_model(clf, X):
 pyplot.xlabel("P")
 pyplot.ylabel("log(odds) = log(P / (1-P))")
 pyplot.grid(True, linestyle='-', color='0.75')
-pyplot.savefig(os.path.join(CHART_DIR, "log_reg_log_odds.png"), bbox_inches="tight")
+pyplot.savefig(
+    os.path.join(CHART_DIR, "log_reg_log_odds.png"), bbox_inches="tight")
diff --git a/ch05/so_xml_to_tsv.py b/ch05/so_xml_to_tsv.py
index f425a2e5..7ce5b150 100644
--- a/ch05/so_xml_to_tsv.py
+++ b/ch05/so_xml_to_tsv.py
@@ -10,6 +10,7 @@
 # to a question that has been asked in 2011 or 2012.
 #
 
+import sys
 import os
 import re
 try:
@@ -24,8 +25,13 @@
 
 from data import DATA_DIR
 
-filename = os.path.join(DATA_DIR, "posts-2011-12.xml")
+#filename = os.path.join(DATA_DIR, "posts-2011-12.xml")
+filename = os.path.join(DATA_DIR, "posts-2012.xml")
+print("Reading from xml %s" % filename)
 filename_filtered = os.path.join(DATA_DIR, "filtered.tsv")
+print("Filtered: %s" % filename_filtered)
+filename_filtered_meta = os.path.join(DATA_DIR, "filtered-meta.json")
+print("Meta: %s" % filename_filtered_meta)
 
 q_creation = {}  # creation datetimes of questions
 q_accepted = {}  # id of accepted answer
@@ -56,20 +62,19 @@ def filter_html(s):
         # sometimes source code contain links, which we don't want to count
         link_count_in_code += len(link_match.findall(match_str))
 
-    anchors = link_match.findall(s)
-    link_count = len(anchors)
+    links = link_match.findall(s)
+    link_count = len(links)
 
     link_count -= link_count_in_code
 
-    html_free_s = re.sub(
+    link_free_s = re.sub(
         " +", " ", tag_match.sub('', code_free_s)).replace("\n", "")
 
-    link_free_s = html_free_s
-    for anchor in anchors:
-        if anchor.lower().startswith("http://"):
-            link_free_s = link_free_s.replace(anchor, '')
+    for link in links:
+        if link.lower().startswith("http://"):
+            link_free_s = link_free_s.replace(link, '')
 
-    num_text_tokens = html_free_s.count(" ")
+    num_text_tokens = link_free_s.count(" ")
 
     return link_free_s, num_text_tokens, num_code_lines, link_count, num_images
 
@@ -77,21 +82,26 @@ def filter_html(s):
 num_questions = 0
 num_answers = 0
 
-from itertools import imap
+if sys.version_info.major < 3:
+    # Python 2, map() returns a list, which will lead to out of memory errors.
+    # The following import ensures that the script behaves like being executed
+    # with Python 3.
+    from itertools import imap as map
+
 
 def parsexml(filename):
     global num_questions, num_answers
 
     counter = 0
 
-    it = imap(itemgetter(1),
+    it = map(itemgetter(1),
              iter(etree.iterparse(filename, events=('start',))))
 
     root = next(it)  # get posts element
 
     for elem in it:
         if counter % 100000 == 0:
-            print(counter)
+            print("Processed %i <row/> elements" % counter)
 
         counter += 1
 
@@ -147,12 +157,12 @@ def parsexml(filename):
 
             root.clear()  # preserve memory
 
-with open(os.path.join(DATA_DIR, filename_filtered), "w") as f:
+with open(filename_filtered, "w") as f:
     for values in parsexml(filename):
         line = "\t".join(map(str, values))
         f.write(line + "\n")
 
-with open(os.path.join(DATA_DIR, "filtered-meta.json"), "w") as f:
+with open(filename_filtered_meta, "w") as f:
     json.dump(meta, f)
 
 print("years:", years)
diff --git a/ch05/utils.py b/ch05/utils.py
index 03429cef..c6e497a5 100644
--- a/ch05/utils.py
+++ b/ch05/utils.py
@@ -171,8 +171,8 @@ def plot_feat_hist(data_name_list, filename=None):
         assert filename is not None
 
     pylab.figure(num=None, figsize=(8, 6))
-    num_rows = 1 + (len(data_name_list) - 1) / 2
-    num_cols = 1 if len(data_name_list) == 1 else 2
+    num_rows = int(1 + (len(data_name_list) - 1) / 2)
+    num_cols = int(1 if len(data_name_list) == 1 else 2)
     pylab.figure(figsize=(5 * num_cols, 4 * num_rows))
 
     for i in range(num_rows):
@@ -191,7 +191,7 @@ def plot_feat_hist(data_name_list, filename=None):
             else:
                 bins = max_val
             n, bins, patches = pylab.hist(
-                x, bins=bins, normed=1, facecolor='blue', alpha=0.75)
+                x, bins=bins, normed=1, alpha=0.75)
 
             pylab.grid(True)
 
@@ -209,7 +209,7 @@ def plot_bias_variance(data_sizes, train_errors, test_errors, name, title):
     pylab.title("Bias-Variance for '%s'" % name)
     pylab.plot(
         data_sizes, test_errors, "--", data_sizes, train_errors, "b-", lw=1)
-    pylab.legend(["train error", "test error"], loc="upper right")
+    pylab.legend(["test error", "train error"], loc="upper right")
     pylab.grid(True, linestyle='-', color='0.75')
     pylab.savefig(
         os.path.join(CHART_DIR, "bv_" + name.replace(" ", "_") + ".png"), bbox_inches="tight")
@@ -220,10 +220,10 @@ def plot_k_complexity(ks, train_errors, test_errors):
     pylab.ylim([0.0, 1.0])
     pylab.xlabel('k')
     pylab.ylabel('Error')
-    pylab.title('Errors for for different values of k')
+    pylab.title('Errors for for different values of $k$')
     pylab.plot(
         ks, test_errors, "--", ks, train_errors, "-", lw=1)
-    pylab.legend(["train error", "test error"], loc="upper right")
+    pylab.legend(["test error", "train error"], loc="upper right")
     pylab.grid(True, linestyle='-', color='0.75')
     pylab.savefig(
         os.path.join(CHART_DIR, "kcomplexity.png"), bbox_inches="tight")
diff --git a/ch06/01_start.py b/ch06/01_start.py
index 2940b0fa..0bdbdd12 100644
--- a/ch06/01_start.py
+++ b/ch06/01_start.py
@@ -40,7 +40,7 @@ def create_ngram_model():
 
 def train_model(clf_factory, X, Y, name="NB ngram", plot=False):
     cv = ShuffleSplit(
-        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
+        n=len(X), n_iter=10, test_size=0.3, random_state=0)
 
     train_errors = []
     test_errors = []
@@ -83,7 +83,7 @@ def train_model(clf_factory, X, Y, name="NB ngram", plot=False):
 
         summary = (np.mean(scores), np.std(scores),
                    np.mean(pr_scores), np.std(pr_scores))
-        print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary
+        print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)
 
     return np.mean(train_errors), np.mean(test_errors)
 
@@ -94,18 +94,18 @@ def print_incorrect(clf, X, Y):
     X_wrong = X[wrong_idx]
     Y_wrong = Y[wrong_idx]
     Y_hat_wrong = Y_hat[wrong_idx]
-    for idx in xrange(len(X_wrong)):
-        print "clf.predict('%s')=%i instead of %i" %\
-            (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])
+    for idx in range(len(X_wrong)):
+        print("clf.predict('%s')=%i instead of %i" %
+              (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx]))
 
 
 if __name__ == "__main__":
     X_orig, Y_orig = load_sanders_data()
     classes = np.unique(Y_orig)
     for c in classes:
-        print "#%s: %i" % (c, sum(Y_orig == c))
+        print("#%s: %i" % (c, sum(Y_orig == c)))
 
-    print "== Pos vs. neg =="
+    print("== Pos vs. neg ==")
     pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative")
     X = X_orig[pos_neg]
     Y = Y_orig[pos_neg]
@@ -113,19 +113,19 @@ def print_incorrect(clf, X, Y):
 
     train_model(create_ngram_model, X, Y, name="pos vs neg", plot=True)
 
-    print "== Pos/neg vs. irrelevant/neutral =="
+    print("== Pos/neg vs. irrelevant/neutral ==")
     X = X_orig
     Y = tweak_labels(Y_orig, ["positive", "negative"])
     train_model(create_ngram_model, X, Y, name="sent vs rest", plot=True)
 
-    print "== Pos vs. rest =="
+    print("== Pos vs. rest ==")
     X = X_orig
     Y = tweak_labels(Y_orig, ["positive"])
     train_model(create_ngram_model, X, Y, name="pos vs rest", plot=True)
 
-    print "== Neg vs. rest =="
+    print("== Neg vs. rest ==")
     X = X_orig
     Y = tweak_labels(Y_orig, ["negative"])
     train_model(create_ngram_model, X, Y, name="neg vs rest", plot=True)
 
-    print "time spent:", time.time() - start_time
+    print("time spent:", time.time() - start_time)
diff --git a/ch06/02_tuning.py b/ch06/02_tuning.py
index a48d3edc..5b6a835f 100644
--- a/ch06/02_tuning.py
+++ b/ch06/02_tuning.py
@@ -45,7 +45,7 @@ def create_ngram_model(params=None):
 
 def grid_search_model(clf_factory, X, Y):
     cv = ShuffleSplit(
-        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
+        n=len(X), n_iter=10, test_size=0.3, random_state=0)
 
     param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)],
                       vect__min_df=[1, 2],
@@ -64,7 +64,7 @@ def grid_search_model(clf_factory, X, Y):
                                verbose=10)
     grid_search.fit(X, Y)
     clf = grid_search.best_estimator_
-    print clf
+    print(clf)
 
     return clf
 
@@ -114,7 +114,7 @@ def train_model(clf, X, Y, name="NB ngram", plot=False):
 
     summary = (np.mean(scores), np.std(scores),
                np.mean(pr_scores), np.std(pr_scores))
-    print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary
+    print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)
 
     return np.mean(train_errors), np.mean(test_errors)
 
@@ -125,9 +125,9 @@ def print_incorrect(clf, X, Y):
     X_wrong = X[wrong_idx]
     Y_wrong = Y[wrong_idx]
     Y_hat_wrong = Y_hat[wrong_idx]
-    for idx in xrange(len(X_wrong)):
-        print "clf.predict('%s')=%i instead of %i" %\
-            (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])
+    for idx in range(len(X_wrong)):
+        print("clf.predict('%s')=%i instead of %i" %
+              (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx]))
 
 
 def get_best_model():
@@ -149,16 +149,16 @@ def get_best_model():
     X_orig, Y_orig = load_sanders_data()
     classes = np.unique(Y_orig)
     for c in classes:
-        print "#%s: %i" % (c, sum(Y_orig == c))
+        print("#%s: %i" % (c, sum(Y_orig == c)))
 
-    print "== Pos vs. neg =="
+    print("== Pos vs. neg ==")
     pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative")
     X = X_orig[pos_neg]
     Y = Y_orig[pos_neg]
     Y = tweak_labels(Y, ["positive"])
     train_model(get_best_model(), X, Y, name="pos vs neg", plot=True)
 
-    print "== Pos/neg vs. irrelevant/neutral =="
+    print("== Pos/neg vs. irrelevant/neutral ==")
     X = X_orig
     Y = tweak_labels(Y_orig, ["positive", "negative"])
 
@@ -166,16 +166,16 @@ def get_best_model():
     # rest", plot=True)
     train_model(get_best_model(), X, Y, name="pos vs neg", plot=True)
 
-    print "== Pos vs. rest =="
+    print("== Pos vs. rest ==")
     X = X_orig
     Y = tweak_labels(Y_orig, ["positive"])
     train_model(get_best_model(), X, Y, name="pos vs rest",
                 plot=True)
 
-    print "== Neg vs. rest =="
+    print("== Neg vs. rest ==")
     X = X_orig
     Y = tweak_labels(Y_orig, ["negative"])
     train_model(get_best_model(), X, Y, name="neg vs rest",
                 plot=True)
 
-    print "time spent:", time.time() - start_time
+    print("time spent:", time.time() - start_time)
diff --git a/ch06/03_clean.py b/ch06/03_clean.py
index f00c2c44..0170ed82 100644
--- a/ch06/03_clean.py
+++ b/ch06/03_clean.py
@@ -57,7 +57,7 @@
 }
 
 emo_repl_order = [k for (k_len, k) in reversed(
-    sorted([(len(k), k) for k in emo_repl.keys()]))]
+    sorted([(len(k), k) for k in list(emo_repl.keys())]))]
 
 re_repl = {
     r"\br\b": "are",
@@ -84,7 +84,7 @@ def preprocessor(tweet):
 
         for k in emo_repl_order:
             tweet = tweet.replace(k, emo_repl[k])
-        for r, repl in re_repl.iteritems():
+        for r, repl in re_repl.items():
             tweet = re.sub(r, repl, tweet)
 
         return tweet
@@ -103,7 +103,7 @@ def preprocessor(tweet):
 def train_model(clf, X, Y, name="NB ngram", plot=False):
     # create it again for plotting
     cv = ShuffleSplit(
-        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
+        n=len(X), n_iter=10, test_size=0.3, random_state=0)
 
     train_errors = []
     test_errors = []
@@ -150,7 +150,7 @@ def train_model(clf, X, Y, name="NB ngram", plot=False):
 
     summary = (np.mean(scores), np.std(scores),
                np.mean(pr_scores), np.std(pr_scores))
-    print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary
+    print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)
 
     return np.mean(train_errors), np.mean(test_errors)
 
@@ -161,9 +161,9 @@ def print_incorrect(clf, X, Y):
     X_wrong = X[wrong_idx]
     Y_wrong = Y[wrong_idx]
     Y_hat_wrong = Y_hat[wrong_idx]
-    for idx in xrange(len(X_wrong)):
-        print "clf.predict('%s')=%i instead of %i" %\
-            (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])
+    for idx in range(len(X_wrong)):
+        print("clf.predict('%s')=%i instead of %i" %
+              (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx]))
 
 
 def get_best_model():
@@ -185,16 +185,16 @@ def get_best_model():
     X_orig, Y_orig = load_sanders_data()
     classes = np.unique(Y_orig)
     for c in classes:
-        print "#%s: %i" % (c, sum(Y_orig == c))
+        print("#%s: %i" % (c, sum(Y_orig == c)))
 
-    print "== Pos vs. neg =="
+    print("== Pos vs. neg ==")
     pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative")
     X = X_orig[pos_neg]
     Y = Y_orig[pos_neg]
     Y = tweak_labels(Y, ["positive"])
     train_model(get_best_model(), X, Y, name="pos vs neg", plot=True)
 
-    print "== Pos/neg vs. irrelevant/neutral =="
+    print("== Pos/neg vs. irrelevant/neutral ==")
     X = X_orig
     Y = tweak_labels(Y_orig, ["positive", "negative"])
 
@@ -202,16 +202,16 @@ def get_best_model():
     # rest", plot=True)
     train_model(get_best_model(), X, Y, name="pos+neg vs rest", plot=True)
 
-    print "== Pos vs. rest =="
+    print("== Pos vs. rest ==")
     X = X_orig
     Y = tweak_labels(Y_orig, ["positive"])
     train_model(get_best_model(), X, Y, name="pos vs rest",
                 plot=True)
 
-    print "== Neg vs. rest =="
+    print("== Neg vs. rest ==")
     X = X_orig
     Y = tweak_labels(Y_orig, ["negative"])
     train_model(get_best_model(), X, Y, name="neg vs rest",
                 plot=True)
 
-    print "time spent:", time.time() - start_time
+    print("time spent:", time.time() - start_time)
diff --git a/ch06/04_sent.py b/ch06/04_sent.py
index 87fbafc9..c09a435f 100644
--- a/ch06/04_sent.py
+++ b/ch06/04_sent.py
@@ -153,7 +153,7 @@ def transform(self, documents):
 }
 
 emo_repl_order = [k for (k_len, k) in reversed(
-    sorted([(len(k), k) for k in emo_repl.keys()]))]
+    sorted([(len(k), k) for k in list(emo_repl.keys())]))]
 
 re_repl = {
     r"\br\b": "are",
@@ -179,7 +179,7 @@ def preprocessor(tweet):
 
         for k in emo_repl_order:
             tweet = tweet.replace(k, emo_repl[k])
-        for r, repl in re_repl.iteritems():
+        for r, repl in re_repl.items():
             tweet = re.sub(r, repl, tweet)
 
         return tweet.replace("-", " ").replace("_", " ")
@@ -202,7 +202,7 @@ def preprocessor(tweet):
 
 def __grid_search_model(clf_factory, X, Y):
     cv = ShuffleSplit(
-        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
+        n=len(X), n_iter=10, test_size=0.3, random_state=0)
 
     param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)],
                       vect__min_df=[1, 2],
@@ -220,7 +220,7 @@ def __grid_search_model(clf_factory, X, Y):
                                verbose=10)
     grid_search.fit(X, Y)
     clf = grid_search.best_estimator_
-    print clf
+    print(clf)
 
     return clf
 
@@ -228,7 +228,7 @@ def __grid_search_model(clf_factory, X, Y):
 def train_model(clf, X, Y, name="NB ngram", plot=False):
     # create it again for plotting
     cv = ShuffleSplit(
-        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
+        n=len(X), n_iter=10, test_size=0.3, random_state=0)
 
     train_errors = []
     test_errors = []
@@ -275,7 +275,7 @@ def train_model(clf, X, Y, name="NB ngram", plot=False):
 
     summary = (np.mean(scores), np.std(scores),
                np.mean(pr_scores), np.std(pr_scores))
-    print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary
+    print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)
 
     return np.mean(train_errors), np.mean(test_errors)
 
@@ -286,9 +286,9 @@ def print_incorrect(clf, X, Y):
     X_wrong = X[wrong_idx]
     Y_wrong = Y[wrong_idx]
     Y_hat_wrong = Y_hat[wrong_idx]
-    for idx in xrange(len(X_wrong)):
-        print "clf.predict('%s')=%i instead of %i" %\
-            (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])
+    for idx in range(len(X_wrong)):
+        print("clf.predict('%s')=%i instead of %i" %
+              (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx]))
 
 
 def get_best_model():
@@ -315,16 +315,16 @@ def get_best_model():
     #Y_orig = Y_orig[:100,]
     classes = np.unique(Y_orig)
     for c in classes:
-        print "#%s: %i" % (c, sum(Y_orig == c))
+        print("#%s: %i" % (c, sum(Y_orig == c)))
 
-    print "== Pos vs. neg =="
+    print("== Pos vs. neg ==")
     pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative")
     X = X_orig[pos_neg]
     Y = Y_orig[pos_neg]
     Y = tweak_labels(Y, ["positive"])
     train_model(get_best_model(), X, Y, name="pos vs neg", plot=True)
 
-    print "== Pos/neg vs. irrelevant/neutral =="
+    print("== Pos/neg vs. irrelevant/neutral ==")
     X = X_orig
     Y = tweak_labels(Y_orig, ["positive", "negative"])
 
@@ -332,18 +332,18 @@ def get_best_model():
     # rest", plot=True)
     train_model(get_best_model(), X, Y, name="pos+neg vs rest", plot=True)
 
-    print "== Pos vs. rest =="
+    print("== Pos vs. rest ==")
     X = X_orig
     Y = tweak_labels(Y_orig, ["positive"])
     train_model(get_best_model(), X, Y, name="pos vs rest",
                 plot=True)
 
-    print "== Neg vs. rest =="
+    print("== Neg vs. rest ==")
     X = X_orig
     Y = tweak_labels(Y_orig, ["negative"])
     train_model(get_best_model(), X, Y, name="neg vs rest",
                 plot=True)
 
-    print "time spent:", time.time() - start_time
+    print("time spent:", time.time() - start_time)
 
     json.dump(poscache, open(poscache_filename, "w"))
diff --git a/ch06/install.py b/ch06/install.py
index c07a195f..be61f485 100644
--- a/ch06/install.py
+++ b/ch06/install.py
@@ -11,23 +11,19 @@
 #
 # Pulls tweet data from Twitter because ToS prevents distributing it directly.
 #
-# Right now we use unauthenticated requests, which are rate-limited to 150/hr.
-# We use 125/hr to stay safe.
-#
-#
 #   - Niek Sanders
 #     njs@sananalytics.com
 #     October 20, 2011
 #
 #
-# Excuse the ugly code.  I threw this together as quickly as possible and I
-# don't normally code in Python.
-#
 
 # In Sanders' original form, the code was using Twitter API 1.0.
 # Now that Twitter moved to 1.1, we had to make a few changes.
 # Cf. twitterauth.py for the details.
 
+# Regarding rate limiting, please check
+# https://dev.twitter.com/rest/public/rate-limiting
+
 import sys
 import csv
 import json
@@ -38,24 +34,21 @@
     import twitter
 except ImportError:
     print("""\
-You need to install python-twitter:
-    pip install python-twitter
+You need to ...
+    pip install twitter
 If pip is not found you might have to install it using easy_install.
 If it does not work on your system, you might want to follow instructions
-at https://github.com/bear/python-twitter, most likely:
-  $ git clone https://github.com/bear/python-twitter
-  $ cd python-twitter
+at https://github.com/sixohsix/twitter, most likely:
+  $ git clone https://github.com/sixohsix/twitter
+  $ cd twitter
   $ sudo python setup.py install
 """)
 
     sys.exit(1)
 
 from twitterauth import CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN_KEY, ACCESS_TOKEN_SECRET
-api = twitter.Api(consumer_key=CONSUMER_KEY, consumer_secret=CONSUMER_SECRET,
-                  access_token_key=ACCESS_TOKEN_KEY, access_token_secret=ACCESS_TOKEN_SECRET)
-
-
-MAX_TWEETS_PER_HR = 350
+api = twitter.Twitter(auth=twitter.OAuth(consumer_key=CONSUMER_KEY, consumer_secret=CONSUMER_SECRET,
+                                         token=ACCESS_TOKEN_KEY, token_secret=ACCESS_TOKEN_SECRET))
 
 DATA_PATH = "data"
 
@@ -87,16 +80,15 @@ def get_user_params(DATA_PATH):
 def dump_user_params(user_params):
 
     # dump user params for confirmation
-    print 'Input:    ' + user_params['inList']
-    print 'Output:   ' + user_params['outList']
-    print 'Raw data: ' + user_params['rawDir']
-    return
+    print('Input:    ' + user_params['inList'])
+    print('Output:   ' + user_params['outList'])
+    print('Raw data: ' + user_params['rawDir'])
 
 
 def read_total_list(in_filename):
 
     # read total fetch list csv
-    fp = open(in_filename, 'rb')
+    fp = open(in_filename, 'rt')
     reader = csv.reader(fp, delimiter=',', quotechar='"')
 
     if os.path.exists(MISSING_ID_FILE):
@@ -111,10 +103,12 @@ def read_total_list(in_filename):
     else:
         not_authed_ids = []
 
-    print "We will skip %i tweets that are not available/visible any more on twitter" % (len(missing_ids) + len(not_authed_ids))
+    print("We will skip %i tweets that are not available or visible any more on twitter" % (
+        len(missing_ids) + len(not_authed_ids)))
 
     ignore_ids = set(missing_ids + not_authed_ids)
     total_list = []
+
     for row in reader:
         if row[2] not in ignore_ids:
             total_list.append(row)
@@ -140,12 +134,12 @@ def purge_already_fetched(fetch_list, raw_dir):
                 parse_tweet_json(tweet_file)
                 count_done += 1
             except RuntimeError:
-                print "Error parsing", item
+                print("Error parsing", item)
                 rem_list.append(item)
         else:
             rem_list.append(item)
 
-    print "We have already downloaded %i tweets." % count_done
+    print("We have already downloaded %i tweets." % count_done)
 
     return rem_list
 
@@ -156,66 +150,50 @@ def download_tweets(fetch_list, raw_dir):
     if not os.path.exists(raw_dir):
         os.mkdir(raw_dir)
 
-    # stay within rate limits
-    download_pause_sec = 3600 / MAX_TWEETS_PER_HR
-
     # download tweets
     for idx in range(0, len(fetch_list)):
-        # stay in Twitter API rate limits
-        print 'Pausing %d sec to obey Twitter API rate limits' % \
-              (download_pause_sec)
-        time.sleep(download_pause_sec)
-
         # current item
         item = fetch_list[idx]
-        print item
-
-        # print status
-        print '--> downloading tweet #%s (%d of %d)' % \
-              (item[2], idx + 1, len(fetch_list))
+        print(item)
 
-        # Old Twitter API 1.0
-        # pull data
-        # url = 'https://api.twitter.com/1/statuses/show.json?id=' + item[2]
-        # print url
-        # urllib.urlretrieve(url, raw_dir + item[2] + '.json')
+        print('--> downloading tweet #%s (%d of %d)' %
+              (item[2], idx + 1, len(fetch_list)))
 
-        # New Twitter API 1.1
         try:
-            sec = api.GetSleepTime('/statuses/show/:id')
-            if sec > 0:
-                print "Sleeping %i seconds to conform to Twitter's rate limiting" % sec
-                time.sleep(sec)
+            #import pdb;pdb.set_trace()
+            response = api.statuses.show(_id=item[2])
 
-            result = api.GetStatus(item[2])
-            json_data = result.AsJsonString()
+            if response.rate_limit_remaining <= 0:
+                wait_seconds = response.rate_limit_reset - time.time()
+                print("Rate limiting requests us to wait %f seconds" %
+                      wait_seconds)
+                time.sleep(wait_seconds+5)
 
-        except twitter.TwitterError, e:
+        except twitter.TwitterError as e:
             fatal = True
-            for m in e.message:
+            print(e)
+            for m in json.loads(e.response_data.decode())['errors']:
                 if m['code'] == 34:
-                    print "Tweet missing: ", item
-                    # [{u'message': u'Sorry, that page does not exist', u'code': 34}]
-                    with open(MISSING_ID_FILE, "a") as f:
+                    print("Tweet missing: ", item)
+                    with open(MISSING_ID_FILE, "at") as f:
                         f.write(item[2] + "\n")
 
                     fatal = False
                     break
                 elif m['code'] == 63:
-                    print "User of tweet '%s' has been suspended." % item
-                    # [{u'message': u'Sorry, that page does not exist', u'code': 34}]
-                    with open(MISSING_ID_FILE, "a") as f:
+                    print("User of tweet '%s' has been suspended." % item)
+                    with open(MISSING_ID_FILE, "at") as f:
                         f.write(item[2] + "\n")
 
                     fatal = False
                     break
                 elif m['code'] == 88:
-                    print "Rate limit exceeded. Please lower max_tweets_per_hr."
+                    print("Rate limit exceeded.")
                     fatal = True
                     break
                 elif m['code'] == 179:
-                    print "Not authorized to view this tweet."
-                    with open(NOT_AUTHORIZED_ID_FILE, "a") as f:
+                    print("Not authorized to view this tweet.")
+                    with open(NOT_AUTHORIZED_ID_FILE, "at") as f:
                         f.write(item[2] + "\n")
                     fatal = False
                     break
@@ -225,8 +203,8 @@ def download_tweets(fetch_list, raw_dir):
             else:
                 continue
 
-        with open(raw_dir + item[2] + '.json', "w") as f:
-            f.write(json_data + "\n")
+        with open(raw_dir + item[2] + '.json', "wt") as f:
+            f.write(json.dumps(dict(response)) + "\n")
 
     return
 
@@ -234,12 +212,13 @@ def download_tweets(fetch_list, raw_dir):
 def parse_tweet_json(filename):
 
     # read tweet
-    fp = open(filename, 'rb')
+    fp = open(filename, 'r')
 
     # parse json
     try:
         tweet_json = json.load(fp)
-    except ValueError:
+    except ValueError as e:
+        print(e)
         raise RuntimeError('error parsing json')
 
     # look for twitter api error msgs
@@ -281,20 +260,20 @@ def build_output_corpus(out_filename, raw_dir, total_list):
                 writer.writerow(full_row)
 
             except RuntimeError:
-                print '--> bad data in tweet #' + item[2]
+                print('--> bad data in tweet #' + item[2])
                 missing_count += 1
 
         else:
-            print '--> missing tweet #' + item[2]
+            print('--> missing tweet #' + item[2])
             missing_count += 1
 
     # indicate success
     if missing_count == 0:
-        print '\nSuccessfully downloaded corpus!'
-        print 'Output in: ' + out_filename + '\n'
+        print('\nSuccessfully downloaded corpus!')
+        print('Output in: ' + out_filename + '\n')
     else:
-        print '\nMissing %d of %d tweets!' % (missing_count, len(total_list))
-        print 'Partial output in: ' + out_filename + '\n'
+        print('\nMissing %d of %d tweets!' % (missing_count, len(total_list)))
+        print('Partial output in: ' + out_filename + '\n')
 
     return
 
@@ -302,7 +281,7 @@ def build_output_corpus(out_filename, raw_dir, total_list):
 def main():
     # get user parameters
     user_params = get_user_params(DATA_PATH)
-    print user_params
+    print(user_params)
     dump_user_params(user_params)
 
     # get fetch list
@@ -310,7 +289,7 @@ def main():
 
     # remove already fetched or missing tweets
     fetch_list = purge_already_fetched(total_list, user_params['rawDir'])
-    print "Fetching %i tweets..." % len(fetch_list)
+    print("Fetching %i tweets..." % len(fetch_list))
 
     if fetch_list:
         # start fetching data from twitter
@@ -319,10 +298,11 @@ def main():
         # second pass for any failed downloads
         fetch_list = purge_already_fetched(total_list, user_params['rawDir'])
         if fetch_list:
-            print '\nStarting second pass to retry %i failed downloads...' % len(fetch_list)
+            print('\nStarting second pass to retry %i failed downloads...' %
+                  len(fetch_list))
             download_tweets(fetch_list, user_params['rawDir'])
     else:
-        print "Nothing to fetch any more."
+        print("Nothing to fetch any more.")
 
     # build output corpus
     build_output_corpus(user_params['outList'], user_params['rawDir'],
diff --git a/ch06/utils.py b/ch06/utils.py
index ca8b2357..6757354f 100644
--- a/ch06/utils.py
+++ b/ch06/utils.py
@@ -6,6 +6,7 @@
 # It is made available under the MIT License
 
 import os
+import sys
 import collections
 import csv
 import json
@@ -57,7 +58,7 @@ def load_sanders_data(dirname=".", line_count=-1):
             try:
                 tweet = json.load(open(tweet_fn, "r"))
             except IOError:
-                print("Tweet '%s' not found. Skip."%tweet_fn)
+                print(("Tweet '%s' not found. Skip." % tweet_fn))
                 continue
 
             if 'text' in tweet and tweet['user']['lang'] == "en":
@@ -84,14 +85,14 @@ def plot_pr(auc_score, name, phase, precision, recall, label=None):
     pylab.title('P/R curve (AUC=%0.2f) / %s' % (auc_score, label))
     filename = name.replace(" ", "_")
     pylab.savefig(os.path.join(CHART_DIR, "pr_%s_%s.png" %
-                  (filename, phase)), bbox_inches="tight")
+                               (filename, phase)), bbox_inches="tight")
 
 
 def show_most_informative_features(vectorizer, clf, n=20):
     c_f = sorted(zip(clf.coef_[0], vectorizer.get_feature_names()))
-    top = zip(c_f[:n], c_f[:-(n + 1):-1])
+    top = list(zip(c_f[:n], c_f[:-(n + 1):-1]))
     for (c1, f1), (c2, f2) in top:
-        print "\t%.4f\t%-15s\t\t%.4f\t%-15s" % (c1, f1, c2, f2)
+        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (c1, f1, c2, f2))
 
 
 def plot_log():
@@ -119,7 +120,7 @@ def plot_feat_importance(feature_names, clf, name):
     inds = np.argsort(coef)
     f_imp = f_imp[inds]
     coef = coef[inds]
-    xpos = np.array(range(len(coef)))
+    xpos = np.array(list(range(len(coef))))
     pylab.bar(xpos, coef, width=1)
 
     pylab.title('Feature importance for %s' % (name))
@@ -181,8 +182,13 @@ def plot_bias_variance(data_sizes, train_errors, test_errors, name):
 def load_sent_word_net():
 
     sent_scores = collections.defaultdict(list)
+    sentiwordnet_path = os.path.join(DATA_DIR, "SentiWordNet_3.0.0_20130122.txt")
 
-    with open(os.path.join(DATA_DIR, "SentiWordNet_3.0.0_20130122.txt"), "r") as csvfile:
+    if not os.path.exists(sentiwordnet_path):
+        print("Please download SentiWordNet_3.0.0 from http://sentiwordnet.isti.cnr.it/download.php, extract it and put it into the data directory")
+        sys.exit(1)
+
+    with open(sentiwordnet_path, 'r') as csvfile:
         reader = csv.reader(csvfile, delimiter='\t', quotechar='"')
         for line in reader:
             if line[0].startswith("#"):
@@ -200,7 +206,7 @@ def load_sent_word_net():
                 term = term.replace("-", " ").replace("_", " ")
                 key = "%s/%s" % (POS, term.split("#")[0])
                 sent_scores[key].append((float(PosScore), float(NegScore)))
-    for key, value in sent_scores.iteritems():
+    for key, value in sent_scores.items():
         sent_scores[key] = np.mean(value, axis=0)
 
     return sent_scores
diff --git a/ch07/.gitignore b/ch07/.gitignore
index d21bb3d1..e33609d2 100644
--- a/ch07/.gitignore
+++ b/ch07/.gitignore
@@ -1 +1 @@
-.formula_*/
+*.png
diff --git a/ch07/README.rst b/ch07/README.rst
new file mode 100644
index 00000000..12a7b051
--- /dev/null
+++ b/ch07/README.rst
@@ -0,0 +1,42 @@
+=========
+Chapter 7
+=========
+
+Support code for *Chapter 7: Regression* 
+
+
+Boston data analysis
+--------------------
+
+This dataset is shipped with sklearn. Thus, no extra download is required.
+
+
+boston1.py
+    Fit a linear regression model to the Boston house price data
+boston1numpy.py
+    Version of above script using numpy operations for linear regression
+boston_cv_penalized.py
+    Test different penalized (and OLS) regression schemes on the Boston dataset
+figure1_2.py
+    Show the regression line for Boston data
+figure3.py
+    Show the regression line for Boston data with OLS and Lasso
+figure4.py
+    Scatter plot of predicted-vs-actual for multidimensional regression
+
+10K data analysis
+-----------------
+
+lr10k.py
+    Linear regression on 10K dataset, evaluation by cross-validation
+predict10k_en.py
+    Elastic nets (including with inner cross-validation for parameter
+    settings). Produces scatter plot.
+
+
+MovieLens data analysis
+-----------------------
+
+In this chapter, we only consider a very simple approach, which is implemented
+in the ``usermodel.py`` script.
+
diff --git a/ch07/boston1.py b/ch07/boston1.py
index 0074f927..d0b30447 100644
--- a/ch07/boston1.py
+++ b/ch07/boston1.py
@@ -7,25 +7,33 @@
 
 # This script shows an example of simple (ordinary) linear regression
 
+# The first edition of the book NumPy functions only for this operation. See
+# the file boston1numpy.py for that version.
+
 import numpy as np
 from sklearn.datasets import load_boston
-import pylab as plt
+from sklearn.linear_model import LinearRegression
+from matplotlib import pyplot as plt
 
 boston = load_boston()
-x = np.array([np.concatenate((v, [1])) for v in boston.data])
+x = boston.data
 y = boston.target
 
-# np.linal.lstsq implements least-squares linear regression
-s, total_error, _, _ = np.linalg.lstsq(x, y)
+# Fitting a model is trivial: call the ``fit`` method in LinearRegression:
+lr = LinearRegression()
+lr.fit(x, y)
+
+# The instance member `residues_` contains the sum of the squared residues
+rmse = np.sqrt(lr.residues_/len(x))
+print('RMSE: {}'.format(rmse))
 
-rmse = np.sqrt(total_error[0] / len(x))
-print('Residual: {}'.format(rmse))
+fig, ax = plt.subplots()
+# Plot a diagonal (for reference):
+ax.plot([0, 50], [0, 50], '-', color=(.9,.3,.3), lw=4)
 
 # Plot the prediction versus real:
-plt.plot(np.dot(x, s), boston.target, 'ro')
+ax.scatter(lr.predict(x), boston.target)
 
-# Plot a diagonal (for reference):
-plt.plot([0, 50], [0, 50], 'g-')
-plt.xlabel('predicted')
-plt.ylabel('real')
-plt.show()
+ax.set_xlabel('predicted')
+ax.set_ylabel('real')
+fig.savefig('Figure_07_08.png')
diff --git a/ch07/boston1numpy.py b/ch07/boston1numpy.py
new file mode 100644
index 00000000..0074f927
--- /dev/null
+++ b/ch07/boston1numpy.py
@@ -0,0 +1,31 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+#
+# It is made available under the MIT License
+
+# This script shows an example of simple (ordinary) linear regression
+
+import numpy as np
+from sklearn.datasets import load_boston
+import pylab as plt
+
+boston = load_boston()
+x = np.array([np.concatenate((v, [1])) for v in boston.data])
+y = boston.target
+
+# np.linal.lstsq implements least-squares linear regression
+s, total_error, _, _ = np.linalg.lstsq(x, y)
+
+rmse = np.sqrt(total_error[0] / len(x))
+print('Residual: {}'.format(rmse))
+
+# Plot the prediction versus real:
+plt.plot(np.dot(x, s), boston.target, 'ro')
+
+# Plot a diagonal (for reference):
+plt.plot([0, 50], [0, 50], 'g-')
+plt.xlabel('predicted')
+plt.ylabel('real')
+plt.show()
diff --git a/ch07/boston_cv10_penalized.py b/ch07/boston_cv10_penalized.py
deleted file mode 100644
index c9f28636..00000000
--- a/ch07/boston_cv10_penalized.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# This code is supporting material for the book
-# Building Machine Learning Systems with Python
-# by Willi Richert and Luis Pedro Coelho
-# published by PACKT Publishing
-#
-# It is made available under the MIT License
-
-# This script fits several forms of penalized regression
-
-from __future__ import print_function
-from sklearn.cross_validation import KFold
-from sklearn.linear_model import ElasticNet, Lasso, Ridge
-from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
-import numpy as np
-from sklearn.datasets import load_boston
-boston = load_boston()
-x = np.array([np.concatenate((v, [1])) for v in boston.data])
-y = boston.target
-
-for name, met in [
-        ('elastic-net(.5)', ElasticNet(fit_intercept=True, alpha=0.5)),
-        ('lasso(.5)', Lasso(fit_intercept=True, alpha=0.5)),
-        ('ridge(.5)', Ridge(fit_intercept=True, alpha=0.5)),
-]:
-    # Fit on the whole data:
-    met.fit(x, y)
-
-    # Predict on the whole data:
-    p = np.array([met.predict(xi) for xi in x])
-
-    e = p - y
-    # np.dot(e, e) == sum(ei**2 for ei in e) but faster
-    total_error = np.dot(e, e)
-    rmse_train = np.sqrt(total_error / len(p))
-
-    # Now, we use 10 fold cross-validation to estimate generalization error
-    kf = KFold(len(x), n_folds=10)
-    err = 0
-    for train, test in kf:
-        met.fit(x[train], y[train])
-        p = np.array([met.predict(xi) for xi in x[test]])
-        e = p - y[test]
-        err += np.dot(e, e)
-
-    rmse_10cv = np.sqrt(err / len(x))
-    print('Method: {}'.format(name))
-    print('RMSE on training: {}'.format(rmse_train))
-    print('RMSE on 10-fold CV: {}'.format(rmse_10cv))
-    print()
-    print()
diff --git a/ch07/boston_cv_penalized.py b/ch07/boston_cv_penalized.py
new file mode 100644
index 00000000..c894c4fa
--- /dev/null
+++ b/ch07/boston_cv_penalized.py
@@ -0,0 +1,46 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+#
+# It is made available under the MIT License
+
+# This script fits several forms of penalized regression
+
+from __future__ import print_function
+import numpy as np
+from sklearn.cross_validation import KFold
+from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, Ridge
+from sklearn.metrics import r2_score
+from sklearn.datasets import load_boston
+boston = load_boston()
+x = boston.data
+y = boston.target
+
+for name, met in [
+        ('linear regression', LinearRegression()),
+        ('lasso()', Lasso()),
+        ('elastic-net(.5)', ElasticNet(alpha=0.5)),
+        ('lasso(.5)', Lasso(alpha=0.5)),
+        ('ridge(.5)', Ridge(alpha=0.5)),
+]:
+    # Fit on the whole data:
+    met.fit(x, y)
+
+    # Predict on the whole data:
+    p = met.predict(x)
+    r2_train = r2_score(y, p)
+
+    # Now, we use 10 fold cross-validation to estimate generalization error
+    kf = KFold(len(x), n_folds=5)
+    p = np.zeros_like(y)
+    for train, test in kf:
+        met.fit(x[train], y[train])
+        p[test] = met.predict(x[test])
+
+    r2_cv = r2_score(y, p)
+    print('Method: {}'.format(name))
+    print('R2 on training: {}'.format(r2_train))
+    print('R2 on 5-fold CV: {}'.format(r2_cv))
+    print()
+    print()
diff --git a/ch07/cv10_lr.py b/ch07/cv10_lr.py
deleted file mode 100644
index 2ae52e02..00000000
--- a/ch07/cv10_lr.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# This code is supporting material for the book
-# Building Machine Learning Systems with Python
-# by Willi Richert and Luis Pedro Coelho
-# published by PACKT Publishing
-#
-# It is made available under the MIT License
-
-from sklearn.cross_validation import KFold
-from sklearn.linear_model import LinearRegression, ElasticNet
-import numpy as np
-from sklearn.datasets import load_boston
-boston = load_boston()
-x = np.array([np.concatenate((v, [1])) for v in boston.data])
-y = boston.target
-FIT_EN = False
-
-if FIT_EN:
-    model = ElasticNet(fit_intercept=True, alpha=0.5)
-else:
-    model = LinearRegression(fit_intercept=True)
-model.fit(x, y)
-p = np.array([model.predict(xi) for xi in x])
-e = p - y
-total_error = np.dot(e, e)
-rmse_train = np.sqrt(total_error / len(p))
-
-kf = KFold(len(x), n_folds=10)
-err = 0
-for train, test in kf:
-    model.fit(x[train], y[train])
-    p = np.array([model.predict(xi) for xi in x[test]])
-    e = p - y[test]
-    err += np.dot(e, e)
-
-rmse_10cv = np.sqrt(err / len(x))
-print('RMSE on training: {}'.format(rmse_train))
-print('RMSE on 10-fold CV: {}'.format(rmse_10cv))
diff --git a/ch07/data/.gitignore b/ch07/data/.gitignore
new file mode 100644
index 00000000..3286ba0f
--- /dev/null
+++ b/ch07/data/.gitignore
@@ -0,0 +1 @@
+E2006.train
diff --git a/ch07/data/download.sh b/ch07/data/download.sh
old mode 100644
new mode 100755
index 6f938e06..74753364
--- a/ch07/data/download.sh
+++ b/ch07/data/download.sh
@@ -1,5 +1,3 @@
 #!/usr/bin/env bash
-wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/E2006.train.bz2
+curl -O http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/E2006.train.bz2
 bunzip2 E2006.train.bz2
-wget http://www.grouplens.org/system/files/ml-100k.zip
-unzip ml-100k.zip
diff --git a/ch07/figure1.py b/ch07/figure1.py
deleted file mode 100644
index a9c23af7..00000000
--- a/ch07/figure1.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# This code is supporting material for the book
-# Building Machine Learning Systems with Python
-# by Willi Richert and Luis Pedro Coelho
-# published by PACKT Publishing
-#
-# It is made available under the MIT License
-
-import numpy as np
-from sklearn.datasets import load_boston
-import pylab as plt
-from mpltools import style
-style.use('ggplot')
-
-boston = load_boston()
-plt.scatter(boston.data[:, 5], boston.target)
-plt.xlabel("RM")
-plt.ylabel("House Price")
-
-
-x = boston.data[:, 5]
-x = np.array([[v] for v in x])
-y = boston.target
-
-slope, res, _, _ = np.linalg.lstsq(x, y)
-plt.plot([0, boston.data[:, 5].max() + 1],
-         [0, slope * (boston.data[:, 5].max() + 1)], '-', lw=4)
-plt.savefig('Figure1.png', dpi=150)
-
-rmse = np.sqrt(res[0] / len(x))
-print('Residual: {}'.format(rmse))
diff --git a/ch07/figure1_2.py b/ch07/figure1_2.py
new file mode 100644
index 00000000..3f11a0c7
--- /dev/null
+++ b/ch07/figure1_2.py
@@ -0,0 +1,63 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+#
+# It is made available under the MIT License
+
+import numpy as np
+from sklearn.datasets import load_boston
+from sklearn.linear_model import LinearRegression
+from sklearn.metrics import mean_squared_error, r2_score
+from matplotlib import pyplot as plt
+
+boston = load_boston()
+
+# Index number five in the number of rooms
+fig,ax = plt.subplots()
+ax.scatter(boston.data[:, 5], boston.target)
+ax.set_xlabel("Average number of rooms (RM)")
+ax.set_ylabel("House Price")
+
+x = boston.data[:, 5]
+# fit (used below) takes a two-dimensional array as input. We use np.atleast_2d
+# to convert from one to two dimensional, then transpose to make sure that the
+# format matches:
+x = np.transpose(np.atleast_2d(x))
+
+y = boston.target
+
+lr = LinearRegression(fit_intercept=False)
+lr.fit(x, y)
+
+ax.plot([0, boston.data[:, 5].max() + 1],
+         [0, lr.predict(boston.data[:, 5].max() + 1)], '-', lw=4)
+fig.savefig('Figure1.png')
+
+mse = mean_squared_error(y, lr.predict(x))
+rmse = np.sqrt(mse)
+print('RMSE (no intercept): {}'.format(rmse))
+
+# Repeat, but fitting an intercept this time:
+lr = LinearRegression(fit_intercept=True)
+
+lr.fit(x, y)
+
+fig,ax = plt.subplots()
+ax.set_xlabel("Average number of rooms (RM)")
+ax.set_ylabel("House Price")
+ax.scatter(boston.data[:, 5], boston.target)
+xmin = x.min()
+xmax = x.max()
+ax.plot([xmin, xmax], lr.predict([[xmin], [xmax]]) , '-', lw=4)
+fig.savefig('Figure2.png')
+
+mse = mean_squared_error(y, lr.predict(x))
+print("Mean squared error (of training data): {:.3}".format(mse))
+
+rmse = np.sqrt(mse)
+print("Root mean squared error (of training data): {:.3}".format(rmse))
+
+cod = r2_score(y, lr.predict(x))
+print('COD (on training data): {:.2}'.format(cod))
+
diff --git a/ch07/figure2.py b/ch07/figure2.py
deleted file mode 100644
index c5977207..00000000
--- a/ch07/figure2.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# This code is supporting material for the book
-# Building Machine Learning Systems with Python
-# by Willi Richert and Luis Pedro Coelho
-# published by PACKT Publishing
-#
-# It is made available under the MIT License
-
-import numpy as np
-from sklearn.datasets import load_boston
-import pylab as plt
-from mpltools import style
-style.use('ggplot')
-
-boston = load_boston()
-plt.scatter(boston.data[:, 5], boston.target)
-plt.xlabel("RM")
-plt.ylabel("House Price")
-
-
-x = boston.data[:, 5]
-xmin = x.min()
-xmax = x.max()
-x = np.array([[v, 1] for v in x])
-y = boston.target
-
-(slope, bias), res, _, _ = np.linalg.lstsq(x, y)
-plt.plot([xmin, xmax], [slope * xmin + bias, slope * xmax + bias], '-', lw=4)
-plt.savefig('Figure2.png', dpi=150)
-
-rmse = np.sqrt(res[0] / len(x))
-print('Residual: {}'.format(rmse))
diff --git a/ch07/figure3.py b/ch07/figure3.py
new file mode 100644
index 00000000..7543c1ec
--- /dev/null
+++ b/ch07/figure3.py
@@ -0,0 +1,33 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+#
+# It is made available under the MIT License
+
+from sklearn.linear_model import LinearRegression, Lasso
+import numpy as np
+from sklearn.datasets import load_boston
+from matplotlib import pyplot as plt
+
+boston = load_boston()
+fig, ax = plt.subplots()
+ax.scatter(boston.data[:, 5], boston.target)
+ax.set_xlabel("Number of rooms (RM)")
+ax.set_ylabel("House Price")
+
+
+x = boston.data[:, 5]
+xmin = x.min()
+xmax = x.max()
+x = np.transpose(np.atleast_2d(x))
+y = boston.target
+
+lr = LinearRegression()
+lr.fit(x, y)
+ax.plot([xmin, xmax], lr.predict([[xmin], [xmax]]), ':', lw=4, label='OLS model')
+
+las = Lasso()
+las.fit(x, y)
+ax.plot([xmin, xmax], las.predict([ [xmin], [xmax] ]), '-', lw=4, label='Lasso model')
+fig.savefig('Figure3.png')
diff --git a/ch07/figure4.py b/ch07/figure4.py
index ab71b3a8..a24d48be 100644
--- a/ch07/figure4.py
+++ b/ch07/figure4.py
@@ -5,31 +5,29 @@
 #
 # It is made available under the MIT License
 
-from sklearn.linear_model import Lasso
+
+# This script plots prediction-vs-actual on training set for the Boston dataset
+# using OLS regression
 import numpy as np
+from sklearn.linear_model import LinearRegression
 from sklearn.datasets import load_boston
-import pylab as plt
-from mpltools import style
-style.use('ggplot')
+from sklearn.metrics import mean_squared_error
+from matplotlib import pyplot as plt
 
 boston = load_boston()
-plt.scatter(boston.data[:, 5], boston.target)
-plt.xlabel("RM")
-plt.ylabel("House Price")
-
 
-x = boston.data[:, 5]
-xmin = x.min()
-xmax = x.max()
-x = np.array([[v, 1] for v in x])
+x = boston.data
 y = boston.target
 
-(slope, bias), res, _, _ = np.linalg.lstsq(x, y)
-plt.plot([xmin, xmax], [slope * xmin + bias, slope * xmax + bias], ':', lw=4)
+lr = LinearRegression()
+lr.fit(x, y)
+p = lr.predict(x)
+print("RMSE: {:.2}.".format(np.sqrt(mean_squared_error(y, p))))
+print("R2: {:.2}.".format(lr.score(x, y)))
+fig,ax = plt.subplots()
+ax.scatter(p, y)
+ax.set_xlabel('Predicted price')
+ax.set_ylabel('Actual price')
+ax.plot([y.min(), y.max()], [y.min(), y.max()], lw=4)
 
-las = Lasso()
-las.fit(x, y)
-y0 = las.predict([xmin, 1])
-y1 = las.predict([xmax, 1])
-plt.plot([xmin, xmax], [y0, y1], '-', lw=4)
-plt.savefig('Figure3.png', dpi=150)
+fig.savefig('Figure4.png')
diff --git a/ch07/lasso_path_plot.py b/ch07/lasso_path_plot.py
new file mode 100644
index 00000000..eab64c26
--- /dev/null
+++ b/ch07/lasso_path_plot.py
@@ -0,0 +1,29 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+#
+# It is made available under the MIT License
+
+from sklearn.linear_model import Lasso
+from sklearn.datasets import load_boston
+from matplotlib import pyplot as plt
+import numpy as np
+
+boston = load_boston()
+x = boston.data
+y = boston.target
+
+las = Lasso(normalize=1)
+alphas = np.logspace(-5, 2, 1000)
+alphas, coefs, _= las.path(x, y, alphas=alphas)
+
+fig,ax = plt.subplots()
+ax.plot(alphas, coefs.T)
+ax.set_xscale('log')
+ax.set_xlim(alphas.max(), alphas.min())
+ax.set_xlabel('Lasso coefficient path as a function of alpha')
+ax.set_xlabel('Alpha')
+ax.set_ylabel('Coefficient weight')
+fig.savefig('Figure_LassoPath.png')
+
diff --git a/ch07/lr10k.py b/ch07/lr10k.py
index 71afd412..831706a1 100644
--- a/ch07/lr10k.py
+++ b/ch07/lr10k.py
@@ -6,31 +6,33 @@
 # It is made available under the MIT License
 
 import numpy as np
+from sklearn.metrics import mean_squared_error, r2_score
 from sklearn.datasets import load_svmlight_file
-from sklearn.linear_model import ElasticNet, LinearRegression
-data, target = load_svmlight_file('E2006.train')
-lr = LinearRegression(fit_intercept=True)
-
+from sklearn.linear_model import LinearRegression
 from sklearn.cross_validation import KFold
-kf = KFold(len(target), n_folds=10)
-err = 0
-for train, test in kf:
-    lr.fit(data[train], target[train])
-    p = map(lr.predict, data[test])
-    p = np.array(p).ravel()
-    e = p - target[test]
-    err += np.dot(e, e)
 
-rmse_10cv = np.sqrt(err / len(target))
+# Whether to use Elastic nets (otherwise, ordinary linear regression is used)
+
+# Load data:
+data, target = load_svmlight_file('data/E2006.train')
 
+lr = LinearRegression()
+
+# Compute error on training data to demonstrate that we can obtain near perfect
+# scores:
 
 lr.fit(data, target)
-p = np.array(map(lr.predict, data))
-p = p.ravel()
-e = p - target
-total_error = np.dot(e, e)
-rmse_train = np.sqrt(total_error / len(p))
+pred = lr.predict(data)
+
+print('RMSE on training, {:.2}'.format(np.sqrt(mean_squared_error(target, pred))))
+print('R2 on training, {:.2}'.format(r2_score(target, pred)))
+print('')
 
+pred = np.zeros_like(target)
+kf = KFold(len(target), n_folds=5)
+for train, test in kf:
+    lr.fit(data[train], target[train])
+    pred[test] = lr.predict(data[test])
 
-print('RMSE on training: {}'.format(rmse_train))
-print('RMSE on 10-fold CV: {}'.format(rmse_10cv))
+print('RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(target, pred))))
+print('R2 on testing (5 fold), {:.2}'.format(r2_score(target, pred)))
diff --git a/ch07/predict10k_en.py b/ch07/predict10k_en.py
index 35372e81..a7dd960a 100644
--- a/ch07/predict10k_en.py
+++ b/ch07/predict10k_en.py
@@ -8,33 +8,66 @@
 import numpy as np
 from sklearn.datasets import load_svmlight_file
 from sklearn.cross_validation import KFold
-from sklearn.linear_model import ElasticNet, LinearRegression
+from sklearn.linear_model import ElasticNetCV, ElasticNet
+from sklearn.metrics import mean_squared_error, r2_score
+from matplotlib import pyplot as plt
 
 data, target = load_svmlight_file('data/E2006.train')
 
 # Edit the lines below if you want to switch method:
-# met = LinearRegression(fit_intercept=True)
-met = ElasticNet(fit_intercept=True, alpha=.1)
+# from sklearn.linear_model import Lasso
+# met = Lasso(alpha=0.1)
+met = ElasticNet(alpha=0.1)
 
-kf = KFold(len(target), n_folds=10)
-err = 0
+kf = KFold(len(target), n_folds=5)
+pred = np.zeros_like(target)
 for train, test in kf:
     met.fit(data[train], target[train])
-    p = map(met.predict, data[test])
-    p = np.array(p).ravel()
-    e = p - target[test]
-    err += np.dot(e, e)
+    pred[test] = met.predict(data[test])
 
-rmse_10cv = np.sqrt(err / len(target))
+print('[EN 0.1] RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(target, pred))))
+print('[EN 0.1] R2 on testing (5 fold), {:.2}'.format(r2_score(target, pred)))
+print('')
 
+# Construct an ElasticNetCV object (use all available CPUs)
+met = ElasticNetCV(n_jobs=-1)
+
+kf = KFold(len(target), n_folds=5)
+pred = np.zeros_like(target)
+for train, test in kf:
+    met.fit(data[train], target[train])
+    pred[test] = met.predict(data[test])
+
+print('[EN CV] RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(target, pred))))
+print('[EN CV] R2 on testing (5 fold), {:.2}'.format(r2_score(target, pred)))
+print('')
 
 met.fit(data, target)
-p = np.array(map(met.predict, data))
-p = p.ravel()
-e = p - target
-total_error = np.dot(e, e)
-rmse_train = np.sqrt(total_error / len(p))
+pred = met.predict(data)
+print('[EN CV] RMSE on training, {:.2}'.format(np.sqrt(mean_squared_error(target, pred))))
+print('[EN CV] R2 on training, {:.2}'.format(r2_score(target, pred)))
+
+
+# Construct an ElasticNetCV object (use all available CPUs)
+met = ElasticNetCV(n_jobs=-1, l1_ratio=[.01, .05, .25, .5, .75, .95, .99])
+
+kf = KFold(len(target), n_folds=5)
+pred = np.zeros_like(target)
+for train, test in kf:
+    met.fit(data[train], target[train])
+    pred[test] = met.predict(data[test])
+
+
+print('[EN CV l1_ratio] RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(target, pred))))
+print('[EN CV l1_ratio] R2 on testing (5 fold), {:.2}'.format(r2_score(target, pred)))
+print('')
+
 
+fig, ax = plt.subplots()
+y = target
+ax.scatter(y, pred, c='k')
+ax.plot([-5,-1], [-5,-1], 'r-', lw=2)
+ax.set_xlabel('Actual value')
+ax.set_ylabel('Predicted value')
+fig.savefig('Figure_10k_scatter_EN_l1_ratio.png')
 
-print('RMSE on training: {}'.format(rmse_train))
-print('RMSE on 10-fold CV: {}'.format(rmse_10cv))
diff --git a/ch07/usermodel.py b/ch07/usermodel.py
deleted file mode 100644
index cae220fb..00000000
--- a/ch07/usermodel.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# This code is supporting material for the book
-# Building Machine Learning Systems with Python
-# by Willi Richert and Luis Pedro Coelho
-# published by PACKT Publishing
-#
-# It is made available under the MIT License
-
-import numpy as np
-from scipy import sparse
-from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV
-from sklearn.cross_validation import KFold
-
-data = np.array([[int(tok) for tok in line.split('\t')[:3]]
-                for line in open('data/ml-100k/u.data')])
-ij = data[:, :2]
-ij -= 1  # original data is in 1-based system
-values = data[:, 2]
-reviews = sparse.csc_matrix((values, ij.T)).astype(float)
-
-reg = ElasticNetCV(fit_intercept=True, alphas=[
-                   0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.])
-
-
-def movie_norm(xc):
-    '''Normalize per movie'''
-    xc = xc.copy().toarray()
-    # x1 is the mean of the positive items
-    x1 = np.array([xi[xi > 0].mean() for xi in xc])
-    x1 = np.nan_to_num(x1)
-
-    for i in range(xc.shape[0]):
-        xc[i] -= (xc[i] > 0) * x1[i]
-    return xc, x1
-
-
-def learn_for(i):
-    u = reviews[i]
-    us = np.delete(np.arange(reviews.shape[0]), i)
-    ps, = np.where(u.toarray().ravel() > 0)
-    x = reviews[us][:, ps].T
-    y = u.data
-    err = 0
-    eb = 0
-    kf = KFold(len(y), n_folds=4)
-    for train, test in kf:
-        xc, x1 = movie_norm(x[train])
-        reg.fit(xc, y[train] - x1)
-
-        xc, x1 = movie_norm(x[test])
-        p = np.array([reg.predict(xi) for xi in xc]).ravel()
-        e = (p + x1) - y[test]
-        err += np.sum(e * e)
-        eb += np.sum((y[train].mean() - y[test]) ** 2)
-    return np.sqrt(err / float(len(y))), np.sqrt(eb / float(len(y)))
-
-whole_data = []
-for i in range(reviews.shape[0]):
-    s = learn_for(i)
-    print(s[0] < s[1])
-    print(s)
-    whole_data.append(s)
diff --git a/ch08/README.rst b/ch08/README.rst
new file mode 100644
index 00000000..488844e6
--- /dev/null
+++ b/ch08/README.rst
@@ -0,0 +1,41 @@
+=========
+Chapter 8
+=========
+
+Support code for *Chapter 8: Recommendations*.
+
+The code refers to the second edition of the book and this code has been
+significantly refactored when compared to the first one.
+
+Ratings Prediction
+------------------
+
+Note that since the partition of the data into training and testing is random,
+everytime you run the code, the results will be different.
+
+
+load_ml100k.py
+    Load data & partition into test/train
+norm.py
+    Normalize the data
+corrneighbours.py
+    Neighbour models based on ncrroaltoin
+regression.py
+    Regression models
+stacked.py
+    Stacked predictions
+averaged.py
+    Averaging of predictions (mentioned in book, but code is not shown there).
+
+Association Rule Mining
+-----------------------
+
+Check the folder ``apriori/``
+
+apriori/histogram.py
+    Print a histogram of how many times each product was bought
+apriori/apriori.py
+    Implementation of Apriori algorithm and association rule building
+apriori/apriori_example.py
+    Example of Apriori algorithm in retail dataset
+
diff --git a/ch08/all_correlations.py b/ch08/all_correlations.py
index df9c30bc..d3817bf0 100644
--- a/ch08/all_correlations.py
+++ b/ch08/all_correlations.py
@@ -7,10 +7,15 @@
 
 import numpy as np
 
-# This is the version in the book:
-
-
-def all_correlations(bait, target):
+def all_correlations(y, X):
+    from scipy import spatial
+    y = np.atleast_2d(y)
+    sp = spatial.distance.cdist(X, y, 'correlation')
+    # The "correlation distance" is 1 - corr(x,y); so we invert that to obtain the correlation
+    return 1 - sp.ravel()
+
+# This is the version in the book (1st Edition):
+def all_correlations_book_version(bait, target):
     '''
     corrs = all_correlations(bait, target)
 
@@ -21,9 +26,7 @@ def all_correlations(bait, target):
          for c in target])
 
 # This is a faster, but harder to read, implementation:
-
-
-def all_correlations(y, X):
+def all_correlations_fast_no_scipy(y, X):
     '''
     Cs = all_correlations(y, X)
 
@@ -41,3 +44,5 @@ def all_correlations(y, X):
     xs_ += 1e-5  # Handle zeros in x
 
     return (xy - x_ * y_ * n) / n / xs_ / ys_
+
+
diff --git a/ch08/apriori/.gitignore b/ch08/apriori/.gitignore
index 05697a95..6379e20f 100644
--- a/ch08/apriori/.gitignore
+++ b/ch08/apriori/.gitignore
@@ -1 +1 @@
-retail.dat
+retail.dat.gz
diff --git a/ch08/apriori/apriori.py b/ch08/apriori/apriori.py
index 1ebb7e54..eeaf0d46 100644
--- a/ch08/apriori/apriori.py
+++ b/ch08/apriori/apriori.py
@@ -5,9 +5,12 @@
 #
 # It is made available under the MIT License
 
+from collections import namedtuple
+
+
 def apriori(dataset, minsupport, maxsize):
     '''
-    freqsets, baskets = apriori(dataset, minsupport, maxsize)
+    freqsets, support = apriori(dataset, minsupport, maxsize)
 
     Parameters
     ----------
@@ -21,48 +24,70 @@ def apriori(dataset, minsupport, maxsize):
     Returns
     -------
     freqsets : sequence of sequences
-    baskets : dictionary
+    support : dictionary
+        This associates each itemset (represented as a frozenset) with a float
+        (the support of that itemset)
     '''
     from collections import defaultdict
 
     baskets = defaultdict(list)
     pointers = defaultdict(list)
+
     for i, ds in enumerate(dataset):
         for ell in ds:
             pointers[ell].append(i)
             baskets[frozenset([ell])].append(i)
-    pointers = dict([(k, frozenset(v)) for k, v in pointers.items()])
-    baskets = dict([(k, frozenset(v)) for k, v in baskets.items()])
 
-    valid = set(list(el)[0]
-                for el, c in baskets.items() if (len(c) >= minsupport))
-    dataset = [[el for el in ds if (el in valid)] for ds in dataset]
-    dataset = [ds for ds in dataset if len(ds) > 1]
-    dataset = map(frozenset, dataset)
+    # Convert pointer items to frozensets to speed up operations later
+    new_pointers = dict()
+    for k in pointers:
+        if len(pointers[k]) >= minsupport:
+            new_pointers[k] = frozenset(pointers[k])
+    pointers = new_pointers
+    for k in baskets:
+        baskets[k] = frozenset(baskets[k])
 
+
+    # Valid are all elements whose support is >= minsupport
+    valid = set()
+    for el, c in baskets.items():
+        if len(c) >= minsupport:
+            valid.update(el)
+
+    # Itemsets at first iteration are simply all singleton with valid elements:
     itemsets = [frozenset([v]) for v in valid]
     freqsets = []
     for i in range(maxsize - 1):
-        print(len(itemsets))
+        print("At iteration {}, number of frequent baskets: {}".format(
+            i, len(itemsets)))
         newsets = []
-        for i, ell in enumerate(itemsets):
-            ccounts = baskets[ell]
-            for v_, pv in pointers.items():
-                if v_ not in ell:
+        for it in itemsets:
+            ccounts = baskets[it]
+
+            for v, pv in pointers.items():
+                if v not in it:
                     csup = (ccounts & pv)
                     if len(csup) >= minsupport:
-                        new = frozenset(ell | set([v_]))
+                        new = frozenset(it | frozenset([v]))
                         if new not in baskets:
                             newsets.append(new)
                             baskets[new] = csup
         freqsets.extend(itemsets)
         itemsets = newsets
-    return freqsets, baskets
+        if not len(itemsets):
+            break
+    support = {}
+    for k in baskets:
+        support[k] = float(len(baskets[k]))
+    return freqsets, support
 
 
-def association_rules(dataset, freqsets, baskets, minlift):
+# A namedtuple to collect all values that may be interesting
+AssociationRule = namedtuple('AssociationRule', ['antecendent', 'consequent', 'base', 'py_x', 'lift'])
+
+def association_rules(dataset, freqsets, support, minlift):
     '''
-    for (antecendent, consequent, base, py_x, lift) in association_rules(dataset, freqsets, baskets, minlift):
+    for assoc_rule in association_rules(dataset, freqsets, support, minlift):
         ...
 
     This function takes the returns from ``apriori``.
@@ -72,9 +97,13 @@ def association_rules(dataset, freqsets, baskets, minlift):
     dataset : sequence of sequences
         input dataset
     freqsets : sequence of sequences
-    baskets : dictionary
+    support : dictionary
     minlift : int
         minimal lift of yielded rules
+
+    Returns
+    -------
+    assoc_rule : sequence of AssociationRule objects
     '''
     nr_transactions = float(len(dataset))
     freqsets = [f for f in freqsets if len(f) > 1]
@@ -82,8 +111,9 @@ def association_rules(dataset, freqsets, baskets, minlift):
         for f in fset:
             consequent = frozenset([f])
             antecendent = fset - consequent
-            base = len(baskets[consequent]) / nr_transactions
-            py_x = len(baskets[fset]) / float(len(baskets[antecendent]))
+            py_x = support[fset] / support[antecendent]
+            base = support[consequent] / nr_transactions
             lift = py_x / base
             if lift > minlift:
-                yield (antecendent, consequent, base, py_x, lift)
+                yield AssociationRule(antecendent, consequent, base, py_x, lift)
+
diff --git a/ch08/apriori/apriori_example.py b/ch08/apriori/apriori_example.py
index da77e96c..971ed4d5 100644
--- a/ch08/apriori/apriori_example.py
+++ b/ch08/apriori/apriori_example.py
@@ -7,10 +7,17 @@
 
 from apriori import apriori, association_rules
 from gzip import GzipFile
+
+# Load dataset
 dataset = [[int(tok) for tok in line.strip().split()]
            for line in GzipFile('retail.dat.gz')]
-freqsets, baskets = apriori(dataset, 80, maxsize=5)
-nr_transactions = float(len(dataset))
-for ant, con, base, pyx, lift in association_rules(dataset, freqsets, baskets, 30):
-    print('{} | {} | {} ({:%}) | {} | {} | {}'
-          .format(ant, con, len(baskets[con]), len(baskets[con]) / nr_transactions, len(baskets[ant]), len(baskets[con | ant]), int(lift)))
+
+freqsets, support = apriori(dataset, 80, maxsize=16)
+rules = list(association_rules(dataset, freqsets, support, minlift=30.0))
+
+rules.sort(key=(lambda ar: -ar.lift))
+for ar in rules:
+    print('{} -> {} (lift = {:.4})'
+          .format(set(ar.antecendent),
+                    set(ar.consequent),
+                    ar.lift))
diff --git a/ch08/apriori/apriori_naive.py b/ch08/apriori/apriori_naive.py
index cb50a8e1..03ab9130 100644
--- a/ch08/apriori/apriori_naive.py
+++ b/ch08/apriori/apriori_naive.py
@@ -5,35 +5,83 @@
 #
 # It is made available under the MIT License
 
-import numpy as np
 from collections import defaultdict
 from itertools import chain
 from gzip import GzipFile
-minsupport = 44
+minsupport = 80
 
 dataset = [[int(tok) for tok in line.strip().split()]
            for line in GzipFile('retail.dat.gz')]
-dataset = dataset[::20]
 
 counts = defaultdict(int)
 for elem in chain(*dataset):
     counts[elem] += 1
 
+# Only elements that have at least minsupport should be considered.
 valid = set(el for el, c in counts.items() if (c >= minsupport))
+
+# Filter the dataset to contain only valid elements
+# (This step is not strictly necessary, but will make the rest of the code
+# faster as the itemsets will be smaller):
 dataset = [[el for el in ds if (el in valid)] for ds in dataset]
 
-dataset = [frozenset(ds) for ds in dataset if len(ds) > 1]
+# Convert to frozenset for fast processing
+dataset = [frozenset(ds) for ds in dataset]
 
 itemsets = [frozenset([v]) for v in valid]
-allsets = [itemsets]
+freqsets = itemsets[:]
 for i in range(16):
-    print(len(itemsets))
+    print("At iteration {}, number of frequent baskets: {}".format(
+        i, len(itemsets)))
     nextsets = []
-    for i, ell in enumerate(itemsets):
-        for v_ in valid:
-            if v_ not in ell:
-                c = (ell | set([v_]))
-                if sum(1 for d in dataset if d.issuperset(c)) > minsupport:
+
+    tested = set()
+    for it in itemsets:
+        for v in valid:
+            if v not in it:
+                # Create a new candidate set by adding v to it
+                c = (it | frozenset([v]))
+
+                # Check if we have tested it already:
+                if c in tested:
+                    continue
+                tested.add(c)
+
+                # Count support by looping over dataset
+                # This step is slow.
+                # Check `apriori.py` for a better implementation.
+                support_c = sum(1 for d in dataset if d.issuperset(c))
+                if support_c > minsupport:
                     nextsets.append(c)
-    allsets.append(nextsets)
+    freqsets.extend(nextsets)
     itemsets = nextsets
+    if not len(itemsets):
+        break
+print("Finished!")
+
+
+def rules_from_itemset(itemset, dataset, minlift=1.):
+    nr_transactions = float(len(dataset))
+    for item in itemset:
+        consequent = frozenset([item])
+        antecedent = itemset-consequent
+        base = 0.0
+        # acount: antecedent count
+        acount = 0.0
+
+        # ccount : consequent count
+        ccount = 0.0
+        for d in dataset:
+          if item in d: base += 1
+          if d.issuperset(itemset): ccount += 1
+          if d.issuperset(antecedent): acount += 1
+        base /= nr_transactions
+        p_y_given_x = ccount/acount
+        lift = p_y_given_x / base
+        if lift > minlift:
+            print('Rule {0} ->  {1} has lift {2}'
+                  .format(antecedent, consequent,lift))
+
+for itemset in freqsets:
+    if len(itemset) > 1:
+        rules_from_itemset(itemset, dataset, minlift=4.)
diff --git a/ch08/averaged.py b/ch08/averaged.py
new file mode 100644
index 00000000..5b19bba7
--- /dev/null
+++ b/ch08/averaged.py
@@ -0,0 +1,33 @@
+import numpy as np
+import load_ml100k
+import regression
+import corrneighbours
+from sklearn import metrics
+import norm
+
+def predict(train):
+    predicted0 = regression.predict(train)
+    predicted1 = regression.predict(train.T).T
+    predicted2 = corrneighbours.predict(train)
+    predicted3 = corrneighbours.predict(train.T).T
+    predicted4 = norm.predict(train)
+    predicted5 = norm.predict(train.T).T
+    stack = np.array([
+        predicted0,
+        predicted1,
+        predicted2,
+        predicted3,
+        predicted4,
+        predicted5,
+        ])
+    return stack.mean(0)
+
+
+def main():
+    train,test = load_ml100k.get_train_test(random_state=12)
+    predicted = predict(train)
+    r2 = metrics.r2_score(test[test > 0], predicted[test > 0])
+    print('R2 averaged: {:.2%}'.format(r2))
+
+if __name__ == '__main__':
+    main()
diff --git a/ch08/chapter.py b/ch08/chapter.py
new file mode 100644
index 00000000..d039d93f
--- /dev/null
+++ b/ch08/chapter.py
@@ -0,0 +1,208 @@
+import numpy as np # NOT IN BOOK
+from matplotlib import pyplot as plt # NOT IN BOOK
+
+def load():
+    import numpy as np
+    from scipy import sparse
+
+    data = np.loadtxt('data/ml-100k/u.data')
+    ij = data[:, :2]
+    ij -= 1  # original data is in 1-based system
+    values = data[:, 2]
+    reviews = sparse.csc_matrix((values, ij.T)).astype(float)
+    return reviews.toarray()
+reviews = load()
+U,M = np.where(reviews)
+import random
+test_idxs = np.array(random.sample(range(len(U)), len(U)//10))
+
+train = reviews.copy()
+train[U[test_idxs], M[test_idxs]] = 0
+
+test = np.zeros_like(reviews)
+test[U[test_idxs], M[test_idxs]] = reviews[U[test_idxs], M[test_idxs]]
+
+class NormalizePositive(object):
+    def __init__(self, axis=0):
+        self.axis = axis
+
+    def fit(self, features, y=None):
+        if self.axis == 1:
+            features = features.T
+          #  count features that are greater than zero in axis 0:
+        binary = (features > 0)
+
+        count0 = binary.sum(axis=0)
+
+         # to avoid division by zero, set zero counts to one:
+        count0[count0 == 0] = 1.
+
+         # computing the mean is easy:
+        self.mean = features.sum(axis=0)/count0
+
+        # only consider differences where binary is True:
+        diff = (features - self.mean) * binary
+        diff **= 2
+        # regularize the estimate of std by adding 0.1
+        self.std = np.sqrt(0.1 + diff.sum(axis=0)/count0)
+        return self
+
+
+    def transform(self, features):
+        if self.axis == 1:
+          features = features.T
+        binary = (features > 0)
+        features = features - self.mean
+        features /= self.std
+        features *= binary
+        if self.axis == 1:
+          features = features.T
+        return features
+
+    def inverse_transform(self, features, copy=True):
+        if copy:
+            features = features.copy()
+        if self.axis == 1:
+          features = features.T
+        features *= self.std
+        features += self.mean
+        if self.axis == 1:
+          features = features.T
+        return features
+
+    def fit_transform(self, features):
+        return self.fit(features).transform(features)
+
+
+norm = NormalizePositive(axis=1)
+binary = (train > 0)
+train = norm.fit_transform(train)
+# plot just 200x200 area for space reasons
+plt.imshow(binary[:200, :200], interpolation='nearest')
+
+from scipy.spatial import distance
+# compute all pair-wise distances:
+dists = distance.pdist(binary, 'correlation')
+# Convert to square form, so that dists[i,j]
+# is distance between binary[i] and binary[j]:
+dists = distance.squareform(dists)
+neighbors = dists.argsort(axis=1)
+
+# We are going to fill this matrix with results
+filled = train.copy()
+for u in range(filled.shape[0]):
+    # n_u is neighbors of user
+    n_u = neighbors[u, 1:]
+    for m in range(filled.shape[1]):
+        # get relevant reviews in order!
+        revs = [train[neigh, m]
+                   for neigh in n_u
+                        if binary    [neigh, m]]
+        if len(revs):
+            # n is the number of reviews for this movie
+            n = len(revs)
+            # take half of the reviews plus one into consideration:
+            n //= 2
+            n += 1
+            revs = revs[:n]
+            filled[u,m] = np.mean(revs)
+
+predicted = norm.inverse_transform(filled)
+from sklearn import metrics
+r2 = metrics.r2_score(test[test > 0], predicted[test > 0])
+print('R2 score (binary neighbors): {:.1%}'.format(r2))
+
+reviews = reviews.T
+# use same code as before 
+r2 = metrics.r2_score(test[test > 0], predicted[test > 0])
+print('R2 score (binary movie neighbors): {:.1%}'.format(r2))
+
+
+from sklearn.linear_model import ElasticNetCV # NOT IN BOOK
+
+reg = ElasticNetCV(alphas=[
+                       0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.])
+filled = train.copy()
+# iterate over all users:
+for u in range(train.shape[0]):
+    curtrain = np.delete(train, u, axis=0)
+    bu = binary[u]
+    reg.fit(curtrain[:,bu].T, train[u, bu])
+    filled[u, ~bu] = reg.predict(curtrain[:,~bu].T)
+predicted = norm.inverse_transform(filled)
+r2 = metrics.r2_score(test[test > 0], predicted[test > 0])
+print('R2 score (user regression): {:.1%}'.format(r2))
+
+
+# SHOPPING BASKET ANALYSIS
+# This is the slow version of the code, which will take a long time to
+# complete.
+
+
+from collections import defaultdict
+from itertools import chain
+
+# File is downloaded as a compressed file
+import gzip
+# file format is a line per transaction
+# of the form '12 34 342 5...'
+dataset = [[int(tok) for tok in line.strip().split()]
+       for line in gzip.open('data/retail.dat.gz')]
+dataset = [set(d) for d in dataset]
+# count how often each product was purchased:
+counts = defaultdict(int)
+for elem in chain(*dataset):
+    counts[elem] += 1
+
+minsupport = 80
+valid = set(k for k,v in counts.items() if (v >= minsupport))
+itemsets = [frozenset([v]) for v in valid]
+freqsets = []
+for i in range(16):
+    nextsets = []
+    tested = set()
+    for it in itemsets:
+        for v in valid:
+           if v not in it:
+               # Create a new candidate set by adding v to it
+               c = (it | frozenset([v]))
+               # check If we have tested it already
+               if c in tested:
+                    continue
+               tested.add(c)
+
+               # Count support by looping over dataset
+               # This step is slow.
+               # Check `apriori.py` for a better implementation.
+               support_c = sum(1 for d in dataset if d.issuperset(c))
+               if support_c > minsupport:
+                    nextsets.append(c)
+    freqsets.extend(nextsets)
+    itemsets = nextsets
+    if not len(itemsets):
+        break
+print("Finished!")
+
+
+minlift = 5.0
+nr_transactions = float(len(dataset))
+for itemset in freqsets:
+    for item in itemset:
+        consequent = frozenset([item])
+        antecedent = itemset-consequent
+        base = 0.0
+        # acount: antecedent count
+        acount = 0.0
+
+        # ccount : consequent count
+        ccount = 0.0
+        for d in dataset:
+          if item in d: base += 1
+          if d.issuperset(itemset): ccount += 1
+          if d.issuperset(antecedent): acount += 1
+        base /= nr_transactions
+        p_y_given_x = ccount/acount
+        lift = p_y_given_x / base
+        if lift > minlift:
+            print('Rule {0} ->  {1} has lift {2}'
+                  .format(antecedent, consequent,lift))
diff --git a/ch08/corrneighbours.py b/ch08/corrneighbours.py
index 6f9b2e4e..eb30e685 100644
--- a/ch08/corrneighbours.py
+++ b/ch08/corrneighbours.py
@@ -6,50 +6,53 @@
 # It is made available under the MIT License
 
 from __future__ import print_function
-from all_correlations import all_correlations
 import numpy as np
-from scipy import sparse
-from load_ml100k import load
-reviews = load()
-
-
-def estimate_user(user, rest):
-    bu = user > 0
-    br = rest > 0
-    ws = all_correlations(bu, br)
-    selected = ws.argsort()[-100:]
-    estimates = rest[selected].mean(0)
-    estimates /= (.1 + br[selected].mean(0))
-    return estimates
-
-
-def train_test(user, rest):
-    estimates = estimate_user(user, rest)
-    bu = user > 0
-    br = rest > 0
-    err = estimates[bu] - user[bu]
-    null = rest.mean(0)
-    null /= (.1 + br.mean(0))
-    nerr = null[bu] - user[bu]
-    return np.dot(err, err), np.dot(nerr, nerr)
-
-
-def cross_validate_all():
-    err = []
-    for i in xrange(reviews.shape[0]):
-        err.append(
-            train_test(reviews[i], np.delete(reviews, i, 0))
-        )
-    revs = (reviews > 0).sum(1)
-    err = np.array(err)
-    rmse = np.sqrt(err / revs[:, None])
-    print(np.mean(rmse, 0))
-    print(np.mean(rmse[revs > 60], 0))
-
-
-def all_estimates(reviews):
-    reviews = reviews.toarray()
-    estimates = np.zeros_like(reviews)
-    for i in xrange(reviews.shape[0]):
-        estimates[i] = estimate_user(reviews[i], np.delete(reviews, i, 0))
-    return estimates
+from load_ml100k import get_train_test
+from scipy.spatial import distance
+from sklearn import metrics
+
+from norm import NormalizePositive
+
+def predict(otrain):
+    binary = (otrain > 0)
+    norm = NormalizePositive(axis=1)
+    train = norm.fit_transform(otrain)
+
+    dists = distance.pdist(binary, 'correlation')
+    dists = distance.squareform(dists)
+
+    neighbors = dists.argsort(axis=1)
+    filled = train.copy()
+    for u in range(filled.shape[0]):
+        # n_u are the neighbors of user
+        n_u = neighbors[u, 1:]
+        for m in range(filled.shape[1]):
+            # This code could be faster using numpy indexing trickery as the
+            # cost of readibility (this is left as an exercise to the reader):
+            revs = [train[neigh, m]
+                    for neigh in n_u
+                    if binary[neigh, m]]
+            if len(revs):
+                n = len(revs)
+                n //= 2
+                n += 1
+                revs = revs[:n]
+                filled[u,m] = np.mean(revs)
+
+    return norm.inverse_transform(filled)
+
+def main(transpose_inputs=False):
+    train, test = get_train_test(random_state=12)
+    if transpose_inputs:
+        train = train.T
+        test  = test.T
+
+    predicted = predict(train)
+    r2 = metrics.r2_score(test[test > 0], predicted[test > 0])
+    print('R2 score (binary {} neighbours): {:.1%}'.format(
+        ('movie' if transpose_inputs else 'user'),
+        r2))
+
+if __name__ == '__main__':
+    main()
+    main(transpose_inputs=True)
diff --git a/ch08/data/.gitignore b/ch08/data/.gitignore
new file mode 100644
index 00000000..c391d625
--- /dev/null
+++ b/ch08/data/.gitignore
@@ -0,0 +1,3 @@
+retail.dat.gz
+ml-100k.zip
+/ml-100k/
diff --git a/ch08/data/download.sh b/ch08/data/download.sh
new file mode 100755
index 00000000..b671171d
--- /dev/null
+++ b/ch08/data/download.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+curl -L -O http://files.grouplens.org/papers/ml-100k.zip
+unzip ml-100k.zip
+curl -L -O http://fimi.ua.ac.be/data/retail.dat.gz
diff --git a/ch08/figure3.py b/ch08/figure3.py
index da699902..daafc300 100644
--- a/ch08/figure3.py
+++ b/ch08/figure3.py
@@ -8,9 +8,8 @@
 from load_ml100k import load
 from matplotlib import pyplot as plt
 data = load()
-data = data.toarray()
 plt.gray()
 plt.imshow(data[:200, :200], interpolation='nearest')
 plt.xlabel('User ID')
 plt.ylabel('Film ID')
-plt.savefig('../1400_08_03+.png')
+plt.savefig('Figure_08_03_DataMatrix.png')
diff --git a/ch08/load_ml100k.py b/ch08/load_ml100k.py
index e636a59c..7096e75c 100644
--- a/ch08/load_ml100k.py
+++ b/ch08/load_ml100k.py
@@ -5,15 +5,55 @@
 #
 # It is made available under the MIT License
 
-import numpy as np
-from scipy import sparse
+def load():
+    '''Load ML-100k data
 
+    Returns the review matrix as a numpy array'''
+    import numpy as np
+    from scipy import sparse
+    from os import path
 
-def load():
-    data = np.array([[int(t) for t in line.split('\t')[:3]]
-                    for line in open('data/ml-100k/u.data')])
+    if not path.exists('data/ml-100k/u.data'):
+        raise IOError("Data has not been downloaded.\nTry the following:\n\n\tcd data\n\t./download.sh")
+
+    # The input is in the form of a CSC sparse matrix, so it's a natural fit to
+    # load the data, but we then convert to a more traditional array before
+    # returning
+    data = np.loadtxt('data/ml-100k/u.data')
     ij = data[:, :2]
     ij -= 1  # original data is in 1-based system
     values = data[:, 2]
     reviews = sparse.csc_matrix((values, ij.T)).astype(float)
-    return reviews
+    return reviews.toarray()
+
+def get_train_test(reviews=None, random_state=None):
+    '''Split data into training & testing
+
+    Parameters
+    ----------
+    reviews : ndarray, optional
+        Input data
+
+    Returns
+    -------
+    train : ndarray
+        training data
+    test : ndarray
+        testing data
+    '''
+    import numpy as np
+    import random
+    r = random.Random(random_state)
+
+    if reviews is None:
+        reviews = load()
+    U,M = np.where(reviews)
+    test_idxs = np.array(r.sample(range(len(U)), len(U)//10))
+    train = reviews.copy()
+    train[U[test_idxs], M[test_idxs]] = 0
+
+    test = np.zeros_like(reviews)
+    test[U[test_idxs], M[test_idxs]] = reviews[U[test_idxs], M[test_idxs]]
+
+    return train, test
+
diff --git a/ch08/norm.py b/ch08/norm.py
new file mode 100644
index 00000000..2925bbca
--- /dev/null
+++ b/ch08/norm.py
@@ -0,0 +1,75 @@
+import numpy as np
+
+class NormalizePositive(object):
+
+    def __init__(self, axis=0):
+        self.axis = axis
+
+    def fit(self, features, y=None):
+        # count features that are greater than zero in axis `self.axis`:
+        if self.axis == 1:
+            features = features.T
+        binary = (features > 0)
+        count = binary.sum(axis=0)
+
+        # to avoid division by zero, set zero counts to one:
+        count[count == 0] = 1.
+
+        self.mean = features.sum(axis=0)/count
+
+        # Compute variance by average squared difference to the mean, but only
+        # consider differences where binary is True (i.e., where there was a
+        # true rating):
+        diff = (features - self.mean) * binary
+        diff **= 2
+        # regularize the estimate of std by adding 0.1
+        self.std = np.sqrt(0.1 + diff.sum(axis=0)/count)
+        return self
+
+    def transform(self, features):
+        if self.axis == 1:
+            features = features.T
+        binary = (features > 0)
+        features = features - self.mean
+        features /= self.std
+        features *= binary
+        if self.axis == 1:
+            features = features.T
+        return features
+
+    def inverse_transform(self, features, copy=True):
+        if copy:
+            features = features.copy()
+        if self.axis == 1:
+            features = features.T
+        features *= self.std
+        features += self.mean
+        if self.axis == 1:
+            features = features.T
+        return features
+
+    def fit_transform(self, features):
+        return self.fit(features).transform(features)
+
+
+def predict(train):
+    norm = NormalizePositive()
+    train = norm.fit_transform(train)
+    return norm.inverse_transform(train * 0.)
+
+
+def main(transpose_inputs=False):
+    from load_ml100k import get_train_test
+    from sklearn import metrics
+    train,test = get_train_test(random_state=12)
+    if transpose_inputs:
+        train = train.T
+        test = test.T
+    predicted = predict(train)
+    r2 = metrics.r2_score(test[test > 0], predicted[test > 0])
+    print('R2 score ({} normalization): {:.1%}'.format(
+        ('movie' if transpose_inputs else 'user'),
+        r2))
+if __name__ == '__main__':
+    main()
+    main(transpose_inputs=True)
diff --git a/ch08/regression.py b/ch08/regression.py
new file mode 100644
index 00000000..693e99a4
--- /dev/null
+++ b/ch08/regression.py
@@ -0,0 +1,50 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+#
+# It is made available under the MIT License
+
+import numpy as np
+from sklearn.linear_model import ElasticNetCV
+from norm import NormalizePositive
+from sklearn import metrics
+
+
+def predict(train):
+    binary = (train > 0)
+    reg = ElasticNetCV(fit_intercept=True, alphas=[
+                       0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.])
+    norm = NormalizePositive()
+    train = norm.fit_transform(train)
+
+    filled = train.copy()
+    # iterate over all users
+    for u in range(train.shape[0]):
+        # remove the current user for training
+        curtrain = np.delete(train, u, axis=0)
+        bu = binary[u]
+        if np.sum(bu) > 5:
+            reg.fit(curtrain[:,bu].T, train[u, bu])
+
+            # Fill the values that were not there already
+            filled[u, ~bu] = reg.predict(curtrain[:,~bu].T)
+    return norm.inverse_transform(filled)
+
+
+def main(transpose_inputs=False):
+    from load_ml100k import get_train_test
+    train,test = get_train_test(random_state=12)
+    if transpose_inputs:
+        train = train.T
+        test = test.T
+    filled = predict(train)
+    r2 = metrics.r2_score(test[test > 0], filled[test > 0])
+
+    print('R2 score ({} regression): {:.1%}'.format(
+        ('movie' if transpose_inputs else 'user'),
+        r2))
+
+if __name__ == '__main__':
+    main()
+    main(transpose_inputs=True)
diff --git a/ch08/similar_movie.py b/ch08/similar_movie.py
index bbb38ee6..cd49a162 100644
--- a/ch08/similar_movie.py
+++ b/ch08/similar_movie.py
@@ -7,11 +7,26 @@
 
 from __future__ import print_function
 import numpy as np
-from load_ml100k import load
-from all_correlations import all_correlations
 
 
 def nn_movie(ureviews, reviews, uid, mid, k=1):
+    '''Movie neighbor based classifier
+
+    Parameters
+    ----------
+    ureviews : ndarray
+    reviews : ndarray
+    uid : int
+        index of user
+    mid : int
+        index of movie
+    k : int
+        index of neighbor to return
+
+    Returns
+    -------
+    pred : float
+    '''
     X = ureviews
     y = ureviews[mid].copy()
     y -= y.mean()
@@ -33,23 +48,27 @@ def nn_movie(ureviews, reviews, uid, mid, k=1):
 
 
 def all_estimates(reviews, k=1):
+    '''Estimate all review ratings
+    '''
     reviews = reviews.astype(float)
     k -= 1
     nusers, nmovies = reviews.shape
     estimates = np.zeros_like(reviews)
     for u in range(nusers):
-        ureviews = np.delete(reviews, u, 0)
+        ureviews = np.delete(reviews, u, axis=0)
         ureviews -= ureviews.mean(0)
-        ureviews /= (ureviews.std(0) + 1e-4)
+        ureviews /= (ureviews.std(0) + 1e-5)
         ureviews = ureviews.T.copy()
         for m in np.where(reviews[u] > 0)[0]:
             estimates[u, m] = nn_movie(ureviews, reviews, u, m, k)
     return estimates
 
 if __name__ == '__main__':
-    reviews = load().toarray()
+    from load_ml100k import load
+    reviews = load()
     estimates = all_estimates(reviews)
     error = (estimates - reviews)
     error **= 2
     error = error[reviews > 0]
-    print(np.sqrt(error).mean())
+    rmse = np.sqrt(error.mean())
+    print("RMSE is {0}.".format(rmse))
diff --git a/ch08/stacked.py b/ch08/stacked.py
index 1d6e9fc7..8fa6344e 100644
--- a/ch08/stacked.py
+++ b/ch08/stacked.py
@@ -1,43 +1,47 @@
-# This code is supporting material for the book
-# Building Machine Learning Systems with Python
-# by Willi Richert and Luis Pedro Coelho
-# published by PACKT Publishing
-#
-# It is made available under the MIT License
-
-from __future__ import print_function
-from sklearn.linear_model import LinearRegression
-from load_ml100k import load
 import numpy as np
-import similar_movie
-import usermodel
+import load_ml100k
+import regression
 import corrneighbours
+from sklearn import linear_model, metrics
+import norm
+
+def predict(train):
+    tr_train,tr_test = load_ml100k.get_train_test(train, random_state=34)
+    tr_predicted0 = regression.predict(tr_train)
+    tr_predicted1 = regression.predict(tr_train.T).T
+    tr_predicted2 = corrneighbours.predict(tr_train)
+    tr_predicted3 = corrneighbours.predict(tr_train.T).T
+    tr_predicted4 = norm.predict(tr_train)
+    tr_predicted5 = norm.predict(tr_train.T).T
+    stack_tr = np.array([
+        tr_predicted0[tr_test > 0],
+        tr_predicted1[tr_test > 0],
+        tr_predicted2[tr_test > 0],
+        tr_predicted3[tr_test > 0],
+        tr_predicted4[tr_test > 0],
+        tr_predicted5[tr_test > 0],
+        ]).T
+
+    lr = linear_model.LinearRegression()
+    lr.fit(stack_tr, tr_test[tr_test > 0])
 
-reviews = load()
-reg = LinearRegression()
-es = np.array([
-    usermodel.all_estimates(reviews),
-    corrneighbours.all_estimates(reviews),
-    similar_movies.all_estimates(reviews),
-])
+    stack_te = np.array([
+        tr_predicted0.ravel(),
+        tr_predicted1.ravel(),
+        tr_predicted2.ravel(),
+        tr_predicted3.ravel(),
+        tr_predicted4.ravel(),
+        tr_predicted5.ravel(),
+        ]).T
 
-reviews = reviews.toarray()
+    return lr.predict(stack_te).reshape(train.shape)
 
 
-total_error = 0.0
-coefficients = []
-for u in xrange(reviews.shape[0]):
-    es0 = np.delete(es, u, 1)
-    r0 = np.delete(reviews, u, 0)
-    X, Y = np.where(r0 > 0)
-    X = es[:, X, Y]
-    y = r0[r0 > 0]
-    reg.fit(X.T, y)
-    coefficients.append(reg.coef_)
+def main():
+    train,test = load_ml100k.get_train_test(random_state=12)
+    predicted = predict(train)
+    r2 = metrics.r2_score(test[test > 0], predicted[test > 0])
+    print('R2 stacked: {:.2%}'.format(r2))
 
-    r0 = reviews[u]
-    X = np.where(r0 > 0)
-    p0 = reg.predict(es[:, u, X].squeeze().T)
-    err0 = r0[r0 > 0] - p0
-    total_error += np.dot(err0, err0)
-    print(u)
+if __name__ == '__main__':
+    main()
diff --git a/ch08/stacked5.py b/ch08/stacked5.py
deleted file mode 100644
index 12f3eacd..00000000
--- a/ch08/stacked5.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# This code is supporting material for the book
-# Building Machine Learning Systems with Python
-# by Willi Richert and Luis Pedro Coelho
-# published by PACKT Publishing
-#
-# It is made available under the MIT License
-
-from sklearn.linear_model import LinearRegression
-from load_ml100k import load
-import numpy as np
-import similar_movie
-import usermodel
-import corrneighbours
-
-sreviews = load()
-reviews = sreviews.toarray()
-reg = LinearRegression()
-es = np.array([
-    usermodel.all_estimates(sreviews),
-    similar_movie.all_estimates(reviews, k=1),
-    similar_movie.all_estimates(reviews, k=2),
-    similar_movie.all_estimates(reviews, k=3),
-    similar_movie.all_estimates(reviews, k=4),
-    similar_movie.all_estimates(reviews, k=5),
-])
-
-total_error = 0.0
-coefficients = []
-for u in xrange(reviews.shape[0]):
-    es0 = np.delete(es, u, 1)
-    r0 = np.delete(reviews, u, 0)
-    X, Y = np.where(r0 > 0)
-    X = es[:, X, Y]
-    y = r0[r0 > 0]
-    reg.fit(X.T, y)
-    coefficients.append(reg.coef_)
-
-    r0 = reviews[u]
-    X = np.where(r0 > 0)
-    p0 = reg.predict(es[:, u, X].squeeze().T)
-    err0 = r0[r0 > 0] - p0
-    total_error += np.dot(err0, err0)
-coefficients = np.array(coefficients)
diff --git a/ch08/usermodel.py b/ch08/usermodel.py
deleted file mode 100644
index 40a90a2a..00000000
--- a/ch08/usermodel.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# This code is supporting material for the book
-# Building Machine Learning Systems with Python
-# by Willi Richert and Luis Pedro Coelho
-# published by PACKT Publishing
-#
-# It is made available under the MIT License
-
-import numpy as np
-from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV
-from sklearn.cross_validation import KFold
-from load_ml100k import load
-
-
-def learn_for(reviews, i):
-    reg = ElasticNetCV(fit_intercept=True, alphas=[
-                       0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.])
-    u = reviews[i]
-    us = range(reviews.shape[0])
-    del us[i]
-    ps, = np.where(u.toarray().ravel() > 0)
-    x = reviews[us][:, ps].T
-    y = u.data
-    kf = KFold(len(y), n_folds=4)
-    predictions = np.zeros(len(ps))
-    for train, test in kf:
-        xc = x[train].copy().toarray()
-        x1 = np.array([xi[xi > 0].mean() for xi in xc])
-        x1 = np.nan_to_num(x1)
-
-        for i in xrange(xc.shape[0]):
-            xc[i] -= (xc[i] > 0) * x1[i]
-
-        reg.fit(xc, y[train] - x1)
-
-        xc = x[test].copy().toarray()
-        x1 = np.array([xi[xi > 0].mean() for xi in xc])
-        x1 = np.nan_to_num(x1)
-
-        for i in xrange(xc.shape[0]):
-            xc[i] -= (xc[i] > 0) * x1[i]
-
-        p = np.array(map(reg.predict, xc)).ravel()
-        predictions[test] = p
-    return predictions
-
-
-def all_estimates(reviews):
-    whole_data = []
-    for i in xrange(reviews.shape[0]):
-        s = learn_for(reviews, i)
-        whole_data.append(s)
-    return np.array(whole_data)
diff --git a/ch09/01_fft_based_classifier.py b/ch09/01_fft_based_classifier.py
index 2a3f1bd8..84249724 100644
--- a/ch09/01_fft_based_classifier.py
+++ b/ch09/01_fft_based_classifier.py
@@ -14,7 +14,7 @@
 
 from sklearn.metrics import confusion_matrix
 
-from utils import plot_pr, plot_roc, plot_confusion_matrix, GENRE_LIST, TEST_DIR
+from utils import plot_pr, plot_roc, plot_confusion_matrix, GENRE_LIST
 
 from fft import read_fft
 
@@ -81,7 +81,7 @@ def train_model(clf_factory, X, Y, name, plot=False):
 
     if plot:
         for label in labels:
-            print("Plotting %s"%genre_list[label])
+            print("Plotting %s" % genre_list[label])
             scores_to_sort = roc_scores[label]
             median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
 
diff --git a/ch09/02_ceps_based_classifier.py b/ch09/02_ceps_based_classifier.py
index 3b9e7237..2791267f 100644
--- a/ch09/02_ceps_based_classifier.py
+++ b/ch09/02_ceps_based_classifier.py
@@ -14,7 +14,7 @@
 
 from sklearn.metrics import confusion_matrix
 
-from utils import plot_roc, plot_confusion_matrix, GENRE_LIST, TEST_DIR
+from utils import plot_roc, plot_confusion_matrix, GENRE_LIST
 
 from ceps import read_ceps
 
@@ -82,7 +82,7 @@ def train_model(clf_factory, X, Y, name, plot=False):
 
     if plot:
         for label in labels:
-            print("Plotting %s"%genre_list[label])
+            print("Plotting %s" % genre_list[label])
             scores_to_sort = roc_scores[label]
             median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
 
diff --git a/ch09/ceps.py b/ch09/ceps.py
index 239b0885..30a6f1cf 100644
--- a/ch09/ceps.py
+++ b/ch09/ceps.py
@@ -24,7 +24,7 @@ def write_ceps(ceps, fn):
     base_fn, ext = os.path.splitext(fn)
     data_fn = base_fn + ".ceps"
     np.save(data_fn, ceps)
-    print "Written", data_fn
+    print("Written %s"%data_fn)
 
 
 def create_ceps(fn):
@@ -51,6 +51,6 @@ def read_ceps(genre_list, base_dir=GENRE_DIR):
 if __name__ == "__main__":
     os.chdir(GENRE_DIR)
     glob_wav = os.path.join(sys.argv[1], "*.wav")
-    print glob_wav
+    print(glob_wav)
     for fn in glob.glob(glob_wav):
         create_ceps(fn)
diff --git a/ch09/fft.py b/ch09/fft.py
index 5ff5a96f..0754c3ad 100644
--- a/ch09/fft.py
+++ b/ch09/fft.py
@@ -27,7 +27,7 @@ def write_fft(fft_features, fn):
     data_fn = base_fn + ".fft"
 
     np.save(data_fn, fft_features)
-    print "Written", data_fn
+    print("Written "%data_fn)
 
 
 def create_fft(fn):
diff --git a/ch09/utils.py b/ch09/utils.py
index 3fea02f7..d74f2169 100644
--- a/ch09/utils.py
+++ b/ch09/utils.py
@@ -29,11 +29,10 @@
 TEST_DIR = None
 
 if GENRE_DIR is None or TEST_DIR is None:
-    print("Please set GENRE_DIR and TEST_DIR in utils.py") 
+    print("Please set GENRE_DIR and TEST_DIR in utils.py")
     sys.exit(1)
 
 
-
 def plot_confusion_matrix(cm, genre_list, name, title):
     pylab.clf()
     pylab.matshow(cm, fignum=False, cmap='Blues', vmin=0, vmax=1.0)
@@ -93,7 +92,7 @@ def show_most_informative_features(vectorizer, clf, n=20):
     c_f = sorted(zip(clf.coef_[0], vectorizer.get_feature_names()))
     top = zip(c_f[:n], c_f[:-(n + 1):-1])
     for (c1, f1), (c2, f2) in top:
-        print "\t%.4f\t%-15s\t\t%.4f\t%-15s" % (c1, f1, c2, f2)
+        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (c1, f1, c2, f2))
 
 
 def plot_log():
diff --git a/ch10/.gitignore b/ch10/.gitignore
new file mode 100644
index 00000000..2f266195
--- /dev/null
+++ b/ch10/.gitignore
@@ -0,0 +1 @@
+AnimTransDistr/
diff --git a/ch10/README.rst b/ch10/README.rst
new file mode 100644
index 00000000..91e32756
--- /dev/null
+++ b/ch10/README.rst
@@ -0,0 +1,37 @@
+==========
+Chapter 10
+==========
+
+Support code for *Chapter 10: Pattern Recognition & Computer Vision*
+
+Data
+----
+
+This chapter relies on a publicly available dataset (which can be downloaded
+using the ``download.sh`` script inside the ``data/`` directory) as well the
+dataset that is packaged with the repository at ``../SimpleImageDataset/``.
+
+Running ``download.sh`` will retrieve the other dataset into a directory
+``AnimTransDistr/``.
+
+Scripts
+-------
+
+chapter.py
+    Code as written in the book.
+thresholded_figure.py
+    Computes the thresholded figures, including after Gaussian blurring
+lena-ring.py
+    Lena image with center in focus and blurred edges
+figure10.py
+    Just paste two images next to each others
+features.py
+    Contains the color histogram function from the book as well as a simple
+    wrapper around ``mahotas.texture.haralick``
+simple_classification.py
+    Classify SimpleImageDataset with texture features + color histogram features
+large_classification.py
+    Classify ``AnimTransDistr`` with both texture and SURF features.
+neighbors.py
+    Computes image neighbors as well as the neighbor figure from the book.
+
diff --git a/ch10/chapter.py b/ch10/chapter.py
new file mode 100644
index 00000000..233720bb
--- /dev/null
+++ b/ch10/chapter.py
@@ -0,0 +1,186 @@
+﻿import numpy as np
+import mahotas as mh
+image = mh.imread('scene00.jpg')
+from matplotlib import pyplot as plt
+plt.imshow(image)
+plt.show()
+image = mh.colors.rgb2grey(image, dtype=np.uint8)
+plt.imshow(image) # Display the image
+plt.gray()
+thresh = mh.thresholding.otsu(image)
+print('Otsu threshold is {}.'.format(thresh))
+# Otsu threshold is 138.
+plt.imshow(image > thresh)
+
+im16 = mh.gaussian_filter(image,16)
+im = mh.demos.load('lenna')
+
+r,g,b = im.transpose(2,0,1) 
+r12 = mh.gaussian_filter(r, 12.)
+g12 = mh.gaussian_filter(g, 12.)
+b12 = mh.gaussian_filter(b, 12.)
+im12 = mh.as_rgb(r12,g12,b12)
+h, w = r.shape # height and width
+Y, X = np.mgrid[:h,:w]
+Y = Y-h/2. # center at h/2
+Y = Y / Y.max() # normalize to -1 .. +1
+
+X = X-w/2.
+X = X / X.max()
+
+C = np.exp(-2.*(X**2+ Y**2))
+
+# Normalize again to 0..1
+C = C - C.min()
+C = C / C.ptp()
+C = C[:,:,None] # This adds a dummy third dimension to C
+
+ringed = mh.stretch(im*C + (1-C)*im12)
+
+haralick_features = mh.features.haralick(image)
+haralick_features_mean = np.mean(haralick_features, axis=0)
+haralick_features_all = np.ravel(haralick_features)
+
+from glob import glob
+images = glob('../SimpleImageDataset/*.jpg')
+features = []
+labels = []
+for im in images:
+  labels.append(im[:-len('00.jpg')])
+  im = mh.imread(im)
+  im = mh.colors.rgb2gray(im, dtype=np.uint8)
+  features.append(mh.features.haralick(im).ravel())
+
+features = np.array(features)
+labels = np.array(labels)
+
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.linear_model import LogisticRegression
+clf = Pipeline([('preproc', StandardScaler()),
+                ('classifier', LogisticRegression())])
+
+
+from sklearn import cross_validation
+cv = cross_validation.LeaveOneOut(len(images))
+scores = cross_validation.cross_val_score(
+    clf, features, labels, cv=cv)
+print('Accuracy: {:.1%}'.format(scores.mean()))
+# Accuracy: 81.1%
+
+def chist(im):
+    im = im // 64
+    r,g,b = im.transpose((2,0,1))
+    pixels = 1 * r + 4 * b + 16 * g
+    hist = np.bincount(pixels.ravel(), minlength=64)
+    hist = hist.astype(float)
+    hist = np.log1p(hist)
+    return hist
+
+features = []
+for im in images:
+  im = mh.imread(im)
+  features.append(chist(im))
+
+
+
+features = []
+for im in images:
+  imcolor = mh.imread(im)
+  im = mh.colors.rgb2gray(imcolor, dtype=np.uint8)
+  features.append(np.concatenate([
+          mh.features.haralick(im).ravel(),
+          chist(imcolor),
+      ]))
+
+
+scores = cross_validation.cross_val_score(
+    clf, features, labels, cv=cv)
+print('Accuracy: {:.1%}'.format(scores.mean()))
+# Accuracy: 95.6%
+
+
+features = []
+for im in images:
+  imcolor = mh.imread(im)
+  # Ignore everything in the 200 pixels close to the borders
+  imcolor = imcolor[200:-200, 200:-200]
+  im = mh.colors.rgb2gray(imcolor, dtype=np.uint8)
+  features.append(np.concatenate([
+          mh.features.haralick(im).ravel(),
+          chist(imcolor),
+      ]))
+
+sc = StandardScaler()
+features = sc.fit_transform(features)
+from scipy.spatial import distance
+dists = distance.squareform(distance.pdist(features))
+
+
+fig, axes = plt.subplots(2, 9)
+for ci,i in enumerate(range(0,90,10)):
+    left = images[i]
+    dists_left = dists[i]
+    right = dists_left.argsort()
+    # right[0] is the same as left[i], so pick the next closest element
+    right = right[1]
+    right = images[right]
+    left = mh.imread(left)
+    right = mh.imread(right)
+    axes[0, ci].imshow(left)
+    axes[1, ci].imshow(right)
+
+
+
+from sklearn.grid_search import GridSearchCV
+C_range = 10.0 ** np.arange(-4, 3)
+grid = GridSearchCV(LogisticRegression(), param_grid={'C' : C_range})
+clf = Pipeline([('preproc', StandardScaler()),
+               ('classifier', grid)])
+
+cv = cross_validation.KFold(len(features), 5,
+                     shuffle=True, random_state=123)
+scores = cross_validation.cross_val_score(
+   clf, features, labels, cv=cv)
+print('Accuracy: {:.1%}'.format(scores.mean()))
+
+
+
+
+from mahotas.features import surf
+image = mh.demos.load('lena')
+image = mh.colors.rgb2gray(image, dtype=np.uint8)
+descriptors = surf.surf(image, descriptor_only=True)
+
+from mahotas.features import surf
+descriptors = surf.dense(image, spacing=16)
+alldescriptors = []
+for im in images:
+  im = mh.imread(im, as_grey=True)
+  im = im.astype(np.uint8)
+  alldescriptors.append(surf.dense(image, spacing=16))
+# get all descriptors into a single array
+concatenated = np.concatenate(alldescriptors)
+print('Number of descriptors: {}'.format(
+       len(concatenated)))
+# use only every 64th vector
+concatenated = concatenated[::64] 
+from sklearn.cluster import KMeans # FIXME CAPITALIZATION
+k = 256
+km = KMeans(k)
+km.fit(concatenated)
+
+features = []
+for d in alldescriptors:
+  c = km.predict(d)
+  features.append(
+      np.array([np.sum(c == ci) for ci in range(k)])
+  )
+# build single array and convert to float
+features = np.array(features, dtype=float)
+scores = cross_validation.cross_val_score(
+   clf, features, labels, cv=cv)
+print('Accuracy: {:.1%}'.format(scores.mean()))
+# Accuracy: 62.6%
+
+
diff --git a/ch10/get-anim-dataset.sh b/ch10/download.sh
similarity index 70%
rename from ch10/get-anim-dataset.sh
rename to ch10/download.sh
index 93121a7f..fb623f3d 100755
--- a/ch10/get-anim-dataset.sh
+++ b/ch10/download.sh
@@ -2,7 +2,7 @@
 
 mkdir -p AnimTransDistr
 cd AnimTransDistr
-wget http://vision.stanford.edu/Datasets/AnimTransDistr.rar
+curl -O http://vision.stanford.edu/Datasets/AnimTransDistr.rar
 unrar x AnimTransDistr.rar
 # The following file is a weird file:
 rm Anims/104034.jpg
diff --git a/ch10/edginess.py b/ch10/edginess.py
deleted file mode 100644
index 5815d654..00000000
--- a/ch10/edginess.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# This code is supporting material for the book
-# Building Machine Learning Systems with Python
-# by Willi Richert and Luis Pedro Coelho
-# published by PACKT Publishing
-#
-# It is made available under the MIT License
-
-import numpy as np
-import mahotas as mh
-
-
-def edginess_sobel(image):
-    '''Measure the "edginess" of an image
-
-    image should be a 2d numpy array (an image)
-
-    Returns a floating point value which is higher the "edgier" the image is.
-
-    '''
-    edges = mh.sobel(image, just_filter=True)
-    edges = edges.ravel()
-    return np.sqrt(np.dot(edges, edges))
diff --git a/ch10/features.py b/ch10/features.py
new file mode 100644
index 00000000..42847b30
--- /dev/null
+++ b/ch10/features.py
@@ -0,0 +1,70 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+#
+# It is made available under the MIT License
+
+import numpy as np
+import mahotas as mh
+
+
+def edginess_sobel(image):
+    '''Measure the "edginess" of an image
+
+    image should be a 2d numpy array (an image)
+
+    Returns a floating point value which is higher the "edgier" the image is.
+
+    '''
+    edges = mh.sobel(image, just_filter=True)
+    edges = edges.ravel()
+    return np.sqrt(np.dot(edges, edges))
+
+def texture(im):
+    '''Compute features for an image
+
+    Parameters
+    ----------
+    im : ndarray
+
+    Returns
+    -------
+    fs : ndarray
+        1-D array of features
+    '''
+    im = im.astype(np.uint8)
+    return mh.features.haralick(im).ravel()
+
+
+def chist(im):
+    '''Compute color histogram of input image
+
+    Parameters
+    ----------
+    im : ndarray
+        should be an RGB image
+
+    Returns
+    -------
+    c : ndarray
+        1-D array of histogram values
+    '''
+
+    # Downsample pixel values:
+    im = im // 64
+
+    # We can also implement the following by using np.histogramdd
+    # im = im.reshape((-1,3))
+    # bins = [np.arange(5), np.arange(5), np.arange(5)]
+    # hist = np.histogramdd(im, bins=bins)[0]
+    # hist = hist.ravel()
+
+    # Separate RGB channels:
+    r,g,b = im.transpose((2,0,1))
+
+    pixels = 1 * r + 4 * g + 16 * b
+    hist = np.bincount(pixels.ravel(), minlength=64)
+    hist = hist.astype(float)
+    return np.log1p(hist)
+
diff --git a/ch10/figure10.py b/ch10/figure10.py
index 0f5a1b5d..6cb45e7a 100644
--- a/ch10/figure10.py
+++ b/ch10/figure10.py
@@ -10,11 +10,11 @@
 
 # This little script just builds an image with two examples, side-by-side:
 
-text = mh.imread("simple-dataset/text21.jpg")
-scene = mh.imread("simple-dataset/scene00.jpg")
+text = mh.imread("../SimpleImageDataset/text21.jpg")
+building = mh.imread("../SimpleImageDataset/building00.jpg")
 h, w, _ = text.shape
 canvas = np.zeros((h, 2 * w + 128, 3), np.uint8)
-canvas[:, -w:] = scene
+canvas[:, -w:] = building
 canvas[:, :w] = text
 canvas = canvas[::4, ::4]
-mh.imsave('../1400OS_10_10+.jpg', canvas)
+mh.imsave('figure10.jpg', canvas)
diff --git a/ch10/figure13.py b/ch10/figure13.py
deleted file mode 100644
index ce635ef9..00000000
--- a/ch10/figure13.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# This code is supporting material for the book
-# Building Machine Learning Systems with Python
-# by Willi Richert and Luis Pedro Coelho
-# published by PACKT Publishing
-#
-# It is made available under the MIT License
-
-import mahotas as mh
-from mahotas.colors import rgb2grey
-import numpy as np
-
-# Adds a little salt-n-pepper noise to an image
-
-im = mh.imread('lenna.jpg')
-im = rgb2grey(im)
-
-# Salt & pepper arrays
-salt = np.random.random(im.shape) > .975
-pepper = np.random.random(im.shape) > .975
-
-# salt is 170 & pepper is 30
-# Some playing around showed that setting these to more extreme values looks
-# very artificial. These look nicer
-
-im = np.maximum(salt * 170, mh.stretch(im))
-im = np.minimum(pepper * 30 + im * (~pepper), im)
-
-mh.imsave('../1400OS_10_13+.jpg', im.astype(np.uint8))
diff --git a/ch10/figure18.py b/ch10/figure18.py
deleted file mode 100644
index ee11cf05..00000000
--- a/ch10/figure18.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# This code is supporting material for the book
-# Building Machine Learning Systems with Python
-# by Willi Richert and Luis Pedro Coelho
-# published by PACKT Publishing
-#
-# It is made available under the MIT License
-
-import mahotas as mh
-from sklearn import cross_validation
-from sklearn.linear_model.logistic import LogisticRegression
-from mpltools import style
-from matplotlib import pyplot as plt
-import numpy as np
-from glob import glob
-
-basedir = 'AnimTransDistr'
-
-
-def features_for(images):
-    fs = []
-    for im in images:
-        im = mh.imread(im, as_grey=True).astype(np.uint8)
-        fs.append(mh.features.haralick(im).mean(0))
-    return np.array(fs)
-
-
-def features_labels(groups):
-    labels = np.zeros(sum(map(len, groups)))
-    st = 0
-    for i, g in enumerate(groups):
-        labels[st:st + len(g)] = i
-        st += len(g)
-    return np.vstack(groups), labels
-
-classes = [
-    'Anims',
-    'Cars',
-    'Distras',
-    'Trans',
-]
-
-features = []
-labels = []
-for ci, cl in enumerate(classes):
-    images = glob('{}/{}/*.jpg'.format(basedir, cl))
-    features.extend(features_for(images))
-    labels.extend([ci for _ in images])
-
-features = np.array(features)
-labels = np.array(labels)
-
-scores0 = cross_validation.cross_val_score(
-    LogisticRegression(), features, labels, cv=10)
-print('Accuracy (5 fold x-val) with Logistic Regrssion [std features]: %s%%' % (
-    0.1 * round(1000 * scores0.mean())))
-
-tfeatures = features
-
-from sklearn.cluster import KMeans
-from mahotas.features import surf
-
-images = []
-labels = []
-
-for ci, cl in enumerate(classes):
-    curimages = glob('{}/{}/*.jpg'.format(basedir, cl))
-    images.extend(curimages)
-    labels.extend([ci for _ in curimages])
-labels = np.array(labels)
-
-alldescriptors = []
-for im in images:
-    im = mh.imread(im, as_grey=1)
-    im = im.astype(np.uint8)
-
-    #alldescriptors.append(surf.dense(im, spacing=max(im.shape)//32))
-    alldescriptors.append(surf.surf(im, descriptor_only=True))
-
-print('Descriptors done')
-k = 256
-km = KMeans(k)
-
-concatenated = np.concatenate(alldescriptors)
-concatenated = concatenated[::64]
-print('k-meaning...')
-km.fit(concatenated)
-features = []
-for d in alldescriptors:
-    c = km.predict(d)
-    features.append(
-        np.array([np.sum(c == i) for i in xrange(k)])
-    )
-features = np.array(features)
-print('predicting...')
-scoreSURFlr = cross_validation.cross_val_score(
-    LogisticRegression(), features, labels, cv=5).mean()
-print('Accuracy (5 fold x-val) with Log. Reg [SURF features]: %s%%' % (
-    0.1 * round(1000 * scoreSURFlr.mean())))
-
-print('combined...')
-allfeatures = np.hstack([features, tfeatures])
-scoreSURFplr = cross_validation.cross_val_score(
-    LogisticRegression(), allfeatures, labels, cv=5).mean()
-
-print('Accuracy (5 fold x-val) with Log. Reg [All features]: %s%%' % (
-    0.1 * round(1000 * scoreSURFplr.mean())))
-
-style.use('ggplot')
-plt.plot([0, 1, 2], 100 *
-         np.array([scores0.mean(), scoreSURFlr, scoreSURFplr]), 'k-', lw=8)
-plt.plot(
-    [0, 1, 2], 100 * np.array([scores0.mean(), scoreSURFlr, scoreSURFplr]),
-    'o', mec='#cccccc', mew=12, mfc='white')
-plt.xlim(-.5, 2.5)
-plt.ylim(scores0.mean() * 90., scoreSURFplr * 110)
-plt.xticks([0, 1, 2], ["baseline", "SURF", "combined"])
-plt.ylabel('Accuracy (%)')
-plt.savefig('../1400OS_10_18+.png')
diff --git a/ch10/figure5_6.py b/ch10/figure5_6.py
deleted file mode 100644
index 100c84a7..00000000
--- a/ch10/figure5_6.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# This code is supporting material for the book
-# Building Machine Learning Systems with Python
-# by Willi Richert and Luis Pedro Coelho
-# published by PACKT Publishing
-#
-# It is made available under the MIT License
-
-from matplotlib import pyplot as plt
-import numpy as np
-import mahotas as mh
-image = mh.imread('../1400OS_10_01.jpeg')
-image = mh.colors.rgb2gray(image)
-im8 = mh.gaussian_filter(image, 8)
-im16 = mh.gaussian_filter(image, 16)
-im32 = mh.gaussian_filter(image, 32)
-h, w = im8.shape
-canvas = np.ones((h, 3 * w + 256), np.uint8)
-canvas *= 255
-canvas[:, :w] = im8
-canvas[:, w + 128:2 * w + 128] = im16
-canvas[:, -w:] = im32
-mh.imsave('../1400OS_10_05+.jpg', canvas[:, ::2])
-
-im32 = mh.stretch(im32)
-ot32 = mh.otsu(im32)
-mh.imsave('../1400OS_10_06+.jpg', (im32 > ot32).astype(np.uint8) * 255)
diff --git a/ch10/figure9.py b/ch10/figure9.py
deleted file mode 100644
index 6d3db1a4..00000000
--- a/ch10/figure9.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# This code is supporting material for the book
-# Building Machine Learning Systems with Python
-# by Willi Richert and Luis Pedro Coelho
-# published by PACKT Publishing
-#
-# It is made available under the MIT License
-
-from matplotlib import pyplot as plt
-import numpy as np
-import mahotas as mh
-image = mh.imread('../1400OS_10_01.jpeg')
-image = mh.colors.rgb2gray(image, dtype=np.uint8)
-image = image[::4, ::4]
-thresh = mh.sobel(image)
-filtered = mh.sobel(image, just_filter=True)
-
-thresh = mh.dilate(thresh, np.ones((7, 7)))
-filtered = mh.dilate(mh.stretch(filtered), np.ones((7, 7)))
-
-
-h, w = thresh.shape
-canvas = 255 * np.ones((h, w * 2 + 64), np.uint8)
-canvas[:, :w] = thresh * 255
-canvas[:, -w:] = filtered
-
-mh.imsave('../1400OS_10_09+.jpg', canvas)
diff --git a/ch10/large_classification.py b/ch10/large_classification.py
new file mode 100644
index 00000000..8db3571b
--- /dev/null
+++ b/ch10/large_classification.py
@@ -0,0 +1,108 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+#
+# It is made available under the MIT License
+
+from __future__ import print_function
+import mahotas as mh
+from glob import glob
+from sklearn import cross_validation
+from sklearn.linear_model import LogisticRegression
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.grid_search import GridSearchCV
+import numpy as np
+
+basedir = 'AnimTransDistr'
+print('This script will test classification of the AnimTransDistr dataset')
+
+C_range = 10.0 ** np.arange(-4, 3)
+grid = GridSearchCV(LogisticRegression(), param_grid={'C' : C_range})
+clf = Pipeline([('preproc', StandardScaler()),
+                ('classifier', grid)])
+
+def features_for(im):
+    from features import chist
+    im = mh.imread(im)
+    img = mh.colors.rgb2grey(im).astype(np.uint8)
+    return np.concatenate([mh.features.haralick(img).ravel(),
+                                chist(im)])
+
+def images():
+    '''Iterate over all (image,label) pairs
+
+    This function will return
+    '''
+    for ci, cl in enumerate(classes):
+        images = glob('{}/{}/*.jpg'.format(basedir, cl))
+        for im in sorted(images):
+            yield im, ci
+
+classes = [
+    'Anims',
+    'Cars',
+    'Distras',
+    'Trans',
+]
+
+print('Computing whole-image texture features...')
+ifeatures = []
+labels = []
+for im, ell in images():
+    ifeatures.append(features_for(im))
+    labels.append(ell)
+
+ifeatures = np.array(ifeatures)
+labels = np.array(labels)
+
+cv = cross_validation.KFold(len(ifeatures), 5, shuffle=True, random_state=123)
+scores0 = cross_validation.cross_val_score(
+    clf, ifeatures, labels, cv=cv)
+print('Accuracy (5 fold x-val) with Logistic Regression [image features]: {:.1%}'.format(
+    scores0.mean()))
+
+
+from sklearn.cluster import KMeans
+from mahotas.features import surf
+
+
+print('Computing SURF descriptors...')
+alldescriptors = []
+for im,_ in images():
+    im = mh.imread(im, as_grey=True)
+    im = im.astype(np.uint8)
+
+    # To use dense sampling, you can try the following line:
+    # alldescriptors.append(surf.dense(im, spacing=16))
+    alldescriptors.append(surf.surf(im, descriptor_only=True))
+
+print('Descriptor computation complete.')
+k = 256
+km = KMeans(k)
+
+concatenated = np.concatenate(alldescriptors)
+print('Number of descriptors: {}'.format(
+        len(concatenated)))
+concatenated = concatenated[::64]
+print('Clustering with K-means...')
+km.fit(concatenated)
+sfeatures = []
+for d in alldescriptors:
+    c = km.predict(d)
+    sfeatures.append(np.bincount(c, minlength=k))
+sfeatures = np.array(sfeatures, dtype=float)
+print('predicting...')
+score_SURF = cross_validation.cross_val_score(
+    clf, sfeatures, labels, cv=cv).mean()
+print('Accuracy (5 fold x-val) with Logistic Regression [SURF features]: {:.1%}'.format(
+    score_SURF.mean()))
+
+
+print('Performing classification with all features combined...')
+allfeatures = np.hstack([sfeatures, ifeatures])
+score_SURF_global = cross_validation.cross_val_score(
+    clf, allfeatures, labels, cv=cv).mean()
+print('Accuracy (5 fold x-val) with Logistic Regression [All features]: {:.1%}'.format(
+    score_SURF_global.mean()))
diff --git a/ch10/lenna-ring.py b/ch10/lena-ring.py
similarity index 93%
rename from ch10/lenna-ring.py
rename to ch10/lena-ring.py
index 9fa9eff3..fb28b53d 100644
--- a/ch10/lenna-ring.py
+++ b/ch10/lena-ring.py
@@ -9,7 +9,7 @@
 import numpy as np
 
 # Read in the image
-im = mh.imread('lenna.jpg')
+im = mh.demos.load('lena')
 
 # This breaks up the image into RGB channels
 r, g, b = im.transpose(2, 0, 1)
@@ -38,4 +38,4 @@
 
 # The final result is sharp in the centre and smooths out to the borders:
 ring = mh.stretch(im * C + (1 - C) * im12)
-mh.imsave('lenna-ring.jpg', ring)
+mh.imsave('lena-ring.jpg', ring)
diff --git a/ch10/neighbors.py b/ch10/neighbors.py
new file mode 100644
index 00000000..1f71d0de
--- /dev/null
+++ b/ch10/neighbors.py
@@ -0,0 +1,64 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+
+import numpy as np
+import mahotas as mh
+from glob import glob
+from features import texture, chist
+from matplotlib import pyplot as plt
+from sklearn.preprocessing import StandardScaler
+from scipy.spatial import distance
+
+basedir = '../SimpleImageDataset/'
+
+
+haralicks = []
+chists = []
+
+print('Computing features...')
+# Use glob to get all the images
+images = glob('{}/*.jpg'.format(basedir))
+# We sort the images to ensure that they are always processed in the same order
+# Otherwise, this would introduce some variation just based on the random
+# ordering that the filesystem uses
+images.sort()
+
+for fname in images:
+    imc = mh.imread(fname)
+    imc = imc[200:-200,200:-200]
+    haralicks.append(texture(mh.colors.rgb2grey(imc)))
+    chists.append(chist(imc))
+
+haralicks = np.array(haralicks)
+chists = np.array(chists)
+features = np.hstack([chists, haralicks])
+
+print('Computing neighbors...')
+sc = StandardScaler()
+features = sc.fit_transform(features)
+dists = distance.squareform(distance.pdist(features))
+
+print('Plotting...')
+fig, axes = plt.subplots(2, 9, figsize=(16,8))
+
+# Remove ticks from all subplots
+for ax in axes.flat:
+    ax.set_xticks([])
+    ax.set_yticks([])
+
+for ci,i in enumerate(range(0,90,10)):
+    left = images[i]
+    dists_left = dists[i]
+    right = dists_left.argsort()
+    # right[0] is the same as left[i], so pick the next closest element
+    right = right[1]
+    right = images[right]
+    left = mh.imread(left)
+    right = mh.imread(right)
+    axes[0, ci].imshow(left)
+    axes[1, ci].imshow(right)
+
+fig.tight_layout()
+fig.savefig('figure_neighbors.png', dpi=300)
diff --git a/ch10/scene00.jpg b/ch10/scene00.jpg
new file mode 100644
index 00000000..ed727a50
Binary files /dev/null and b/ch10/scene00.jpg differ
diff --git a/ch10/simple_classification.py b/ch10/simple_classification.py
index 2f9ebfa4..a5a448d2 100644
--- a/ch10/simple_classification.py
+++ b/ch10/simple_classification.py
@@ -6,37 +6,65 @@
 # It is made available under the MIT License
 
 import mahotas as mh
-from sklearn import cross_validation
-from sklearn.linear_model.logistic import LogisticRegression
 import numpy as np
 from glob import glob
-from edginess import edginess_sobel
 
-basedir = 'simple-dataset'
+from features import texture, chist
+from sklearn.linear_model import LogisticRegression
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
 
+basedir = '../SimpleImageDataset/'
 
-def features_for(im):
-    im = mh.imread(im, as_grey=True).astype(np.uint8)
-    return mh.features.haralick(im).mean(0)
 
-features = []
-sobels = []
+haralicks = []
 labels = []
+chists = []
+
+print('This script will test (with cross-validation) classification of the simple 3 class dataset')
+print('Computing features...')
+# Use glob to get all the images
 images = glob('{}/*.jpg'.format(basedir))
-for im in images:
-    features.append(features_for(im))
-    sobels.append(edginess_sobel(mh.imread(im, as_grey=True)))
-    labels.append(im[:-len('00.jpg')])
 
-features = np.array(features)
+# We sort the images to ensure that they are always processed in the same order
+# Otherwise, this would introduce some variation just based on the random
+# ordering that the filesystem uses
+for fname in sorted(images):
+    imc = mh.imread(fname)
+    haralicks.append(texture(mh.colors.rgb2grey(imc)))
+    chists.append(chist(imc))
+
+    # Files are named like building00.jpg, scene23.jpg...
+    labels.append(fname[:-len('xx.jpg')])
+
+print('Finished computing features.')
+
+haralicks = np.array(haralicks)
 labels = np.array(labels)
+chists = np.array(chists)
+
+haralick_plus_chists = np.hstack([chists, haralicks])
 
+
+# We use Logistic Regression because it achieves high accuracy on small(ish) datasets
+# Feel free to experiment with other classifiers
+clf = Pipeline([('preproc', StandardScaler()),
+                ('classifier', LogisticRegression())])
+
+from sklearn import cross_validation
+cv = cross_validation.LeaveOneOut(len(images))
 scores = cross_validation.cross_val_score(
-    LogisticRegression(), features, labels, cv=5)
-print('Accuracy (5 fold x-val) with Logistic Regrssion [std features]: {}%'.format(
-    0.1 * round(1000 * scores.mean())))
+    clf, haralicks, labels, cv=cv)
+print('Accuracy (Leave-one-out) with Logistic Regression [haralick features]: {:.1%}'.format(
+    scores.mean()))
 
 scores = cross_validation.cross_val_score(
-    LogisticRegression(), np.hstack([np.atleast_2d(sobels).T, features]), labels, cv=5).mean()
-print('Accuracy (5 fold x-val) with Logistic Regrssion [std features + sobel]: {}%'.format(
-    0.1 * round(1000 * scores.mean())))
+    clf, chists, labels, cv=cv)
+print('Accuracy (Leave-one-out) with Logistic Regression [color histograms]: {:.1%}'.format(
+    scores.mean()))
+
+scores = cross_validation.cross_val_score(
+    clf, haralick_plus_chists, labels, cv=cv)
+print('Accuracy (Leave-one-out) with Logistic Regression [texture features + color histograms]: {:.1%}'.format(
+    scores.mean()))
+
diff --git a/ch10/threshold.py b/ch10/threshold.py
index 51f32861..d7361bfb 100644
--- a/ch10/threshold.py
+++ b/ch10/threshold.py
@@ -7,15 +7,28 @@
 
 import numpy as np
 import mahotas as mh
-image = mh.imread('../1400OS_10_01.jpeg')
+
+# Load our example image:
+image = mh.imread('../SimpleImageDataset/building05.jpg')
+
+# Convert to greyscale
 image = mh.colors.rgb2gray(image, dtype=np.uint8)
+
+# Compute a threshold value:
 thresh = mh.thresholding.otsu(image)
-print(thresh)
+print('Otsu threshold is {0}'.format(thresh))
+
+# Compute the thresholded image
 otsubin = (image > thresh)
+print('Saving thresholded image (with Otsu threshold) to otsu-threshold.jpeg')
 mh.imsave('otsu-threshold.jpeg', otsubin.astype(np.uint8) * 255)
-otsubin = ~ mh.close(~otsubin, np.ones((15, 15)))
+
+# Execute morphological opening to smooth out the edges
+otsubin = mh.open(otsubin, np.ones((15, 15)))
 mh.imsave('otsu-closed.jpeg', otsubin.astype(np.uint8) * 255)
 
+# An alternative thresholding method:
 thresh = mh.thresholding.rc(image)
-print(thresh)
+print('Ridley-Calvard threshold is {0}'.format(thresh))
+print('Saving thresholded image (with Ridley-Calvard threshold) to rc-threshold.jpeg')
 mh.imsave('rc-threshold.jpeg', (image > thresh).astype(np.uint8) * 255)
diff --git a/ch10/thresholded_figure.py b/ch10/thresholded_figure.py
new file mode 100644
index 00000000..947762e8
--- /dev/null
+++ b/ch10/thresholded_figure.py
@@ -0,0 +1,31 @@
+import mahotas as mh
+import numpy as np
+from matplotlib import pyplot as plt
+
+# Load image & convert to B&W
+image = mh.imread('../SimpleImageDataset/scene00.jpg')
+image = mh.colors.rgb2grey(image, dtype=np.uint8)
+plt.imshow(image)
+plt.gray()
+plt.title('original image')
+
+thresh = mh.thresholding.otsu(image)
+print('Otsu threshold is {}.'.format(thresh))
+
+threshed = (image > thresh)
+plt.figure()
+plt.imshow(threshed)
+plt.title('threholded image')
+mh.imsave('thresholded.png', threshed.astype(np.uint8)*255)
+
+im16 = mh.gaussian_filter(image, 16)
+
+# Repeat the thresholding operations with the blurred image 
+thresh = mh.thresholding.otsu(im16.astype(np.uint8))
+threshed = (im16 > thresh)
+plt.figure()
+plt.imshow(threshed)
+plt.title('threholded image (after blurring)')
+print('Otsu threshold after blurring is {}.'.format(thresh))
+mh.imsave('thresholded16.png', threshed.astype(np.uint8)*255)
+plt.show()
diff --git a/ch12/.gitignore b/ch12/.gitignore
new file mode 100644
index 00000000..fa8fbcc2
--- /dev/null
+++ b/ch12/.gitignore
@@ -0,0 +1,3 @@
+*.jugdata/
+output.txt
+results.image.txt
diff --git a/ch12/README.rst b/ch12/README.rst
new file mode 100644
index 00000000..fba88d4a
--- /dev/null
+++ b/ch12/README.rst
@@ -0,0 +1,28 @@
+==========
+Chapter 12
+==========
+
+Support code for *Chapter 12: Big(ger) Data* 
+
+Data
+----
+
+This chapter relies only on the image dataset that is packaged with the
+repository at ``../SimpleImageDataset/``.
+
+Scripts
+-------
+
+chapter.py
+    Code as written in the book
+jugfile.py
+    Example jugfile
+image-classification.py
+    Jugfile implementation of image classification from Chapter 10
+
+setup-aws.txt
+    Commands to setup Amazon WebServices machine
+run-jugfile.sh
+    Wrapper script to run jug file on jugfile.py
+run-image-classification.sh
+    Wrapper script to run jug file on image-classification.py
diff --git a/ch12/chapter.py b/ch12/chapter.py
new file mode 100644
index 00000000..15326680
--- /dev/null
+++ b/ch12/chapter.py
@@ -0,0 +1,95 @@
+from jug import TaskGenerator
+from glob import glob
+import mahotas as mh
+@TaskGenerator
+def compute_texture(im):   
+    from features import texture
+    imc = mh.imread(im)
+    return texture(mh.colors.rgb2gray(imc))
+
+@TaskGenerator
+def chist_file(fname):
+    from features import chist
+    im = mh.imread(fname)
+    return chist(im)
+
+import numpy as np
+to_array = TaskGenerator(np.array)
+hstack = TaskGenerator(np.hstack)
+
+haralicks = []
+chists = []
+labels = []
+
+# Change this variable to point to
+# the location of the dataset is on disk
+basedir = '../SimpleImageDataset/'
+# Use glob to get all the images
+images = glob('{}/*.jpg'.format(basedir))
+
+for fname in sorted(images):
+    haralicks.append(compute_texture(fname))
+    chists.append(chist_file(fname))
+    # The class is encoded in the filename as xxxx00.jpg
+    labels.append(fname[:-len('00.jpg')])
+
+haralicks = to_array(haralicks)
+chists = to_array(chists)
+labels = to_array(labels)
+
+@TaskGenerator
+def accuracy(features, labels):
+    from sklearn.linear_model import LogisticRegression
+    from sklearn.pipeline import Pipeline
+    from sklearn.preprocessing import StandardScaler
+    from sklearn import cross_validation
+    
+    clf = Pipeline([('preproc', StandardScaler()),
+                ('classifier', LogisticRegression())])
+    cv = cross_validation.LeaveOneOut(len(features))
+    scores = cross_validation.cross_val_score(
+        clf, features, labels, cv=cv)
+    return scores.mean()
+scores_base = accuracy(haralicks, labels)
+scores_chist = accuracy(chists, labels)
+
+combined = hstack([chists, haralicks])
+scores_combined  = accuracy(combined, labels)
+
+@TaskGenerator
+def print_results(scores):
+    with open('results.image.txt', 'w') as output:
+        for k,v in scores:
+            output.write('Accuracy [{}]: {:.1%}\n'.format(
+                k, v.mean()))
+
+print_results([
+        ('base', scores_base),
+        ('chists', scores_chist),
+        ('combined' , scores_combined),
+        ])
+
+@TaskGenerator
+def compute_lbp(fname):
+    from mahotas.features import lbp
+    imc = mh.imread(fname)
+    im = mh.colors.rgb2grey(imc)
+    return lbp(im, radius=8, points=6)
+
+lbps = []
+for fname in sorted(images):
+    # the rest of the loop as before
+    lbps.append(compute_lbp(fname))
+lbps = to_array(lbps)
+
+scores_lbps = accuracy(lbps, labels)
+combined_all = hstack([chists, haralicks, lbps])
+scores_combined_all = accuracy(combined_all, labels)
+
+print_results([
+        ('base', scores_base),
+        ('chists', scores_chist),
+        ('lbps', scores_lbps),
+        ('combined' , scores_combined),
+        ('combined_all' , scores_combined_all),
+        ])
diff --git a/ch12/features.py b/ch12/features.py
new file mode 120000
index 00000000..142a324d
--- /dev/null
+++ b/ch12/features.py
@@ -0,0 +1 @@
+../ch10/features.py
\ No newline at end of file
diff --git a/ch12/image-classification.py b/ch12/image-classification.py
new file mode 100644
index 00000000..6f76d26d
--- /dev/null
+++ b/ch12/image-classification.py
@@ -0,0 +1,116 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+#
+# It is made available under the MIT License
+
+import mahotas as mh
+import numpy as np
+from glob import glob
+from jug import TaskGenerator
+
+# We need to use the `features` module from chapter 10.
+from sys import path
+path.append('../ch10')
+
+
+# This is the jug-enabled version of the script ``figure18.py`` in Chapter 10
+
+basedir = '../SimpleImageDataset/'
+
+@TaskGenerator
+def compute_texture(im):
+    '''Compute features for an image
+
+    Parameters
+    ----------
+    im : str
+        filepath for image to process
+
+    Returns
+    -------
+    fs : ndarray
+        1-D array of features
+    '''
+    from features import texture
+    imc = mh.imread(im)
+    return texture(mh.colors.rgb2grey(imc))
+
+@TaskGenerator
+def chist(fname):
+    from features import chist as color_histogram
+    im = mh.imread(fname)
+    return color_histogram(im)
+
+@TaskGenerator
+def compute_lbp(fname):
+    from mahotas.features import lbp
+    imc = mh.imread(fname)
+    im = mh.colors.rgb2grey(imc)
+    return lbp(im, radius=8, points=6)
+
+
+@TaskGenerator
+def accuracy(features, labels):
+    from sklearn.linear_model import LogisticRegression
+    from sklearn.pipeline import Pipeline
+    from sklearn.preprocessing import StandardScaler
+    from sklearn import cross_validation
+    # We use logistic regression because it is very fast.
+    # Feel free to experiment with other classifiers
+    clf = Pipeline([('preproc', StandardScaler()),
+                ('classifier', LogisticRegression())])
+    cv = cross_validation.LeaveOneOut(len(features))
+    scores = cross_validation.cross_val_score(
+        clf, features, labels, cv=cv)
+    return scores.mean()
+
+
+@TaskGenerator
+def print_results(scores):
+    with open('results.image.txt', 'w') as output:
+        for k,v in scores:
+            output.write('Accuracy (LOO x-val) with Logistic Regression [{0}]: {1:.1%}\n'.format(
+                k, v.mean()))
+
+
+to_array = TaskGenerator(np.array)
+hstack = TaskGenerator(np.hstack)
+
+haralicks = []
+chists = []
+lbps = []
+labels = []
+
+# Use glob to get all the images
+images = glob('{0}/*.jpg'.format(basedir))
+for fname in sorted(images):
+    haralicks.append(compute_texture(fname))
+    chists.append(chist(fname))
+    lbps.append(compute_lbp(fname))
+    labels.append(fname[:-len('00.jpg')]) # The class is encoded in the filename as xxxx00.jpg
+
+haralicks = to_array(haralicks)
+chists = to_array(chists)
+lbps = to_array(lbps)
+labels = to_array(labels)
+
+scores_base = accuracy(haralicks, labels)
+scores_chist = accuracy(chists, labels)
+scores_lbps = accuracy(lbps, labels)
+
+combined = hstack([chists, haralicks])
+scores_combined = accuracy(combined, labels)
+
+combined_all = hstack([chists, haralicks, lbps])
+scores_combined_all = accuracy(combined_all, labels)
+
+print_results([
+        ('base', scores_base),
+        ('chists', scores_chist),
+        ('lbps', scores_lbps),
+        ('combined' , scores_combined),
+        ('combined_all' , scores_combined_all),
+        ])
+
diff --git a/ch12/jugfile.py b/ch12/jugfile.py
index 91306ccb..9d7e2b7a 100644
--- a/ch12/jugfile.py
+++ b/ch12/jugfile.py
@@ -23,7 +23,7 @@ def add(a, b):
 @TaskGenerator
 def print_final_result(oname, value):
     with open(oname, 'w') as output:
-        print >>output, "Final result:", value
+        output.write("Final result: {0}\n".format(value))
 
 input = 2
 y = double(input)
diff --git a/ch12/run-image-classification.sh b/ch12/run-image-classification.sh
new file mode 100755
index 00000000..868d07fa
--- /dev/null
+++ b/ch12/run-image-classification.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+
+jug execute image-classification.py
diff --git a/ch12/run-jugfile.sh b/ch12/run-jugfile.sh
new file mode 100755
index 00000000..0ff59131
--- /dev/null
+++ b/ch12/run-jugfile.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+
+jug execute
+
diff --git a/ch12/setup-aws.txt b/ch12/setup-aws.txt
new file mode 100644
index 00000000..292654eb
--- /dev/null
+++ b/ch12/setup-aws.txt
@@ -0,0 +1,7 @@
+sudo yum update
+sudo yum -y install python-devel python-pip numpy scipy python-matplotlib
+sudo yum -y install gcc-c++
+sudo yum -y install git
+sudo pip-python install -U pip
+sudo pip install scikit-learn jug mahotas
+