diff --git a/README.md b/README.md
index 05218427..cbcd3155 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,14 @@
 Building Machine Learning Systems with Python
 =============================================
 
-Source Code for the book Building Machine Learning Systems with Python by
-[Willi Richert](http://twotoreal.com) and [Luis Pedro
-Coelho](http://luispedro.org).
+Source Code for the book Building Machine Learning Systems with Python by [Luis
+Pedro Coelho](http://luispedro.org) and [Willi Richert](http://twotoreal.com).
 
-The book was published in 2013 by Packt Publishing and is available [from their
+The book was published in 2013 (second edition in 2015) by Packt Publishing and
+is available [from their
 website](http://www.packtpub.com/building-machine-learning-systems-with-python/book).
 
+The code in the repository corresponds to the second edition. Code for the
+first edition is available in [first\_edition
+branch](https://github.com/luispedro/BuildingMachineLearningSystemsWithPython/tree/first_edition).
 
diff --git a/ch01/analyze_webstats.py b/ch01/analyze_webstats.py
index 8d3c4c41..5da892e2 100644
--- a/ch01/analyze_webstats.py
+++ b/ch01/analyze_webstats.py
@@ -26,8 +26,9 @@
 x = x[~sp.isnan(y)]
 y = y[~sp.isnan(y)]
 
-# plot input data
+
 def plot_models(x, y, models, fname, mx=None, ymax=None, xmin=None):
+    ''' plot input data '''
 
     plt.figure(num=None, figsize=(8, 6))
     plt.clf()
@@ -138,8 +139,8 @@ def error(f, x, y):
 train = sorted(shuffled[split_idx:])
 fbt1 = sp.poly1d(sp.polyfit(xb[train], yb[train], 1))
 fbt2 = sp.poly1d(sp.polyfit(xb[train], yb[train], 2))
-print("fbt2(x)= \n%s"%fbt2)
-print("fbt2(x)-100,000= \n%s"%(fbt2-100000))
+print("fbt2(x)= \n%s" % fbt2)
+print("fbt2(x)-100,000= \n%s" % (fbt2-100000))
 fbt3 = sp.poly1d(sp.polyfit(xb[train], yb[train], 3))
 fbt10 = sp.poly1d(sp.polyfit(xb[train], yb[train], 10))
 fbt100 = sp.poly1d(sp.polyfit(xb[train], yb[train], 100))
diff --git a/ch01/gen_webstats.py b/ch01/gen_webstats.py
index fa133d76..61d0b738 100644
--- a/ch01/gen_webstats.py
+++ b/ch01/gen_webstats.py
@@ -17,26 +17,22 @@
 
 sp.random.seed(3)  # to reproduce the data later on
 
-x = sp.arange(1, 31 * 24)
-y = sp.array(200 * (sp.sin(2 * sp.pi * x / (7 * 24))), dtype=int)
+x = sp.arange(1, 31*24)
+y = sp.array(200*(sp.sin(2*sp.pi*x/(7*24))), dtype=int)
 y += gamma.rvs(15, loc=0, scale=100, size=len(x))
-y += 2 * sp.exp(x / 100.0)
-y = sp.ma.array(y, mask=[y < 0])
-print(sum(y), sum(y < 0))
+y += 2 * sp.exp(x/100.0)
+y = sp.ma.array(y, mask=[y<0])
+print(sum(y), sum(y<0))
 
 plt.scatter(x, y)
 plt.title("Web traffic over the last month")
 plt.xlabel("Time")
 plt.ylabel("Hits/hour")
-plt.xticks([w * 7 * 24 for w in [0, 1, 2, 3, 4]], ['week %i' % (w + 1) for w in
-                                                   [0, 1, 2, 3, 4]])
-
+plt.xticks([w*7*24 for w in range(5)], 
+           ['week %i' %(w+1) for w in range(5)])
 plt.autoscale(tight=True)
 plt.grid()
 plt.savefig(os.path.join(CHART_DIR, "1400_01_01.png"))
 
-# sp.savetxt(os.path.join("..", "web_traffic.tsv"),
-# zip(x[~y.mask],y[~y.mask]), delimiter="\t", fmt="%i")
-
-sp.savetxt(os.path.join(
-    DATA_DIR, "web_traffic.tsv"), list(zip(x, y)), delimiter="\t", fmt="%s")
+sp.savetxt(os.path.join(DATA_DIR, "web_traffic.tsv"), 
+           list(zip(x, y)), delimiter="\t", fmt="%s")
diff --git a/ch02/README.rst b/ch02/README.rst
index 4dc0b211..e2cb729a 100644
--- a/ch02/README.rst
+++ b/ch02/README.rst
@@ -6,6 +6,9 @@ Support code for *Chapter 2: Learning How to Classify with Real-world
 Examples*. The directory data contains the seeds dataset, originally downloaded
 from https://archive.ics.uci.edu/ml/datasets/seeds
 
+chapter.py
+    The code as printed in the book.
+
 figure1.py
     Figure 1 in the book: all 2-by-2 scatter plots
 
diff --git a/ch02/chapter.py b/ch02/chapter.py
new file mode 100644
index 00000000..c68b45ab
--- /dev/null
+++ b/ch02/chapter.py
@@ -0,0 +1,164 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+#
+# It is made available under the MIT License
+
+
+from matplotlib import pyplot as plt
+import numpy as np
+
+# We load the data with load_iris from sklearn
+from sklearn.datasets import load_iris
+data = load_iris()
+
+# load_iris returns an object with several fields
+features = data.data
+feature_names = data.feature_names
+target = data.target
+target_names = data.target_names
+
+for t in range(3):
+ if t == 0:
+     c = 'r'
+     marker = '>'
+ elif t == 1:
+     c = 'g'
+     marker = 'o'
+ elif t == 2:
+     c = 'b'
+     marker = 'x'
+ plt.scatter(features[target == t, 0],
+            features[target == t, 1],
+            marker=marker,
+            c=c)
+# We use NumPy fancy indexing to get an array of strings:
+labels = target_names[target]
+
+# The petal length is the feature at position 2
+plength = features[:, 2]
+
+# Build an array of booleans:
+is_setosa = (labels == 'setosa')
+
+# This is the important step:
+max_setosa =plength[is_setosa].max()
+min_non_setosa = plength[~is_setosa].min()
+print('Maximum of setosa: {0}.'.format(max_setosa))
+
+print('Minimum of others: {0}.'.format(min_non_setosa))
+
+# ~ is the boolean negation operator
+features = features[~is_setosa]
+labels = labels[~is_setosa]
+# Build a new target variable, is_virigina
+is_virginica = (labels == 'virginica')
+
+# Initialize best_acc to impossibly low value
+best_acc = -1.0
+for fi in range(features.shape[1]):
+    # We are going to test all possible thresholds
+    thresh = features[:,fi]
+    for t in thresh:
+
+        # Get the vector for feature `fi`
+        feature_i = features[:, fi]
+        # apply threshold `t`
+        pred = (feature_i > t)
+        acc = (pred == is_virginica).mean()
+        rev_acc = (pred == ~is_virginica).mean()
+        if rev_acc > acc:
+            reverse = True
+            acc = rev_acc
+        else:
+            reverse = False
+
+        if acc > best_acc:
+            best_acc = acc
+            best_fi = fi
+            best_t = t
+            best_reverse = reverse
+
+print(best_fi, best_t, best_reverse, best_acc)
+
+def is_virginica_test(fi, t, reverse, example):
+    'Apply threshold model to a new example'
+    test = example[fi] > t
+    if reverse:
+        test = not test
+    return test
+from threshold import fit_model, predict
+
+# ning accuracy was 96.0%.
+# ing accuracy was 90.0% (N = 50).
+correct = 0.0
+
+for ei in range(len(features)):
+    # select all but the one at position `ei`:
+    training = np.ones(len(features), bool)
+    training[ei] = False
+    testing = ~training
+    model = fit_model(features[training], is_virginica[training])
+    predictions = predict(model, features[testing])
+    correct += np.sum(predictions == is_virginica[testing])
+acc = correct/float(len(features))
+print('Accuracy: {0:.1%}'.format(acc))
+
+
+###########################################
+############## SEEDS DATASET ##############
+###########################################
+
+from load import load_dataset
+
+feature_names = [
+    'area',
+    'perimeter',
+    'compactness',
+    'length of kernel',
+    'width of kernel',
+    'asymmetry coefficien',
+    'length of kernel groove',
+]
+features, labels = load_dataset('seeds')
+
+
+
+from sklearn.neighbors import KNeighborsClassifier
+classifier = KNeighborsClassifier(n_neighbors=1)
+from sklearn.cross_validation import KFold
+
+kf = KFold(len(features), n_folds=5, shuffle=True)
+means = []
+for training,testing in kf:
+   # We learn a model for this fold with `fit` and then apply it to the
+   # testing data with `predict`:
+   classifier.fit(features[training], labels[training])
+   prediction = classifier.predict(features[testing])
+
+   # np.mean on an array of booleans returns fraction
+ # of correct decisions for this fold:
+   curmean = np.mean(prediction == labels[testing])
+   means.append(curmean)
+print('Mean accuracy: {:.1%}'.format(np.mean(means)))
+
+
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+
+classifier = KNeighborsClassifier(n_neighbors=1)
+classifier = Pipeline([('norm', StandardScaler()), ('knn', classifier)])
+
+means = []
+for training,testing in kf:
+    # We learn a model for this fold with `fit` and then apply it to the
+    # testing data with `predict`:
+    classifier.fit(features[training], labels[training])
+    prediction = classifier.predict(features[testing])
+
+    # np.mean on an array of booleans returns fraction
+    # of correct decisions for this fold:
+    curmean = np.mean(prediction == labels[testing])
+    means.append(curmean)
+print('Mean accuracy: {:.1%}'.format(np.mean(means)))
diff --git a/ch02/figure1.py b/ch02/figure1.py
index 2bbdfb9c..4ec6fff8 100644
--- a/ch02/figure1.py
+++ b/ch02/figure1.py
@@ -19,13 +19,21 @@
 
 fig,axes = plt.subplots(2, 3)
 pairs = [(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]
+
+# Set up 3 different pairs of (color, marker)
+color_markers = [
+        ('r', '>'),
+        ('g', 'o'),
+        ('b', 'x'),
+        ]
 for i, (p0, p1) in enumerate(pairs):
     ax = axes.flat[i]
 
-    # Use a different marker/color for each class `t`
-    for t, marker, c in zip(range(3), ">ox", "rgb"):
+    for t in range(3):
+        # Use a different color/marker for each class `t`
+        c,marker = color_markers[t]
         ax.scatter(features[target == t, p0], features[
-                    target == t, p1], marker=marker, c=c, s=40)
+                    target == t, p1], marker=marker, c=c)
     ax.set_xlabel(feature_names[p0])
     ax.set_ylabel(feature_names[p1])
     ax.set_xticks([])
diff --git a/ch02/figure2.py b/ch02/figure2.py
index 29e6d96e..0b69d395 100644
--- a/ch02/figure2.py
+++ b/ch02/figure2.py
@@ -23,8 +23,9 @@
 labels = labels[~is_setosa]
 is_virginica = (labels == 'virginica')
 
-# Hand fixed threshold:
-t = 1.75
+# Hand fixed thresholds:
+t = 1.65
+t2 = 1.75
 
 # Features to use: 3 & 2
 f0, f1 = 3, 2
@@ -49,7 +50,7 @@
 ax.fill_between([t, x1], [y0, y0], [y1, y1], color=area2c)
 ax.fill_between([x0, t], [y0, y0], [y1, y1], color=area1c)
 ax.plot([t, t], [y0, y1], 'k--', lw=2)
-ax.plot([t - .1, t - .1], [y0, y1], 'k:', lw=2)
+ax.plot([t2, t2], [y0, y1], 'k:', lw=2)
 ax.scatter(features[is_virginica, f0],
             features[is_virginica, f1], c='b', marker='o', s=40)
 ax.scatter(features[~is_virginica, f0],
diff --git a/ch02/figure4_5_no_sklearn.py b/ch02/figure4_5_no_sklearn.py
index 5f67e0d7..adc83d73 100644
--- a/ch02/figure4_5_no_sklearn.py
+++ b/ch02/figure4_5_no_sklearn.py
@@ -45,7 +45,7 @@ def plot_decision(features, labels):
 
     model = fit_model(1, features[:, (0, 2)], np.array(labels))
     C = predict(
-        np.vstack([X.ravel(), Y.ravel()]).T, model).reshape(X.shape)
+        model, np.vstack([X.ravel(), Y.ravel()]).T).reshape(X.shape)
     if COLOUR_FIGURE:
         cmap = ListedColormap([(1., .6, .6), (.6, 1., .6), (.6, .6, 1.)])
     else:
diff --git a/ch02/figure4_5_sklearn.py b/ch02/figure4_5_sklearn.py
index 9d20c7cb..55ac0c80 100644
--- a/ch02/figure4_5_sklearn.py
+++ b/ch02/figure4_5_sklearn.py
@@ -58,11 +58,11 @@ def plot_decision(features, labels, num_neighbors=1):
     ax.pcolormesh(X, Y, C, cmap=cmap)
     if COLOUR_FIGURE:
         cmap = ListedColormap([(1., .0, .0), (.1, .6, .1), (.0, .0, 1.)])
-        ax.scatter(features[:, 0], features[:, 2], c=labels, cmap=cmap, s=40)
+        ax.scatter(features[:, 0], features[:, 2], c=labels, cmap=cmap)
     else:
         for lab, ma in zip(range(3), "Do^"):
             ax.plot(features[labels == lab, 0], features[
-                     labels == lab, 2], ma, c=(1., 1., 1.), ms=8)
+                     labels == lab, 2], ma, c=(1., 1., 1.), ms=6)
     return fig,ax
 
 
diff --git a/ch02/knn.py b/ch02/knn.py
index e1d48ce0..89ebfdb4 100644
--- a/ch02/knn.py
+++ b/ch02/knn.py
@@ -26,7 +26,7 @@ def plurality(xs):
             return k
 
 # This function was called ``apply_model`` in the first edition
-def predict(features, model):
+def predict(model, features):
     '''Apply k-nn model'''
     k, train_feats, labels = model
     results = []
@@ -42,5 +42,5 @@ def predict(features, model):
 
 
 def accuracy(features, labels, model):
-    preds = predict(features, model)
+    preds = predict(model, features)
     return np.mean(preds == labels)
diff --git a/ch02/threshold.py b/ch02/threshold.py
index efbd942b..d621a350 100644
--- a/ch02/threshold.py
+++ b/ch02/threshold.py
@@ -40,7 +40,7 @@ def fit_model(features, labels):
 
 
 # This function was called ``apply_model`` in the first edition
-def predict(features, model):
+def predict(model, features):
     '''Apply a learned model'''
     # A model is a pair as returned by fit_model
     t, fi, reverse = model
@@ -51,5 +51,5 @@ def predict(features, model):
 
 def accuracy(features, labels, model):
     '''Compute the accuracy of the model'''
-    preds = predict(features, model)
+    preds = predict(model, features)
     return np.mean(preds == labels)
diff --git a/ch04/.gitignore b/ch04/.gitignore
index 0a7a205c..c4c0b18a 100644
--- a/ch04/.gitignore
+++ b/ch04/.gitignore
@@ -1,2 +1,6 @@
 wiki_lda.pkl
 wiki_lda.pkl.state
+*.png
+*.npy
+*.pkl
+topics.txt
diff --git a/ch04/README.rst b/ch04/README.rst
index 7582ba54..99a3c186 100644
--- a/ch04/README.rst
+++ b/ch04/README.rst
@@ -4,6 +4,16 @@ Chapter 4
 
 Support code for *Chapter 4: Topic Modeling*
 
+
+AP Data
+-------
+
+To download the AP data, use the ``download_ap.sh`` script inside the ``data``
+directory::
+
+    cd data
+    ./download_ap.sh
+
 Word cloud creation
 -------------------
 
@@ -49,3 +59,7 @@ Scripts
 
 blei_lda.py
     Computes LDA using the AP Corpus.
+wikitopics_create.py
+    Create the topic model for Wikipedia using LDA (must download wikipedia database first)
+wikitopics_create_hdp.py
+    Create the topic model for Wikipedia using HDP (must download wikipedia database first)
diff --git a/ch04/blei_lda.py b/ch04/blei_lda.py
index d6cc37b7..7f6ac2b3 100644
--- a/ch04/blei_lda.py
+++ b/ch04/blei_lda.py
@@ -8,19 +8,13 @@
 from __future__ import print_function
 from wordcloud import create_cloud
 try:
-    from gensim import corpora, models
+    from gensim import corpora, models, matutils
 except:
     print("import gensim failed.")
     print()
     print("Please install it")
     raise
 
-try:
-    from mpltools import style
-    style.use('ggplot')
-except:
-    print("Could not import mpltools: plots will not be styled correctly")
-
 import matplotlib.pyplot as plt
 import numpy as np
 from os import path
@@ -39,27 +33,22 @@
 model = models.ldamodel.LdaModel(
     corpus, num_topics=NUM_TOPICS, id2word=corpus.id2word, alpha=None)
 
-ti = 0
 # Iterate over all the topics in the model
-for ti in xrange(model.num_topics):
+for ti in range(model.num_topics):
     words = model.show_topic(ti, 64)
-    tf = sum(f for f, w in words)
+    tf = sum(f for _, f in words)
     with open('topics.txt', 'w') as output:
-        output.write('\n'.join('{}:{}'.format(w, int(1000. * f / tf)) for f, w in words))
+        output.write('\n'.join('{}:{}'.format(w, int(1000. * f / tf)) for w, f in words))
         output.write("\n\n\n")
 
 # We first identify the most discussed topic, i.e., the one with the
 # highest total weight
 
-# First, we need to sum up the weights across all the documents
-weight = np.zeros(model.num_topics)
-for doc in corpus:
-    for col, val in model[doc]:
-        weight[col] += val
-        # As a reasonable alternative, we could have used the log of val:
-        # weight[col] += np.log(val)
+topics = matutils.corpus2dense(model[corpus], num_terms=model.num_topics)
+weight = topics.sum(1)
 max_topic = weight.argmax()
 
+
 # Get the top 64 words for this topic
 # Without the argument, show_topic would return only 10 words
 words = model.show_topic(max_topic, 64)
@@ -68,11 +57,12 @@
 create_cloud('cloud_blei_lda.png', words)
 
 num_topics_used = [len(model[doc]) for doc in corpus]
-plt.hist(num_topics_used, np.arange(42))
-plt.ylabel('Nr of documents')
-plt.xlabel('Nr of topics')
-plt.savefig('Figure_04_01.png')
-plt.clf()
+fig,ax = plt.subplots()
+ax.hist(num_topics_used, np.arange(42))
+ax.set_ylabel('Nr of documents')
+ax.set_xlabel('Nr of topics')
+fig.tight_layout()
+fig.savefig('Figure_04_01.png')
 
 
 # Now, repeat the same exercise using alpha=1.0
@@ -83,12 +73,14 @@
     corpus, num_topics=NUM_TOPICS, id2word=corpus.id2word, alpha=ALPHA)
 num_topics_used1 = [len(model1[doc]) for doc in corpus]
 
-plt.hist([num_topics_used, num_topics_used1], np.arange(42))
-plt.ylabel('Nr of documents')
-plt.xlabel('Nr of topics')
+fig,ax = plt.subplots()
+ax.hist([num_topics_used, num_topics_used1], np.arange(42))
+ax.set_ylabel('Nr of documents')
+ax.set_xlabel('Nr of topics')
 
 # The coordinates below were fit by trial and error to look good
-plt.text(9, 223, r'default alpha')
-plt.text(26, 156, 'alpha=1.0')
-plt.savefig('Figure_04_02.png')
+ax.text(9, 223, r'default alpha')
+ax.text(26, 156, 'alpha=1.0')
+fig.tight_layout()
+fig.savefig('Figure_04_02.png')
 
diff --git a/ch04/build_lda.py b/ch04/build_lda.py
index d4dcb7f2..a0ee9c5f 100644
--- a/ch04/build_lda.py
+++ b/ch04/build_lda.py
@@ -79,7 +79,7 @@ def __len__(self):
 
 distances = distance.squareform(distance.pdist(thetas))
 large = distances.max() + 1
-for i in xrange(len(distances)):
+for i in range(len(distances)):
     distances[i, i] = large
 
 print(otexts[1])
diff --git a/ch04/data/.gitignore b/ch04/data/.gitignore
index d71ac12b..91f24bbc 100644
--- a/ch04/data/.gitignore
+++ b/ch04/data/.gitignore
@@ -4,7 +4,9 @@ dataset-379-20news-18828_HJRZF.zip
 379/
 enwiki-latest-pages-articles.xml.bz2
 wiki_en_output_bow.mm
+wiki_en_output_bow.mm.gz
 wiki_en_output_bow.mm.index
 wiki_en_output_tfidf.mm
+wiki_en_output_tfidf.mm.gz
 wiki_en_output_tfidf.mm.index
 wiki_en_output_wordids.txt.bz2
diff --git a/ch04/data/download_ap.sh b/ch04/data/download_ap.sh
index 6de8ded8..da27814a 100755
--- a/ch04/data/download_ap.sh
+++ b/ch04/data/download_ap.sh
@@ -1,3 +1,3 @@
 #!/bin/sh
-wget http://www.cs.princeton.edu/~blei/lda-c/ap.tgz
+wget http://www.cs.columbia.edu/~blei/lda-c/ap.tgz
 tar xzf ap.tgz
diff --git a/ch04/wikitopics_create_hdp.py b/ch04/wikitopics_create_hdp.py
new file mode 100644
index 00000000..951d1850
--- /dev/null
+++ b/ch04/wikitopics_create_hdp.py
@@ -0,0 +1,39 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+#
+# It is made available under the MIT License
+
+from __future__ import print_function
+import logging
+import gensim
+import numpy as np
+
+# Set up logging in order to get progress information as the model is being built:
+logging.basicConfig(
+    format='%(asctime)s : %(levelname)s : %(message)s',
+    level=logging.INFO)
+
+# Load the preprocessed corpus (id2word & mm):
+id2word = gensim.corpora.Dictionary.load_from_text(
+    'data/wiki_en_output_wordids.txt.bz2')
+mm = gensim.corpora.MmCorpus('data/wiki_en_output_tfidf.mm')
+
+# Calling the constructor is enough to build the model
+# This call will take a few hours!
+model = gensim.models.hdpmodel.HdpModel(
+    corpus=mm,
+    id2word=id2word,
+    chunksize=10000)
+
+# Save the model so we do not need to learn it again.
+model.save('wiki_hdp.pkl')
+
+# Compute the document/topic matrix
+topics = np.zeros((len(mm), model.num_topics))
+for di,doc in enumerate(mm):
+    doc_top = model[doc]
+    for ti,tv in doc_top:
+        topics[di,ti] += tv
+np.save('topics_hdp.npy', topics)
diff --git a/ch04/wordcloud.py b/ch04/wordcloud.py
index 6c5302ea..accca2d6 100644
--- a/ch04/wordcloud.py
+++ b/ch04/wordcloud.py
@@ -24,8 +24,6 @@ def create_cloud(oname, words,maxsize=120, fontname='Lobster'):
     # gensim returns a weight between 0 and 1 for each word, while pytagcloud
     # expects an integer word count. So, we multiply by a large number and
     # round. For a visualization this is an adequate approximation.
-    # We also need to flip the order as gensim returns (value, word), whilst
-    # pytagcloud expects (word, value):
-    words = [(w,int(v*10000)) for v,w in words]
+    words = [(w,int(v*10000)) for w,v in words]
     tags = make_tags(words, maxsize=maxsize)
     create_tag_image(tags, oname, size=(1800, 1200), fontname=fontname)
diff --git a/ch07/.gitignore b/ch07/.gitignore
index d21bb3d1..e33609d2 100644
--- a/ch07/.gitignore
+++ b/ch07/.gitignore
@@ -1 +1 @@
-.formula_*/
+*.png
diff --git a/ch07/boston1.py b/ch07/boston1.py
index 1f82409f..d0b30447 100644
--- a/ch07/boston1.py
+++ b/ch07/boston1.py
@@ -27,11 +27,13 @@
 rmse = np.sqrt(lr.residues_/len(x))
 print('RMSE: {}'.format(rmse))
 
+fig, ax = plt.subplots()
+# Plot a diagonal (for reference):
+ax.plot([0, 50], [0, 50], '-', color=(.9,.3,.3), lw=4)
+
 # Plot the prediction versus real:
-plt.scatter(lr.predict(x), boston.target)
+ax.scatter(lr.predict(x), boston.target)
 
-# Plot a diagonal (for reference):
-plt.plot([0, 50], [0, 50], '-', color=(.9,.3,.3), lw=4)
-plt.xlabel('predicted')
-plt.ylabel('real')
-plt.savefig('Figure_07_08.png')
+ax.set_xlabel('predicted')
+ax.set_ylabel('real')
+fig.savefig('Figure_07_08.png')
diff --git a/ch07/data/.gitignore b/ch07/data/.gitignore
new file mode 100644
index 00000000..3286ba0f
--- /dev/null
+++ b/ch07/data/.gitignore
@@ -0,0 +1 @@
+E2006.train
diff --git a/ch07/data/download.sh b/ch07/data/download.sh
index 6f938e06..74753364 100755
--- a/ch07/data/download.sh
+++ b/ch07/data/download.sh
@@ -1,5 +1,3 @@
 #!/usr/bin/env bash
-wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/E2006.train.bz2
+curl -O http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/E2006.train.bz2
 bunzip2 E2006.train.bz2
-wget http://www.grouplens.org/system/files/ml-100k.zip
-unzip ml-100k.zip
diff --git a/ch07/figure1_2.py b/ch07/figure1_2.py
index ec1d31da..3f11a0c7 100644
--- a/ch07/figure1_2.py
+++ b/ch07/figure1_2.py
@@ -14,9 +14,10 @@
 boston = load_boston()
 
 # Index number five in the number of rooms
-plt.scatter(boston.data[:, 5], boston.target)
-plt.xlabel("Number of rooms (RM)")
-plt.ylabel("House Price")
+fig,ax = plt.subplots()
+ax.scatter(boston.data[:, 5], boston.target)
+ax.set_xlabel("Average number of rooms (RM)")
+ax.set_ylabel("House Price")
 
 x = boston.data[:, 5]
 # fit (used below) takes a two-dimensional array as input. We use np.atleast_2d
@@ -29,9 +30,9 @@
 lr = LinearRegression(fit_intercept=False)
 lr.fit(x, y)
 
-plt.plot([0, boston.data[:, 5].max() + 1],
+ax.plot([0, boston.data[:, 5].max() + 1],
          [0, lr.predict(boston.data[:, 5].max() + 1)], '-', lw=4)
-plt.savefig('Figure1.png', dpi=150)
+fig.savefig('Figure1.png')
 
 mse = mean_squared_error(y, lr.predict(x))
 rmse = np.sqrt(mse)
@@ -42,14 +43,14 @@
 
 lr.fit(x, y)
 
-plt.clf()
-plt.xlabel("Number of rooms (RM)")
-plt.ylabel("House Price")
-plt.scatter(boston.data[:, 5], boston.target)
+fig,ax = plt.subplots()
+ax.set_xlabel("Average number of rooms (RM)")
+ax.set_ylabel("House Price")
+ax.scatter(boston.data[:, 5], boston.target)
 xmin = x.min()
 xmax = x.max()
-plt.plot([xmin, xmax], lr.predict([[xmin], [xmax]]) , '-', lw=4)
-plt.savefig('Figure2.png', dpi=150)
+ax.plot([xmin, xmax], lr.predict([[xmin], [xmax]]) , '-', lw=4)
+fig.savefig('Figure2.png')
 
 mse = mean_squared_error(y, lr.predict(x))
 print("Mean squared error (of training data): {:.3}".format(mse))
diff --git a/ch07/figure3.py b/ch07/figure3.py
index d8c10ddc..7543c1ec 100644
--- a/ch07/figure3.py
+++ b/ch07/figure3.py
@@ -8,12 +8,13 @@
 from sklearn.linear_model import LinearRegression, Lasso
 import numpy as np
 from sklearn.datasets import load_boston
-import pylab as plt
+from matplotlib import pyplot as plt
 
 boston = load_boston()
-plt.scatter(boston.data[:, 5], boston.target)
-plt.xlabel("Number of rooms (RM)")
-plt.ylabel("House Price")
+fig, ax = plt.subplots()
+ax.scatter(boston.data[:, 5], boston.target)
+ax.set_xlabel("Number of rooms (RM)")
+ax.set_ylabel("House Price")
 
 
 x = boston.data[:, 5]
@@ -24,9 +25,9 @@
 
 lr = LinearRegression()
 lr.fit(x, y)
-plt.plot([xmin, xmax], lr.predict([[xmin], [xmax]]), ':', lw=4, label='OLS model')
+ax.plot([xmin, xmax], lr.predict([[xmin], [xmax]]), ':', lw=4, label='OLS model')
 
 las = Lasso()
 las.fit(x, y)
-plt.plot([xmin, xmax], las.predict([ [xmin], [xmax] ]), '-', lw=4, label='Lasso model')
-plt.savefig('Figure3.png', dpi=150)
+ax.plot([xmin, xmax], las.predict([ [xmin], [xmax] ]), '-', lw=4, label='Lasso model')
+fig.savefig('Figure3.png')
diff --git a/ch07/figure4.py b/ch07/figure4.py
index 70e61c16..a24d48be 100644
--- a/ch07/figure4.py
+++ b/ch07/figure4.py
@@ -12,7 +12,7 @@
 from sklearn.linear_model import LinearRegression
 from sklearn.datasets import load_boston
 from sklearn.metrics import mean_squared_error
-import pylab as plt
+from matplotlib import pyplot as plt
 
 boston = load_boston()
 
@@ -24,9 +24,10 @@
 p = lr.predict(x)
 print("RMSE: {:.2}.".format(np.sqrt(mean_squared_error(y, p))))
 print("R2: {:.2}.".format(lr.score(x, y)))
-plt.scatter(p, y)
-plt.xlabel('Predicted price')
-plt.ylabel('Actual price')
-plt.plot([y.min(), y.max()], [y.min(), y.max()], lw=4)
+fig,ax = plt.subplots()
+ax.scatter(p, y)
+ax.set_xlabel('Predicted price')
+ax.set_ylabel('Actual price')
+ax.plot([y.min(), y.max()], [y.min(), y.max()], lw=4)
 
-plt.savefig('Figure4.png', dpi=150)
+fig.savefig('Figure4.png')
diff --git a/ch07/lasso_path_plot.py b/ch07/lasso_path_plot.py
index f0a6f3f5..eab64c26 100644
--- a/ch07/lasso_path_plot.py
+++ b/ch07/lasso_path_plot.py
@@ -25,5 +25,5 @@
 ax.set_xlabel('Lasso coefficient path as a function of alpha')
 ax.set_xlabel('Alpha')
 ax.set_ylabel('Coefficient weight')
-fig.savefig('Figure_LassoPath.png', dpi=150)
+fig.savefig('Figure_LassoPath.png')
 
diff --git a/ch07/predict10k_en.py b/ch07/predict10k_en.py
index bcadcef3..a7dd960a 100644
--- a/ch07/predict10k_en.py
+++ b/ch07/predict10k_en.py
@@ -69,4 +69,5 @@
 ax.plot([-5,-1], [-5,-1], 'r-', lw=2)
 ax.set_xlabel('Actual value')
 ax.set_ylabel('Predicted value')
-fig.savefig('Figure_10k_scatter_EN_l1_ratio.png', dpi=150)
+fig.savefig('Figure_10k_scatter_EN_l1_ratio.png')
+
diff --git a/ch07/usermodel.py b/ch07/usermodel.py
deleted file mode 100644
index 7b309455..00000000
--- a/ch07/usermodel.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# This code is supporting material for the book
-# Building Machine Learning Systems with Python
-# by Willi Richert and Luis Pedro Coelho
-# published by PACKT Publishing
-#
-# It is made available under the MIT License
-
-import numpy as np
-from scipy import sparse
-from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV
-from sklearn.cross_validation import KFold
-
-data = np.array([[int(tok) for tok in line.split('\t')[:3]]
-                for line in open('data/ml-100k/u.data')])
-ij = data[:, :2]
-ij -= 1  # original data is in 1-based system
-values = data[:, 2]
-reviews = sparse.csc_matrix((values, ij.T)).astype(float)
-
-reg = ElasticNetCV(fit_intercept=True, alphas=[
-                   0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.])
-
-
-def movie_norm(xc):
-    '''Normalize per movie'''
-    xc = xc.copy().toarray()
-    # xpos is the mean of the positive items
-    xpos = np.array([xi[xi > 0].mean() for xi in xc])
-    xpos = np.nan_to_num(xpos)
-
-    for i in range(xc.shape[0]):
-        xc[i] -= (xc[i] > 0) * xpos[i]
-    return xc, xpos
-
-
-def learn_for(i):
-    u = reviews[i]
-    us = np.delete(np.arange(reviews.shape[0]), i)
-    ps, = np.where(u.toarray().ravel() > 0)
-    x = reviews[us][:, ps].T
-    y = u.data
-    err = 0
-    eb = 0
-    kf = KFold(len(y), n_folds=5)
-    for train, test in kf:
-        xc, xpos = movie_norm(x[train])
-        reg.fit(xc, y[train] - xpos)
-
-        xc, xpos = movie_norm(x[test])
-        p = reg.predict(xc).ravel()
-        e = (p + xpos) - y[test]
-        err += np.sum(e * e)
-        eb += np.sum((y[train].mean() - y[test]) ** 2)
-    return np.sqrt(err / float(len(y))), np.sqrt(eb / float(len(y)))
-
-whole_data = []
-for i in range(reviews.shape[0]):
-    s = learn_for(i)
-    print(s[0] < s[1])
-    print(s)
-    whole_data.append(s)
diff --git a/ch08/README.rst b/ch08/README.rst
new file mode 100644
index 00000000..488844e6
--- /dev/null
+++ b/ch08/README.rst
@@ -0,0 +1,41 @@
+=========
+Chapter 8
+=========
+
+Support code for *Chapter 8: Recommendations*.
+
+The code refers to the second edition of the book and this code has been
+significantly refactored when compared to the first one.
+
+Ratings Prediction
+------------------
+
+Note that since the partition of the data into training and testing is random,
+everytime you run the code, the results will be different.
+
+
+load_ml100k.py
+    Load data & partition into test/train
+norm.py
+    Normalize the data
+corrneighbours.py
+    Neighbour models based on ncrroaltoin
+regression.py
+    Regression models
+stacked.py
+    Stacked predictions
+averaged.py
+    Averaging of predictions (mentioned in book, but code is not shown there).
+
+Association Rule Mining
+-----------------------
+
+Check the folder ``apriori/``
+
+apriori/histogram.py
+    Print a histogram of how many times each product was bought
+apriori/apriori.py
+    Implementation of Apriori algorithm and association rule building
+apriori/apriori_example.py
+    Example of Apriori algorithm in retail dataset
+
diff --git a/ch08/all_correlations.py b/ch08/all_correlations.py
index 509e5b8e..d3817bf0 100644
--- a/ch08/all_correlations.py
+++ b/ch08/all_correlations.py
@@ -7,9 +7,14 @@
 
 import numpy as np
 
-# This is the version in the book:
-
+def all_correlations(y, X):
+    from scipy import spatial
+    y = np.atleast_2d(y)
+    sp = spatial.distance.cdist(X, y, 'correlation')
+    # The "correlation distance" is 1 - corr(x,y); so we invert that to obtain the correlation
+    return 1 - sp.ravel()
 
+# This is the version in the book (1st Edition):
 def all_correlations_book_version(bait, target):
     '''
     corrs = all_correlations(bait, target)
@@ -21,9 +26,7 @@ def all_correlations_book_version(bait, target):
          for c in target])
 
 # This is a faster, but harder to read, implementation:
-
-
-def all_correlations(y, X):
+def all_correlations_fast_no_scipy(y, X):
     '''
     Cs = all_correlations(y, X)
 
@@ -42,12 +45,4 @@ def all_correlations(y, X):
 
     return (xy - x_ * y_ * n) / n / xs_ / ys_
 
-# If you have scipy installed, then you can compute correlations with
-# scipy.spatial.cdist:
 
-def all_correlations_scipy(y, X):
-    from scipy import spatial
-    y = np.atleast_2d(y)
-    sp = spatial.distance.cdist(X, y, 'correlation')
-    # The "correlation distance" is 1 - corr(x,y); so we invert that to obtain the correlation
-    return 1 - sp.ravel()
diff --git a/ch08/apriori/.gitignore b/ch08/apriori/.gitignore
index 05697a95..6379e20f 100644
--- a/ch08/apriori/.gitignore
+++ b/ch08/apriori/.gitignore
@@ -1 +1 @@
-retail.dat
+retail.dat.gz
diff --git a/ch08/apriori/apriori.py b/ch08/apriori/apriori.py
index 1ebb7e54..eeaf0d46 100644
--- a/ch08/apriori/apriori.py
+++ b/ch08/apriori/apriori.py
@@ -5,9 +5,12 @@
 #
 # It is made available under the MIT License
 
+from collections import namedtuple
+
+
 def apriori(dataset, minsupport, maxsize):
     '''
-    freqsets, baskets = apriori(dataset, minsupport, maxsize)
+    freqsets, support = apriori(dataset, minsupport, maxsize)
 
     Parameters
     ----------
@@ -21,48 +24,70 @@ def apriori(dataset, minsupport, maxsize):
     Returns
     -------
     freqsets : sequence of sequences
-    baskets : dictionary
+    support : dictionary
+        This associates each itemset (represented as a frozenset) with a float
+        (the support of that itemset)
     '''
     from collections import defaultdict
 
     baskets = defaultdict(list)
     pointers = defaultdict(list)
+
     for i, ds in enumerate(dataset):
         for ell in ds:
             pointers[ell].append(i)
             baskets[frozenset([ell])].append(i)
-    pointers = dict([(k, frozenset(v)) for k, v in pointers.items()])
-    baskets = dict([(k, frozenset(v)) for k, v in baskets.items()])
 
-    valid = set(list(el)[0]
-                for el, c in baskets.items() if (len(c) >= minsupport))
-    dataset = [[el for el in ds if (el in valid)] for ds in dataset]
-    dataset = [ds for ds in dataset if len(ds) > 1]
-    dataset = map(frozenset, dataset)
+    # Convert pointer items to frozensets to speed up operations later
+    new_pointers = dict()
+    for k in pointers:
+        if len(pointers[k]) >= minsupport:
+            new_pointers[k] = frozenset(pointers[k])
+    pointers = new_pointers
+    for k in baskets:
+        baskets[k] = frozenset(baskets[k])
 
+
+    # Valid are all elements whose support is >= minsupport
+    valid = set()
+    for el, c in baskets.items():
+        if len(c) >= minsupport:
+            valid.update(el)
+
+    # Itemsets at first iteration are simply all singleton with valid elements:
     itemsets = [frozenset([v]) for v in valid]
     freqsets = []
     for i in range(maxsize - 1):
-        print(len(itemsets))
+        print("At iteration {}, number of frequent baskets: {}".format(
+            i, len(itemsets)))
         newsets = []
-        for i, ell in enumerate(itemsets):
-            ccounts = baskets[ell]
-            for v_, pv in pointers.items():
-                if v_ not in ell:
+        for it in itemsets:
+            ccounts = baskets[it]
+
+            for v, pv in pointers.items():
+                if v not in it:
                     csup = (ccounts & pv)
                     if len(csup) >= minsupport:
-                        new = frozenset(ell | set([v_]))
+                        new = frozenset(it | frozenset([v]))
                         if new not in baskets:
                             newsets.append(new)
                             baskets[new] = csup
         freqsets.extend(itemsets)
         itemsets = newsets
-    return freqsets, baskets
+        if not len(itemsets):
+            break
+    support = {}
+    for k in baskets:
+        support[k] = float(len(baskets[k]))
+    return freqsets, support
 
 
-def association_rules(dataset, freqsets, baskets, minlift):
+# A namedtuple to collect all values that may be interesting
+AssociationRule = namedtuple('AssociationRule', ['antecendent', 'consequent', 'base', 'py_x', 'lift'])
+
+def association_rules(dataset, freqsets, support, minlift):
     '''
-    for (antecendent, consequent, base, py_x, lift) in association_rules(dataset, freqsets, baskets, minlift):
+    for assoc_rule in association_rules(dataset, freqsets, support, minlift):
         ...
 
     This function takes the returns from ``apriori``.
@@ -72,9 +97,13 @@ def association_rules(dataset, freqsets, baskets, minlift):
     dataset : sequence of sequences
         input dataset
     freqsets : sequence of sequences
-    baskets : dictionary
+    support : dictionary
     minlift : int
         minimal lift of yielded rules
+
+    Returns
+    -------
+    assoc_rule : sequence of AssociationRule objects
     '''
     nr_transactions = float(len(dataset))
     freqsets = [f for f in freqsets if len(f) > 1]
@@ -82,8 +111,9 @@ def association_rules(dataset, freqsets, baskets, minlift):
         for f in fset:
             consequent = frozenset([f])
             antecendent = fset - consequent
-            base = len(baskets[consequent]) / nr_transactions
-            py_x = len(baskets[fset]) / float(len(baskets[antecendent]))
+            py_x = support[fset] / support[antecendent]
+            base = support[consequent] / nr_transactions
             lift = py_x / base
             if lift > minlift:
-                yield (antecendent, consequent, base, py_x, lift)
+                yield AssociationRule(antecendent, consequent, base, py_x, lift)
+
diff --git a/ch08/apriori/apriori_example.py b/ch08/apriori/apriori_example.py
index da77e96c..971ed4d5 100644
--- a/ch08/apriori/apriori_example.py
+++ b/ch08/apriori/apriori_example.py
@@ -7,10 +7,17 @@
 
 from apriori import apriori, association_rules
 from gzip import GzipFile
+
+# Load dataset
 dataset = [[int(tok) for tok in line.strip().split()]
            for line in GzipFile('retail.dat.gz')]
-freqsets, baskets = apriori(dataset, 80, maxsize=5)
-nr_transactions = float(len(dataset))
-for ant, con, base, pyx, lift in association_rules(dataset, freqsets, baskets, 30):
-    print('{} | {} | {} ({:%}) | {} | {} | {}'
-          .format(ant, con, len(baskets[con]), len(baskets[con]) / nr_transactions, len(baskets[ant]), len(baskets[con | ant]), int(lift)))
+
+freqsets, support = apriori(dataset, 80, maxsize=16)
+rules = list(association_rules(dataset, freqsets, support, minlift=30.0))
+
+rules.sort(key=(lambda ar: -ar.lift))
+for ar in rules:
+    print('{} -> {} (lift = {:.4})'
+          .format(set(ar.antecendent),
+                    set(ar.consequent),
+                    ar.lift))
diff --git a/ch08/apriori/apriori_naive.py b/ch08/apriori/apriori_naive.py
index cb50a8e1..03ab9130 100644
--- a/ch08/apriori/apriori_naive.py
+++ b/ch08/apriori/apriori_naive.py
@@ -5,35 +5,83 @@
 #
 # It is made available under the MIT License
 
-import numpy as np
 from collections import defaultdict
 from itertools import chain
 from gzip import GzipFile
-minsupport = 44
+minsupport = 80
 
 dataset = [[int(tok) for tok in line.strip().split()]
            for line in GzipFile('retail.dat.gz')]
-dataset = dataset[::20]
 
 counts = defaultdict(int)
 for elem in chain(*dataset):
     counts[elem] += 1
 
+# Only elements that have at least minsupport should be considered.
 valid = set(el for el, c in counts.items() if (c >= minsupport))
+
+# Filter the dataset to contain only valid elements
+# (This step is not strictly necessary, but will make the rest of the code
+# faster as the itemsets will be smaller):
 dataset = [[el for el in ds if (el in valid)] for ds in dataset]
 
-dataset = [frozenset(ds) for ds in dataset if len(ds) > 1]
+# Convert to frozenset for fast processing
+dataset = [frozenset(ds) for ds in dataset]
 
 itemsets = [frozenset([v]) for v in valid]
-allsets = [itemsets]
+freqsets = itemsets[:]
 for i in range(16):
-    print(len(itemsets))
+    print("At iteration {}, number of frequent baskets: {}".format(
+        i, len(itemsets)))
     nextsets = []
-    for i, ell in enumerate(itemsets):
-        for v_ in valid:
-            if v_ not in ell:
-                c = (ell | set([v_]))
-                if sum(1 for d in dataset if d.issuperset(c)) > minsupport:
+
+    tested = set()
+    for it in itemsets:
+        for v in valid:
+            if v not in it:
+                # Create a new candidate set by adding v to it
+                c = (it | frozenset([v]))
+
+                # Check if we have tested it already:
+                if c in tested:
+                    continue
+                tested.add(c)
+
+                # Count support by looping over dataset
+                # This step is slow.
+                # Check `apriori.py` for a better implementation.
+                support_c = sum(1 for d in dataset if d.issuperset(c))
+                if support_c > minsupport:
                     nextsets.append(c)
-    allsets.append(nextsets)
+    freqsets.extend(nextsets)
     itemsets = nextsets
+    if not len(itemsets):
+        break
+print("Finished!")
+
+
+def rules_from_itemset(itemset, dataset, minlift=1.):
+    nr_transactions = float(len(dataset))
+    for item in itemset:
+        consequent = frozenset([item])
+        antecedent = itemset-consequent
+        base = 0.0
+        # acount: antecedent count
+        acount = 0.0
+
+        # ccount : consequent count
+        ccount = 0.0
+        for d in dataset:
+          if item in d: base += 1
+          if d.issuperset(itemset): ccount += 1
+          if d.issuperset(antecedent): acount += 1
+        base /= nr_transactions
+        p_y_given_x = ccount/acount
+        lift = p_y_given_x / base
+        if lift > minlift:
+            print('Rule {0} ->  {1} has lift {2}'
+                  .format(antecedent, consequent,lift))
+
+for itemset in freqsets:
+    if len(itemset) > 1:
+        rules_from_itemset(itemset, dataset, minlift=4.)
diff --git a/ch08/averaged.py b/ch08/averaged.py
new file mode 100644
index 00000000..5b19bba7
--- /dev/null
+++ b/ch08/averaged.py
@@ -0,0 +1,33 @@
+import numpy as np
+import load_ml100k
+import regression
+import corrneighbours
+from sklearn import metrics
+import norm
+
+def predict(train):
+    predicted0 = regression.predict(train)
+    predicted1 = regression.predict(train.T).T
+    predicted2 = corrneighbours.predict(train)
+    predicted3 = corrneighbours.predict(train.T).T
+    predicted4 = norm.predict(train)
+    predicted5 = norm.predict(train.T).T
+    stack = np.array([
+        predicted0,
+        predicted1,
+        predicted2,
+        predicted3,
+        predicted4,
+        predicted5,
+        ])
+    return stack.mean(0)
+
+
+def main():
+    train,test = load_ml100k.get_train_test(random_state=12)
+    predicted = predict(train)
+    r2 = metrics.r2_score(test[test > 0], predicted[test > 0])
+    print('R2 averaged: {:.2%}'.format(r2))
+
+if __name__ == '__main__':
+    main()
diff --git a/ch08/chapter.py b/ch08/chapter.py
new file mode 100644
index 00000000..d039d93f
--- /dev/null
+++ b/ch08/chapter.py
@@ -0,0 +1,208 @@
+import numpy as np # NOT IN BOOK
+from matplotlib import pyplot as plt # NOT IN BOOK
+
+def load():
+    import numpy as np
+    from scipy import sparse
+
+    data = np.loadtxt('data/ml-100k/u.data')
+    ij = data[:, :2]
+    ij -= 1  # original data is in 1-based system
+    values = data[:, 2]
+    reviews = sparse.csc_matrix((values, ij.T)).astype(float)
+    return reviews.toarray()
+reviews = load()
+U,M = np.where(reviews)
+import random
+test_idxs = np.array(random.sample(range(len(U)), len(U)//10))
+
+train = reviews.copy()
+train[U[test_idxs], M[test_idxs]] = 0
+
+test = np.zeros_like(reviews)
+test[U[test_idxs], M[test_idxs]] = reviews[U[test_idxs], M[test_idxs]]
+
+class NormalizePositive(object):
+    def __init__(self, axis=0):
+        self.axis = axis
+
+    def fit(self, features, y=None):
+        if self.axis == 1:
+            features = features.T
+          #  count features that are greater than zero in axis 0:
+        binary = (features > 0)
+
+        count0 = binary.sum(axis=0)
+
+         # to avoid division by zero, set zero counts to one:
+        count0[count0 == 0] = 1.
+
+         # computing the mean is easy:
+        self.mean = features.sum(axis=0)/count0
+
+        # only consider differences where binary is True:
+        diff = (features - self.mean) * binary
+        diff **= 2
+        # regularize the estimate of std by adding 0.1
+        self.std = np.sqrt(0.1 + diff.sum(axis=0)/count0)
+        return self
+
+
+    def transform(self, features):
+        if self.axis == 1:
+          features = features.T
+        binary = (features > 0)
+        features = features - self.mean
+        features /= self.std
+        features *= binary
+        if self.axis == 1:
+          features = features.T
+        return features
+
+    def inverse_transform(self, features, copy=True):
+        if copy:
+            features = features.copy()
+        if self.axis == 1:
+          features = features.T
+        features *= self.std
+        features += self.mean
+        if self.axis == 1:
+          features = features.T
+        return features
+
+    def fit_transform(self, features):
+        return self.fit(features).transform(features)
+
+
+norm = NormalizePositive(axis=1)
+binary = (train > 0)
+train = norm.fit_transform(train)
+# plot just 200x200 area for space reasons
+plt.imshow(binary[:200, :200], interpolation='nearest')
+
+from scipy.spatial import distance
+# compute all pair-wise distances:
+dists = distance.pdist(binary, 'correlation')
+# Convert to square form, so that dists[i,j]
+# is distance between binary[i] and binary[j]:
+dists = distance.squareform(dists)
+neighbors = dists.argsort(axis=1)
+
+# We are going to fill this matrix with results
+filled = train.copy()
+for u in range(filled.shape[0]):
+    # n_u is neighbors of user
+    n_u = neighbors[u, 1:]
+    for m in range(filled.shape[1]):
+        # get relevant reviews in order!
+        revs = [train[neigh, m]
+                   for neigh in n_u
+                        if binary    [neigh, m]]
+        if len(revs):
+            # n is the number of reviews for this movie
+            n = len(revs)
+            # take half of the reviews plus one into consideration:
+            n //= 2
+            n += 1
+            revs = revs[:n]
+            filled[u,m] = np.mean(revs)
+
+predicted = norm.inverse_transform(filled)
+from sklearn import metrics
+r2 = metrics.r2_score(test[test > 0], predicted[test > 0])
+print('R2 score (binary neighbors): {:.1%}'.format(r2))
+
+reviews = reviews.T
+# use same code as before 
+r2 = metrics.r2_score(test[test > 0], predicted[test > 0])
+print('R2 score (binary movie neighbors): {:.1%}'.format(r2))
+
+
+from sklearn.linear_model import ElasticNetCV # NOT IN BOOK
+
+reg = ElasticNetCV(alphas=[
+                       0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.])
+filled = train.copy()
+# iterate over all users:
+for u in range(train.shape[0]):
+    curtrain = np.delete(train, u, axis=0)
+    bu = binary[u]
+    reg.fit(curtrain[:,bu].T, train[u, bu])
+    filled[u, ~bu] = reg.predict(curtrain[:,~bu].T)
+predicted = norm.inverse_transform(filled)
+r2 = metrics.r2_score(test[test > 0], predicted[test > 0])
+print('R2 score (user regression): {:.1%}'.format(r2))
+
+
+# SHOPPING BASKET ANALYSIS
+# This is the slow version of the code, which will take a long time to
+# complete.
+
+
+from collections import defaultdict
+from itertools import chain
+
+# File is downloaded as a compressed file
+import gzip
+# file format is a line per transaction
+# of the form '12 34 342 5...'
+dataset = [[int(tok) for tok in line.strip().split()]
+       for line in gzip.open('data/retail.dat.gz')]
+dataset = [set(d) for d in dataset]
+# count how often each product was purchased:
+counts = defaultdict(int)
+for elem in chain(*dataset):
+    counts[elem] += 1
+
+minsupport = 80
+valid = set(k for k,v in counts.items() if (v >= minsupport))
+itemsets = [frozenset([v]) for v in valid]
+freqsets = []
+for i in range(16):
+    nextsets = []
+    tested = set()
+    for it in itemsets:
+        for v in valid:
+           if v not in it:
+               # Create a new candidate set by adding v to it
+               c = (it | frozenset([v]))
+               # check If we have tested it already
+               if c in tested:
+                    continue
+               tested.add(c)
+
+               # Count support by looping over dataset
+               # This step is slow.
+               # Check `apriori.py` for a better implementation.
+               support_c = sum(1 for d in dataset if d.issuperset(c))
+               if support_c > minsupport:
+                    nextsets.append(c)
+    freqsets.extend(nextsets)
+    itemsets = nextsets
+    if not len(itemsets):
+        break
+print("Finished!")
+
+
+minlift = 5.0
+nr_transactions = float(len(dataset))
+for itemset in freqsets:
+    for item in itemset:
+        consequent = frozenset([item])
+        antecedent = itemset-consequent
+        base = 0.0
+        # acount: antecedent count
+        acount = 0.0
+
+        # ccount : consequent count
+        ccount = 0.0
+        for d in dataset:
+          if item in d: base += 1
+          if d.issuperset(itemset): ccount += 1
+          if d.issuperset(antecedent): acount += 1
+        base /= nr_transactions
+        p_y_given_x = ccount/acount
+        lift = p_y_given_x / base
+        if lift > minlift:
+            print('Rule {0} ->  {1} has lift {2}'
+                  .format(antecedent, consequent,lift))
diff --git a/ch08/corrneighbours.py b/ch08/corrneighbours.py
index eef3881e..eb30e685 100644
--- a/ch08/corrneighbours.py
+++ b/ch08/corrneighbours.py
@@ -6,75 +6,53 @@
 # It is made available under the MIT License
 
 from __future__ import print_function
-from all_correlations import all_correlations
 import numpy as np
-from load_ml100k import load
-
-def estimate_user(user, rest, num_neighbors=100):
-    '''Estimate ratings for user based on the binary rating matrix
-
-    Returns
-    -------
-    estimates: ndarray
-        Returns a rating estimate for each movie
-    '''
-
-    # Compute binary matrix correlations:
-    bu = user > 0
-    br = rest > 0
-    ws = all_correlations(bu, br)
-
-    # Select top `num_neighbors`:
-    selected = ws.argsort()[-num_neighbors:]
-
-    # Use these to compute estimates:
-    estimates = rest[selected].mean(0)
-    estimates /= (.1 + br[selected].mean(0))
-    return estimates
-
-
-def train_test(user, rest):
-    '''Train & test on a single user
-
-    Returns both the prediction error and the null error
-    '''
-    estimates = estimate_user(user, rest)
-    bu = user > 0
-    br = rest > 0
-    err = estimates[bu] - user[bu]
-    null = rest.mean(0)
-    null /= (.1 + br.mean(0))
-    nerr = null[bu] - user[bu]
-    return np.dot(err, err), np.dot(nerr, nerr)
-
-
-def all_estimates(reviews):
-    estimates = np.zeros_like(reviews)
-    for i in range(reviews.shape[0]):
-        estimates[i] = estimate_user(reviews[i], np.delete(reviews, i, 0))
-    return estimates
-
-def main():
-    reviews = load()
-
-    err = []
-    for i in range(reviews.shape[0]):
-        err.append(
-            train_test(reviews[i], np.delete(reviews, i, 0))
-        )
-    revs = (reviews > 0).sum(1)
-    err = np.array(err)
-    rmse = np.sqrt(err / revs[:, None])
-
-    rmse_model, rmse_null = np.mean(rmse, 0)
-
-    print("Average of RMSE / Null-model RMSE")
-    print("{:.2}\t{:.2} (improvement: {:.1%}".format(rmse_model, rmse_null, (rmse_null-rmse_model)/rmse_null))
-    print()
-
-    rmse_model, rmse_null = np.mean(rmse[revs > 60], 0)
-    print("Average of RMSE / Null-model RMSE (users with more than 60 reviewed movies)")
-    print("{:.2}\t{:.2} (improvement: {:.1%}".format(rmse_model, rmse_null, (rmse_null-rmse_model)/rmse_null))
+from load_ml100k import get_train_test
+from scipy.spatial import distance
+from sklearn import metrics
+
+from norm import NormalizePositive
+
+def predict(otrain):
+    binary = (otrain > 0)
+    norm = NormalizePositive(axis=1)
+    train = norm.fit_transform(otrain)
+
+    dists = distance.pdist(binary, 'correlation')
+    dists = distance.squareform(dists)
+
+    neighbors = dists.argsort(axis=1)
+    filled = train.copy()
+    for u in range(filled.shape[0]):
+        # n_u are the neighbors of user
+        n_u = neighbors[u, 1:]
+        for m in range(filled.shape[1]):
+            # This code could be faster using numpy indexing trickery as the
+            # cost of readibility (this is left as an exercise to the reader):
+            revs = [train[neigh, m]
+                    for neigh in n_u
+                    if binary[neigh, m]]
+            if len(revs):
+                n = len(revs)
+                n //= 2
+                n += 1
+                revs = revs[:n]
+                filled[u,m] = np.mean(revs)
+
+    return norm.inverse_transform(filled)
+
+def main(transpose_inputs=False):
+    train, test = get_train_test(random_state=12)
+    if transpose_inputs:
+        train = train.T
+        test  = test.T
+
+    predicted = predict(train)
+    r2 = metrics.r2_score(test[test > 0], predicted[test > 0])
+    print('R2 score (binary {} neighbours): {:.1%}'.format(
+        ('movie' if transpose_inputs else 'user'),
+        r2))
 
 if __name__ == '__main__':
     main()
+    main(transpose_inputs=True)
diff --git a/ch08/data b/ch08/data
deleted file mode 120000
index 32d37b53..00000000
--- a/ch08/data
+++ /dev/null
@@ -1 +0,0 @@
-../ch07/data
\ No newline at end of file
diff --git a/ch08/data/.gitignore b/ch08/data/.gitignore
new file mode 100644
index 00000000..c391d625
--- /dev/null
+++ b/ch08/data/.gitignore
@@ -0,0 +1,3 @@
+retail.dat.gz
+ml-100k.zip
+/ml-100k/
diff --git a/ch08/data/download.sh b/ch08/data/download.sh
new file mode 100755
index 00000000..b671171d
--- /dev/null
+++ b/ch08/data/download.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+curl -L -O http://files.grouplens.org/papers/ml-100k.zip
+unzip ml-100k.zip
+curl -L -O http://fimi.ua.ac.be/data/retail.dat.gz
diff --git a/ch08/figure3.py b/ch08/figure3.py
index ac6961f3..daafc300 100644
--- a/ch08/figure3.py
+++ b/ch08/figure3.py
@@ -12,4 +12,4 @@
 plt.imshow(data[:200, :200], interpolation='nearest')
 plt.xlabel('User ID')
 plt.ylabel('Film ID')
-plt.savefig('../1400_08_03+.png')
+plt.savefig('Figure_08_03_DataMatrix.png')
diff --git a/ch08/load_ml100k.py b/ch08/load_ml100k.py
index da6add37..7096e75c 100644
--- a/ch08/load_ml100k.py
+++ b/ch08/load_ml100k.py
@@ -11,6 +11,10 @@ def load():
     Returns the review matrix as a numpy array'''
     import numpy as np
     from scipy import sparse
+    from os import path
+
+    if not path.exists('data/ml-100k/u.data'):
+        raise IOError("Data has not been downloaded.\nTry the following:\n\n\tcd data\n\t./download.sh")
 
     # The input is in the form of a CSC sparse matrix, so it's a natural fit to
     # load the data, but we then convert to a more traditional array before
@@ -21,3 +25,35 @@ def load():
     values = data[:, 2]
     reviews = sparse.csc_matrix((values, ij.T)).astype(float)
     return reviews.toarray()
+
+def get_train_test(reviews=None, random_state=None):
+    '''Split data into training & testing
+
+    Parameters
+    ----------
+    reviews : ndarray, optional
+        Input data
+
+    Returns
+    -------
+    train : ndarray
+        training data
+    test : ndarray
+        testing data
+    '''
+    import numpy as np
+    import random
+    r = random.Random(random_state)
+
+    if reviews is None:
+        reviews = load()
+    U,M = np.where(reviews)
+    test_idxs = np.array(r.sample(range(len(U)), len(U)//10))
+    train = reviews.copy()
+    train[U[test_idxs], M[test_idxs]] = 0
+
+    test = np.zeros_like(reviews)
+    test[U[test_idxs], M[test_idxs]] = reviews[U[test_idxs], M[test_idxs]]
+
+    return train, test
+
diff --git a/ch08/norm.py b/ch08/norm.py
new file mode 100644
index 00000000..2925bbca
--- /dev/null
+++ b/ch08/norm.py
@@ -0,0 +1,75 @@
+import numpy as np
+
+class NormalizePositive(object):
+
+    def __init__(self, axis=0):
+        self.axis = axis
+
+    def fit(self, features, y=None):
+        # count features that are greater than zero in axis `self.axis`:
+        if self.axis == 1:
+            features = features.T
+        binary = (features > 0)
+        count = binary.sum(axis=0)
+
+        # to avoid division by zero, set zero counts to one:
+        count[count == 0] = 1.
+
+        self.mean = features.sum(axis=0)/count
+
+        # Compute variance by average squared difference to the mean, but only
+        # consider differences where binary is True (i.e., where there was a
+        # true rating):
+        diff = (features - self.mean) * binary
+        diff **= 2
+        # regularize the estimate of std by adding 0.1
+        self.std = np.sqrt(0.1 + diff.sum(axis=0)/count)
+        return self
+
+    def transform(self, features):
+        if self.axis == 1:
+            features = features.T
+        binary = (features > 0)
+        features = features - self.mean
+        features /= self.std
+        features *= binary
+        if self.axis == 1:
+            features = features.T
+        return features
+
+    def inverse_transform(self, features, copy=True):
+        if copy:
+            features = features.copy()
+        if self.axis == 1:
+            features = features.T
+        features *= self.std
+        features += self.mean
+        if self.axis == 1:
+            features = features.T
+        return features
+
+    def fit_transform(self, features):
+        return self.fit(features).transform(features)
+
+
+def predict(train):
+    norm = NormalizePositive()
+    train = norm.fit_transform(train)
+    return norm.inverse_transform(train * 0.)
+
+
+def main(transpose_inputs=False):
+    from load_ml100k import get_train_test
+    from sklearn import metrics
+    train,test = get_train_test(random_state=12)
+    if transpose_inputs:
+        train = train.T
+        test = test.T
+    predicted = predict(train)
+    r2 = metrics.r2_score(test[test > 0], predicted[test > 0])
+    print('R2 score ({} normalization): {:.1%}'.format(
+        ('movie' if transpose_inputs else 'user'),
+        r2))
+if __name__ == '__main__':
+    main()
+    main(transpose_inputs=True)
diff --git a/ch08/regression.py b/ch08/regression.py
new file mode 100644
index 00000000..693e99a4
--- /dev/null
+++ b/ch08/regression.py
@@ -0,0 +1,50 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+#
+# It is made available under the MIT License
+
+import numpy as np
+from sklearn.linear_model import ElasticNetCV
+from norm import NormalizePositive
+from sklearn import metrics
+
+
+def predict(train):
+    binary = (train > 0)
+    reg = ElasticNetCV(fit_intercept=True, alphas=[
+                       0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.])
+    norm = NormalizePositive()
+    train = norm.fit_transform(train)
+
+    filled = train.copy()
+    # iterate over all users
+    for u in range(train.shape[0]):
+        # remove the current user for training
+        curtrain = np.delete(train, u, axis=0)
+        bu = binary[u]
+        if np.sum(bu) > 5:
+            reg.fit(curtrain[:,bu].T, train[u, bu])
+
+            # Fill the values that were not there already
+            filled[u, ~bu] = reg.predict(curtrain[:,~bu].T)
+    return norm.inverse_transform(filled)
+
+
+def main(transpose_inputs=False):
+    from load_ml100k import get_train_test
+    train,test = get_train_test(random_state=12)
+    if transpose_inputs:
+        train = train.T
+        test = test.T
+    filled = predict(train)
+    r2 = metrics.r2_score(test[test > 0], filled[test > 0])
+
+    print('R2 score ({} regression): {:.1%}'.format(
+        ('movie' if transpose_inputs else 'user'),
+        r2))
+
+if __name__ == '__main__':
+    main()
+    main(transpose_inputs=True)
diff --git a/ch08/stacked.py b/ch08/stacked.py
index 76afe9fc..8fa6344e 100644
--- a/ch08/stacked.py
+++ b/ch08/stacked.py
@@ -1,40 +1,47 @@
-# This code is supporting material for the book
-# Building Machine Learning Systems with Python
-# by Willi Richert and Luis Pedro Coelho
-# published by PACKT Publishing
-#
-# It is made available under the MIT License
-
-from __future__ import print_function
-from sklearn.linear_model import LinearRegression
-from load_ml100k import load
 import numpy as np
-import similar_movie
-import usermodel
+import load_ml100k
+import regression
 import corrneighbours
+from sklearn import linear_model, metrics
+import norm
+
+def predict(train):
+    tr_train,tr_test = load_ml100k.get_train_test(train, random_state=34)
+    tr_predicted0 = regression.predict(tr_train)
+    tr_predicted1 = regression.predict(tr_train.T).T
+    tr_predicted2 = corrneighbours.predict(tr_train)
+    tr_predicted3 = corrneighbours.predict(tr_train.T).T
+    tr_predicted4 = norm.predict(tr_train)
+    tr_predicted5 = norm.predict(tr_train.T).T
+    stack_tr = np.array([
+        tr_predicted0[tr_test > 0],
+        tr_predicted1[tr_test > 0],
+        tr_predicted2[tr_test > 0],
+        tr_predicted3[tr_test > 0],
+        tr_predicted4[tr_test > 0],
+        tr_predicted5[tr_test > 0],
+        ]).T
+
+    lr = linear_model.LinearRegression()
+    lr.fit(stack_tr, tr_test[tr_test > 0])
+
+    stack_te = np.array([
+        tr_predicted0.ravel(),
+        tr_predicted1.ravel(),
+        tr_predicted2.ravel(),
+        tr_predicted3.ravel(),
+        tr_predicted4.ravel(),
+        tr_predicted5.ravel(),
+        ]).T
+
+    return lr.predict(stack_te).reshape(train.shape)
 
-reviews = load()
-reg = LinearRegression()
-es = np.array([
-    usermodel.all_estimates(reviews),
-    corrneighbours.all_estimates(reviews),
-    similar_movie.all_estimates(reviews),
-])
 
-total_error = 0.0
-coefficients = []
-for u in range(reviews.shape[0]):
-    es0 = np.delete(es, u, 1)
-    r0 = np.delete(reviews, u, 0)
-    X, Y = np.where(r0 > 0)
-    X = es[:, X, Y]
-    y = r0[r0 > 0]
-    reg.fit(X.T, y)
-    coefficients.append(reg.coef_)
+def main():
+    train,test = load_ml100k.get_train_test(random_state=12)
+    predicted = predict(train)
+    r2 = metrics.r2_score(test[test > 0], predicted[test > 0])
+    print('R2 stacked: {:.2%}'.format(r2))
 
-    r0 = reviews[u]
-    X = np.where(r0 > 0)
-    p0 = reg.predict(es[:, u, X].squeeze().T)
-    err0 = r0[r0 > 0] - p0
-    total_error += np.dot(err0, err0)
-    print(u)
+if __name__ == '__main__':
+    main()
diff --git a/ch08/stacked5.py b/ch08/stacked5.py
deleted file mode 100644
index 7ace7d32..00000000
--- a/ch08/stacked5.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# This code is supporting material for the book
-# Building Machine Learning Systems with Python
-# by Willi Richert and Luis Pedro Coelho
-# published by PACKT Publishing
-#
-# It is made available under the MIT License
-
-from sklearn.linear_model import LinearRegression
-from load_ml100k import load
-import numpy as np
-import similar_movie
-import usermodel
-
-reviews = load()
-# Collect several estimates
-es = np.array([
-    usermodel.all_estimates(reviews),
-    similar_movie.all_estimates(reviews, k=1),
-    similar_movie.all_estimates(reviews, k=2),
-    similar_movie.all_estimates(reviews, k=3),
-    similar_movie.all_estimates(reviews, k=4),
-    similar_movie.all_estimates(reviews, k=5),
-])
-
-total_error = 0.0
-coefficients = []
-
-reg = LinearRegression()
-# Iterate over all users
-for u in range(reviews.shape[0]):
-    es0 = np.delete(es, u, axis=1)
-    r0 = np.delete(reviews, u, axis=0)
-    X, Y = np.where(r0 > 0)
-    X = es[:, X, Y]
-    y = r0[r0 > 0]
-    reg.fit(X.T, y)
-    coefficients.append(reg.coef_)
-
-    r0 = reviews[u]
-    X = np.where(r0 > 0)
-    p0 = reg.predict(es[:, u, X].squeeze().T)
-    err0 = r0[r0 > 0] - p0
-    total_error += np.dot(err0, err0)
-coefficients = np.array(coefficients)
diff --git a/ch08/usermodel.py b/ch08/usermodel.py
deleted file mode 100644
index 589afca4..00000000
--- a/ch08/usermodel.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# This code is supporting material for the book
-# Building Machine Learning Systems with Python
-# by Willi Richert and Luis Pedro Coelho
-# published by PACKT Publishing
-#
-# It is made available under the MIT License
-
-import numpy as np
-from sklearn.linear_model import ElasticNetCV
-from sklearn.cross_validation import KFold
-
-
-def learn_for(reviews, i):
-    reg = ElasticNetCV(fit_intercept=True, alphas=[
-                       0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.])
-    nusers,nmovies = reviews.shape
-    u = reviews[i]
-    us = np.arange(reviews.shape[0])
-    us = np.delete(us, i)
-    ps, = np.where(u.ravel() > 0)
-    x = reviews[us][:, ps].T
-    kf = KFold(len(ps), n_folds=4)
-    predictions = np.zeros(len(ps))
-    for train, test in kf:
-        xc = x[train].copy()
-        x1 = np.array([xi[xi > 0].mean() for xi in xc])
-        x1 = np.nan_to_num(x1)
-
-        for i in range(xc.shape[0]):
-            xc[i] -= (xc[i] > 0) * x1[i]
-
-        reg.fit(xc, u[train] - x1)
-
-        xc = x[test].copy()
-        x1 = np.array([xi[xi > 0].mean() for xi in xc])
-        x1 = np.nan_to_num(x1)
-
-        for i in range(xc.shape[0]):
-            xc[i] -= (xc[i] > 0) * x1[i]
-
-        p = reg.predict(xc).ravel()
-        predictions[test] = p
-    fill_preds = np.zeros(nmovies)
-    fill_preds[ps] = predictions
-    return fill_preds
-
-
-def all_estimates(reviews):
-    whole_data = []
-    for i in range(reviews.shape[0]):
-        s = learn_for(reviews, i)
-        whole_data.append(s)
-    return np.array(whole_data)
-
diff --git a/ch10/.gitignore b/ch10/.gitignore
new file mode 100644
index 00000000..2f266195
--- /dev/null
+++ b/ch10/.gitignore
@@ -0,0 +1 @@
+AnimTransDistr/
diff --git a/ch10/README.rst b/ch10/README.rst
index 7137582b..91e32756 100644
--- a/ch10/README.rst
+++ b/ch10/README.rst
@@ -17,23 +17,21 @@ Running ``download.sh`` will retrieve the other dataset into a directory
 Scripts
 -------
 
-threshold.py
-    Threshold building image with both Otsu and Ridley-Calvard threshold
+chapter.py
+    Code as written in the book.
+thresholded_figure.py
+    Computes the thresholded figures, including after Gaussian blurring
 lena-ring.py
-    Show Lena in a sharp ring around a soft focus image
-figure5_6.py
-    Computes figures 5 and 6 in the book
-figure9.py
-    Compute Sobel images
+    Lena image with center in focus and blurred edges
 figure10.py
     Just paste two images next to each others
-figure13.py
-    Demonstrate salt&pepper effect
 features.py
-    Contains the ``edginess_sobel`` function from the book as well as a simple
+    Contains the color histogram function from the book as well as a simple
     wrapper around ``mahotas.texture.haralick``
 simple_classification.py
-    Classify SimpleImageDataset with texture features + sobel feature
-figure18.py
+    Classify SimpleImageDataset with texture features + color histogram features
+large_classification.py
     Classify ``AnimTransDistr`` with both texture and SURF features.
+neighbors.py
+    Computes image neighbors as well as the neighbor figure from the book.
 
diff --git a/ch10/chapter.py b/ch10/chapter.py
new file mode 100644
index 00000000..233720bb
--- /dev/null
+++ b/ch10/chapter.py
@@ -0,0 +1,186 @@
+﻿import numpy as np
+import mahotas as mh
+image = mh.imread('scene00.jpg')
+from matplotlib import pyplot as plt
+plt.imshow(image)
+plt.show()
+image = mh.colors.rgb2grey(image, dtype=np.uint8)
+plt.imshow(image) # Display the image
+plt.gray()
+thresh = mh.thresholding.otsu(image)
+print('Otsu threshold is {}.'.format(thresh))
+# Otsu threshold is 138.
+plt.imshow(image > thresh)
+
+im16 = mh.gaussian_filter(image,16)
+im = mh.demos.load('lenna')
+
+r,g,b = im.transpose(2,0,1) 
+r12 = mh.gaussian_filter(r, 12.)
+g12 = mh.gaussian_filter(g, 12.)
+b12 = mh.gaussian_filter(b, 12.)
+im12 = mh.as_rgb(r12,g12,b12)
+h, w = r.shape # height and width
+Y, X = np.mgrid[:h,:w]
+Y = Y-h/2. # center at h/2
+Y = Y / Y.max() # normalize to -1 .. +1
+
+X = X-w/2.
+X = X / X.max()
+
+C = np.exp(-2.*(X**2+ Y**2))
+
+# Normalize again to 0..1
+C = C - C.min()
+C = C / C.ptp()
+C = C[:,:,None] # This adds a dummy third dimension to C
+
+ringed = mh.stretch(im*C + (1-C)*im12)
+
+haralick_features = mh.features.haralick(image)
+haralick_features_mean = np.mean(haralick_features, axis=0)
+haralick_features_all = np.ravel(haralick_features)
+
+from glob import glob
+images = glob('../SimpleImageDataset/*.jpg')
+features = []
+labels = []
+for im in images:
+  labels.append(im[:-len('00.jpg')])
+  im = mh.imread(im)
+  im = mh.colors.rgb2gray(im, dtype=np.uint8)
+  features.append(mh.features.haralick(im).ravel())
+
+features = np.array(features)
+labels = np.array(labels)
+
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.linear_model import LogisticRegression
+clf = Pipeline([('preproc', StandardScaler()),
+                ('classifier', LogisticRegression())])
+
+
+from sklearn import cross_validation
+cv = cross_validation.LeaveOneOut(len(images))
+scores = cross_validation.cross_val_score(
+    clf, features, labels, cv=cv)
+print('Accuracy: {:.1%}'.format(scores.mean()))
+# Accuracy: 81.1%
+
+def chist(im):
+    im = im // 64
+    r,g,b = im.transpose((2,0,1))
+    pixels = 1 * r + 4 * b + 16 * g
+    hist = np.bincount(pixels.ravel(), minlength=64)
+    hist = hist.astype(float)
+    hist = np.log1p(hist)
+    return hist
+
+features = []
+for im in images:
+  im = mh.imread(im)
+  features.append(chist(im))
+
+
+
+features = []
+for im in images:
+  imcolor = mh.imread(im)
+  im = mh.colors.rgb2gray(imcolor, dtype=np.uint8)
+  features.append(np.concatenate([
+          mh.features.haralick(im).ravel(),
+          chist(imcolor),
+      ]))
+
+
+scores = cross_validation.cross_val_score(
+    clf, features, labels, cv=cv)
+print('Accuracy: {:.1%}'.format(scores.mean()))
+# Accuracy: 95.6%
+
+
+features = []
+for im in images:
+  imcolor = mh.imread(im)
+  # Ignore everything in the 200 pixels close to the borders
+  imcolor = imcolor[200:-200, 200:-200]
+  im = mh.colors.rgb2gray(imcolor, dtype=np.uint8)
+  features.append(np.concatenate([
+          mh.features.haralick(im).ravel(),
+          chist(imcolor),
+      ]))
+
+sc = StandardScaler()
+features = sc.fit_transform(features)
+from scipy.spatial import distance
+dists = distance.squareform(distance.pdist(features))
+
+
+fig, axes = plt.subplots(2, 9)
+for ci,i in enumerate(range(0,90,10)):
+    left = images[i]
+    dists_left = dists[i]
+    right = dists_left.argsort()
+    # right[0] is the same as left[i], so pick the next closest element
+    right = right[1]
+    right = images[right]
+    left = mh.imread(left)
+    right = mh.imread(right)
+    axes[0, ci].imshow(left)
+    axes[1, ci].imshow(right)
+
+
+
+from sklearn.grid_search import GridSearchCV
+C_range = 10.0 ** np.arange(-4, 3)
+grid = GridSearchCV(LogisticRegression(), param_grid={'C' : C_range})
+clf = Pipeline([('preproc', StandardScaler()),
+               ('classifier', grid)])
+
+cv = cross_validation.KFold(len(features), 5,
+                     shuffle=True, random_state=123)
+scores = cross_validation.cross_val_score(
+   clf, features, labels, cv=cv)
+print('Accuracy: {:.1%}'.format(scores.mean()))
+
+
+
+
+from mahotas.features import surf
+image = mh.demos.load('lena')
+image = mh.colors.rgb2gray(image, dtype=np.uint8)
+descriptors = surf.surf(image, descriptor_only=True)
+
+from mahotas.features import surf
+descriptors = surf.dense(image, spacing=16)
+alldescriptors = []
+for im in images:
+  im = mh.imread(im, as_grey=True)
+  im = im.astype(np.uint8)
+  alldescriptors.append(surf.dense(image, spacing=16))
+# get all descriptors into a single array
+concatenated = np.concatenate(alldescriptors)
+print('Number of descriptors: {}'.format(
+       len(concatenated)))
+# use only every 64th vector
+concatenated = concatenated[::64] 
+from sklearn.cluster import KMeans # FIXME CAPITALIZATION
+k = 256
+km = KMeans(k)
+km.fit(concatenated)
+
+features = []
+for d in alldescriptors:
+  c = km.predict(d)
+  features.append(
+      np.array([np.sum(c == ci) for ci in range(k)])
+  )
+# build single array and convert to float
+features = np.array(features, dtype=float)
+scores = cross_validation.cross_val_score(
+   clf, features, labels, cv=cv)
+print('Accuracy: {:.1%}'.format(scores.mean()))
+# Accuracy: 62.6%
+
+
diff --git a/ch10/download.sh b/ch10/download.sh
index 93121a7f..fb623f3d 100755
--- a/ch10/download.sh
+++ b/ch10/download.sh
@@ -2,7 +2,7 @@
 
 mkdir -p AnimTransDistr
 cd AnimTransDistr
-wget http://vision.stanford.edu/Datasets/AnimTransDistr.rar
+curl -O http://vision.stanford.edu/Datasets/AnimTransDistr.rar
 unrar x AnimTransDistr.rar
 # The following file is a weird file:
 rm Anims/104034.jpg
diff --git a/ch10/features.py b/ch10/features.py
index 27fd3e91..42847b30 100644
--- a/ch10/features.py
+++ b/ch10/features.py
@@ -34,6 +34,37 @@ def texture(im):
         1-D array of features
     '''
     im = im.astype(np.uint8)
-    return mh.features.haralick(im).mean(0)
+    return mh.features.haralick(im).ravel()
 
 
+def chist(im):
+    '''Compute color histogram of input image
+
+    Parameters
+    ----------
+    im : ndarray
+        should be an RGB image
+
+    Returns
+    -------
+    c : ndarray
+        1-D array of histogram values
+    '''
+
+    # Downsample pixel values:
+    im = im // 64
+
+    # We can also implement the following by using np.histogramdd
+    # im = im.reshape((-1,3))
+    # bins = [np.arange(5), np.arange(5), np.arange(5)]
+    # hist = np.histogramdd(im, bins=bins)[0]
+    # hist = hist.ravel()
+
+    # Separate RGB channels:
+    r,g,b = im.transpose((2,0,1))
+
+    pixels = 1 * r + 4 * g + 16 * b
+    hist = np.bincount(pixels.ravel(), minlength=64)
+    hist = hist.astype(float)
+    return np.log1p(hist)
+
diff --git a/ch10/figure10.py b/ch10/figure10.py
index 058b1069..6cb45e7a 100644
--- a/ch10/figure10.py
+++ b/ch10/figure10.py
@@ -11,10 +11,10 @@
 # This little script just builds an image with two examples, side-by-side:
 
 text = mh.imread("../SimpleImageDataset/text21.jpg")
-scene = mh.imread("../SimpleImageDataset/scene00.jpg")
+building = mh.imread("../SimpleImageDataset/building00.jpg")
 h, w, _ = text.shape
 canvas = np.zeros((h, 2 * w + 128, 3), np.uint8)
-canvas[:, -w:] = scene
+canvas[:, -w:] = building
 canvas[:, :w] = text
 canvas = canvas[::4, ::4]
-mh.imsave('../1400OS_10_10+.jpg', canvas)
+mh.imsave('figure10.jpg', canvas)
diff --git a/ch10/figure13.py b/ch10/figure13.py
deleted file mode 100644
index 89174e8e..00000000
--- a/ch10/figure13.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# This code is supporting material for the book
-# Building Machine Learning Systems with Python
-# by Willi Richert and Luis Pedro Coelho
-# published by PACKT Publishing
-#
-# It is made available under the MIT License
-
-import mahotas as mh
-from mahotas.colors import rgb2grey
-import numpy as np
-
-# Adds a little salt-n-pepper noise to an image
-
-im = mh.demos.load('lena')
-im = rgb2grey(im)
-
-# Salt & pepper arrays
-salt = np.random.random(im.shape) > .975
-pepper = np.random.random(im.shape) > .975
-
-# salt is 170 & pepper is 30
-# Some playing around showed that setting these to more extreme values looks
-# very artificial. These look nicer
-
-im = np.maximum(salt * 170, mh.stretch(im))
-im = np.minimum(pepper * 30 + im * (~pepper), im)
-
-mh.imsave('../1400OS_10_13+.jpg', im.astype(np.uint8))
diff --git a/ch10/figure18.py b/ch10/figure18.py
deleted file mode 100644
index f17f9520..00000000
--- a/ch10/figure18.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# This code is supporting material for the book
-# Building Machine Learning Systems with Python
-# by Willi Richert and Luis Pedro Coelho
-# published by PACKT Publishing
-#
-# It is made available under the MIT License
-
-from __future__ import print_function
-import mahotas as mh
-from sklearn import cross_validation
-from sklearn.linear_model.logistic import LogisticRegression
-from matplotlib import pyplot as plt
-import numpy as np
-from glob import glob
-
-basedir = 'AnimTransDistr'
-print('This script will test classification of the AnimTransDistr dataset')
-
-
-def features_for(images):
-    fs = []
-    for im in images:
-        im = mh.imread(im, as_grey=True).astype(np.uint8)
-        fs.append(mh.features.haralick(im).mean(0))
-    return np.array(fs)
-
-
-def features_labels(groups):
-    labels = np.zeros(sum(map(len, groups)))
-    st = 0
-    for i, g in enumerate(groups):
-        labels[st:st + len(g)] = i
-        st += len(g)
-    return np.vstack(groups), labels
-
-classes = [
-    'Anims',
-    'Cars',
-    'Distras',
-    'Trans',
-]
-
-print('Computing whole-image texture features...')
-features = []
-labels = []
-for ci, cl in enumerate(classes):
-    images = glob('{}/{}/*.jpg'.format(basedir, cl))
-    features.extend(features_for(images))
-    labels.extend([ci for _ in images])
-
-features = np.array(features)
-labels = np.array(labels)
-
-scores0 = cross_validation.cross_val_score(
-    LogisticRegression(), features, labels, cv=10)
-print('Accuracy (5 fold x-val) with Logistic Regrssion [std features]: %s%%' % (
-    0.1 * round(1000 * scores0.mean())))
-
-tfeatures = features
-
-from sklearn.cluster import KMeans
-from mahotas.features import surf
-
-images = []
-labels = []
-
-for ci, cl in enumerate(classes):
-    curimages = glob('{}/{}/*.jpg'.format(basedir, cl))
-    images.extend(curimages)
-    labels.extend([ci for _ in curimages])
-labels = np.array(labels)
-
-print('Computing SURF descriptors...')
-alldescriptors = []
-for im in images:
-    im = mh.imread(im, as_grey=1)
-    im = im.astype(np.uint8)
-
-    # To use dense sampling, you can try the following line:
-    # alldescriptors.append(surf.dense(im, spacing=max(im.shape)//32))
-    alldescriptors.append(surf.surf(im, descriptor_only=True))
-
-print('Descriptors done')
-k = 256
-km = KMeans(k)
-
-concatenated = np.concatenate(alldescriptors)
-concatenated = concatenated[::64]
-print('Clustering with K-means...')
-km.fit(concatenated)
-features = []
-for d in alldescriptors:
-    c = km.predict(d)
-    features.append(
-        np.array([np.sum(c == i) for i in range(k)])
-    )
-features = np.array(features)
-print('predicting...')
-scoreSURFlr = cross_validation.cross_val_score(
-    LogisticRegression(), features, labels, cv=5).mean()
-print('Accuracy (5 fold x-val) with Log. Reg [SURF features]: %s%%' % (
-    0.1 * round(1000 * scoreSURFlr.mean())))
-
-print('Attemping classification with all features combined...')
-allfeatures = np.hstack([features, tfeatures])
-scoreSURFplr = cross_validation.cross_val_score(
-    LogisticRegression(), allfeatures, labels, cv=5).mean()
-
-print('Accuracy (5 fold x-val) with Log. Reg [All features]: %s%%' % (
-    0.1 * round(1000 * scoreSURFplr.mean())))
-
-plt.plot([0, 1, 2], 100 *
-         np.array([scores0.mean(), scoreSURFlr, scoreSURFplr]), 'k-', lw=8)
-plt.plot(
-    [0, 1, 2], 100 * np.array([scores0.mean(), scoreSURFlr, scoreSURFplr]),
-    'o', mec='#cccccc', mew=12, mfc='white')
-plt.xlim(-.5, 2.5)
-plt.ylim(scores0.mean() * 90., scoreSURFplr * 110)
-plt.xticks([0, 1, 2], ["Texture", "SURF", "combined"])
-plt.ylabel('Accuracy (%)')
-plt.savefig('../1400OS_10_18+.png')
diff --git a/ch10/figure5_6.py b/ch10/figure5_6.py
deleted file mode 100644
index 37d52752..00000000
--- a/ch10/figure5_6.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# This code is supporting material for the book
-# Building Machine Learning Systems with Python
-# by Willi Richert and Luis Pedro Coelho
-# published by PACKT Publishing
-#
-# It is made available under the MIT License
-
-import numpy as np
-import mahotas as mh
-image = mh.imread('../SimpleImageDataset/building05.jpg')
-image = mh.colors.rgb2gray(image)
-
-# Compute Gaussian filtered versions with increasing kernel widths
-im8  = mh.gaussian_filter(image,  8)
-im16 = mh.gaussian_filter(image, 16)
-im32 = mh.gaussian_filter(image, 32)
-
-# We now build a composite image with three panels:
-#
-# [ IM8 | | IM16 | | IM32 ]
-
-h, w = im8.shape
-canvas = np.ones((h, 3 * w + 256), np.uint8)
-canvas *= 255
-canvas[:, :w] = im8
-canvas[:, w + 128:2 * w + 128] = im16
-canvas[:, -w:] = im32
-mh.imsave('../1400OS_10_05+.jpg', canvas[:, ::2])
-
-# Threshold the image
-# We need to first stretch it to convert to an integer image
-im32 = mh.stretch(im32)
-ot32 = mh.otsu(im32)
-
-# Convert to 255 np.uint8 to match the other images
-im255 = 255 * (im32 > ot32).astype(np.uint8)
-mh.imsave('../1400OS_10_06+.jpg', im255)
diff --git a/ch10/figure9.py b/ch10/figure9.py
deleted file mode 100644
index fbaa985f..00000000
--- a/ch10/figure9.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# This code is supporting material for the book
-# Building Machine Learning Systems with Python
-# by Willi Richert and Luis Pedro Coelho
-# published by PACKT Publishing
-#
-# It is made available under the MIT License
-
-import numpy as np
-import mahotas as mh
-image = mh.imread('../SimpleImageDataset/building05.jpg')
-image = mh.colors.rgb2gray(image, dtype=np.uint8)
-
-# We need to downsample ``image`` so that the details are visibly pixelated.
-# This exaggerates the effect so that the result is clear
-image = image[::4, ::4]
-thresh = mh.sobel(image)
-filtered = mh.sobel(image, just_filter=True)
-
-thresh = mh.dilate(thresh, np.ones((7, 7)))
-filtered = mh.dilate(mh.stretch(filtered), np.ones((7, 7)))
-
-# Paste the thresholded and non-thresholded versions of the image side-by-side
-h, w = thresh.shape
-canvas = 255 * np.ones((h, w * 2 + 64), np.uint8)
-canvas[:, :w] = thresh * 255
-canvas[:, -w:] = filtered
-
-mh.imsave('../1400OS_10_09+.jpg', canvas)
diff --git a/ch10/large_classification.py b/ch10/large_classification.py
new file mode 100644
index 00000000..8db3571b
--- /dev/null
+++ b/ch10/large_classification.py
@@ -0,0 +1,108 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+#
+# It is made available under the MIT License
+
+from __future__ import print_function
+import mahotas as mh
+from glob import glob
+from sklearn import cross_validation
+from sklearn.linear_model import LogisticRegression
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.grid_search import GridSearchCV
+import numpy as np
+
+basedir = 'AnimTransDistr'
+print('This script will test classification of the AnimTransDistr dataset')
+
+C_range = 10.0 ** np.arange(-4, 3)
+grid = GridSearchCV(LogisticRegression(), param_grid={'C' : C_range})
+clf = Pipeline([('preproc', StandardScaler()),
+                ('classifier', grid)])
+
+def features_for(im):
+    from features import chist
+    im = mh.imread(im)
+    img = mh.colors.rgb2grey(im).astype(np.uint8)
+    return np.concatenate([mh.features.haralick(img).ravel(),
+                                chist(im)])
+
+def images():
+    '''Iterate over all (image,label) pairs
+
+    This function will return
+    '''
+    for ci, cl in enumerate(classes):
+        images = glob('{}/{}/*.jpg'.format(basedir, cl))
+        for im in sorted(images):
+            yield im, ci
+
+classes = [
+    'Anims',
+    'Cars',
+    'Distras',
+    'Trans',
+]
+
+print('Computing whole-image texture features...')
+ifeatures = []
+labels = []
+for im, ell in images():
+    ifeatures.append(features_for(im))
+    labels.append(ell)
+
+ifeatures = np.array(ifeatures)
+labels = np.array(labels)
+
+cv = cross_validation.KFold(len(ifeatures), 5, shuffle=True, random_state=123)
+scores0 = cross_validation.cross_val_score(
+    clf, ifeatures, labels, cv=cv)
+print('Accuracy (5 fold x-val) with Logistic Regression [image features]: {:.1%}'.format(
+    scores0.mean()))
+
+
+from sklearn.cluster import KMeans
+from mahotas.features import surf
+
+
+print('Computing SURF descriptors...')
+alldescriptors = []
+for im,_ in images():
+    im = mh.imread(im, as_grey=True)
+    im = im.astype(np.uint8)
+
+    # To use dense sampling, you can try the following line:
+    # alldescriptors.append(surf.dense(im, spacing=16))
+    alldescriptors.append(surf.surf(im, descriptor_only=True))
+
+print('Descriptor computation complete.')
+k = 256
+km = KMeans(k)
+
+concatenated = np.concatenate(alldescriptors)
+print('Number of descriptors: {}'.format(
+        len(concatenated)))
+concatenated = concatenated[::64]
+print('Clustering with K-means...')
+km.fit(concatenated)
+sfeatures = []
+for d in alldescriptors:
+    c = km.predict(d)
+    sfeatures.append(np.bincount(c, minlength=k))
+sfeatures = np.array(sfeatures, dtype=float)
+print('predicting...')
+score_SURF = cross_validation.cross_val_score(
+    clf, sfeatures, labels, cv=cv).mean()
+print('Accuracy (5 fold x-val) with Logistic Regression [SURF features]: {:.1%}'.format(
+    score_SURF.mean()))
+
+
+print('Performing classification with all features combined...')
+allfeatures = np.hstack([sfeatures, ifeatures])
+score_SURF_global = cross_validation.cross_val_score(
+    clf, allfeatures, labels, cv=cv).mean()
+print('Accuracy (5 fold x-val) with Logistic Regression [All features]: {:.1%}'.format(
+    score_SURF_global.mean()))
diff --git a/ch10/neighbors.py b/ch10/neighbors.py
new file mode 100644
index 00000000..1f71d0de
--- /dev/null
+++ b/ch10/neighbors.py
@@ -0,0 +1,64 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+
+import numpy as np
+import mahotas as mh
+from glob import glob
+from features import texture, chist
+from matplotlib import pyplot as plt
+from sklearn.preprocessing import StandardScaler
+from scipy.spatial import distance
+
+basedir = '../SimpleImageDataset/'
+
+
+haralicks = []
+chists = []
+
+print('Computing features...')
+# Use glob to get all the images
+images = glob('{}/*.jpg'.format(basedir))
+# We sort the images to ensure that they are always processed in the same order
+# Otherwise, this would introduce some variation just based on the random
+# ordering that the filesystem uses
+images.sort()
+
+for fname in images:
+    imc = mh.imread(fname)
+    imc = imc[200:-200,200:-200]
+    haralicks.append(texture(mh.colors.rgb2grey(imc)))
+    chists.append(chist(imc))
+
+haralicks = np.array(haralicks)
+chists = np.array(chists)
+features = np.hstack([chists, haralicks])
+
+print('Computing neighbors...')
+sc = StandardScaler()
+features = sc.fit_transform(features)
+dists = distance.squareform(distance.pdist(features))
+
+print('Plotting...')
+fig, axes = plt.subplots(2, 9, figsize=(16,8))
+
+# Remove ticks from all subplots
+for ax in axes.flat:
+    ax.set_xticks([])
+    ax.set_yticks([])
+
+for ci,i in enumerate(range(0,90,10)):
+    left = images[i]
+    dists_left = dists[i]
+    right = dists_left.argsort()
+    # right[0] is the same as left[i], so pick the next closest element
+    right = right[1]
+    right = images[right]
+    left = mh.imread(left)
+    right = mh.imread(right)
+    axes[0, ci].imshow(left)
+    axes[1, ci].imshow(right)
+
+fig.tight_layout()
+fig.savefig('figure_neighbors.png', dpi=300)
diff --git a/ch10/scene00.jpg b/ch10/scene00.jpg
new file mode 100644
index 00000000..ed727a50
Binary files /dev/null and b/ch10/scene00.jpg differ
diff --git a/ch10/simple_classification.py b/ch10/simple_classification.py
index 7f8a214d..a5a448d2 100644
--- a/ch10/simple_classification.py
+++ b/ch10/simple_classification.py
@@ -6,18 +6,20 @@
 # It is made available under the MIT License
 
 import mahotas as mh
-from sklearn import cross_validation
-from sklearn.linear_model.logistic import LogisticRegression
 import numpy as np
 from glob import glob
-from features import texture, edginess_sobel
+
+from features import texture, chist
+from sklearn.linear_model import LogisticRegression
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
 
 basedir = '../SimpleImageDataset/'
 
 
 haralicks = []
-sobels = []
 labels = []
+chists = []
 
 print('This script will test (with cross-validation) classification of the simple 3 class dataset')
 print('Computing features...')
@@ -28,9 +30,9 @@
 # Otherwise, this would introduce some variation just based on the random
 # ordering that the filesystem uses
 for fname in sorted(images):
-    im = mh.imread(fname, as_grey=True)
-    haralicks.append(texture(im))
-    sobels.append(edginess_sobel(im))
+    imc = mh.imread(fname)
+    haralicks.append(texture(mh.colors.rgb2grey(imc)))
+    chists.append(chist(imc))
 
     # Files are named like building00.jpg, scene23.jpg...
     labels.append(fname[:-len('xx.jpg')])
@@ -38,27 +40,31 @@
 print('Finished computing features.')
 
 haralicks = np.array(haralicks)
-sobels = np.array(sobels)
 labels = np.array(labels)
+chists = np.array(chists)
+
+haralick_plus_chists = np.hstack([chists, haralicks])
+
 
-# We use logistic regression because it is very fast.
+# We use Logistic Regression because it achieves high accuracy on small(ish) datasets
 # Feel free to experiment with other classifiers
-scores = cross_validation.cross_val_score(
-    LogisticRegression(), haralicks, labels, cv=5)
-print('Accuracy (5 fold x-val) with Logistic Regression [std features]: {}%'.format(
-    0.1 * round(1000 * scores.mean())))
+clf = Pipeline([('preproc', StandardScaler()),
+                ('classifier', LogisticRegression())])
 
-haralick_plus_sobel = np.hstack([np.atleast_2d(sobels).T, haralicks])
+from sklearn import cross_validation
+cv = cross_validation.LeaveOneOut(len(images))
 scores = cross_validation.cross_val_score(
-    LogisticRegression(), haralick_plus_sobel, labels, cv=5).mean()
-print('Accuracy (5 fold x-val) with Logistic Regression [std features + sobel]: {}%'.format(
-    0.1 * round(1000 * scores.mean())))
+    clf, haralicks, labels, cv=cv)
+print('Accuracy (Leave-one-out) with Logistic Regression [haralick features]: {:.1%}'.format(
+    scores.mean()))
 
+scores = cross_validation.cross_val_score(
+    clf, chists, labels, cv=cv)
+print('Accuracy (Leave-one-out) with Logistic Regression [color histograms]: {:.1%}'.format(
+    scores.mean()))
 
-# We can try to just use the sobel feature. The result is almost completely
-# random.
 scores = cross_validation.cross_val_score(
-    LogisticRegression(), np.atleast_2d(sobels).T, labels, cv=5).mean()
-print('Accuracy (5 fold x-val) with Logistic Regression [only using sobel feature]: {}%'.format(
-    0.1 * round(1000 * scores.mean())))
+    clf, haralick_plus_chists, labels, cv=cv)
+print('Accuracy (Leave-one-out) with Logistic Regression [texture features + color histograms]: {:.1%}'.format(
+    scores.mean()))
 
diff --git a/ch10/thresholded_figure.py b/ch10/thresholded_figure.py
new file mode 100644
index 00000000..947762e8
--- /dev/null
+++ b/ch10/thresholded_figure.py
@@ -0,0 +1,31 @@
+import mahotas as mh
+import numpy as np
+from matplotlib import pyplot as plt
+
+# Load image & convert to B&W
+image = mh.imread('../SimpleImageDataset/scene00.jpg')
+image = mh.colors.rgb2grey(image, dtype=np.uint8)
+plt.imshow(image)
+plt.gray()
+plt.title('original image')
+
+thresh = mh.thresholding.otsu(image)
+print('Otsu threshold is {}.'.format(thresh))
+
+threshed = (image > thresh)
+plt.figure()
+plt.imshow(threshed)
+plt.title('threholded image')
+mh.imsave('thresholded.png', threshed.astype(np.uint8)*255)
+
+im16 = mh.gaussian_filter(image, 16)
+
+# Repeat the thresholding operations with the blurred image 
+thresh = mh.thresholding.otsu(im16.astype(np.uint8))
+threshed = (im16 > thresh)
+plt.figure()
+plt.imshow(threshed)
+plt.title('threholded image (after blurring)')
+print('Otsu threshold after blurring is {}.'.format(thresh))
+mh.imsave('thresholded16.png', threshed.astype(np.uint8)*255)
+plt.show()
diff --git a/ch12/.gitignore b/ch12/.gitignore
index fb47e2ef..fa8fbcc2 100644
--- a/ch12/.gitignore
+++ b/ch12/.gitignore
@@ -1 +1,3 @@
 *.jugdata/
+output.txt
+results.image.txt
diff --git a/ch12/README.rst b/ch12/README.rst
new file mode 100644
index 00000000..fba88d4a
--- /dev/null
+++ b/ch12/README.rst
@@ -0,0 +1,28 @@
+==========
+Chapter 12
+==========
+
+Support code for *Chapter 12: Big(ger) Data* 
+
+Data
+----
+
+This chapter relies only on the image dataset that is packaged with the
+repository at ``../SimpleImageDataset/``.
+
+Scripts
+-------
+
+chapter.py
+    Code as written in the book
+jugfile.py
+    Example jugfile
+image-classification.py
+    Jugfile implementation of image classification from Chapter 10
+
+setup-aws.txt
+    Commands to setup Amazon WebServices machine
+run-jugfile.sh
+    Wrapper script to run jug file on jugfile.py
+run-image-classification.sh
+    Wrapper script to run jug file on image-classification.py
diff --git a/ch12/chapter.py b/ch12/chapter.py
new file mode 100644
index 00000000..15326680
--- /dev/null
+++ b/ch12/chapter.py
@@ -0,0 +1,95 @@
+from jug import TaskGenerator
+from glob import glob
+import mahotas as mh
+@TaskGenerator
+def compute_texture(im):   
+    from features import texture
+    imc = mh.imread(im)
+    return texture(mh.colors.rgb2gray(imc))
+
+@TaskGenerator
+def chist_file(fname):
+    from features import chist
+    im = mh.imread(fname)
+    return chist(im)
+
+import numpy as np
+to_array = TaskGenerator(np.array)
+hstack = TaskGenerator(np.hstack)
+
+haralicks = []
+chists = []
+labels = []
+
+# Change this variable to point to
+# the location of the dataset is on disk
+basedir = '../SimpleImageDataset/'
+# Use glob to get all the images
+images = glob('{}/*.jpg'.format(basedir))
+
+for fname in sorted(images):
+    haralicks.append(compute_texture(fname))
+    chists.append(chist_file(fname))
+    # The class is encoded in the filename as xxxx00.jpg
+    labels.append(fname[:-len('00.jpg')])
+
+haralicks = to_array(haralicks)
+chists = to_array(chists)
+labels = to_array(labels)
+
+@TaskGenerator
+def accuracy(features, labels):
+    from sklearn.linear_model import LogisticRegression
+    from sklearn.pipeline import Pipeline
+    from sklearn.preprocessing import StandardScaler
+    from sklearn import cross_validation
+    
+    clf = Pipeline([('preproc', StandardScaler()),
+                ('classifier', LogisticRegression())])
+    cv = cross_validation.LeaveOneOut(len(features))
+    scores = cross_validation.cross_val_score(
+        clf, features, labels, cv=cv)
+    return scores.mean()
+scores_base = accuracy(haralicks, labels)
+scores_chist = accuracy(chists, labels)
+
+combined = hstack([chists, haralicks])
+scores_combined  = accuracy(combined, labels)
+
+@TaskGenerator
+def print_results(scores):
+    with open('results.image.txt', 'w') as output:
+        for k,v in scores:
+            output.write('Accuracy [{}]: {:.1%}\n'.format(
+                k, v.mean()))
+
+print_results([
+        ('base', scores_base),
+        ('chists', scores_chist),
+        ('combined' , scores_combined),
+        ])
+
+@TaskGenerator
+def compute_lbp(fname):
+    from mahotas.features import lbp
+    imc = mh.imread(fname)
+    im = mh.colors.rgb2grey(imc)
+    return lbp(im, radius=8, points=6)
+
+lbps = []
+for fname in sorted(images):
+    # the rest of the loop as before
+    lbps.append(compute_lbp(fname))
+lbps = to_array(lbps)
+
+scores_lbps = accuracy(lbps, labels)
+combined_all = hstack([chists, haralicks, lbps])
+scores_combined_all = accuracy(combined_all, labels)
+
+print_results([
+        ('base', scores_base),
+        ('chists', scores_chist),
+        ('lbps', scores_lbps),
+        ('combined' , scores_combined),
+        ('combined_all' , scores_combined_all),
+        ])
diff --git a/ch12/features.py b/ch12/features.py
new file mode 120000
index 00000000..142a324d
--- /dev/null
+++ b/ch12/features.py
@@ -0,0 +1 @@
+../ch10/features.py
\ No newline at end of file
diff --git a/ch12/image-classification.py b/ch12/image-classification.py
index 6283a588..6f76d26d 100644
--- a/ch12/image-classification.py
+++ b/ch12/image-classification.py
@@ -6,11 +6,11 @@
 # It is made available under the MIT License
 
 import mahotas as mh
-from sklearn import cross_validation
-from sklearn.linear_model.logistic import LogisticRegression
 import numpy as np
 from glob import glob
 from jug import TaskGenerator
+
+# We need to use the `features` module from chapter 10.
 from sys import path
 path.append('../ch10')
 
@@ -20,7 +20,7 @@
 basedir = '../SimpleImageDataset/'
 
 @TaskGenerator
-def features_for(im):
+def compute_texture(im):
     '''Compute features for an image
 
     Parameters
@@ -33,57 +33,84 @@ def features_for(im):
     fs : ndarray
         1-D array of features
     '''
-    im = mh.imread(im, as_grey=True).astype(np.uint8)
-    return mh.features.haralick(im).mean(0)
+    from features import texture
+    imc = mh.imread(im)
+    return texture(mh.colors.rgb2grey(imc))
+
+@TaskGenerator
+def chist(fname):
+    from features import chist as color_histogram
+    im = mh.imread(fname)
+    return color_histogram(im)
 
 @TaskGenerator
-def edginess_sobel_from_fname(fname):
-    from edginess import edginess_sobel
-    return edginess_sobel(mh.imread(fname, as_grey=True))
+def compute_lbp(fname):
+    from mahotas.features import lbp
+    imc = mh.imread(fname)
+    im = mh.colors.rgb2grey(imc)
+    return lbp(im, radius=8, points=6)
+
 
 @TaskGenerator
 def accuracy(features, labels):
+    from sklearn.linear_model import LogisticRegression
+    from sklearn.pipeline import Pipeline
+    from sklearn.preprocessing import StandardScaler
+    from sklearn import cross_validation
     # We use logistic regression because it is very fast.
     # Feel free to experiment with other classifiers
+    clf = Pipeline([('preproc', StandardScaler()),
+                ('classifier', LogisticRegression())])
+    cv = cross_validation.LeaveOneOut(len(features))
     scores = cross_validation.cross_val_score(
-        LogisticRegression(), features, labels, cv=5)
+        clf, features, labels, cv=cv)
     return scores.mean()
 
 
 @TaskGenerator
-def stack_features(sobels, haralicks):
-    return np.hstack([np.atleast_2d(sobels).T, haralicks])
-
-@TaskGenerator
-def print_results(scores_base, scores_combined):
-    output = open('results.image.txt', 'w')
-    output.write('Accuracy (5 fold x-val) with Logistic Regrssion [std features]: {}%\n'.format(
-            0.1 * round(1000 * scores_base.mean())))
-    output.write('Accuracy (5 fold x-val) with Logistic Regrssion [std features + sobel]: {}%\n'.format(
-        0.1 * round(1000 * scores_combined.mean())))
-    output.close()
+def print_results(scores):
+    with open('results.image.txt', 'w') as output:
+        for k,v in scores:
+            output.write('Accuracy (LOO x-val) with Logistic Regression [{0}]: {1:.1%}\n'.format(
+                k, v.mean()))
 
 
 to_array = TaskGenerator(np.array)
+hstack = TaskGenerator(np.hstack)
 
 haralicks = []
-sobels = []
+chists = []
+lbps = []
 labels = []
 
 # Use glob to get all the images
-images = glob('{}/*.jpg'.format(basedir))
+images = glob('{0}/*.jpg'.format(basedir))
 for fname in sorted(images):
-    haralicks.append(features_for(fname))
-    sobels.append(edginess_sobel_from_fname(fname))
+    haralicks.append(compute_texture(fname))
+    chists.append(chist(fname))
+    lbps.append(compute_lbp(fname))
     labels.append(fname[:-len('00.jpg')]) # The class is encoded in the filename as xxxx00.jpg
 
 haralicks = to_array(haralicks)
-sobels = to_array(sobels)
+chists = to_array(chists)
+lbps = to_array(lbps)
 labels = to_array(labels)
 
 scores_base = accuracy(haralicks, labels)
-haralick_plus_sobel = stack_features(sobels, haralicks)
-scores_combined  = accuracy(haralick_plus_sobel, labels)
-
-print_results(scores_base, scores_combined)
+scores_chist = accuracy(chists, labels)
+scores_lbps = accuracy(lbps, labels)
+
+combined = hstack([chists, haralicks])
+scores_combined = accuracy(combined, labels)
+
+combined_all = hstack([chists, haralicks, lbps])
+scores_combined_all = accuracy(combined_all, labels)
+
+print_results([
+        ('base', scores_base),
+        ('chists', scores_chist),
+        ('lbps', scores_lbps),
+        ('combined' , scores_combined),
+        ('combined_all' , scores_combined_all),
+        ])
 
diff --git a/ch12/jugfile.py b/ch12/jugfile.py
index 858c64f4..9d7e2b7a 100644
--- a/ch12/jugfile.py
+++ b/ch12/jugfile.py
@@ -23,7 +23,7 @@ def add(a, b):
 @TaskGenerator
 def print_final_result(oname, value):
     with open(oname, 'w') as output:
-        output.write("Final result: {}\n".format(value))
+        output.write("Final result: {0}\n".format(value))
 
 input = 2
 y = double(input)
diff --git a/ch12/run-image-classification.sh b/ch12/run-image-classification.sh
new file mode 100755
index 00000000..868d07fa
--- /dev/null
+++ b/ch12/run-image-classification.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+
+jug execute image-classification.py
diff --git a/ch12/run-jugfile.sh b/ch12/run-jugfile.sh
new file mode 100755
index 00000000..0ff59131
--- /dev/null
+++ b/ch12/run-jugfile.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+
+jug execute
+
diff --git a/ch12/setup-aws.txt b/ch12/setup-aws.txt
new file mode 100644
index 00000000..292654eb
--- /dev/null
+++ b/ch12/setup-aws.txt
@@ -0,0 +1,7 @@
+sudo yum update
+sudo yum -y install python-devel python-pip numpy scipy python-matplotlib
+sudo yum -y install gcc-c++
+sudo yum -y install git
+sudo pip-python install -U pip
+sudo pip install scikit-learn jug mahotas
+