diff --git a/README.md b/README.md
index 05218427..cbcd3155 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,14 @@
 Building Machine Learning Systems with Python
 =============================================
 
-Source Code for the book Building Machine Learning Systems with Python by
-[Willi Richert](http://twotoreal.com) and [Luis Pedro
-Coelho](http://luispedro.org).
+Source Code for the book Building Machine Learning Systems with Python by [Luis
+Pedro Coelho](http://luispedro.org) and [Willi Richert](http://twotoreal.com).
 
-The book was published in 2013 by Packt Publishing and is available [from their
+The book was published in 2013 (second edition in 2015) by Packt Publishing and
+is available [from their
 website](http://www.packtpub.com/building-machine-learning-systems-with-python/book).
 
+The code in the repository corresponds to the second edition. Code for the
+first edition is available in [first\_edition
+branch](https://github.com/luispedro/BuildingMachineLearningSystemsWithPython/tree/first_edition).
 
diff --git a/SimpleImageDataset/scene01.jpg b/SimpleImageDataset/scene01.jpg
index 29b327d2..e7b416d3 100644
Binary files a/SimpleImageDataset/scene01.jpg and b/SimpleImageDataset/scene01.jpg differ
diff --git a/SimpleImageDataset/scene08.jpg b/SimpleImageDataset/scene08.jpg
index acc89a74..7dee8860 100644
Binary files a/SimpleImageDataset/scene08.jpg and b/SimpleImageDataset/scene08.jpg differ
diff --git a/ch01/analyze_webstats.py b/ch01/analyze_webstats.py
index 8d3c4c41..5da892e2 100644
--- a/ch01/analyze_webstats.py
+++ b/ch01/analyze_webstats.py
@@ -26,8 +26,9 @@
 x = x[~sp.isnan(y)]
 y = y[~sp.isnan(y)]
 
-# plot input data
+
 def plot_models(x, y, models, fname, mx=None, ymax=None, xmin=None):
+    ''' plot input data '''
 
     plt.figure(num=None, figsize=(8, 6))
     plt.clf()
@@ -138,8 +139,8 @@ def error(f, x, y):
 train = sorted(shuffled[split_idx:])
 fbt1 = sp.poly1d(sp.polyfit(xb[train], yb[train], 1))
 fbt2 = sp.poly1d(sp.polyfit(xb[train], yb[train], 2))
-print("fbt2(x)= \n%s"%fbt2)
-print("fbt2(x)-100,000= \n%s"%(fbt2-100000))
+print("fbt2(x)= \n%s" % fbt2)
+print("fbt2(x)-100,000= \n%s" % (fbt2-100000))
 fbt3 = sp.poly1d(sp.polyfit(xb[train], yb[train], 3))
 fbt10 = sp.poly1d(sp.polyfit(xb[train], yb[train], 10))
 fbt100 = sp.poly1d(sp.polyfit(xb[train], yb[train], 100))
diff --git a/ch01/gen_webstats.py b/ch01/gen_webstats.py
index fa133d76..61d0b738 100644
--- a/ch01/gen_webstats.py
+++ b/ch01/gen_webstats.py
@@ -17,26 +17,22 @@
 
 sp.random.seed(3)  # to reproduce the data later on
 
-x = sp.arange(1, 31 * 24)
-y = sp.array(200 * (sp.sin(2 * sp.pi * x / (7 * 24))), dtype=int)
+x = sp.arange(1, 31*24)
+y = sp.array(200*(sp.sin(2*sp.pi*x/(7*24))), dtype=int)
 y += gamma.rvs(15, loc=0, scale=100, size=len(x))
-y += 2 * sp.exp(x / 100.0)
-y = sp.ma.array(y, mask=[y < 0])
-print(sum(y), sum(y < 0))
+y += 2 * sp.exp(x/100.0)
+y = sp.ma.array(y, mask=[y<0])
+print(sum(y), sum(y<0))
 
 plt.scatter(x, y)
 plt.title("Web traffic over the last month")
 plt.xlabel("Time")
 plt.ylabel("Hits/hour")
-plt.xticks([w * 7 * 24 for w in [0, 1, 2, 3, 4]], ['week %i' % (w + 1) for w in
-                                                   [0, 1, 2, 3, 4]])
-
+plt.xticks([w*7*24 for w in range(5)], 
+           ['week %i' %(w+1) for w in range(5)])
 plt.autoscale(tight=True)
 plt.grid()
 plt.savefig(os.path.join(CHART_DIR, "1400_01_01.png"))
 
-# sp.savetxt(os.path.join("..", "web_traffic.tsv"),
-# zip(x[~y.mask],y[~y.mask]), delimiter="\t", fmt="%i")
-
-sp.savetxt(os.path.join(
-    DATA_DIR, "web_traffic.tsv"), list(zip(x, y)), delimiter="\t", fmt="%s")
+sp.savetxt(os.path.join(DATA_DIR, "web_traffic.tsv"), 
+           list(zip(x, y)), delimiter="\t", fmt="%s")
diff --git a/ch02/README.rst b/ch02/README.rst
index 4dc0b211..e2cb729a 100644
--- a/ch02/README.rst
+++ b/ch02/README.rst
@@ -6,6 +6,9 @@ Support code for *Chapter 2: Learning How to Classify with Real-world
 Examples*. The directory data contains the seeds dataset, originally downloaded
 from https://archive.ics.uci.edu/ml/datasets/seeds
 
+chapter.py
+    The code as printed in the book.
+
 figure1.py
     Figure 1 in the book: all 2-by-2 scatter plots
 
diff --git a/ch02/chapter.py b/ch02/chapter.py
new file mode 100644
index 00000000..c68b45ab
--- /dev/null
+++ b/ch02/chapter.py
@@ -0,0 +1,164 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+#
+# It is made available under the MIT License
+
+
+from matplotlib import pyplot as plt
+import numpy as np
+
+# We load the data with load_iris from sklearn
+from sklearn.datasets import load_iris
+data = load_iris()
+
+# load_iris returns an object with several fields
+features = data.data
+feature_names = data.feature_names
+target = data.target
+target_names = data.target_names
+
+for t in range(3):
+ if t == 0:
+     c = 'r'
+     marker = '>'
+ elif t == 1:
+     c = 'g'
+     marker = 'o'
+ elif t == 2:
+     c = 'b'
+     marker = 'x'
+ plt.scatter(features[target == t, 0],
+            features[target == t, 1],
+            marker=marker,
+            c=c)
+# We use NumPy fancy indexing to get an array of strings:
+labels = target_names[target]
+
+# The petal length is the feature at position 2
+plength = features[:, 2]
+
+# Build an array of booleans:
+is_setosa = (labels == 'setosa')
+
+# This is the important step:
+max_setosa =plength[is_setosa].max()
+min_non_setosa = plength[~is_setosa].min()
+print('Maximum of setosa: {0}.'.format(max_setosa))
+
+print('Minimum of others: {0}.'.format(min_non_setosa))
+
+# ~ is the boolean negation operator
+features = features[~is_setosa]
+labels = labels[~is_setosa]
+# Build a new target variable, is_virigina
+is_virginica = (labels == 'virginica')
+
+# Initialize best_acc to impossibly low value
+best_acc = -1.0
+for fi in range(features.shape[1]):
+    # We are going to test all possible thresholds
+    thresh = features[:,fi]
+    for t in thresh:
+
+        # Get the vector for feature `fi`
+        feature_i = features[:, fi]
+        # apply threshold `t`
+        pred = (feature_i > t)
+        acc = (pred == is_virginica).mean()
+        rev_acc = (pred == ~is_virginica).mean()
+        if rev_acc > acc:
+            reverse = True
+            acc = rev_acc
+        else:
+            reverse = False
+
+        if acc > best_acc:
+            best_acc = acc
+            best_fi = fi
+            best_t = t
+            best_reverse = reverse
+
+print(best_fi, best_t, best_reverse, best_acc)
+
+def is_virginica_test(fi, t, reverse, example):
+    'Apply threshold model to a new example'
+    test = example[fi] > t
+    if reverse:
+        test = not test
+    return test
+from threshold import fit_model, predict
+
+# ning accuracy was 96.0%.
+# ing accuracy was 90.0% (N = 50).
+correct = 0.0
+
+for ei in range(len(features)):
+    # select all but the one at position `ei`:
+    training = np.ones(len(features), bool)
+    training[ei] = False
+    testing = ~training
+    model = fit_model(features[training], is_virginica[training])
+    predictions = predict(model, features[testing])
+    correct += np.sum(predictions == is_virginica[testing])
+acc = correct/float(len(features))
+print('Accuracy: {0:.1%}'.format(acc))
+
+
+###########################################
+############## SEEDS DATASET ##############
+###########################################
+
+from load import load_dataset
+
+feature_names = [
+    'area',
+    'perimeter',
+    'compactness',
+    'length of kernel',
+    'width of kernel',
+    'asymmetry coefficien',
+    'length of kernel groove',
+]
+features, labels = load_dataset('seeds')
+
+
+
+from sklearn.neighbors import KNeighborsClassifier
+classifier = KNeighborsClassifier(n_neighbors=1)
+from sklearn.cross_validation import KFold
+
+kf = KFold(len(features), n_folds=5, shuffle=True)
+means = []
+for training,testing in kf:
+   # We learn a model for this fold with `fit` and then apply it to the
+   # testing data with `predict`:
+   classifier.fit(features[training], labels[training])
+   prediction = classifier.predict(features[testing])
+
+   # np.mean on an array of booleans returns fraction
+ # of correct decisions for this fold:
+   curmean = np.mean(prediction == labels[testing])
+   means.append(curmean)
+print('Mean accuracy: {:.1%}'.format(np.mean(means)))
+
+
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+
+classifier = KNeighborsClassifier(n_neighbors=1)
+classifier = Pipeline([('norm', StandardScaler()), ('knn', classifier)])
+
+means = []
+for training,testing in kf:
+    # We learn a model for this fold with `fit` and then apply it to the
+    # testing data with `predict`:
+    classifier.fit(features[training], labels[training])
+    prediction = classifier.predict(features[testing])
+
+    # np.mean on an array of booleans returns fraction
+    # of correct decisions for this fold:
+    curmean = np.mean(prediction == labels[testing])
+    means.append(curmean)
+print('Mean accuracy: {:.1%}'.format(np.mean(means)))
diff --git a/ch02/figure1.py b/ch02/figure1.py
index d6a4d287..4ec6fff8 100644
--- a/ch02/figure1.py
+++ b/ch02/figure1.py
@@ -5,9 +5,12 @@
 #
 # It is made available under the MIT License
 
-from sklearn.datasets import load_iris
 from matplotlib import pyplot as plt
 
+# We load the data with load_iris from sklearn
+from sklearn.datasets import load_iris
+
+# load_iris returns an object with several fields
 data = load_iris()
 features = data.data
 feature_names = data.feature_names
@@ -16,13 +19,21 @@
 
 fig,axes = plt.subplots(2, 3)
 pairs = [(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]
+
+# Set up 3 different pairs of (color, marker)
+color_markers = [
+        ('r', '>'),
+        ('g', 'o'),
+        ('b', 'x'),
+        ]
 for i, (p0, p1) in enumerate(pairs):
     ax = axes.flat[i]
 
-    # Use a different marker/color for each class `t`
-    for t, marker, c in zip(range(3), ">ox", "rgb"):
+    for t in range(3):
+        # Use a different color/marker for each class `t`
+        c,marker = color_markers[t]
         ax.scatter(features[target == t, p0], features[
-                    target == t, p1], marker=marker, c=c, s=40)
+                    target == t, p1], marker=marker, c=c)
     ax.set_xlabel(feature_names[p0])
     ax.set_ylabel(feature_names[p1])
     ax.set_xticks([])
diff --git a/ch02/figure2.py b/ch02/figure2.py
index e7222dd6..0b69d395 100644
--- a/ch02/figure2.py
+++ b/ch02/figure2.py
@@ -10,17 +10,22 @@
 from matplotlib import pyplot as plt
 from sklearn.datasets import load_iris
 data = load_iris()
-features = data['data']
-feature_names = data['feature_names']
-species = data['target_names'][data['target']]
+features = data.data
+feature_names = data.feature_names
+target = data.target
+target_names = data.target_names
 
-is_setosa = (species == 'setosa')
+# We use NumPy fancy indexing to get an array of strings:
+labels = target_names[target]
+
+is_setosa = (labels == 'setosa')
 features = features[~is_setosa]
-species = species[~is_setosa]
-is_virginica = (species == 'virginica')
+labels = labels[~is_setosa]
+is_virginica = (labels == 'virginica')
 
-# Hand fixed threshold:
-t = 1.75
+# Hand fixed thresholds:
+t = 1.65
+t2 = 1.75
 
 # Features to use: 3 & 2
 f0, f1 = 3, 2
@@ -45,7 +50,7 @@
 ax.fill_between([t, x1], [y0, y0], [y1, y1], color=area2c)
 ax.fill_between([x0, t], [y0, y0], [y1, y1], color=area1c)
 ax.plot([t, t], [y0, y1], 'k--', lw=2)
-ax.plot([t - .1, t - .1], [y0, y1], 'k:', lw=2)
+ax.plot([t2, t2], [y0, y1], 'k:', lw=2)
 ax.scatter(features[is_virginica, f0],
             features[is_virginica, f1], c='b', marker='o', s=40)
 ax.scatter(features[~is_virginica, f0],
diff --git a/ch02/figure4_5_no_sklearn.py b/ch02/figure4_5_no_sklearn.py
index 5f67e0d7..adc83d73 100644
--- a/ch02/figure4_5_no_sklearn.py
+++ b/ch02/figure4_5_no_sklearn.py
@@ -45,7 +45,7 @@ def plot_decision(features, labels):
 
     model = fit_model(1, features[:, (0, 2)], np.array(labels))
     C = predict(
-        np.vstack([X.ravel(), Y.ravel()]).T, model).reshape(X.shape)
+        model, np.vstack([X.ravel(), Y.ravel()]).T).reshape(X.shape)
     if COLOUR_FIGURE:
         cmap = ListedColormap([(1., .6, .6), (.6, 1., .6), (.6, .6, 1.)])
     else:
diff --git a/ch02/figure4_5_sklearn.py b/ch02/figure4_5_sklearn.py
index 9d20c7cb..55ac0c80 100644
--- a/ch02/figure4_5_sklearn.py
+++ b/ch02/figure4_5_sklearn.py
@@ -58,11 +58,11 @@ def plot_decision(features, labels, num_neighbors=1):
     ax.pcolormesh(X, Y, C, cmap=cmap)
     if COLOUR_FIGURE:
         cmap = ListedColormap([(1., .0, .0), (.1, .6, .1), (.0, .0, 1.)])
-        ax.scatter(features[:, 0], features[:, 2], c=labels, cmap=cmap, s=40)
+        ax.scatter(features[:, 0], features[:, 2], c=labels, cmap=cmap)
     else:
         for lab, ma in zip(range(3), "Do^"):
             ax.plot(features[labels == lab, 0], features[
-                     labels == lab, 2], ma, c=(1., 1., 1.), ms=8)
+                     labels == lab, 2], ma, c=(1., 1., 1.), ms=6)
     return fig,ax
 
 
diff --git a/ch02/knn.py b/ch02/knn.py
index e1d48ce0..89ebfdb4 100644
--- a/ch02/knn.py
+++ b/ch02/knn.py
@@ -26,7 +26,7 @@ def plurality(xs):
             return k
 
 # This function was called ``apply_model`` in the first edition
-def predict(features, model):
+def predict(model, features):
     '''Apply k-nn model'''
     k, train_feats, labels = model
     results = []
@@ -42,5 +42,5 @@ def predict(features, model):
 
 
 def accuracy(features, labels, model):
-    preds = predict(features, model)
+    preds = predict(model, features)
     return np.mean(preds == labels)
diff --git a/ch02/simple_threshold.py b/ch02/simple_threshold.py
index b0e4e71e..d174f283 100644
--- a/ch02/simple_threshold.py
+++ b/ch02/simple_threshold.py
@@ -12,9 +12,12 @@
 target = data['target']
 target_names = data['target_names']
 labels = target_names[target]
-
 plength = features[:, 2]
+
+# To use numpy operations to get setosa features,
+# we build a boolean array
 is_setosa = (labels == 'setosa')
+
 max_setosa = plength[is_setosa].max()
 min_non_setosa = plength[~is_setosa].min()
 
diff --git a/ch02/threshold.py b/ch02/threshold.py
index efbd942b..d621a350 100644
--- a/ch02/threshold.py
+++ b/ch02/threshold.py
@@ -40,7 +40,7 @@ def fit_model(features, labels):
 
 
 # This function was called ``apply_model`` in the first edition
-def predict(features, model):
+def predict(model, features):
     '''Apply a learned model'''
     # A model is a pair as returned by fit_model
     t, fi, reverse = model
@@ -51,5 +51,5 @@ def predict(features, model):
 
 def accuracy(features, labels, model):
     '''Compute the accuracy of the model'''
-    preds = predict(features, model)
+    preds = predict(model, features)
     return np.mean(preds == labels)
diff --git a/ch04/.gitignore b/ch04/.gitignore
index 0a7a205c..c4c0b18a 100644
--- a/ch04/.gitignore
+++ b/ch04/.gitignore
@@ -1,2 +1,6 @@
 wiki_lda.pkl
 wiki_lda.pkl.state
+*.png
+*.npy
+*.pkl
+topics.txt
diff --git a/ch04/README.rst b/ch04/README.rst
index e40eccbe..99a3c186 100644
--- a/ch04/README.rst
+++ b/ch04/README.rst
@@ -4,6 +4,16 @@ Chapter 4
 
 Support code for *Chapter 4: Topic Modeling*
 
+
+AP Data
+-------
+
+To download the AP data, use the ``download_ap.sh`` script inside the ``data``
+directory::
+
+    cd data
+    ./download_ap.sh
+
 Word cloud creation
 -------------------
 
@@ -33,7 +43,9 @@ may remove the input file if you want to save disk space
 
 To generate the model, you can run the ``wikitopics_create.py`` script, while
 the ``wikitopics_plot.py`` script will plot the most heavily discussed topic as
-well as the least heavily discussed one.
+well as the least heavily discussed one. The code is split into steps as the
+first one can take a very long time. Then it saves the results so that you can
+later explore them at leisure.
 
 You should not expect that your results will exactly match the results in the
 book, for two reasons:
@@ -41,3 +53,13 @@ book, for two reasons:
 1. The LDA algorithm is a probabilistic algorithm and can give different
    results every time it is run.
 2. Wikipedia keeps changing. Thus, even your input data will be different.
+
+Scripts
+-------
+
+blei_lda.py
+    Computes LDA using the AP Corpus.
+wikitopics_create.py
+    Create the topic model for Wikipedia using LDA (must download wikipedia database first)
+wikitopics_create_hdp.py
+    Create the topic model for Wikipedia using HDP (must download wikipedia database first)
diff --git a/ch04/blei_lda.py b/ch04/blei_lda.py
index 14f18349..7f6ac2b3 100644
--- a/ch04/blei_lda.py
+++ b/ch04/blei_lda.py
@@ -8,19 +8,13 @@
 from __future__ import print_function
 from wordcloud import create_cloud
 try:
-    from gensim import corpora, models
+    from gensim import corpora, models, matutils
 except:
     print("import gensim failed.")
     print()
     print("Please install it")
     raise
 
-try:
-    from mpltools import style
-    style.use('ggplot')
-except:
-    print("Could not import mpltools: plots will not be styled correctly")
-
 import matplotlib.pyplot as plt
 import numpy as np
 from os import path
@@ -39,27 +33,22 @@
 model = models.ldamodel.LdaModel(
     corpus, num_topics=NUM_TOPICS, id2word=corpus.id2word, alpha=None)
 
-ti = 0
 # Iterate over all the topics in the model
-for ti in xrange(model.num_topics):
+for ti in range(model.num_topics):
     words = model.show_topic(ti, 64)
-    tf = sum(f for f, w in words)
+    tf = sum(f for _, f in words)
     with open('topics.txt', 'w') as output:
-        output.write('\n'.join('{}:{}'.format(w, int(1000. * f / tf)) for f, w in words))
+        output.write('\n'.join('{}:{}'.format(w, int(1000. * f / tf)) for w, f in words))
         output.write("\n\n\n")
 
 # We first identify the most discussed topic, i.e., the one with the
 # highest total weight
 
-# First, we need to sum up the weights across all the documents
-weight = np.zeros(model.num_topics)
-for doc in corpus:
-    for col, val in model[doc]:
-        weight[col] += val
-        # As a reasonable alternative, we could have used the log of val:
-        # weight[col] += np.log(val)
+topics = matutils.corpus2dense(model[corpus], num_terms=model.num_topics)
+weight = topics.sum(1)
 max_topic = weight.argmax()
 
+
 # Get the top 64 words for this topic
 # Without the argument, show_topic would return only 10 words
 words = model.show_topic(max_topic, 64)
@@ -68,24 +57,30 @@
 create_cloud('cloud_blei_lda.png', words)
 
 num_topics_used = [len(model[doc]) for doc in corpus]
-plt.hist(num_topics_used, np.arange(42))
-plt.ylabel('Nr of documents')
-plt.xlabel('Nr of topics')
-plt.savefig('../1400OS_04_01+.png')
-plt.clf()
+fig,ax = plt.subplots()
+ax.hist(num_topics_used, np.arange(42))
+ax.set_ylabel('Nr of documents')
+ax.set_xlabel('Nr of topics')
+fig.tight_layout()
+fig.savefig('Figure_04_01.png')
 
 
-# Now, repeat the same exercise using alpha=1:
+# Now, repeat the same exercise using alpha=1.0
+# You can edit the constant below to play around with this parameter
+ALPHA = 1.0
 
 model1 = models.ldamodel.LdaModel(
-    corpus, num_topics=NUM_TOPICS, id2word=corpus.id2word, alpha=1.)
+    corpus, num_topics=NUM_TOPICS, id2word=corpus.id2word, alpha=ALPHA)
 num_topics_used1 = [len(model1[doc]) for doc in corpus]
 
-plt.hist([num_topics_used, num_topics_used1], np.arange(42))
-plt.ylabel('Nr of documents')
-plt.xlabel('Nr of topics')
+fig,ax = plt.subplots()
+ax.hist([num_topics_used, num_topics_used1], np.arange(42))
+ax.set_ylabel('Nr of documents')
+ax.set_xlabel('Nr of topics')
 
 # The coordinates below were fit by trial and error to look good
-plt.text(9, 223, r'default alpha')
-plt.text(26, 156, 'alpha=1.0')
-plt.savefig('../1400OS_04_02+.png')
+ax.text(9, 223, r'default alpha')
+ax.text(26, 156, 'alpha=1.0')
+fig.tight_layout()
+fig.savefig('Figure_04_02.png')
+
diff --git a/ch04/build_lda.py b/ch04/build_lda.py
index d4dcb7f2..a0ee9c5f 100644
--- a/ch04/build_lda.py
+++ b/ch04/build_lda.py
@@ -79,7 +79,7 @@ def __len__(self):
 
 distances = distance.squareform(distance.pdist(thetas))
 large = distances.max() + 1
-for i in xrange(len(distances)):
+for i in range(len(distances)):
     distances[i, i] = large
 
 print(otexts[1])
diff --git a/ch04/data/.gitignore b/ch04/data/.gitignore
index 2436b020..91f24bbc 100644
--- a/ch04/data/.gitignore
+++ b/ch04/data/.gitignore
@@ -1,8 +1,12 @@
 ap.tgz
 ap/
+dataset-379-20news-18828_HJRZF.zip
+379/
 enwiki-latest-pages-articles.xml.bz2
 wiki_en_output_bow.mm
+wiki_en_output_bow.mm.gz
 wiki_en_output_bow.mm.index
 wiki_en_output_tfidf.mm
+wiki_en_output_tfidf.mm.gz
 wiki_en_output_tfidf.mm.index
 wiki_en_output_wordids.txt.bz2
diff --git a/ch04/data/download_ap.sh b/ch04/data/download_ap.sh
index 6de8ded8..da27814a 100755
--- a/ch04/data/download_ap.sh
+++ b/ch04/data/download_ap.sh
@@ -1,3 +1,3 @@
 #!/bin/sh
-wget http://www.cs.princeton.edu/~blei/lda-c/ap.tgz
+wget http://www.cs.columbia.edu/~blei/lda-c/ap.tgz
 tar xzf ap.tgz
diff --git a/ch04/wikitopics_create_hdp.py b/ch04/wikitopics_create_hdp.py
new file mode 100644
index 00000000..951d1850
--- /dev/null
+++ b/ch04/wikitopics_create_hdp.py
@@ -0,0 +1,39 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+#
+# It is made available under the MIT License
+
+from __future__ import print_function
+import logging
+import gensim
+import numpy as np
+
+# Set up logging in order to get progress information as the model is being built:
+logging.basicConfig(
+    format='%(asctime)s : %(levelname)s : %(message)s',
+    level=logging.INFO)
+
+# Load the preprocessed corpus (id2word & mm):
+id2word = gensim.corpora.Dictionary.load_from_text(
+    'data/wiki_en_output_wordids.txt.bz2')
+mm = gensim.corpora.MmCorpus('data/wiki_en_output_tfidf.mm')
+
+# Calling the constructor is enough to build the model
+# This call will take a few hours!
+model = gensim.models.hdpmodel.HdpModel(
+    corpus=mm,
+    id2word=id2word,
+    chunksize=10000)
+
+# Save the model so we do not need to learn it again.
+model.save('wiki_hdp.pkl')
+
+# Compute the document/topic matrix
+topics = np.zeros((len(mm), model.num_topics))
+for di,doc in enumerate(mm):
+    doc_top = model[doc]
+    for ti,tv in doc_top:
+        topics[di,ti] += tv
+np.save('topics_hdp.npy', topics)
diff --git a/ch04/wikitopics_plot.py b/ch04/wikitopics_plot.py
index f51e6dec..04adf780 100644
--- a/ch04/wikitopics_plot.py
+++ b/ch04/wikitopics_plot.py
@@ -31,7 +31,7 @@
 topics = np.load('topics.npy', mmap_mode='r')
 
 # Compute the number of topics mentioned in each document
-lens = (topics > 0).sum(1)
+lens = (topics > 0).sum(axis=1)
 print('Mean number of topics mentioned: {0:.3}'.format(np.mean(lens)))
 print('Percentage of articles mentioning less than 10 topics: {0:.1%}'.format(np.mean(lens <= 10)))
 
@@ -42,16 +42,23 @@
 words = model.show_topic(weights.argmax(), 64)
 
 # The parameter ``maxsize`` often needs some manual tuning to make it look nice.
-create_cloud('Wikipedia_most.png', words, maxsize=410, fontname='Neucha')
-print(words)
+create_cloud('Wikipedia_most.png', words, maxsize=250, fontname='Cardo')
+
+fraction_mention = np.mean(topics[:,weights.argmax()] > 0)
+print("The most mentioned topics is mentioned in {:.1%} of documents.".format(fraction_mention))
+total_weight = np.mean(topics[:,weights.argmax()])
+print("It represents {:.1%} of the total number of words.".format(total_weight))
 print()
 print()
 print()
 
 # Retrieve the **least** heavily used topic and plot it as a word cloud:
 words = model.show_topic(weights.argmin(), 64)
-create_cloud('Wikipedia_least.png', words, maxsize=180, fontname='Neucha')
-print(words)
+create_cloud('Wikipedia_least.png', words, maxsize=150, fontname='Cardo')
+fraction_mention = np.mean(topics[:,weights.argmin()] > 0)
+print("The least mentioned topics is mentioned in {:.1%} of documents.".format(fraction_mention))
+total_weight = np.mean(topics[:,weights.argmin()])
+print("It represents {:.1%} of the total number of words.".format(total_weight))
 print()
 print()
 print()
diff --git a/ch04/wordcloud.py b/ch04/wordcloud.py
index 6c5302ea..accca2d6 100644
--- a/ch04/wordcloud.py
+++ b/ch04/wordcloud.py
@@ -24,8 +24,6 @@ def create_cloud(oname, words,maxsize=120, fontname='Lobster'):
     # gensim returns a weight between 0 and 1 for each word, while pytagcloud
     # expects an integer word count. So, we multiply by a large number and
     # round. For a visualization this is an adequate approximation.
-    # We also need to flip the order as gensim returns (value, word), whilst
-    # pytagcloud expects (word, value):
-    words = [(w,int(v*10000)) for v,w in words]
+    words = [(w,int(v*10000)) for w,v in words]
     tags = make_tags(words, maxsize=maxsize)
     create_tag_image(tags, oname, size=(1800, 1200), fontname=fontname)
diff --git a/ch05/classify.py b/ch05/classify.py
index ef5af419..76699819 100644
--- a/ch05/classify.py
+++ b/ch05/classify.py
@@ -54,6 +54,9 @@ def prepare_sent_features():
         if not text:
             meta[pid]['AvgSentLen'] = meta[pid]['AvgWordLen'] = 0
         else:
+            from platform import python_version
+            if python_version().startswith('2'):
+                text = text.decode('utf-8')
             sent_lens = [len(nltk.word_tokenize(
                 sent)) for sent in nltk.sent_tokenize(text)]
             meta[pid]['AvgSentLen'] = np.mean(sent_lens)
diff --git a/ch06/01_start.py b/ch06/01_start.py
index 2940b0fa..0bdbdd12 100644
--- a/ch06/01_start.py
+++ b/ch06/01_start.py
@@ -40,7 +40,7 @@ def create_ngram_model():
 
 def train_model(clf_factory, X, Y, name="NB ngram", plot=False):
     cv = ShuffleSplit(
-        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
+        n=len(X), n_iter=10, test_size=0.3, random_state=0)
 
     train_errors = []
     test_errors = []
@@ -83,7 +83,7 @@ def train_model(clf_factory, X, Y, name="NB ngram", plot=False):
 
         summary = (np.mean(scores), np.std(scores),
                    np.mean(pr_scores), np.std(pr_scores))
-        print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary
+        print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)
 
     return np.mean(train_errors), np.mean(test_errors)
 
@@ -94,18 +94,18 @@ def print_incorrect(clf, X, Y):
     X_wrong = X[wrong_idx]
     Y_wrong = Y[wrong_idx]
     Y_hat_wrong = Y_hat[wrong_idx]
-    for idx in xrange(len(X_wrong)):
-        print "clf.predict('%s')=%i instead of %i" %\
-            (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])
+    for idx in range(len(X_wrong)):
+        print("clf.predict('%s')=%i instead of %i" %
+              (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx]))
 
 
 if __name__ == "__main__":
     X_orig, Y_orig = load_sanders_data()
     classes = np.unique(Y_orig)
     for c in classes:
-        print "#%s: %i" % (c, sum(Y_orig == c))
+        print("#%s: %i" % (c, sum(Y_orig == c)))
 
-    print "== Pos vs. neg =="
+    print("== Pos vs. neg ==")
     pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative")
     X = X_orig[pos_neg]
     Y = Y_orig[pos_neg]
@@ -113,19 +113,19 @@ def print_incorrect(clf, X, Y):
 
     train_model(create_ngram_model, X, Y, name="pos vs neg", plot=True)
 
-    print "== Pos/neg vs. irrelevant/neutral =="
+    print("== Pos/neg vs. irrelevant/neutral ==")
     X = X_orig
     Y = tweak_labels(Y_orig, ["positive", "negative"])
     train_model(create_ngram_model, X, Y, name="sent vs rest", plot=True)
 
-    print "== Pos vs. rest =="
+    print("== Pos vs. rest ==")
     X = X_orig
     Y = tweak_labels(Y_orig, ["positive"])
     train_model(create_ngram_model, X, Y, name="pos vs rest", plot=True)
 
-    print "== Neg vs. rest =="
+    print("== Neg vs. rest ==")
     X = X_orig
     Y = tweak_labels(Y_orig, ["negative"])
     train_model(create_ngram_model, X, Y, name="neg vs rest", plot=True)
 
-    print "time spent:", time.time() - start_time
+    print("time spent:", time.time() - start_time)
diff --git a/ch06/02_tuning.py b/ch06/02_tuning.py
index a48d3edc..5b6a835f 100644
--- a/ch06/02_tuning.py
+++ b/ch06/02_tuning.py
@@ -45,7 +45,7 @@ def create_ngram_model(params=None):
 
 def grid_search_model(clf_factory, X, Y):
     cv = ShuffleSplit(
-        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
+        n=len(X), n_iter=10, test_size=0.3, random_state=0)
 
     param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)],
                       vect__min_df=[1, 2],
@@ -64,7 +64,7 @@ def grid_search_model(clf_factory, X, Y):
                                verbose=10)
     grid_search.fit(X, Y)
     clf = grid_search.best_estimator_
-    print clf
+    print(clf)
 
     return clf
 
@@ -114,7 +114,7 @@ def train_model(clf, X, Y, name="NB ngram", plot=False):
 
     summary = (np.mean(scores), np.std(scores),
                np.mean(pr_scores), np.std(pr_scores))
-    print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary
+    print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)
 
     return np.mean(train_errors), np.mean(test_errors)
 
@@ -125,9 +125,9 @@ def print_incorrect(clf, X, Y):
     X_wrong = X[wrong_idx]
     Y_wrong = Y[wrong_idx]
     Y_hat_wrong = Y_hat[wrong_idx]
-    for idx in xrange(len(X_wrong)):
-        print "clf.predict('%s')=%i instead of %i" %\
-            (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])
+    for idx in range(len(X_wrong)):
+        print("clf.predict('%s')=%i instead of %i" %
+              (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx]))
 
 
 def get_best_model():
@@ -149,16 +149,16 @@ def get_best_model():
     X_orig, Y_orig = load_sanders_data()
     classes = np.unique(Y_orig)
     for c in classes:
-        print "#%s: %i" % (c, sum(Y_orig == c))
+        print("#%s: %i" % (c, sum(Y_orig == c)))
 
-    print "== Pos vs. neg =="
+    print("== Pos vs. neg ==")
     pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative")
     X = X_orig[pos_neg]
     Y = Y_orig[pos_neg]
     Y = tweak_labels(Y, ["positive"])
     train_model(get_best_model(), X, Y, name="pos vs neg", plot=True)
 
-    print "== Pos/neg vs. irrelevant/neutral =="
+    print("== Pos/neg vs. irrelevant/neutral ==")
     X = X_orig
     Y = tweak_labels(Y_orig, ["positive", "negative"])
 
@@ -166,16 +166,16 @@ def get_best_model():
     # rest", plot=True)
     train_model(get_best_model(), X, Y, name="pos vs neg", plot=True)
 
-    print "== Pos vs. rest =="
+    print("== Pos vs. rest ==")
     X = X_orig
     Y = tweak_labels(Y_orig, ["positive"])
     train_model(get_best_model(), X, Y, name="pos vs rest",
                 plot=True)
 
-    print "== Neg vs. rest =="
+    print("== Neg vs. rest ==")
     X = X_orig
     Y = tweak_labels(Y_orig, ["negative"])
     train_model(get_best_model(), X, Y, name="neg vs rest",
                 plot=True)
 
-    print "time spent:", time.time() - start_time
+    print("time spent:", time.time() - start_time)
diff --git a/ch06/03_clean.py b/ch06/03_clean.py
index f00c2c44..0170ed82 100644
--- a/ch06/03_clean.py
+++ b/ch06/03_clean.py
@@ -57,7 +57,7 @@
 }
 
 emo_repl_order = [k for (k_len, k) in reversed(
-    sorted([(len(k), k) for k in emo_repl.keys()]))]
+    sorted([(len(k), k) for k in list(emo_repl.keys())]))]
 
 re_repl = {
     r"\br\b": "are",
@@ -84,7 +84,7 @@ def preprocessor(tweet):
 
         for k in emo_repl_order:
             tweet = tweet.replace(k, emo_repl[k])
-        for r, repl in re_repl.iteritems():
+        for r, repl in re_repl.items():
             tweet = re.sub(r, repl, tweet)
 
         return tweet
@@ -103,7 +103,7 @@ def preprocessor(tweet):
 def train_model(clf, X, Y, name="NB ngram", plot=False):
     # create it again for plotting
     cv = ShuffleSplit(
-        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
+        n=len(X), n_iter=10, test_size=0.3, random_state=0)
 
     train_errors = []
     test_errors = []
@@ -150,7 +150,7 @@ def train_model(clf, X, Y, name="NB ngram", plot=False):
 
     summary = (np.mean(scores), np.std(scores),
                np.mean(pr_scores), np.std(pr_scores))
-    print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary
+    print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)
 
     return np.mean(train_errors), np.mean(test_errors)
 
@@ -161,9 +161,9 @@ def print_incorrect(clf, X, Y):
     X_wrong = X[wrong_idx]
     Y_wrong = Y[wrong_idx]
     Y_hat_wrong = Y_hat[wrong_idx]
-    for idx in xrange(len(X_wrong)):
-        print "clf.predict('%s')=%i instead of %i" %\
-            (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])
+    for idx in range(len(X_wrong)):
+        print("clf.predict('%s')=%i instead of %i" %
+              (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx]))
 
 
 def get_best_model():
@@ -185,16 +185,16 @@ def get_best_model():
     X_orig, Y_orig = load_sanders_data()
     classes = np.unique(Y_orig)
     for c in classes:
-        print "#%s: %i" % (c, sum(Y_orig == c))
+        print("#%s: %i" % (c, sum(Y_orig == c)))
 
-    print "== Pos vs. neg =="
+    print("== Pos vs. neg ==")
     pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative")
     X = X_orig[pos_neg]
     Y = Y_orig[pos_neg]
     Y = tweak_labels(Y, ["positive"])
     train_model(get_best_model(), X, Y, name="pos vs neg", plot=True)
 
-    print "== Pos/neg vs. irrelevant/neutral =="
+    print("== Pos/neg vs. irrelevant/neutral ==")
     X = X_orig
     Y = tweak_labels(Y_orig, ["positive", "negative"])
 
@@ -202,16 +202,16 @@ def get_best_model():
     # rest", plot=True)
     train_model(get_best_model(), X, Y, name="pos+neg vs rest", plot=True)
 
-    print "== Pos vs. rest =="
+    print("== Pos vs. rest ==")
     X = X_orig
     Y = tweak_labels(Y_orig, ["positive"])
     train_model(get_best_model(), X, Y, name="pos vs rest",
                 plot=True)
 
-    print "== Neg vs. rest =="
+    print("== Neg vs. rest ==")
     X = X_orig
     Y = tweak_labels(Y_orig, ["negative"])
     train_model(get_best_model(), X, Y, name="neg vs rest",
                 plot=True)
 
-    print "time spent:", time.time() - start_time
+    print("time spent:", time.time() - start_time)
diff --git a/ch06/04_sent.py b/ch06/04_sent.py
index 87fbafc9..c09a435f 100644
--- a/ch06/04_sent.py
+++ b/ch06/04_sent.py
@@ -153,7 +153,7 @@ def transform(self, documents):
 }
 
 emo_repl_order = [k for (k_len, k) in reversed(
-    sorted([(len(k), k) for k in emo_repl.keys()]))]
+    sorted([(len(k), k) for k in list(emo_repl.keys())]))]
 
 re_repl = {
     r"\br\b": "are",
@@ -179,7 +179,7 @@ def preprocessor(tweet):
 
         for k in emo_repl_order:
             tweet = tweet.replace(k, emo_repl[k])
-        for r, repl in re_repl.iteritems():
+        for r, repl in re_repl.items():
             tweet = re.sub(r, repl, tweet)
 
         return tweet.replace("-", " ").replace("_", " ")
@@ -202,7 +202,7 @@ def preprocessor(tweet):
 
 def __grid_search_model(clf_factory, X, Y):
     cv = ShuffleSplit(
-        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
+        n=len(X), n_iter=10, test_size=0.3, random_state=0)
 
     param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)],
                       vect__min_df=[1, 2],
@@ -220,7 +220,7 @@ def __grid_search_model(clf_factory, X, Y):
                                verbose=10)
     grid_search.fit(X, Y)
     clf = grid_search.best_estimator_
-    print clf
+    print(clf)
 
     return clf
 
@@ -228,7 +228,7 @@ def __grid_search_model(clf_factory, X, Y):
 def train_model(clf, X, Y, name="NB ngram", plot=False):
     # create it again for plotting
     cv = ShuffleSplit(
-        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
+        n=len(X), n_iter=10, test_size=0.3, random_state=0)
 
     train_errors = []
     test_errors = []
@@ -275,7 +275,7 @@ def train_model(clf, X, Y, name="NB ngram", plot=False):
 
     summary = (np.mean(scores), np.std(scores),
                np.mean(pr_scores), np.std(pr_scores))
-    print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary
+    print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)
 
     return np.mean(train_errors), np.mean(test_errors)
 
@@ -286,9 +286,9 @@ def print_incorrect(clf, X, Y):
     X_wrong = X[wrong_idx]
     Y_wrong = Y[wrong_idx]
     Y_hat_wrong = Y_hat[wrong_idx]
-    for idx in xrange(len(X_wrong)):
-        print "clf.predict('%s')=%i instead of %i" %\
-            (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])
+    for idx in range(len(X_wrong)):
+        print("clf.predict('%s')=%i instead of %i" %
+              (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx]))
 
 
 def get_best_model():
@@ -315,16 +315,16 @@ def get_best_model():
     #Y_orig = Y_orig[:100,]
     classes = np.unique(Y_orig)
     for c in classes:
-        print "#%s: %i" % (c, sum(Y_orig == c))
+        print("#%s: %i" % (c, sum(Y_orig == c)))
 
-    print "== Pos vs. neg =="
+    print("== Pos vs. neg ==")
     pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative")
     X = X_orig[pos_neg]
     Y = Y_orig[pos_neg]
     Y = tweak_labels(Y, ["positive"])
     train_model(get_best_model(), X, Y, name="pos vs neg", plot=True)
 
-    print "== Pos/neg vs. irrelevant/neutral =="
+    print("== Pos/neg vs. irrelevant/neutral ==")
     X = X_orig
     Y = tweak_labels(Y_orig, ["positive", "negative"])
 
@@ -332,18 +332,18 @@ def get_best_model():
     # rest", plot=True)
     train_model(get_best_model(), X, Y, name="pos+neg vs rest", plot=True)
 
-    print "== Pos vs. rest =="
+    print("== Pos vs. rest ==")
     X = X_orig
     Y = tweak_labels(Y_orig, ["positive"])
     train_model(get_best_model(), X, Y, name="pos vs rest",
                 plot=True)
 
-    print "== Neg vs. rest =="
+    print("== Neg vs. rest ==")
     X = X_orig
     Y = tweak_labels(Y_orig, ["negative"])
     train_model(get_best_model(), X, Y, name="neg vs rest",
                 plot=True)
 
-    print "time spent:", time.time() - start_time
+    print("time spent:", time.time() - start_time)
 
     json.dump(poscache, open(poscache_filename, "w"))
diff --git a/ch06/install.py b/ch06/install.py
index c07a195f..be61f485 100644
--- a/ch06/install.py
+++ b/ch06/install.py
@@ -11,23 +11,19 @@
 #
 # Pulls tweet data from Twitter because ToS prevents distributing it directly.
 #
-# Right now we use unauthenticated requests, which are rate-limited to 150/hr.
-# We use 125/hr to stay safe.
-#
-#
 #   - Niek Sanders
 #     njs@sananalytics.com
 #     October 20, 2011
 #
 #
-# Excuse the ugly code.  I threw this together as quickly as possible and I
-# don't normally code in Python.
-#
 
 # In Sanders' original form, the code was using Twitter API 1.0.
 # Now that Twitter moved to 1.1, we had to make a few changes.
 # Cf. twitterauth.py for the details.
 
+# Regarding rate limiting, please check
+# https://dev.twitter.com/rest/public/rate-limiting
+
 import sys
 import csv
 import json
@@ -38,24 +34,21 @@
     import twitter
 except ImportError:
     print("""\
-You need to install python-twitter:
-    pip install python-twitter
+You need to ...
+    pip install twitter
 If pip is not found you might have to install it using easy_install.
 If it does not work on your system, you might want to follow instructions
-at https://github.com/bear/python-twitter, most likely:
-  $ git clone https://github.com/bear/python-twitter
-  $ cd python-twitter
+at https://github.com/sixohsix/twitter, most likely:
+  $ git clone https://github.com/sixohsix/twitter
+  $ cd twitter
   $ sudo python setup.py install
 """)
 
     sys.exit(1)
 
 from twitterauth import CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN_KEY, ACCESS_TOKEN_SECRET
-api = twitter.Api(consumer_key=CONSUMER_KEY, consumer_secret=CONSUMER_SECRET,
-                  access_token_key=ACCESS_TOKEN_KEY, access_token_secret=ACCESS_TOKEN_SECRET)
-
-
-MAX_TWEETS_PER_HR = 350
+api = twitter.Twitter(auth=twitter.OAuth(consumer_key=CONSUMER_KEY, consumer_secret=CONSUMER_SECRET,
+                                         token=ACCESS_TOKEN_KEY, token_secret=ACCESS_TOKEN_SECRET))
 
 DATA_PATH = "data"
 
@@ -87,16 +80,15 @@ def get_user_params(DATA_PATH):
 def dump_user_params(user_params):
 
     # dump user params for confirmation
-    print 'Input:    ' + user_params['inList']
-    print 'Output:   ' + user_params['outList']
-    print 'Raw data: ' + user_params['rawDir']
-    return
+    print('Input:    ' + user_params['inList'])
+    print('Output:   ' + user_params['outList'])
+    print('Raw data: ' + user_params['rawDir'])
 
 
 def read_total_list(in_filename):
 
     # read total fetch list csv
-    fp = open(in_filename, 'rb')
+    fp = open(in_filename, 'rt')
     reader = csv.reader(fp, delimiter=',', quotechar='"')
 
     if os.path.exists(MISSING_ID_FILE):
@@ -111,10 +103,12 @@ def read_total_list(in_filename):
     else:
         not_authed_ids = []
 
-    print "We will skip %i tweets that are not available/visible any more on twitter" % (len(missing_ids) + len(not_authed_ids))
+    print("We will skip %i tweets that are not available or visible any more on twitter" % (
+        len(missing_ids) + len(not_authed_ids)))
 
     ignore_ids = set(missing_ids + not_authed_ids)
     total_list = []
+
     for row in reader:
         if row[2] not in ignore_ids:
             total_list.append(row)
@@ -140,12 +134,12 @@ def purge_already_fetched(fetch_list, raw_dir):
                 parse_tweet_json(tweet_file)
                 count_done += 1
             except RuntimeError:
-                print "Error parsing", item
+                print("Error parsing", item)
                 rem_list.append(item)
         else:
             rem_list.append(item)
 
-    print "We have already downloaded %i tweets." % count_done
+    print("We have already downloaded %i tweets." % count_done)
 
     return rem_list
 
@@ -156,66 +150,50 @@ def download_tweets(fetch_list, raw_dir):
     if not os.path.exists(raw_dir):
         os.mkdir(raw_dir)
 
-    # stay within rate limits
-    download_pause_sec = 3600 / MAX_TWEETS_PER_HR
-
     # download tweets
     for idx in range(0, len(fetch_list)):
-        # stay in Twitter API rate limits
-        print 'Pausing %d sec to obey Twitter API rate limits' % \
-              (download_pause_sec)
-        time.sleep(download_pause_sec)
-
         # current item
         item = fetch_list[idx]
-        print item
-
-        # print status
-        print '--> downloading tweet #%s (%d of %d)' % \
-              (item[2], idx + 1, len(fetch_list))
+        print(item)
 
-        # Old Twitter API 1.0
-        # pull data
-        # url = 'https://api.twitter.com/1/statuses/show.json?id=' + item[2]
-        # print url
-        # urllib.urlretrieve(url, raw_dir + item[2] + '.json')
+        print('--> downloading tweet #%s (%d of %d)' %
+              (item[2], idx + 1, len(fetch_list)))
 
-        # New Twitter API 1.1
         try:
-            sec = api.GetSleepTime('/statuses/show/:id')
-            if sec > 0:
-                print "Sleeping %i seconds to conform to Twitter's rate limiting" % sec
-                time.sleep(sec)
+            #import pdb;pdb.set_trace()
+            response = api.statuses.show(_id=item[2])
 
-            result = api.GetStatus(item[2])
-            json_data = result.AsJsonString()
+            if response.rate_limit_remaining <= 0:
+                wait_seconds = response.rate_limit_reset - time.time()
+                print("Rate limiting requests us to wait %f seconds" %
+                      wait_seconds)
+                time.sleep(wait_seconds+5)
 
-        except twitter.TwitterError, e:
+        except twitter.TwitterError as e:
             fatal = True
-            for m in e.message:
+            print(e)
+            for m in json.loads(e.response_data.decode())['errors']:
                 if m['code'] == 34:
-                    print "Tweet missing: ", item
-                    # [{u'message': u'Sorry, that page does not exist', u'code': 34}]
-                    with open(MISSING_ID_FILE, "a") as f:
+                    print("Tweet missing: ", item)
+                    with open(MISSING_ID_FILE, "at") as f:
                         f.write(item[2] + "\n")
 
                     fatal = False
                     break
                 elif m['code'] == 63:
-                    print "User of tweet '%s' has been suspended." % item
-                    # [{u'message': u'Sorry, that page does not exist', u'code': 34}]
-                    with open(MISSING_ID_FILE, "a") as f:
+                    print("User of tweet '%s' has been suspended." % item)
+                    with open(MISSING_ID_FILE, "at") as f:
                         f.write(item[2] + "\n")
 
                     fatal = False
                     break
                 elif m['code'] == 88:
-                    print "Rate limit exceeded. Please lower max_tweets_per_hr."
+                    print("Rate limit exceeded.")
                     fatal = True
                     break
                 elif m['code'] == 179:
-                    print "Not authorized to view this tweet."
-                    with open(NOT_AUTHORIZED_ID_FILE, "a") as f:
+                    print("Not authorized to view this tweet.")
+                    with open(NOT_AUTHORIZED_ID_FILE, "at") as f:
                         f.write(item[2] + "\n")
                     fatal = False
                     break
@@ -225,8 +203,8 @@ def download_tweets(fetch_list, raw_dir):
             else:
                 continue
 
-        with open(raw_dir + item[2] + '.json', "w") as f:
-            f.write(json_data + "\n")
+        with open(raw_dir + item[2] + '.json', "wt") as f:
+            f.write(json.dumps(dict(response)) + "\n")
 
     return
 
@@ -234,12 +212,13 @@ def download_tweets(fetch_list, raw_dir):
 def parse_tweet_json(filename):
 
     # read tweet
-    fp = open(filename, 'rb')
+    fp = open(filename, 'r')
 
     # parse json
     try:
         tweet_json = json.load(fp)
-    except ValueError:
+    except ValueError as e:
+        print(e)
         raise RuntimeError('error parsing json')
 
     # look for twitter api error msgs
@@ -281,20 +260,20 @@ def build_output_corpus(out_filename, raw_dir, total_list):
                 writer.writerow(full_row)
 
             except RuntimeError:
-                print '--> bad data in tweet #' + item[2]
+                print('--> bad data in tweet #' + item[2])
                 missing_count += 1
 
         else:
-            print '--> missing tweet #' + item[2]
+            print('--> missing tweet #' + item[2])
             missing_count += 1
 
     # indicate success
     if missing_count == 0:
-        print '\nSuccessfully downloaded corpus!'
-        print 'Output in: ' + out_filename + '\n'
+        print('\nSuccessfully downloaded corpus!')
+        print('Output in: ' + out_filename + '\n')
     else:
-        print '\nMissing %d of %d tweets!' % (missing_count, len(total_list))
-        print 'Partial output in: ' + out_filename + '\n'
+        print('\nMissing %d of %d tweets!' % (missing_count, len(total_list)))
+        print('Partial output in: ' + out_filename + '\n')
 
     return
 
@@ -302,7 +281,7 @@ def build_output_corpus(out_filename, raw_dir, total_list):
 def main():
     # get user parameters
     user_params = get_user_params(DATA_PATH)
-    print user_params
+    print(user_params)
     dump_user_params(user_params)
 
     # get fetch list
@@ -310,7 +289,7 @@ def main():
 
     # remove already fetched or missing tweets
     fetch_list = purge_already_fetched(total_list, user_params['rawDir'])
-    print "Fetching %i tweets..." % len(fetch_list)
+    print("Fetching %i tweets..." % len(fetch_list))
 
     if fetch_list:
         # start fetching data from twitter
@@ -319,10 +298,11 @@ def main():
         # second pass for any failed downloads
         fetch_list = purge_already_fetched(total_list, user_params['rawDir'])
         if fetch_list:
-            print '\nStarting second pass to retry %i failed downloads...' % len(fetch_list)
+            print('\nStarting second pass to retry %i failed downloads...' %
+                  len(fetch_list))
             download_tweets(fetch_list, user_params['rawDir'])
     else:
-        print "Nothing to fetch any more."
+        print("Nothing to fetch any more.")
 
     # build output corpus
     build_output_corpus(user_params['outList'], user_params['rawDir'],
diff --git a/ch06/utils.py b/ch06/utils.py
index ca8b2357..6757354f 100644
--- a/ch06/utils.py
+++ b/ch06/utils.py
@@ -6,6 +6,7 @@
 # It is made available under the MIT License
 
 import os
+import sys
 import collections
 import csv
 import json
@@ -57,7 +58,7 @@ def load_sanders_data(dirname=".", line_count=-1):
             try:
                 tweet = json.load(open(tweet_fn, "r"))
             except IOError:
-                print("Tweet '%s' not found. Skip."%tweet_fn)
+                print(("Tweet '%s' not found. Skip." % tweet_fn))
                 continue
 
             if 'text' in tweet and tweet['user']['lang'] == "en":
@@ -84,14 +85,14 @@ def plot_pr(auc_score, name, phase, precision, recall, label=None):
     pylab.title('P/R curve (AUC=%0.2f) / %s' % (auc_score, label))
     filename = name.replace(" ", "_")
     pylab.savefig(os.path.join(CHART_DIR, "pr_%s_%s.png" %
-                  (filename, phase)), bbox_inches="tight")
+                               (filename, phase)), bbox_inches="tight")
 
 
 def show_most_informative_features(vectorizer, clf, n=20):
     c_f = sorted(zip(clf.coef_[0], vectorizer.get_feature_names()))
-    top = zip(c_f[:n], c_f[:-(n + 1):-1])
+    top = list(zip(c_f[:n], c_f[:-(n + 1):-1]))
     for (c1, f1), (c2, f2) in top:
-        print "\t%.4f\t%-15s\t\t%.4f\t%-15s" % (c1, f1, c2, f2)
+        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (c1, f1, c2, f2))
 
 
 def plot_log():
@@ -119,7 +120,7 @@ def plot_feat_importance(feature_names, clf, name):
     inds = np.argsort(coef)
     f_imp = f_imp[inds]
     coef = coef[inds]
-    xpos = np.array(range(len(coef)))
+    xpos = np.array(list(range(len(coef))))
     pylab.bar(xpos, coef, width=1)
 
     pylab.title('Feature importance for %s' % (name))
@@ -181,8 +182,13 @@ def plot_bias_variance(data_sizes, train_errors, test_errors, name):
 def load_sent_word_net():
 
     sent_scores = collections.defaultdict(list)
+    sentiwordnet_path = os.path.join(DATA_DIR, "SentiWordNet_3.0.0_20130122.txt")
 
-    with open(os.path.join(DATA_DIR, "SentiWordNet_3.0.0_20130122.txt"), "r") as csvfile:
+    if not os.path.exists(sentiwordnet_path):
+        print("Please download SentiWordNet_3.0.0 from http://sentiwordnet.isti.cnr.it/download.php, extract it and put it into the data directory")
+        sys.exit(1)
+
+    with open(sentiwordnet_path, 'r') as csvfile:
         reader = csv.reader(csvfile, delimiter='\t', quotechar='"')
         for line in reader:
             if line[0].startswith("#"):
@@ -200,7 +206,7 @@ def load_sent_word_net():
                 term = term.replace("-", " ").replace("_", " ")
                 key = "%s/%s" % (POS, term.split("#")[0])
                 sent_scores[key].append((float(PosScore), float(NegScore)))
-    for key, value in sent_scores.iteritems():
+    for key, value in sent_scores.items():
         sent_scores[key] = np.mean(value, axis=0)
 
     return sent_scores
diff --git a/ch07/.gitignore b/ch07/.gitignore
index d21bb3d1..e33609d2 100644
--- a/ch07/.gitignore
+++ b/ch07/.gitignore
@@ -1 +1 @@
-.formula_*/
+*.png
diff --git a/ch07/README.rst b/ch07/README.rst
new file mode 100644
index 00000000..12a7b051
--- /dev/null
+++ b/ch07/README.rst
@@ -0,0 +1,42 @@
+=========
+Chapter 7
+=========
+
+Support code for *Chapter 7: Regression* 
+
+
+Boston data analysis
+--------------------
+
+This dataset is shipped with sklearn. Thus, no extra download is required.
+
+
+boston1.py
+    Fit a linear regression model to the Boston house price data
+boston1numpy.py
+    Version of above script using numpy operations for linear regression
+boston_cv_penalized.py
+    Test different penalized (and OLS) regression schemes on the Boston dataset
+figure1_2.py
+    Show the regression line for Boston data
+figure3.py
+    Show the regression line for Boston data with OLS and Lasso
+figure4.py
+    Scatter plot of predicted-vs-actual for multidimensional regression
+
+10K data analysis
+-----------------
+
+lr10k.py
+    Linear regression on 10K dataset, evaluation by cross-validation
+predict10k_en.py
+    Elastic nets (including with inner cross-validation for parameter
+    settings). Produces scatter plot.
+
+
+MovieLens data analysis
+-----------------------
+
+In this chapter, we only consider a very simple approach, which is implemented
+in the ``usermodel.py`` script.
+
diff --git a/ch07/boston1.py b/ch07/boston1.py
index abfa40b3..d0b30447 100644
--- a/ch07/boston1.py
+++ b/ch07/boston1.py
@@ -19,6 +19,7 @@
 x = boston.data
 y = boston.target
 
+# Fitting a model is trivial: call the ``fit`` method in LinearRegression:
 lr = LinearRegression()
 lr.fit(x, y)
 
@@ -26,11 +27,13 @@
 rmse = np.sqrt(lr.residues_/len(x))
 print('RMSE: {}'.format(rmse))
 
+fig, ax = plt.subplots()
+# Plot a diagonal (for reference):
+ax.plot([0, 50], [0, 50], '-', color=(.9,.3,.3), lw=4)
+
 # Plot the prediction versus real:
-plt.plot(lr.predict(x), boston.target, 'ro')
+ax.scatter(lr.predict(x), boston.target)
 
-# Plot a diagonal (for reference):
-plt.plot([0, 50], [0, 50], 'g-')
-plt.xlabel('predicted')
-plt.ylabel('real')
-plt.show()
+ax.set_xlabel('predicted')
+ax.set_ylabel('real')
+fig.savefig('Figure_07_08.png')
diff --git a/ch07/boston_cv10_penalized.py b/ch07/boston_cv_penalized.py
similarity index 54%
rename from ch07/boston_cv10_penalized.py
rename to ch07/boston_cv_penalized.py
index 3ce97093..c894c4fa 100644
--- a/ch07/boston_cv10_penalized.py
+++ b/ch07/boston_cv_penalized.py
@@ -10,40 +10,37 @@
 from __future__ import print_function
 import numpy as np
 from sklearn.cross_validation import KFold
-from sklearn.linear_model import ElasticNet, Lasso, Ridge
+from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, Ridge
+from sklearn.metrics import r2_score
 from sklearn.datasets import load_boston
 boston = load_boston()
 x = boston.data
 y = boston.target
 
 for name, met in [
-        ('elastic-net(.5)', ElasticNet(fit_intercept=True, alpha=0.5)),
-        ('lasso(.5)', Lasso(fit_intercept=True, alpha=0.5)),
-        ('ridge(.5)', Ridge(fit_intercept=True, alpha=0.5)),
+        ('linear regression', LinearRegression()),
+        ('lasso()', Lasso()),
+        ('elastic-net(.5)', ElasticNet(alpha=0.5)),
+        ('lasso(.5)', Lasso(alpha=0.5)),
+        ('ridge(.5)', Ridge(alpha=0.5)),
 ]:
     # Fit on the whole data:
     met.fit(x, y)
 
     # Predict on the whole data:
     p = met.predict(x)
-
-    e = p - y
-    # np.dot(e, e) == sum(ei**2 for ei in e) but faster
-    total_error = np.dot(e, e)
-    rmse_train = np.sqrt(total_error / len(p))
+    r2_train = r2_score(y, p)
 
     # Now, we use 10 fold cross-validation to estimate generalization error
-    kf = KFold(len(x), n_folds=10)
-    err = 0
+    kf = KFold(len(x), n_folds=5)
+    p = np.zeros_like(y)
     for train, test in kf:
         met.fit(x[train], y[train])
-        p = met.predict(x[test])
-        e = p - y[test]
-        err += np.dot(e, e)
+        p[test] = met.predict(x[test])
 
-    rmse_10cv = np.sqrt(err / len(x))
+    r2_cv = r2_score(y, p)
     print('Method: {}'.format(name))
-    print('RMSE on training: {}'.format(rmse_train))
-    print('RMSE on 10-fold CV: {}'.format(rmse_10cv))
+    print('R2 on training: {}'.format(r2_train))
+    print('R2 on 5-fold CV: {}'.format(r2_cv))
     print()
     print()
diff --git a/ch07/cv10_lr.py b/ch07/cv10_lr.py
deleted file mode 100644
index 48ebc217..00000000
--- a/ch07/cv10_lr.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# This code is supporting material for the book
-# Building Machine Learning Systems with Python
-# by Willi Richert and Luis Pedro Coelho
-# published by PACKT Publishing
-#
-# It is made available under the MIT License
-
-import numpy as np
-from sklearn.cross_validation import KFold
-from sklearn.linear_model import LinearRegression, ElasticNet
-from sklearn.datasets import load_boston
-boston = load_boston()
-x = boston.data
-y = boston.target
-
-
-# Switch this variable to use an Elastic Net instead of OLS
-FIT_EN = False
-
-if FIT_EN:
-    model = ElasticNet(fit_intercept=True, alpha=0.5)
-else:
-    model = LinearRegression(fit_intercept=True)
-
-model.fit(x, y)
-rmse_train = np.sqrt(model.residues_/len(x))
-
-# Alternatively, we could have computed rmse_train using this expression:
-# rmse_train = np.sqrt(np.mean( (model.predict(x) - y) ** 2))
-# The results are equivalent
-
-kf = KFold(len(x), n_folds=10)
-err = 0
-for train, test in kf:
-    model.fit(x[train], y[train])
-    p = model.predict(x[test])
-    e = p - y[test]
-    err += np.dot(e, e) # This is the same as np.sum(e * e)
-
-rmse_10cv = np.sqrt(err / len(x))
-print('RMSE on training: {}'.format(rmse_train))
-print('RMSE on 10-fold CV: {}'.format(rmse_10cv))
diff --git a/ch07/data/.gitignore b/ch07/data/.gitignore
new file mode 100644
index 00000000..3286ba0f
--- /dev/null
+++ b/ch07/data/.gitignore
@@ -0,0 +1 @@
+E2006.train
diff --git a/ch07/data/download.sh b/ch07/data/download.sh
index 6f938e06..74753364 100755
--- a/ch07/data/download.sh
+++ b/ch07/data/download.sh
@@ -1,5 +1,3 @@
 #!/usr/bin/env bash
-wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/E2006.train.bz2
+curl -O http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/E2006.train.bz2
 bunzip2 E2006.train.bz2
-wget http://www.grouplens.org/system/files/ml-100k.zip
-unzip ml-100k.zip
diff --git a/ch07/figure1.py b/ch07/figure1.py
deleted file mode 100644
index 9585488a..00000000
--- a/ch07/figure1.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# This code is supporting material for the book
-# Building Machine Learning Systems with Python
-# by Willi Richert and Luis Pedro Coelho
-# published by PACKT Publishing
-#
-# It is made available under the MIT License
-
-import numpy as np
-from sklearn.datasets import load_boston
-from sklearn.linear_model import LinearRegression
-from matplotlib import pyplot as plt
-from mpltools import style
-style.use('ggplot')
-
-boston = load_boston()
-
-# Index number five in the number of rooms
-plt.scatter(boston.data[:, 5], boston.target)
-plt.xlabel("Number of rooms (RM)")
-plt.ylabel("House Price")
-
-x = boston.data[:, 5]
-# fit (used below) takes a two-dimensional array as input. We use np.atleast_2d
-# to convert from one to two dimensional, then transpose to make sure that the
-# format matches:
-x = np.transpose(np.atleast_2d(x))
-
-y = boston.target
-
-lr = LinearRegression(fit_intercept=False)
-
-lr.fit(x, y)
-
-plt.plot([0, boston.data[:, 5].max() + 1],
-         [0, lr.predict(boston.data[:, 5].max() + 1)], '-', lw=4)
-plt.savefig('Figure1.png', dpi=150)
-
-# The instance member `residues_` contains the sum of the squared residues
-rmse = np.sqrt(lr.residues_ / len(x))
-print('RMSE: {}'.format(rmse))
diff --git a/ch07/figure1_2.py b/ch07/figure1_2.py
new file mode 100644
index 00000000..3f11a0c7
--- /dev/null
+++ b/ch07/figure1_2.py
@@ -0,0 +1,63 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+#
+# It is made available under the MIT License
+
+import numpy as np
+from sklearn.datasets import load_boston
+from sklearn.linear_model import LinearRegression
+from sklearn.metrics import mean_squared_error, r2_score
+from matplotlib import pyplot as plt
+
+boston = load_boston()
+
+# Index number five in the number of rooms
+fig,ax = plt.subplots()
+ax.scatter(boston.data[:, 5], boston.target)
+ax.set_xlabel("Average number of rooms (RM)")
+ax.set_ylabel("House Price")
+
+x = boston.data[:, 5]
+# fit (used below) takes a two-dimensional array as input. We use np.atleast_2d
+# to convert from one to two dimensional, then transpose to make sure that the
+# format matches:
+x = np.transpose(np.atleast_2d(x))
+
+y = boston.target
+
+lr = LinearRegression(fit_intercept=False)
+lr.fit(x, y)
+
+ax.plot([0, boston.data[:, 5].max() + 1],
+         [0, lr.predict(boston.data[:, 5].max() + 1)], '-', lw=4)
+fig.savefig('Figure1.png')
+
+mse = mean_squared_error(y, lr.predict(x))
+rmse = np.sqrt(mse)
+print('RMSE (no intercept): {}'.format(rmse))
+
+# Repeat, but fitting an intercept this time:
+lr = LinearRegression(fit_intercept=True)
+
+lr.fit(x, y)
+
+fig,ax = plt.subplots()
+ax.set_xlabel("Average number of rooms (RM)")
+ax.set_ylabel("House Price")
+ax.scatter(boston.data[:, 5], boston.target)
+xmin = x.min()
+xmax = x.max()
+ax.plot([xmin, xmax], lr.predict([[xmin], [xmax]]) , '-', lw=4)
+fig.savefig('Figure2.png')
+
+mse = mean_squared_error(y, lr.predict(x))
+print("Mean squared error (of training data): {:.3}".format(mse))
+
+rmse = np.sqrt(mse)
+print("Root mean squared error (of training data): {:.3}".format(rmse))
+
+cod = r2_score(y, lr.predict(x))
+print('COD (on training data): {:.2}'.format(cod))
+
diff --git a/ch07/figure2.py b/ch07/figure2.py
deleted file mode 100644
index c5977207..00000000
--- a/ch07/figure2.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# This code is supporting material for the book
-# Building Machine Learning Systems with Python
-# by Willi Richert and Luis Pedro Coelho
-# published by PACKT Publishing
-#
-# It is made available under the MIT License
-
-import numpy as np
-from sklearn.datasets import load_boston
-import pylab as plt
-from mpltools import style
-style.use('ggplot')
-
-boston = load_boston()
-plt.scatter(boston.data[:, 5], boston.target)
-plt.xlabel("RM")
-plt.ylabel("House Price")
-
-
-x = boston.data[:, 5]
-xmin = x.min()
-xmax = x.max()
-x = np.array([[v, 1] for v in x])
-y = boston.target
-
-(slope, bias), res, _, _ = np.linalg.lstsq(x, y)
-plt.plot([xmin, xmax], [slope * xmin + bias, slope * xmax + bias], '-', lw=4)
-plt.savefig('Figure2.png', dpi=150)
-
-rmse = np.sqrt(res[0] / len(x))
-print('Residual: {}'.format(rmse))
diff --git a/ch07/figure3.py b/ch07/figure3.py
new file mode 100644
index 00000000..7543c1ec
--- /dev/null
+++ b/ch07/figure3.py
@@ -0,0 +1,33 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+#
+# It is made available under the MIT License
+
+from sklearn.linear_model import LinearRegression, Lasso
+import numpy as np
+from sklearn.datasets import load_boston
+from matplotlib import pyplot as plt
+
+boston = load_boston()
+fig, ax = plt.subplots()
+ax.scatter(boston.data[:, 5], boston.target)
+ax.set_xlabel("Number of rooms (RM)")
+ax.set_ylabel("House Price")
+
+
+x = boston.data[:, 5]
+xmin = x.min()
+xmax = x.max()
+x = np.transpose(np.atleast_2d(x))
+y = boston.target
+
+lr = LinearRegression()
+lr.fit(x, y)
+ax.plot([xmin, xmax], lr.predict([[xmin], [xmax]]), ':', lw=4, label='OLS model')
+
+las = Lasso()
+las.fit(x, y)
+ax.plot([xmin, xmax], las.predict([ [xmin], [xmax] ]), '-', lw=4, label='Lasso model')
+fig.savefig('Figure3.png')
diff --git a/ch07/figure4.py b/ch07/figure4.py
index ab71b3a8..a24d48be 100644
--- a/ch07/figure4.py
+++ b/ch07/figure4.py
@@ -5,31 +5,29 @@
 #
 # It is made available under the MIT License
 
-from sklearn.linear_model import Lasso
+
+# This script plots prediction-vs-actual on training set for the Boston dataset
+# using OLS regression
 import numpy as np
+from sklearn.linear_model import LinearRegression
 from sklearn.datasets import load_boston
-import pylab as plt
-from mpltools import style
-style.use('ggplot')
+from sklearn.metrics import mean_squared_error
+from matplotlib import pyplot as plt
 
 boston = load_boston()
-plt.scatter(boston.data[:, 5], boston.target)
-plt.xlabel("RM")
-plt.ylabel("House Price")
-
 
-x = boston.data[:, 5]
-xmin = x.min()
-xmax = x.max()
-x = np.array([[v, 1] for v in x])
+x = boston.data
 y = boston.target
 
-(slope, bias), res, _, _ = np.linalg.lstsq(x, y)
-plt.plot([xmin, xmax], [slope * xmin + bias, slope * xmax + bias], ':', lw=4)
+lr = LinearRegression()
+lr.fit(x, y)
+p = lr.predict(x)
+print("RMSE: {:.2}.".format(np.sqrt(mean_squared_error(y, p))))
+print("R2: {:.2}.".format(lr.score(x, y)))
+fig,ax = plt.subplots()
+ax.scatter(p, y)
+ax.set_xlabel('Predicted price')
+ax.set_ylabel('Actual price')
+ax.plot([y.min(), y.max()], [y.min(), y.max()], lw=4)
 
-las = Lasso()
-las.fit(x, y)
-y0 = las.predict([xmin, 1])
-y1 = las.predict([xmax, 1])
-plt.plot([xmin, xmax], [y0, y1], '-', lw=4)
-plt.savefig('Figure3.png', dpi=150)
+fig.savefig('Figure4.png')
diff --git a/ch07/lasso_path_plot.py b/ch07/lasso_path_plot.py
new file mode 100644
index 00000000..eab64c26
--- /dev/null
+++ b/ch07/lasso_path_plot.py
@@ -0,0 +1,29 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+#
+# It is made available under the MIT License
+
+from sklearn.linear_model import Lasso
+from sklearn.datasets import load_boston
+from matplotlib import pyplot as plt
+import numpy as np
+
+boston = load_boston()
+x = boston.data
+y = boston.target
+
+las = Lasso(normalize=1)
+alphas = np.logspace(-5, 2, 1000)
+alphas, coefs, _= las.path(x, y, alphas=alphas)
+
+fig,ax = plt.subplots()
+ax.plot(alphas, coefs.T)
+ax.set_xscale('log')
+ax.set_xlim(alphas.max(), alphas.min())
+ax.set_xlabel('Lasso coefficient path as a function of alpha')
+ax.set_xlabel('Alpha')
+ax.set_ylabel('Coefficient weight')
+fig.savefig('Figure_LassoPath.png')
+
diff --git a/ch07/lr10k.py b/ch07/lr10k.py
index 05346c1c..831706a1 100644
--- a/ch07/lr10k.py
+++ b/ch07/lr10k.py
@@ -6,36 +6,33 @@
 # It is made available under the MIT License
 
 import numpy as np
+from sklearn.metrics import mean_squared_error, r2_score
 from sklearn.datasets import load_svmlight_file
-from sklearn.linear_model import ElasticNet, LinearRegression
+from sklearn.linear_model import LinearRegression
 from sklearn.cross_validation import KFold
 
-USE_EN = False
+# Whether to use Elastic nets (otherwise, ordinary linear regression is used)
 
+# Load data:
 data, target = load_svmlight_file('data/E2006.train')
-if USE_EN:
-    lr = ElasticNet(fit_intercept=True)
-else:
-    lr = LinearRegression(fit_intercept=True)
 
-kf = KFold(len(target), n_folds=10)
-err = 0
-for train, test in kf:
-    lr.fit(data[train], target[train])
-    p = lr.predict(data[test])
-    p = np.array(p).ravel()
-    e = p - target[test]
-    err += np.dot(e, e)
-
-rmse_10cv = np.sqrt(err / len(target))
+lr = LinearRegression()
 
+# Compute error on training data to demonstrate that we can obtain near perfect
+# scores:
 
 lr.fit(data, target)
-p = lr.predict(data)
-p = p.ravel()
-e = p - target
-total_error = np.dot(e, e)
-rmse_train = np.sqrt(total_error / len(p))
-
-print('RMSE on training: {}'.format(rmse_train))
-print('RMSE on 10-fold CV: {}'.format(rmse_10cv))
+pred = lr.predict(data)
+
+print('RMSE on training, {:.2}'.format(np.sqrt(mean_squared_error(target, pred))))
+print('R2 on training, {:.2}'.format(r2_score(target, pred)))
+print('')
+
+pred = np.zeros_like(target)
+kf = KFold(len(target), n_folds=5)
+for train, test in kf:
+    lr.fit(data[train], target[train])
+    pred[test] = lr.predict(data[test])
+
+print('RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(target, pred))))
+print('R2 on testing (5 fold), {:.2}'.format(r2_score(target, pred)))
diff --git a/ch07/predict10k_en.py b/ch07/predict10k_en.py
index 4b240574..a7dd960a 100644
--- a/ch07/predict10k_en.py
+++ b/ch07/predict10k_en.py
@@ -8,33 +8,66 @@
 import numpy as np
 from sklearn.datasets import load_svmlight_file
 from sklearn.cross_validation import KFold
-from sklearn.linear_model import ElasticNet, LinearRegression
+from sklearn.linear_model import ElasticNetCV, ElasticNet
+from sklearn.metrics import mean_squared_error, r2_score
+from matplotlib import pyplot as plt
 
 data, target = load_svmlight_file('data/E2006.train')
 
 # Edit the lines below if you want to switch method:
-# met = LinearRegression(fit_intercept=True)
-met = ElasticNet(fit_intercept=True, alpha=.1)
+# from sklearn.linear_model import Lasso
+# met = Lasso(alpha=0.1)
+met = ElasticNet(alpha=0.1)
 
-kf = KFold(len(target), n_folds=10)
-err = 0
+kf = KFold(len(target), n_folds=5)
+pred = np.zeros_like(target)
 for train, test in kf:
     met.fit(data[train], target[train])
-    p = met.predict(data[test])
-    p = np.array(p).ravel()
-    e = p - target[test]
-    err += np.dot(e, e)
+    pred[test] = met.predict(data[test])
 
-rmse_10cv = np.sqrt(err / len(target))
+print('[EN 0.1] RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(target, pred))))
+print('[EN 0.1] R2 on testing (5 fold), {:.2}'.format(r2_score(target, pred)))
+print('')
 
+# Construct an ElasticNetCV object (use all available CPUs)
+met = ElasticNetCV(n_jobs=-1)
+
+kf = KFold(len(target), n_folds=5)
+pred = np.zeros_like(target)
+for train, test in kf:
+    met.fit(data[train], target[train])
+    pred[test] = met.predict(data[test])
+
+print('[EN CV] RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(target, pred))))
+print('[EN CV] R2 on testing (5 fold), {:.2}'.format(r2_score(target, pred)))
+print('')
 
 met.fit(data, target)
-p = met.predict(data)
-p = p.ravel()
-e = p - target
-total_error = np.dot(e, e)
-rmse_train = np.sqrt(total_error / len(p))
+pred = met.predict(data)
+print('[EN CV] RMSE on training, {:.2}'.format(np.sqrt(mean_squared_error(target, pred))))
+print('[EN CV] R2 on training, {:.2}'.format(r2_score(target, pred)))
+
+
+# Construct an ElasticNetCV object (use all available CPUs)
+met = ElasticNetCV(n_jobs=-1, l1_ratio=[.01, .05, .25, .5, .75, .95, .99])
+
+kf = KFold(len(target), n_folds=5)
+pred = np.zeros_like(target)
+for train, test in kf:
+    met.fit(data[train], target[train])
+    pred[test] = met.predict(data[test])
+
+
+print('[EN CV l1_ratio] RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(target, pred))))
+print('[EN CV l1_ratio] R2 on testing (5 fold), {:.2}'.format(r2_score(target, pred)))
+print('')
+
 
+fig, ax = plt.subplots()
+y = target
+ax.scatter(y, pred, c='k')
+ax.plot([-5,-1], [-5,-1], 'r-', lw=2)
+ax.set_xlabel('Actual value')
+ax.set_ylabel('Predicted value')
+fig.savefig('Figure_10k_scatter_EN_l1_ratio.png')
 
-print('RMSE on training: {}'.format(rmse_train))
-print('RMSE on 10-fold CV: {}'.format(rmse_10cv))
diff --git a/ch07/usermodel.py b/ch07/usermodel.py
deleted file mode 100644
index 9a93b2c8..00000000
--- a/ch07/usermodel.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# This code is supporting material for the book
-# Building Machine Learning Systems with Python
-# by Willi Richert and Luis Pedro Coelho
-# published by PACKT Publishing
-#
-# It is made available under the MIT License
-
-import numpy as np
-from scipy import sparse
-from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV
-from sklearn.cross_validation import KFold
-
-data = np.array([[int(tok) for tok in line.split('\t')[:3]]
-                for line in open('data/ml-100k/u.data')])
-ij = data[:, :2]
-ij -= 1  # original data is in 1-based system
-values = data[:, 2]
-reviews = sparse.csc_matrix((values, ij.T)).astype(float)
-
-reg = ElasticNetCV(fit_intercept=True, alphas=[
-                   0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.])
-
-
-def movie_norm(xc):
-    '''Normalize per movie'''
-    xc = xc.copy().toarray()
-    # x1 is the mean of the positive items
-    x1 = np.array([xi[xi > 0].mean() for xi in xc])
-    x1 = np.nan_to_num(x1)
-
-    for i in range(xc.shape[0]):
-        xc[i] -= (xc[i] > 0) * x1[i]
-    return xc, x1
-
-
-def learn_for(i):
-    u = reviews[i]
-    us = np.delete(np.arange(reviews.shape[0]), i)
-    ps, = np.where(u.toarray().ravel() > 0)
-    x = reviews[us][:, ps].T
-    y = u.data
-    err = 0
-    eb = 0
-    kf = KFold(len(y), n_folds=4)
-    for train, test in kf:
-        xc, x1 = movie_norm(x[train])
-        reg.fit(xc, y[train] - x1)
-
-        xc, x1 = movie_norm(x[test])
-        p = reg.predict(xc).ravel()
-        e = (p + x1) - y[test]
-        err += np.sum(e * e)
-        eb += np.sum((y[train].mean() - y[test]) ** 2)
-    return np.sqrt(err / float(len(y))), np.sqrt(eb / float(len(y)))
-
-whole_data = []
-for i in range(reviews.shape[0]):
-    s = learn_for(i)
-    print(s[0] < s[1])
-    print(s)
-    whole_data.append(s)
diff --git a/ch08/README.rst b/ch08/README.rst
new file mode 100644
index 00000000..488844e6
--- /dev/null
+++ b/ch08/README.rst
@@ -0,0 +1,41 @@
+=========
+Chapter 8
+=========
+
+Support code for *Chapter 8: Recommendations*.
+
+The code refers to the second edition of the book and this code has been
+significantly refactored when compared to the first one.
+
+Ratings Prediction
+------------------
+
+Note that since the partition of the data into training and testing is random,
+everytime you run the code, the results will be different.
+
+
+load_ml100k.py
+    Load data & partition into test/train
+norm.py
+    Normalize the data
+corrneighbours.py
+    Neighbour models based on ncrroaltoin
+regression.py
+    Regression models
+stacked.py
+    Stacked predictions
+averaged.py
+    Averaging of predictions (mentioned in book, but code is not shown there).
+
+Association Rule Mining
+-----------------------
+
+Check the folder ``apriori/``
+
+apriori/histogram.py
+    Print a histogram of how many times each product was bought
+apriori/apriori.py
+    Implementation of Apriori algorithm and association rule building
+apriori/apriori_example.py
+    Example of Apriori algorithm in retail dataset
+
diff --git a/ch08/all_correlations.py b/ch08/all_correlations.py
index 509e5b8e..d3817bf0 100644
--- a/ch08/all_correlations.py
+++ b/ch08/all_correlations.py
@@ -7,9 +7,14 @@
 
 import numpy as np
 
-# This is the version in the book:
-
+def all_correlations(y, X):
+    from scipy import spatial
+    y = np.atleast_2d(y)
+    sp = spatial.distance.cdist(X, y, 'correlation')
+    # The "correlation distance" is 1 - corr(x,y); so we invert that to obtain the correlation
+    return 1 - sp.ravel()
 
+# This is the version in the book (1st Edition):
 def all_correlations_book_version(bait, target):
     '''
     corrs = all_correlations(bait, target)
@@ -21,9 +26,7 @@ def all_correlations_book_version(bait, target):
          for c in target])
 
 # This is a faster, but harder to read, implementation:
-
-
-def all_correlations(y, X):
+def all_correlations_fast_no_scipy(y, X):
     '''
     Cs = all_correlations(y, X)
 
@@ -42,12 +45,4 @@ def all_correlations(y, X):
 
     return (xy - x_ * y_ * n) / n / xs_ / ys_
 
-# If you have scipy installed, then you can compute correlations with
-# scipy.spatial.cdist:
 
-def all_correlations_scipy(y, X):
-    from scipy import spatial
-    y = np.atleast_2d(y)
-    sp = spatial.distance.cdist(X, y, 'correlation')
-    # The "correlation distance" is 1 - corr(x,y); so we invert that to obtain the correlation
-    return 1 - sp.ravel()
diff --git a/ch08/apriori/.gitignore b/ch08/apriori/.gitignore
index 05697a95..6379e20f 100644
--- a/ch08/apriori/.gitignore
+++ b/ch08/apriori/.gitignore
@@ -1 +1 @@
-retail.dat
+retail.dat.gz
diff --git a/ch08/apriori/apriori.py b/ch08/apriori/apriori.py
index 1ebb7e54..eeaf0d46 100644
--- a/ch08/apriori/apriori.py
+++ b/ch08/apriori/apriori.py
@@ -5,9 +5,12 @@
 #
 # It is made available under the MIT License
 
+from collections import namedtuple
+
+
 def apriori(dataset, minsupport, maxsize):
     '''
-    freqsets, baskets = apriori(dataset, minsupport, maxsize)
+    freqsets, support = apriori(dataset, minsupport, maxsize)
 
     Parameters
     ----------
@@ -21,48 +24,70 @@ def apriori(dataset, minsupport, maxsize):
     Returns
     -------
     freqsets : sequence of sequences
-    baskets : dictionary
+    support : dictionary
+        This associates each itemset (represented as a frozenset) with a float
+        (the support of that itemset)
     '''
     from collections import defaultdict
 
     baskets = defaultdict(list)
     pointers = defaultdict(list)
+
     for i, ds in enumerate(dataset):
         for ell in ds:
             pointers[ell].append(i)
             baskets[frozenset([ell])].append(i)
-    pointers = dict([(k, frozenset(v)) for k, v in pointers.items()])
-    baskets = dict([(k, frozenset(v)) for k, v in baskets.items()])
 
-    valid = set(list(el)[0]
-                for el, c in baskets.items() if (len(c) >= minsupport))
-    dataset = [[el for el in ds if (el in valid)] for ds in dataset]
-    dataset = [ds for ds in dataset if len(ds) > 1]
-    dataset = map(frozenset, dataset)
+    # Convert pointer items to frozensets to speed up operations later
+    new_pointers = dict()
+    for k in pointers:
+        if len(pointers[k]) >= minsupport:
+            new_pointers[k] = frozenset(pointers[k])
+    pointers = new_pointers
+    for k in baskets:
+        baskets[k] = frozenset(baskets[k])
 
+
+    # Valid are all elements whose support is >= minsupport
+    valid = set()
+    for el, c in baskets.items():
+        if len(c) >= minsupport:
+            valid.update(el)
+
+    # Itemsets at first iteration are simply all singleton with valid elements:
     itemsets = [frozenset([v]) for v in valid]
     freqsets = []
     for i in range(maxsize - 1):
-        print(len(itemsets))
+        print("At iteration {}, number of frequent baskets: {}".format(
+            i, len(itemsets)))
         newsets = []
-        for i, ell in enumerate(itemsets):
-            ccounts = baskets[ell]
-            for v_, pv in pointers.items():
-                if v_ not in ell:
+        for it in itemsets:
+            ccounts = baskets[it]
+
+            for v, pv in pointers.items():
+                if v not in it:
                     csup = (ccounts & pv)
                     if len(csup) >= minsupport:
-                        new = frozenset(ell | set([v_]))
+                        new = frozenset(it | frozenset([v]))
                         if new not in baskets:
                             newsets.append(new)
                             baskets[new] = csup
         freqsets.extend(itemsets)
         itemsets = newsets
-    return freqsets, baskets
+        if not len(itemsets):
+            break
+    support = {}
+    for k in baskets:
+        support[k] = float(len(baskets[k]))
+    return freqsets, support
 
 
-def association_rules(dataset, freqsets, baskets, minlift):
+# A namedtuple to collect all values that may be interesting
+AssociationRule = namedtuple('AssociationRule', ['antecendent', 'consequent', 'base', 'py_x', 'lift'])
+
+def association_rules(dataset, freqsets, support, minlift):
     '''
-    for (antecendent, consequent, base, py_x, lift) in association_rules(dataset, freqsets, baskets, minlift):
+    for assoc_rule in association_rules(dataset, freqsets, support, minlift):
         ...
 
     This function takes the returns from ``apriori``.
@@ -72,9 +97,13 @@ def association_rules(dataset, freqsets, baskets, minlift):
     dataset : sequence of sequences
         input dataset
     freqsets : sequence of sequences
-    baskets : dictionary
+    support : dictionary
     minlift : int
         minimal lift of yielded rules
+
+    Returns
+    -------
+    assoc_rule : sequence of AssociationRule objects
     '''
     nr_transactions = float(len(dataset))
     freqsets = [f for f in freqsets if len(f) > 1]
@@ -82,8 +111,9 @@ def association_rules(dataset, freqsets, baskets, minlift):
         for f in fset:
             consequent = frozenset([f])
             antecendent = fset - consequent
-            base = len(baskets[consequent]) / nr_transactions
-            py_x = len(baskets[fset]) / float(len(baskets[antecendent]))
+            py_x = support[fset] / support[antecendent]
+            base = support[consequent] / nr_transactions
             lift = py_x / base
             if lift > minlift:
-                yield (antecendent, consequent, base, py_x, lift)
+                yield AssociationRule(antecendent, consequent, base, py_x, lift)
+
diff --git a/ch08/apriori/apriori_example.py b/ch08/apriori/apriori_example.py
index da77e96c..971ed4d5 100644
--- a/ch08/apriori/apriori_example.py
+++ b/ch08/apriori/apriori_example.py
@@ -7,10 +7,17 @@
 
 from apriori import apriori, association_rules
 from gzip import GzipFile
+
+# Load dataset
 dataset = [[int(tok) for tok in line.strip().split()]
            for line in GzipFile('retail.dat.gz')]
-freqsets, baskets = apriori(dataset, 80, maxsize=5)
-nr_transactions = float(len(dataset))
-for ant, con, base, pyx, lift in association_rules(dataset, freqsets, baskets, 30):
-    print('{} | {} | {} ({:%}) | {} | {} | {}'
-          .format(ant, con, len(baskets[con]), len(baskets[con]) / nr_transactions, len(baskets[ant]), len(baskets[con | ant]), int(lift)))
+
+freqsets, support = apriori(dataset, 80, maxsize=16)
+rules = list(association_rules(dataset, freqsets, support, minlift=30.0))
+
+rules.sort(key=(lambda ar: -ar.lift))
+for ar in rules:
+    print('{} -> {} (lift = {:.4})'
+          .format(set(ar.antecendent),
+                    set(ar.consequent),
+                    ar.lift))
diff --git a/ch08/apriori/apriori_naive.py b/ch08/apriori/apriori_naive.py
index cb50a8e1..03ab9130 100644
--- a/ch08/apriori/apriori_naive.py
+++ b/ch08/apriori/apriori_naive.py
@@ -5,35 +5,83 @@
 #
 # It is made available under the MIT License
 
-import numpy as np
 from collections import defaultdict
 from itertools import chain
 from gzip import GzipFile
-minsupport = 44
+minsupport = 80
 
 dataset = [[int(tok) for tok in line.strip().split()]
            for line in GzipFile('retail.dat.gz')]
-dataset = dataset[::20]
 
 counts = defaultdict(int)
 for elem in chain(*dataset):
     counts[elem] += 1
 
+# Only elements that have at least minsupport should be considered.
 valid = set(el for el, c in counts.items() if (c >= minsupport))
+
+# Filter the dataset to contain only valid elements
+# (This step is not strictly necessary, but will make the rest of the code
+# faster as the itemsets will be smaller):
 dataset = [[el for el in ds if (el in valid)] for ds in dataset]
 
-dataset = [frozenset(ds) for ds in dataset if len(ds) > 1]
+# Convert to frozenset for fast processing
+dataset = [frozenset(ds) for ds in dataset]
 
 itemsets = [frozenset([v]) for v in valid]
-allsets = [itemsets]
+freqsets = itemsets[:]
 for i in range(16):
-    print(len(itemsets))
+    print("At iteration {}, number of frequent baskets: {}".format(
+        i, len(itemsets)))
     nextsets = []
-    for i, ell in enumerate(itemsets):
-        for v_ in valid:
-            if v_ not in ell:
-                c = (ell | set([v_]))
-                if sum(1 for d in dataset if d.issuperset(c)) > minsupport:
+
+    tested = set()
+    for it in itemsets:
+        for v in valid:
+            if v not in it:
+                # Create a new candidate set by adding v to it
+                c = (it | frozenset([v]))
+
+                # Check if we have tested it already:
+                if c in tested:
+                    continue
+                tested.add(c)
+
+                # Count support by looping over dataset
+                # This step is slow.
+                # Check `apriori.py` for a better implementation.
+                support_c = sum(1 for d in dataset if d.issuperset(c))
+                if support_c > minsupport:
                     nextsets.append(c)
-    allsets.append(nextsets)
+    freqsets.extend(nextsets)
     itemsets = nextsets
+    if not len(itemsets):
+        break
+print("Finished!")
+
+
+def rules_from_itemset(itemset, dataset, minlift=1.):
+    nr_transactions = float(len(dataset))
+    for item in itemset:
+        consequent = frozenset([item])
+        antecedent = itemset-consequent
+        base = 0.0
+        # acount: antecedent count
+        acount = 0.0
+
+        # ccount : consequent count
+        ccount = 0.0
+        for d in dataset:
+          if item in d: base += 1
+          if d.issuperset(itemset): ccount += 1
+          if d.issuperset(antecedent): acount += 1
+        base /= nr_transactions
+        p_y_given_x = ccount/acount
+        lift = p_y_given_x / base
+        if lift > minlift:
+            print('Rule {0} ->  {1} has lift {2}'
+                  .format(antecedent, consequent,lift))
+
+for itemset in freqsets:
+    if len(itemset) > 1:
+        rules_from_itemset(itemset, dataset, minlift=4.)
diff --git a/ch08/averaged.py b/ch08/averaged.py
new file mode 100644
index 00000000..5b19bba7
--- /dev/null
+++ b/ch08/averaged.py
@@ -0,0 +1,33 @@
+import numpy as np
+import load_ml100k
+import regression
+import corrneighbours
+from sklearn import metrics
+import norm
+
+def predict(train):
+    predicted0 = regression.predict(train)
+    predicted1 = regression.predict(train.T).T
+    predicted2 = corrneighbours.predict(train)
+    predicted3 = corrneighbours.predict(train.T).T
+    predicted4 = norm.predict(train)
+    predicted5 = norm.predict(train.T).T
+    stack = np.array([
+        predicted0,
+        predicted1,
+        predicted2,
+        predicted3,
+        predicted4,
+        predicted5,
+        ])
+    return stack.mean(0)
+
+
+def main():
+    train,test = load_ml100k.get_train_test(random_state=12)
+    predicted = predict(train)
+    r2 = metrics.r2_score(test[test > 0], predicted[test > 0])
+    print('R2 averaged: {:.2%}'.format(r2))
+
+if __name__ == '__main__':
+    main()
diff --git a/ch08/chapter.py b/ch08/chapter.py
new file mode 100644
index 00000000..d039d93f
--- /dev/null
+++ b/ch08/chapter.py
@@ -0,0 +1,208 @@
+import numpy as np # NOT IN BOOK
+from matplotlib import pyplot as plt # NOT IN BOOK
+
+def load():
+    import numpy as np
+    from scipy import sparse
+
+    data = np.loadtxt('data/ml-100k/u.data')
+    ij = data[:, :2]
+    ij -= 1  # original data is in 1-based system
+    values = data[:, 2]
+    reviews = sparse.csc_matrix((values, ij.T)).astype(float)
+    return reviews.toarray()
+reviews = load()
+U,M = np.where(reviews)
+import random
+test_idxs = np.array(random.sample(range(len(U)), len(U)//10))
+
+train = reviews.copy()
+train[U[test_idxs], M[test_idxs]] = 0
+
+test = np.zeros_like(reviews)
+test[U[test_idxs], M[test_idxs]] = reviews[U[test_idxs], M[test_idxs]]
+
+class NormalizePositive(object):
+    def __init__(self, axis=0):
+        self.axis = axis
+
+    def fit(self, features, y=None):
+        if self.axis == 1:
+            features = features.T
+          #  count features that are greater than zero in axis 0:
+        binary = (features > 0)
+
+        count0 = binary.sum(axis=0)
+
+         # to avoid division by zero, set zero counts to one:
+        count0[count0 == 0] = 1.
+
+         # computing the mean is easy:
+        self.mean = features.sum(axis=0)/count0
+
+        # only consider differences where binary is True:
+        diff = (features - self.mean) * binary
+        diff **= 2
+        # regularize the estimate of std by adding 0.1
+        self.std = np.sqrt(0.1 + diff.sum(axis=0)/count0)
+        return self
+
+
+    def transform(self, features):
+        if self.axis == 1:
+          features = features.T
+        binary = (features > 0)
+        features = features - self.mean
+        features /= self.std
+        features *= binary
+        if self.axis == 1:
+          features = features.T
+        return features
+
+    def inverse_transform(self, features, copy=True):
+        if copy:
+            features = features.copy()
+        if self.axis == 1:
+          features = features.T
+        features *= self.std
+        features += self.mean
+        if self.axis == 1:
+          features = features.T
+        return features
+
+    def fit_transform(self, features):
+        return self.fit(features).transform(features)
+
+
+norm = NormalizePositive(axis=1)
+binary = (train > 0)
+train = norm.fit_transform(train)
+# plot just 200x200 area for space reasons
+plt.imshow(binary[:200, :200], interpolation='nearest')
+
+from scipy.spatial import distance
+# compute all pair-wise distances:
+dists = distance.pdist(binary, 'correlation')
+# Convert to square form, so that dists[i,j]
+# is distance between binary[i] and binary[j]:
+dists = distance.squareform(dists)
+neighbors = dists.argsort(axis=1)
+
+# We are going to fill this matrix with results
+filled = train.copy()
+for u in range(filled.shape[0]):
+    # n_u is neighbors of user
+    n_u = neighbors[u, 1:]
+    for m in range(filled.shape[1]):
+        # get relevant reviews in order!
+        revs = [train[neigh, m]
+                   for neigh in n_u
+                        if binary    [neigh, m]]
+        if len(revs):
+            # n is the number of reviews for this movie
+            n = len(revs)
+            # take half of the reviews plus one into consideration:
+            n //= 2
+            n += 1
+            revs = revs[:n]
+            filled[u,m] = np.mean(revs)
+
+predicted = norm.inverse_transform(filled)
+from sklearn import metrics
+r2 = metrics.r2_score(test[test > 0], predicted[test > 0])
+print('R2 score (binary neighbors): {:.1%}'.format(r2))
+
+reviews = reviews.T
+# use same code as before 
+r2 = metrics.r2_score(test[test > 0], predicted[test > 0])
+print('R2 score (binary movie neighbors): {:.1%}'.format(r2))
+
+
+from sklearn.linear_model import ElasticNetCV # NOT IN BOOK
+
+reg = ElasticNetCV(alphas=[
+                       0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.])
+filled = train.copy()
+# iterate over all users:
+for u in range(train.shape[0]):
+    curtrain = np.delete(train, u, axis=0)
+    bu = binary[u]
+    reg.fit(curtrain[:,bu].T, train[u, bu])
+    filled[u, ~bu] = reg.predict(curtrain[:,~bu].T)
+predicted = norm.inverse_transform(filled)
+r2 = metrics.r2_score(test[test > 0], predicted[test > 0])
+print('R2 score (user regression): {:.1%}'.format(r2))
+
+
+# SHOPPING BASKET ANALYSIS
+# This is the slow version of the code, which will take a long time to
+# complete.
+
+
+from collections import defaultdict
+from itertools import chain
+
+# File is downloaded as a compressed file
+import gzip
+# file format is a line per transaction
+# of the form '12 34 342 5...'
+dataset = [[int(tok) for tok in line.strip().split()]
+       for line in gzip.open('data/retail.dat.gz')]
+dataset = [set(d) for d in dataset]
+# count how often each product was purchased:
+counts = defaultdict(int)
+for elem in chain(*dataset):
+    counts[elem] += 1
+
+minsupport = 80
+valid = set(k for k,v in counts.items() if (v >= minsupport))
+itemsets = [frozenset([v]) for v in valid]
+freqsets = []
+for i in range(16):
+    nextsets = []
+    tested = set()
+    for it in itemsets:
+        for v in valid:
+           if v not in it:
+               # Create a new candidate set by adding v to it
+               c = (it | frozenset([v]))
+               # check If we have tested it already
+               if c in tested:
+                    continue
+               tested.add(c)
+
+               # Count support by looping over dataset
+               # This step is slow.
+               # Check `apriori.py` for a better implementation.
+               support_c = sum(1 for d in dataset if d.issuperset(c))
+               if support_c > minsupport:
+                    nextsets.append(c)
+    freqsets.extend(nextsets)
+    itemsets = nextsets
+    if not len(itemsets):
+        break
+print("Finished!")
+
+
+minlift = 5.0
+nr_transactions = float(len(dataset))
+for itemset in freqsets:
+    for item in itemset:
+        consequent = frozenset([item])
+        antecedent = itemset-consequent
+        base = 0.0
+        # acount: antecedent count
+        acount = 0.0
+
+        # ccount : consequent count
+        ccount = 0.0
+        for d in dataset:
+          if item in d: base += 1
+          if d.issuperset(itemset): ccount += 1
+          if d.issuperset(antecedent): acount += 1
+        base /= nr_transactions
+        p_y_given_x = ccount/acount
+        lift = p_y_given_x / base
+        if lift > minlift:
+            print('Rule {0} ->  {1} has lift {2}'
+                  .format(antecedent, consequent,lift))
diff --git a/ch08/corrneighbours.py b/ch08/corrneighbours.py
index eef3881e..eb30e685 100644
--- a/ch08/corrneighbours.py
+++ b/ch08/corrneighbours.py
@@ -6,75 +6,53 @@
 # It is made available under the MIT License
 
 from __future__ import print_function
-from all_correlations import all_correlations
 import numpy as np
-from load_ml100k import load
-
-def estimate_user(user, rest, num_neighbors=100):
-    '''Estimate ratings for user based on the binary rating matrix
-
-    Returns
-    -------
-    estimates: ndarray
-        Returns a rating estimate for each movie
-    '''
-
-    # Compute binary matrix correlations:
-    bu = user > 0
-    br = rest > 0
-    ws = all_correlations(bu, br)
-
-    # Select top `num_neighbors`:
-    selected = ws.argsort()[-num_neighbors:]
-
-    # Use these to compute estimates:
-    estimates = rest[selected].mean(0)
-    estimates /= (.1 + br[selected].mean(0))
-    return estimates
-
-
-def train_test(user, rest):
-    '''Train & test on a single user
-
-    Returns both the prediction error and the null error
-    '''
-    estimates = estimate_user(user, rest)
-    bu = user > 0
-    br = rest > 0
-    err = estimates[bu] - user[bu]
-    null = rest.mean(0)
-    null /= (.1 + br.mean(0))
-    nerr = null[bu] - user[bu]
-    return np.dot(err, err), np.dot(nerr, nerr)
-
-
-def all_estimates(reviews):
-    estimates = np.zeros_like(reviews)
-    for i in range(reviews.shape[0]):
-        estimates[i] = estimate_user(reviews[i], np.delete(reviews, i, 0))
-    return estimates
-
-def main():
-    reviews = load()
-
-    err = []
-    for i in range(reviews.shape[0]):
-        err.append(
-            train_test(reviews[i], np.delete(reviews, i, 0))
-        )
-    revs = (reviews > 0).sum(1)
-    err = np.array(err)
-    rmse = np.sqrt(err / revs[:, None])
-
-    rmse_model, rmse_null = np.mean(rmse, 0)
-
-    print("Average of RMSE / Null-model RMSE")
-    print("{:.2}\t{:.2} (improvement: {:.1%}".format(rmse_model, rmse_null, (rmse_null-rmse_model)/rmse_null))
-    print()
-
-    rmse_model, rmse_null = np.mean(rmse[revs > 60], 0)
-    print("Average of RMSE / Null-model RMSE (users with more than 60 reviewed movies)")
-    print("{:.2}\t{:.2} (improvement: {:.1%}".format(rmse_model, rmse_null, (rmse_null-rmse_model)/rmse_null))
+from load_ml100k import get_train_test
+from scipy.spatial import distance
+from sklearn import metrics
+
+from norm import NormalizePositive
+
+def predict(otrain):
+    binary = (otrain > 0)
+    norm = NormalizePositive(axis=1)
+    train = norm.fit_transform(otrain)
+
+    dists = distance.pdist(binary, 'correlation')
+    dists = distance.squareform(dists)
+
+    neighbors = dists.argsort(axis=1)
+    filled = train.copy()
+    for u in range(filled.shape[0]):
+        # n_u are the neighbors of user
+        n_u = neighbors[u, 1:]
+        for m in range(filled.shape[1]):
+            # This code could be faster using numpy indexing trickery as the
+            # cost of readibility (this is left as an exercise to the reader):
+            revs = [train[neigh, m]
+                    for neigh in n_u
+                    if binary[neigh, m]]
+            if len(revs):
+                n = len(revs)
+                n //= 2
+                n += 1
+                revs = revs[:n]
+                filled[u,m] = np.mean(revs)
+
+    return norm.inverse_transform(filled)
+
+def main(transpose_inputs=False):
+    train, test = get_train_test(random_state=12)
+    if transpose_inputs:
+        train = train.T
+        test  = test.T
+
+    predicted = predict(train)
+    r2 = metrics.r2_score(test[test > 0], predicted[test > 0])
+    print('R2 score (binary {} neighbours): {:.1%}'.format(
+        ('movie' if transpose_inputs else 'user'),
+        r2))
 
 if __name__ == '__main__':
     main()
+    main(transpose_inputs=True)
diff --git a/ch08/data b/ch08/data
deleted file mode 120000
index 32d37b53..00000000
--- a/ch08/data
+++ /dev/null
@@ -1 +0,0 @@
-../ch07/data
\ No newline at end of file
diff --git a/ch08/data/.gitignore b/ch08/data/.gitignore
new file mode 100644
index 00000000..c391d625
--- /dev/null
+++ b/ch08/data/.gitignore
@@ -0,0 +1,3 @@
+retail.dat.gz
+ml-100k.zip
+/ml-100k/
diff --git a/ch08/data/download.sh b/ch08/data/download.sh
new file mode 100755
index 00000000..b671171d
--- /dev/null
+++ b/ch08/data/download.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+curl -L -O http://files.grouplens.org/papers/ml-100k.zip
+unzip ml-100k.zip
+curl -L -O http://fimi.ua.ac.be/data/retail.dat.gz
diff --git a/ch08/figure3.py b/ch08/figure3.py
index ac6961f3..daafc300 100644
--- a/ch08/figure3.py
+++ b/ch08/figure3.py
@@ -12,4 +12,4 @@
 plt.imshow(data[:200, :200], interpolation='nearest')
 plt.xlabel('User ID')
 plt.ylabel('Film ID')
-plt.savefig('../1400_08_03+.png')
+plt.savefig('Figure_08_03_DataMatrix.png')
diff --git a/ch08/load_ml100k.py b/ch08/load_ml100k.py
index b9c99b9d..7096e75c 100644
--- a/ch08/load_ml100k.py
+++ b/ch08/load_ml100k.py
@@ -6,15 +6,54 @@
 # It is made available under the MIT License
 
 def load():
-
     '''Load ML-100k data
 
-    Returns a sparse matrix'''
+    Returns the review matrix as a numpy array'''
     import numpy as np
     from scipy import sparse
+    from os import path
+
+    if not path.exists('data/ml-100k/u.data'):
+        raise IOError("Data has not been downloaded.\nTry the following:\n\n\tcd data\n\t./download.sh")
+
+    # The input is in the form of a CSC sparse matrix, so it's a natural fit to
+    # load the data, but we then convert to a more traditional array before
+    # returning
     data = np.loadtxt('data/ml-100k/u.data')
     ij = data[:, :2]
     ij -= 1  # original data is in 1-based system
     values = data[:, 2]
     reviews = sparse.csc_matrix((values, ij.T)).astype(float)
     return reviews.toarray()
+
+def get_train_test(reviews=None, random_state=None):
+    '''Split data into training & testing
+
+    Parameters
+    ----------
+    reviews : ndarray, optional
+        Input data
+
+    Returns
+    -------
+    train : ndarray
+        training data
+    test : ndarray
+        testing data
+    '''
+    import numpy as np
+    import random
+    r = random.Random(random_state)
+
+    if reviews is None:
+        reviews = load()
+    U,M = np.where(reviews)
+    test_idxs = np.array(r.sample(range(len(U)), len(U)//10))
+    train = reviews.copy()
+    train[U[test_idxs], M[test_idxs]] = 0
+
+    test = np.zeros_like(reviews)
+    test[U[test_idxs], M[test_idxs]] = reviews[U[test_idxs], M[test_idxs]]
+
+    return train, test
+
diff --git a/ch08/norm.py b/ch08/norm.py
new file mode 100644
index 00000000..2925bbca
--- /dev/null
+++ b/ch08/norm.py
@@ -0,0 +1,75 @@
+import numpy as np
+
+class NormalizePositive(object):
+
+    def __init__(self, axis=0):
+        self.axis = axis
+
+    def fit(self, features, y=None):
+        # count features that are greater than zero in axis `self.axis`:
+        if self.axis == 1:
+            features = features.T
+        binary = (features > 0)
+        count = binary.sum(axis=0)
+
+        # to avoid division by zero, set zero counts to one:
+        count[count == 0] = 1.
+
+        self.mean = features.sum(axis=0)/count
+
+        # Compute variance by average squared difference to the mean, but only
+        # consider differences where binary is True (i.e., where there was a
+        # true rating):
+        diff = (features - self.mean) * binary
+        diff **= 2
+        # regularize the estimate of std by adding 0.1
+        self.std = np.sqrt(0.1 + diff.sum(axis=0)/count)
+        return self
+
+    def transform(self, features):
+        if self.axis == 1:
+            features = features.T
+        binary = (features > 0)
+        features = features - self.mean
+        features /= self.std
+        features *= binary
+        if self.axis == 1:
+            features = features.T
+        return features
+
+    def inverse_transform(self, features, copy=True):
+        if copy:
+            features = features.copy()
+        if self.axis == 1:
+            features = features.T
+        features *= self.std
+        features += self.mean
+        if self.axis == 1:
+            features = features.T
+        return features
+
+    def fit_transform(self, features):
+        return self.fit(features).transform(features)
+
+
+def predict(train):
+    norm = NormalizePositive()
+    train = norm.fit_transform(train)
+    return norm.inverse_transform(train * 0.)
+
+
+def main(transpose_inputs=False):
+    from load_ml100k import get_train_test
+    from sklearn import metrics
+    train,test = get_train_test(random_state=12)
+    if transpose_inputs:
+        train = train.T
+        test = test.T
+    predicted = predict(train)
+    r2 = metrics.r2_score(test[test > 0], predicted[test > 0])
+    print('R2 score ({} normalization): {:.1%}'.format(
+        ('movie' if transpose_inputs else 'user'),
+        r2))
+if __name__ == '__main__':
+    main()
+    main(transpose_inputs=True)
diff --git a/ch08/regression.py b/ch08/regression.py
new file mode 100644
index 00000000..693e99a4
--- /dev/null
+++ b/ch08/regression.py
@@ -0,0 +1,50 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+#
+# It is made available under the MIT License
+
+import numpy as np
+from sklearn.linear_model import ElasticNetCV
+from norm import NormalizePositive
+from sklearn import metrics
+
+
+def predict(train):
+    binary = (train > 0)
+    reg = ElasticNetCV(fit_intercept=True, alphas=[
+                       0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.])
+    norm = NormalizePositive()
+    train = norm.fit_transform(train)
+
+    filled = train.copy()
+    # iterate over all users
+    for u in range(train.shape[0]):
+        # remove the current user for training
+        curtrain = np.delete(train, u, axis=0)
+        bu = binary[u]
+        if np.sum(bu) > 5:
+            reg.fit(curtrain[:,bu].T, train[u, bu])
+
+            # Fill the values that were not there already
+            filled[u, ~bu] = reg.predict(curtrain[:,~bu].T)
+    return norm.inverse_transform(filled)
+
+
+def main(transpose_inputs=False):
+    from load_ml100k import get_train_test
+    train,test = get_train_test(random_state=12)
+    if transpose_inputs:
+        train = train.T
+        test = test.T
+    filled = predict(train)
+    r2 = metrics.r2_score(test[test > 0], filled[test > 0])
+
+    print('R2 score ({} regression): {:.1%}'.format(
+        ('movie' if transpose_inputs else 'user'),
+        r2))
+
+if __name__ == '__main__':
+    main()
+    main(transpose_inputs=True)
diff --git a/ch08/stacked.py b/ch08/stacked.py
index 76afe9fc..8fa6344e 100644
--- a/ch08/stacked.py
+++ b/ch08/stacked.py
@@ -1,40 +1,47 @@
-# This code is supporting material for the book
-# Building Machine Learning Systems with Python
-# by Willi Richert and Luis Pedro Coelho
-# published by PACKT Publishing
-#
-# It is made available under the MIT License
-
-from __future__ import print_function
-from sklearn.linear_model import LinearRegression
-from load_ml100k import load
 import numpy as np
-import similar_movie
-import usermodel
+import load_ml100k
+import regression
 import corrneighbours
+from sklearn import linear_model, metrics
+import norm
+
+def predict(train):
+    tr_train,tr_test = load_ml100k.get_train_test(train, random_state=34)
+    tr_predicted0 = regression.predict(tr_train)
+    tr_predicted1 = regression.predict(tr_train.T).T
+    tr_predicted2 = corrneighbours.predict(tr_train)
+    tr_predicted3 = corrneighbours.predict(tr_train.T).T
+    tr_predicted4 = norm.predict(tr_train)
+    tr_predicted5 = norm.predict(tr_train.T).T
+    stack_tr = np.array([
+        tr_predicted0[tr_test > 0],
+        tr_predicted1[tr_test > 0],
+        tr_predicted2[tr_test > 0],
+        tr_predicted3[tr_test > 0],
+        tr_predicted4[tr_test > 0],
+        tr_predicted5[tr_test > 0],
+        ]).T
+
+    lr = linear_model.LinearRegression()
+    lr.fit(stack_tr, tr_test[tr_test > 0])
+
+    stack_te = np.array([
+        tr_predicted0.ravel(),
+        tr_predicted1.ravel(),
+        tr_predicted2.ravel(),
+        tr_predicted3.ravel(),
+        tr_predicted4.ravel(),
+        tr_predicted5.ravel(),
+        ]).T
+
+    return lr.predict(stack_te).reshape(train.shape)
 
-reviews = load()
-reg = LinearRegression()
-es = np.array([
-    usermodel.all_estimates(reviews),
-    corrneighbours.all_estimates(reviews),
-    similar_movie.all_estimates(reviews),
-])
 
-total_error = 0.0
-coefficients = []
-for u in range(reviews.shape[0]):
-    es0 = np.delete(es, u, 1)
-    r0 = np.delete(reviews, u, 0)
-    X, Y = np.where(r0 > 0)
-    X = es[:, X, Y]
-    y = r0[r0 > 0]
-    reg.fit(X.T, y)
-    coefficients.append(reg.coef_)
+def main():
+    train,test = load_ml100k.get_train_test(random_state=12)
+    predicted = predict(train)
+    r2 = metrics.r2_score(test[test > 0], predicted[test > 0])
+    print('R2 stacked: {:.2%}'.format(r2))
 
-    r0 = reviews[u]
-    X = np.where(r0 > 0)
-    p0 = reg.predict(es[:, u, X].squeeze().T)
-    err0 = r0[r0 > 0] - p0
-    total_error += np.dot(err0, err0)
-    print(u)
+if __name__ == '__main__':
+    main()
diff --git a/ch08/stacked5.py b/ch08/stacked5.py
deleted file mode 100644
index 7ace7d32..00000000
--- a/ch08/stacked5.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# This code is supporting material for the book
-# Building Machine Learning Systems with Python
-# by Willi Richert and Luis Pedro Coelho
-# published by PACKT Publishing
-#
-# It is made available under the MIT License
-
-from sklearn.linear_model import LinearRegression
-from load_ml100k import load
-import numpy as np
-import similar_movie
-import usermodel
-
-reviews = load()
-# Collect several estimates
-es = np.array([
-    usermodel.all_estimates(reviews),
-    similar_movie.all_estimates(reviews, k=1),
-    similar_movie.all_estimates(reviews, k=2),
-    similar_movie.all_estimates(reviews, k=3),
-    similar_movie.all_estimates(reviews, k=4),
-    similar_movie.all_estimates(reviews, k=5),
-])
-
-total_error = 0.0
-coefficients = []
-
-reg = LinearRegression()
-# Iterate over all users
-for u in range(reviews.shape[0]):
-    es0 = np.delete(es, u, axis=1)
-    r0 = np.delete(reviews, u, axis=0)
-    X, Y = np.where(r0 > 0)
-    X = es[:, X, Y]
-    y = r0[r0 > 0]
-    reg.fit(X.T, y)
-    coefficients.append(reg.coef_)
-
-    r0 = reviews[u]
-    X = np.where(r0 > 0)
-    p0 = reg.predict(es[:, u, X].squeeze().T)
-    err0 = r0[r0 > 0] - p0
-    total_error += np.dot(err0, err0)
-coefficients = np.array(coefficients)
diff --git a/ch08/usermodel.py b/ch08/usermodel.py
deleted file mode 100644
index 589afca4..00000000
--- a/ch08/usermodel.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# This code is supporting material for the book
-# Building Machine Learning Systems with Python
-# by Willi Richert and Luis Pedro Coelho
-# published by PACKT Publishing
-#
-# It is made available under the MIT License
-
-import numpy as np
-from sklearn.linear_model import ElasticNetCV
-from sklearn.cross_validation import KFold
-
-
-def learn_for(reviews, i):
-    reg = ElasticNetCV(fit_intercept=True, alphas=[
-                       0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.])
-    nusers,nmovies = reviews.shape
-    u = reviews[i]
-    us = np.arange(reviews.shape[0])
-    us = np.delete(us, i)
-    ps, = np.where(u.ravel() > 0)
-    x = reviews[us][:, ps].T
-    kf = KFold(len(ps), n_folds=4)
-    predictions = np.zeros(len(ps))
-    for train, test in kf:
-        xc = x[train].copy()
-        x1 = np.array([xi[xi > 0].mean() for xi in xc])
-        x1 = np.nan_to_num(x1)
-
-        for i in range(xc.shape[0]):
-            xc[i] -= (xc[i] > 0) * x1[i]
-
-        reg.fit(xc, u[train] - x1)
-
-        xc = x[test].copy()
-        x1 = np.array([xi[xi > 0].mean() for xi in xc])
-        x1 = np.nan_to_num(x1)
-
-        for i in range(xc.shape[0]):
-            xc[i] -= (xc[i] > 0) * x1[i]
-
-        p = reg.predict(xc).ravel()
-        predictions[test] = p
-    fill_preds = np.zeros(nmovies)
-    fill_preds[ps] = predictions
-    return fill_preds
-
-
-def all_estimates(reviews):
-    whole_data = []
-    for i in range(reviews.shape[0]):
-        s = learn_for(reviews, i)
-        whole_data.append(s)
-    return np.array(whole_data)
-
diff --git a/ch10/.gitignore b/ch10/.gitignore
new file mode 100644
index 00000000..2f266195
--- /dev/null
+++ b/ch10/.gitignore
@@ -0,0 +1 @@
+AnimTransDistr/
diff --git a/ch10/README.rst b/ch10/README.rst
index 6aea7be0..91e32756 100644
--- a/ch10/README.rst
+++ b/ch10/README.rst
@@ -17,22 +17,21 @@ Running ``download.sh`` will retrieve the other dataset into a directory
 Scripts
 -------
 
-threshold.py
-    Threshold building image with both Otsu and Ridley-Calvard threshold
+chapter.py
+    Code as written in the book.
+thresholded_figure.py
+    Computes the thresholded figures, including after Gaussian blurring
 lena-ring.py
-    Show Lena in a sharp ring around a soft focus image
-figure5_6.py
-    Computes figures 5 and 6 in the book
-figure9.py
-    Compute Sobel images
+    Lena image with center in focus and blurred edges
 figure10.py
     Just paste two images next to each others
-figure13.py
-    Demonstrate salt&pepper effect
-edginess.py
-    Contains the ``edginess_sobel`` function from the book
+features.py
+    Contains the color histogram function from the book as well as a simple
+    wrapper around ``mahotas.texture.haralick``
 simple_classification.py
-    Classify SimpleImageDataset with texture features + sobel feature
-figure18.py
+    Classify SimpleImageDataset with texture features + color histogram features
+large_classification.py
     Classify ``AnimTransDistr`` with both texture and SURF features.
+neighbors.py
+    Computes image neighbors as well as the neighbor figure from the book.
 
diff --git a/ch10/chapter.py b/ch10/chapter.py
new file mode 100644
index 00000000..233720bb
--- /dev/null
+++ b/ch10/chapter.py
@@ -0,0 +1,186 @@
+﻿import numpy as np
+import mahotas as mh
+image = mh.imread('scene00.jpg')
+from matplotlib import pyplot as plt
+plt.imshow(image)
+plt.show()
+image = mh.colors.rgb2grey(image, dtype=np.uint8)
+plt.imshow(image) # Display the image
+plt.gray()
+thresh = mh.thresholding.otsu(image)
+print('Otsu threshold is {}.'.format(thresh))
+# Otsu threshold is 138.
+plt.imshow(image > thresh)
+
+im16 = mh.gaussian_filter(image,16)
+im = mh.demos.load('lenna')
+
+r,g,b = im.transpose(2,0,1) 
+r12 = mh.gaussian_filter(r, 12.)
+g12 = mh.gaussian_filter(g, 12.)
+b12 = mh.gaussian_filter(b, 12.)
+im12 = mh.as_rgb(r12,g12,b12)
+h, w = r.shape # height and width
+Y, X = np.mgrid[:h,:w]
+Y = Y-h/2. # center at h/2
+Y = Y / Y.max() # normalize to -1 .. +1
+
+X = X-w/2.
+X = X / X.max()
+
+C = np.exp(-2.*(X**2+ Y**2))
+
+# Normalize again to 0..1
+C = C - C.min()
+C = C / C.ptp()
+C = C[:,:,None] # This adds a dummy third dimension to C
+
+ringed = mh.stretch(im*C + (1-C)*im12)
+
+haralick_features = mh.features.haralick(image)
+haralick_features_mean = np.mean(haralick_features, axis=0)
+haralick_features_all = np.ravel(haralick_features)
+
+from glob import glob
+images = glob('../SimpleImageDataset/*.jpg')
+features = []
+labels = []
+for im in images:
+  labels.append(im[:-len('00.jpg')])
+  im = mh.imread(im)
+  im = mh.colors.rgb2gray(im, dtype=np.uint8)
+  features.append(mh.features.haralick(im).ravel())
+
+features = np.array(features)
+labels = np.array(labels)
+
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.linear_model import LogisticRegression
+clf = Pipeline([('preproc', StandardScaler()),
+                ('classifier', LogisticRegression())])
+
+
+from sklearn import cross_validation
+cv = cross_validation.LeaveOneOut(len(images))
+scores = cross_validation.cross_val_score(
+    clf, features, labels, cv=cv)
+print('Accuracy: {:.1%}'.format(scores.mean()))
+# Accuracy: 81.1%
+
+def chist(im):
+    im = im // 64
+    r,g,b = im.transpose((2,0,1))
+    pixels = 1 * r + 4 * b + 16 * g
+    hist = np.bincount(pixels.ravel(), minlength=64)
+    hist = hist.astype(float)
+    hist = np.log1p(hist)
+    return hist
+
+features = []
+for im in images:
+  im = mh.imread(im)
+  features.append(chist(im))
+
+
+
+features = []
+for im in images:
+  imcolor = mh.imread(im)
+  im = mh.colors.rgb2gray(imcolor, dtype=np.uint8)
+  features.append(np.concatenate([
+          mh.features.haralick(im).ravel(),
+          chist(imcolor),
+      ]))
+
+
+scores = cross_validation.cross_val_score(
+    clf, features, labels, cv=cv)
+print('Accuracy: {:.1%}'.format(scores.mean()))
+# Accuracy: 95.6%
+
+
+features = []
+for im in images:
+  imcolor = mh.imread(im)
+  # Ignore everything in the 200 pixels close to the borders
+  imcolor = imcolor[200:-200, 200:-200]
+  im = mh.colors.rgb2gray(imcolor, dtype=np.uint8)
+  features.append(np.concatenate([
+          mh.features.haralick(im).ravel(),
+          chist(imcolor),
+      ]))
+
+sc = StandardScaler()
+features = sc.fit_transform(features)
+from scipy.spatial import distance
+dists = distance.squareform(distance.pdist(features))
+
+
+fig, axes = plt.subplots(2, 9)
+for ci,i in enumerate(range(0,90,10)):
+    left = images[i]
+    dists_left = dists[i]
+    right = dists_left.argsort()
+    # right[0] is the same as left[i], so pick the next closest element
+    right = right[1]
+    right = images[right]
+    left = mh.imread(left)
+    right = mh.imread(right)
+    axes[0, ci].imshow(left)
+    axes[1, ci].imshow(right)
+
+
+
+from sklearn.grid_search import GridSearchCV
+C_range = 10.0 ** np.arange(-4, 3)
+grid = GridSearchCV(LogisticRegression(), param_grid={'C' : C_range})
+clf = Pipeline([('preproc', StandardScaler()),
+               ('classifier', grid)])
+
+cv = cross_validation.KFold(len(features), 5,
+                     shuffle=True, random_state=123)
+scores = cross_validation.cross_val_score(
+   clf, features, labels, cv=cv)
+print('Accuracy: {:.1%}'.format(scores.mean()))
+
+
+
+
+from mahotas.features import surf
+image = mh.demos.load('lena')
+image = mh.colors.rgb2gray(image, dtype=np.uint8)
+descriptors = surf.surf(image, descriptor_only=True)
+
+from mahotas.features import surf
+descriptors = surf.dense(image, spacing=16)
+alldescriptors = []
+for im in images:
+  im = mh.imread(im, as_grey=True)
+  im = im.astype(np.uint8)
+  alldescriptors.append(surf.dense(image, spacing=16))
+# get all descriptors into a single array
+concatenated = np.concatenate(alldescriptors)
+print('Number of descriptors: {}'.format(
+       len(concatenated)))
+# use only every 64th vector
+concatenated = concatenated[::64] 
+from sklearn.cluster import KMeans # FIXME CAPITALIZATION
+k = 256
+km = KMeans(k)
+km.fit(concatenated)
+
+features = []
+for d in alldescriptors:
+  c = km.predict(d)
+  features.append(
+      np.array([np.sum(c == ci) for ci in range(k)])
+  )
+# build single array and convert to float
+features = np.array(features, dtype=float)
+scores = cross_validation.cross_val_score(
+   clf, features, labels, cv=cv)
+print('Accuracy: {:.1%}'.format(scores.mean()))
+# Accuracy: 62.6%
+
+
diff --git a/ch10/download.sh b/ch10/download.sh
index 93121a7f..fb623f3d 100755
--- a/ch10/download.sh
+++ b/ch10/download.sh
@@ -2,7 +2,7 @@
 
 mkdir -p AnimTransDistr
 cd AnimTransDistr
-wget http://vision.stanford.edu/Datasets/AnimTransDistr.rar
+curl -O http://vision.stanford.edu/Datasets/AnimTransDistr.rar
 unrar x AnimTransDistr.rar
 # The following file is a weird file:
 rm Anims/104034.jpg
diff --git a/ch10/edginess.py b/ch10/edginess.py
deleted file mode 100644
index 5815d654..00000000
--- a/ch10/edginess.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# This code is supporting material for the book
-# Building Machine Learning Systems with Python
-# by Willi Richert and Luis Pedro Coelho
-# published by PACKT Publishing
-#
-# It is made available under the MIT License
-
-import numpy as np
-import mahotas as mh
-
-
-def edginess_sobel(image):
-    '''Measure the "edginess" of an image
-
-    image should be a 2d numpy array (an image)
-
-    Returns a floating point value which is higher the "edgier" the image is.
-
-    '''
-    edges = mh.sobel(image, just_filter=True)
-    edges = edges.ravel()
-    return np.sqrt(np.dot(edges, edges))
diff --git a/ch10/features.py b/ch10/features.py
new file mode 100644
index 00000000..42847b30
--- /dev/null
+++ b/ch10/features.py
@@ -0,0 +1,70 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+#
+# It is made available under the MIT License
+
+import numpy as np
+import mahotas as mh
+
+
+def edginess_sobel(image):
+    '''Measure the "edginess" of an image
+
+    image should be a 2d numpy array (an image)
+
+    Returns a floating point value which is higher the "edgier" the image is.
+
+    '''
+    edges = mh.sobel(image, just_filter=True)
+    edges = edges.ravel()
+    return np.sqrt(np.dot(edges, edges))
+
+def texture(im):
+    '''Compute features for an image
+
+    Parameters
+    ----------
+    im : ndarray
+
+    Returns
+    -------
+    fs : ndarray
+        1-D array of features
+    '''
+    im = im.astype(np.uint8)
+    return mh.features.haralick(im).ravel()
+
+
+def chist(im):
+    '''Compute color histogram of input image
+
+    Parameters
+    ----------
+    im : ndarray
+        should be an RGB image
+
+    Returns
+    -------
+    c : ndarray
+        1-D array of histogram values
+    '''
+
+    # Downsample pixel values:
+    im = im // 64
+
+    # We can also implement the following by using np.histogramdd
+    # im = im.reshape((-1,3))
+    # bins = [np.arange(5), np.arange(5), np.arange(5)]
+    # hist = np.histogramdd(im, bins=bins)[0]
+    # hist = hist.ravel()
+
+    # Separate RGB channels:
+    r,g,b = im.transpose((2,0,1))
+
+    pixels = 1 * r + 4 * g + 16 * b
+    hist = np.bincount(pixels.ravel(), minlength=64)
+    hist = hist.astype(float)
+    return np.log1p(hist)
+
diff --git a/ch10/figure10.py b/ch10/figure10.py
index 058b1069..6cb45e7a 100644
--- a/ch10/figure10.py
+++ b/ch10/figure10.py
@@ -11,10 +11,10 @@
 # This little script just builds an image with two examples, side-by-side:
 
 text = mh.imread("../SimpleImageDataset/text21.jpg")
-scene = mh.imread("../SimpleImageDataset/scene00.jpg")
+building = mh.imread("../SimpleImageDataset/building00.jpg")
 h, w, _ = text.shape
 canvas = np.zeros((h, 2 * w + 128, 3), np.uint8)
-canvas[:, -w:] = scene
+canvas[:, -w:] = building
 canvas[:, :w] = text
 canvas = canvas[::4, ::4]
-mh.imsave('../1400OS_10_10+.jpg', canvas)
+mh.imsave('figure10.jpg', canvas)
diff --git a/ch10/figure13.py b/ch10/figure13.py
deleted file mode 100644
index 89174e8e..00000000
--- a/ch10/figure13.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# This code is supporting material for the book
-# Building Machine Learning Systems with Python
-# by Willi Richert and Luis Pedro Coelho
-# published by PACKT Publishing
-#
-# It is made available under the MIT License
-
-import mahotas as mh
-from mahotas.colors import rgb2grey
-import numpy as np
-
-# Adds a little salt-n-pepper noise to an image
-
-im = mh.demos.load('lena')
-im = rgb2grey(im)
-
-# Salt & pepper arrays
-salt = np.random.random(im.shape) > .975
-pepper = np.random.random(im.shape) > .975
-
-# salt is 170 & pepper is 30
-# Some playing around showed that setting these to more extreme values looks
-# very artificial. These look nicer
-
-im = np.maximum(salt * 170, mh.stretch(im))
-im = np.minimum(pepper * 30 + im * (~pepper), im)
-
-mh.imsave('../1400OS_10_13+.jpg', im.astype(np.uint8))
diff --git a/ch10/figure18.py b/ch10/figure18.py
deleted file mode 100644
index c0e80d0a..00000000
--- a/ch10/figure18.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# This code is supporting material for the book
-# Building Machine Learning Systems with Python
-# by Willi Richert and Luis Pedro Coelho
-# published by PACKT Publishing
-#
-# It is made available under the MIT License
-
-from __future__ import print_function
-import mahotas as mh
-from sklearn import cross_validation
-from sklearn.linear_model.logistic import LogisticRegression
-from matplotlib import pyplot as plt
-import numpy as np
-from glob import glob
-
-basedir = 'AnimTransDistr'
-print('This script will test classification of the AnimTransDistr dataset')
-
-
-def features_for(images):
-    fs = []
-    for im in images:
-        im = mh.imread(im, as_grey=True).astype(np.uint8)
-        fs.append(mh.features.haralick(im).mean(0))
-    return np.array(fs)
-
-
-def features_labels(groups):
-    labels = np.zeros(sum(map(len, groups)))
-    st = 0
-    for i, g in enumerate(groups):
-        labels[st:st + len(g)] = i
-        st += len(g)
-    return np.vstack(groups), labels
-
-classes = [
-    'Anims',
-    'Cars',
-    'Distras',
-    'Trans',
-]
-
-print('Computing whole-image texture features...')
-features = []
-labels = []
-for ci, cl in enumerate(classes):
-    images = glob('{}/{}/*.jpg'.format(basedir, cl))
-    features.extend(features_for(images))
-    labels.extend([ci for _ in images])
-
-features = np.array(features)
-labels = np.array(labels)
-
-scores0 = cross_validation.cross_val_score(
-    LogisticRegression(), features, labels, cv=10)
-print('Accuracy (5 fold x-val) with Logistic Regrssion [std features]: %s%%' % (
-    0.1 * round(1000 * scores0.mean())))
-
-tfeatures = features
-
-from sklearn.cluster import KMeans
-from mahotas.features import surf
-
-images = []
-labels = []
-
-for ci, cl in enumerate(classes):
-    curimages = glob('{}/{}/*.jpg'.format(basedir, cl))
-    images.extend(curimages)
-    labels.extend([ci for _ in curimages])
-labels = np.array(labels)
-
-print('Computing SURF descriptors...')
-alldescriptors = []
-for im in images:
-    im = mh.imread(im, as_grey=1)
-    im = im.astype(np.uint8)
-
-    # To use dense sampling, you can try the following line:
-    # alldescriptors.append(surf.dense(im, spacing=max(im.shape)//32))
-    alldescriptors.append(surf.surf(im, descriptor_only=True))
-
-print('Descriptors done')
-k = 256
-km = KMeans(k)
-
-concatenated = np.concatenate(alldescriptors)
-concatenated = concatenated[::64]
-print('Clustering with K-means...')
-km.fit(concatenated)
-features = []
-for d in alldescriptors:
-    c = km.predict(d)
-    features.append(
-        np.array([np.sum(c == i) for i in xrange(k)])
-    )
-features = np.array(features)
-print('predicting...')
-scoreSURFlr = cross_validation.cross_val_score(
-    LogisticRegression(), features, labels, cv=5).mean()
-print('Accuracy (5 fold x-val) with Log. Reg [SURF features]: %s%%' % (
-    0.1 * round(1000 * scoreSURFlr.mean())))
-
-print('Attemping classification with all features combined...')
-allfeatures = np.hstack([features, tfeatures])
-scoreSURFplr = cross_validation.cross_val_score(
-    LogisticRegression(), allfeatures, labels, cv=5).mean()
-
-print('Accuracy (5 fold x-val) with Log. Reg [All features]: %s%%' % (
-    0.1 * round(1000 * scoreSURFplr.mean())))
-
-plt.plot([0, 1, 2], 100 *
-         np.array([scores0.mean(), scoreSURFlr, scoreSURFplr]), 'k-', lw=8)
-plt.plot(
-    [0, 1, 2], 100 * np.array([scores0.mean(), scoreSURFlr, scoreSURFplr]),
-    'o', mec='#cccccc', mew=12, mfc='white')
-plt.xlim(-.5, 2.5)
-plt.ylim(scores0.mean() * 90., scoreSURFplr * 110)
-plt.xticks([0, 1, 2], ["Texture", "SURF", "combined"])
-plt.ylabel('Accuracy (%)')
-plt.savefig('../1400OS_10_18+.png')
diff --git a/ch10/figure5_6.py b/ch10/figure5_6.py
deleted file mode 100644
index 37d52752..00000000
--- a/ch10/figure5_6.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# This code is supporting material for the book
-# Building Machine Learning Systems with Python
-# by Willi Richert and Luis Pedro Coelho
-# published by PACKT Publishing
-#
-# It is made available under the MIT License
-
-import numpy as np
-import mahotas as mh
-image = mh.imread('../SimpleImageDataset/building05.jpg')
-image = mh.colors.rgb2gray(image)
-
-# Compute Gaussian filtered versions with increasing kernel widths
-im8  = mh.gaussian_filter(image,  8)
-im16 = mh.gaussian_filter(image, 16)
-im32 = mh.gaussian_filter(image, 32)
-
-# We now build a composite image with three panels:
-#
-# [ IM8 | | IM16 | | IM32 ]
-
-h, w = im8.shape
-canvas = np.ones((h, 3 * w + 256), np.uint8)
-canvas *= 255
-canvas[:, :w] = im8
-canvas[:, w + 128:2 * w + 128] = im16
-canvas[:, -w:] = im32
-mh.imsave('../1400OS_10_05+.jpg', canvas[:, ::2])
-
-# Threshold the image
-# We need to first stretch it to convert to an integer image
-im32 = mh.stretch(im32)
-ot32 = mh.otsu(im32)
-
-# Convert to 255 np.uint8 to match the other images
-im255 = 255 * (im32 > ot32).astype(np.uint8)
-mh.imsave('../1400OS_10_06+.jpg', im255)
diff --git a/ch10/figure9.py b/ch10/figure9.py
deleted file mode 100644
index fbaa985f..00000000
--- a/ch10/figure9.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# This code is supporting material for the book
-# Building Machine Learning Systems with Python
-# by Willi Richert and Luis Pedro Coelho
-# published by PACKT Publishing
-#
-# It is made available under the MIT License
-
-import numpy as np
-import mahotas as mh
-image = mh.imread('../SimpleImageDataset/building05.jpg')
-image = mh.colors.rgb2gray(image, dtype=np.uint8)
-
-# We need to downsample ``image`` so that the details are visibly pixelated.
-# This exaggerates the effect so that the result is clear
-image = image[::4, ::4]
-thresh = mh.sobel(image)
-filtered = mh.sobel(image, just_filter=True)
-
-thresh = mh.dilate(thresh, np.ones((7, 7)))
-filtered = mh.dilate(mh.stretch(filtered), np.ones((7, 7)))
-
-# Paste the thresholded and non-thresholded versions of the image side-by-side
-h, w = thresh.shape
-canvas = 255 * np.ones((h, w * 2 + 64), np.uint8)
-canvas[:, :w] = thresh * 255
-canvas[:, -w:] = filtered
-
-mh.imsave('../1400OS_10_09+.jpg', canvas)
diff --git a/ch10/large_classification.py b/ch10/large_classification.py
new file mode 100644
index 00000000..8db3571b
--- /dev/null
+++ b/ch10/large_classification.py
@@ -0,0 +1,108 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+#
+# It is made available under the MIT License
+
+from __future__ import print_function
+import mahotas as mh
+from glob import glob
+from sklearn import cross_validation
+from sklearn.linear_model import LogisticRegression
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.grid_search import GridSearchCV
+import numpy as np
+
+basedir = 'AnimTransDistr'
+print('This script will test classification of the AnimTransDistr dataset')
+
+C_range = 10.0 ** np.arange(-4, 3)
+grid = GridSearchCV(LogisticRegression(), param_grid={'C' : C_range})
+clf = Pipeline([('preproc', StandardScaler()),
+                ('classifier', grid)])
+
+def features_for(im):
+    from features import chist
+    im = mh.imread(im)
+    img = mh.colors.rgb2grey(im).astype(np.uint8)
+    return np.concatenate([mh.features.haralick(img).ravel(),
+                                chist(im)])
+
+def images():
+    '''Iterate over all (image,label) pairs
+
+    This function will return
+    '''
+    for ci, cl in enumerate(classes):
+        images = glob('{}/{}/*.jpg'.format(basedir, cl))
+        for im in sorted(images):
+            yield im, ci
+
+classes = [
+    'Anims',
+    'Cars',
+    'Distras',
+    'Trans',
+]
+
+print('Computing whole-image texture features...')
+ifeatures = []
+labels = []
+for im, ell in images():
+    ifeatures.append(features_for(im))
+    labels.append(ell)
+
+ifeatures = np.array(ifeatures)
+labels = np.array(labels)
+
+cv = cross_validation.KFold(len(ifeatures), 5, shuffle=True, random_state=123)
+scores0 = cross_validation.cross_val_score(
+    clf, ifeatures, labels, cv=cv)
+print('Accuracy (5 fold x-val) with Logistic Regression [image features]: {:.1%}'.format(
+    scores0.mean()))
+
+
+from sklearn.cluster import KMeans
+from mahotas.features import surf
+
+
+print('Computing SURF descriptors...')
+alldescriptors = []
+for im,_ in images():
+    im = mh.imread(im, as_grey=True)
+    im = im.astype(np.uint8)
+
+    # To use dense sampling, you can try the following line:
+    # alldescriptors.append(surf.dense(im, spacing=16))
+    alldescriptors.append(surf.surf(im, descriptor_only=True))
+
+print('Descriptor computation complete.')
+k = 256
+km = KMeans(k)
+
+concatenated = np.concatenate(alldescriptors)
+print('Number of descriptors: {}'.format(
+        len(concatenated)))
+concatenated = concatenated[::64]
+print('Clustering with K-means...')
+km.fit(concatenated)
+sfeatures = []
+for d in alldescriptors:
+    c = km.predict(d)
+    sfeatures.append(np.bincount(c, minlength=k))
+sfeatures = np.array(sfeatures, dtype=float)
+print('predicting...')
+score_SURF = cross_validation.cross_val_score(
+    clf, sfeatures, labels, cv=cv).mean()
+print('Accuracy (5 fold x-val) with Logistic Regression [SURF features]: {:.1%}'.format(
+    score_SURF.mean()))
+
+
+print('Performing classification with all features combined...')
+allfeatures = np.hstack([sfeatures, ifeatures])
+score_SURF_global = cross_validation.cross_val_score(
+    clf, allfeatures, labels, cv=cv).mean()
+print('Accuracy (5 fold x-val) with Logistic Regression [All features]: {:.1%}'.format(
+    score_SURF_global.mean()))
diff --git a/ch10/neighbors.py b/ch10/neighbors.py
new file mode 100644
index 00000000..1f71d0de
--- /dev/null
+++ b/ch10/neighbors.py
@@ -0,0 +1,64 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+
+import numpy as np
+import mahotas as mh
+from glob import glob
+from features import texture, chist
+from matplotlib import pyplot as plt
+from sklearn.preprocessing import StandardScaler
+from scipy.spatial import distance
+
+basedir = '../SimpleImageDataset/'
+
+
+haralicks = []
+chists = []
+
+print('Computing features...')
+# Use glob to get all the images
+images = glob('{}/*.jpg'.format(basedir))
+# We sort the images to ensure that they are always processed in the same order
+# Otherwise, this would introduce some variation just based on the random
+# ordering that the filesystem uses
+images.sort()
+
+for fname in images:
+    imc = mh.imread(fname)
+    imc = imc[200:-200,200:-200]
+    haralicks.append(texture(mh.colors.rgb2grey(imc)))
+    chists.append(chist(imc))
+
+haralicks = np.array(haralicks)
+chists = np.array(chists)
+features = np.hstack([chists, haralicks])
+
+print('Computing neighbors...')
+sc = StandardScaler()
+features = sc.fit_transform(features)
+dists = distance.squareform(distance.pdist(features))
+
+print('Plotting...')
+fig, axes = plt.subplots(2, 9, figsize=(16,8))
+
+# Remove ticks from all subplots
+for ax in axes.flat:
+    ax.set_xticks([])
+    ax.set_yticks([])
+
+for ci,i in enumerate(range(0,90,10)):
+    left = images[i]
+    dists_left = dists[i]
+    right = dists_left.argsort()
+    # right[0] is the same as left[i], so pick the next closest element
+    right = right[1]
+    right = images[right]
+    left = mh.imread(left)
+    right = mh.imread(right)
+    axes[0, ci].imshow(left)
+    axes[1, ci].imshow(right)
+
+fig.tight_layout()
+fig.savefig('figure_neighbors.png', dpi=300)
diff --git a/ch10/scene00.jpg b/ch10/scene00.jpg
new file mode 100644
index 00000000..ed727a50
Binary files /dev/null and b/ch10/scene00.jpg differ
diff --git a/ch10/simple_classification.py b/ch10/simple_classification.py
index 6a512f64..a5a448d2 100644
--- a/ch10/simple_classification.py
+++ b/ch10/simple_classification.py
@@ -6,42 +6,33 @@
 # It is made available under the MIT License
 
 import mahotas as mh
-from sklearn import cross_validation
-from sklearn.linear_model.logistic import LogisticRegression
 import numpy as np
 from glob import glob
-from edginess import edginess_sobel
-
-basedir = '../SimpleImageDataset/'
 
+from features import texture, chist
+from sklearn.linear_model import LogisticRegression
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
 
-def features_for(im):
-    '''Compute features for an image
-
-    Parameters
-    ----------
-    im : str
-        filepath for image to process
+basedir = '../SimpleImageDataset/'
 
-    Returns
-    -------
-    fs : ndarray
-        1-D array of features
-    '''
-    im = mh.imread(im, as_grey=True).astype(np.uint8)
-    return mh.features.haralick(im).mean(0)
 
 haralicks = []
-sobels = []
 labels = []
+chists = []
 
 print('This script will test (with cross-validation) classification of the simple 3 class dataset')
 print('Computing features...')
 # Use glob to get all the images
 images = glob('{}/*.jpg'.format(basedir))
-for fname in images:
-    haralicks.append(features_for(fname))
-    sobels.append(edginess_sobel(mh.imread(fname, as_grey=True)))
+
+# We sort the images to ensure that they are always processed in the same order
+# Otherwise, this would introduce some variation just based on the random
+# ordering that the filesystem uses
+for fname in sorted(images):
+    imc = mh.imread(fname)
+    haralicks.append(texture(mh.colors.rgb2grey(imc)))
+    chists.append(chist(imc))
 
     # Files are named like building00.jpg, scene23.jpg...
     labels.append(fname[:-len('xx.jpg')])
@@ -49,27 +40,31 @@ def features_for(im):
 print('Finished computing features.')
 
 haralicks = np.array(haralicks)
-sobels = np.array(sobels)
 labels = np.array(labels)
+chists = np.array(chists)
 
-# We use logistic regression because it is very fast.
+haralick_plus_chists = np.hstack([chists, haralicks])
+
+
+# We use Logistic Regression because it achieves high accuracy on small(ish) datasets
 # Feel free to experiment with other classifiers
-scores = cross_validation.cross_val_score(
-    LogisticRegression(), haralicks, labels, cv=5)
-print('Accuracy (5 fold x-val) with Logistic Regrssion [std features]: {}%'.format(
-    0.1 * round(1000 * scores.mean())))
+clf = Pipeline([('preproc', StandardScaler()),
+                ('classifier', LogisticRegression())])
 
-haralick_plus_sobel = np.hstack([np.atleast_2d(sobels).T, haralicks])
+from sklearn import cross_validation
+cv = cross_validation.LeaveOneOut(len(images))
 scores = cross_validation.cross_val_score(
-    LogisticRegression(), haralick_plus_sobel, labels, cv=5).mean()
-print('Accuracy (5 fold x-val) with Logistic Regrssion [std features + sobel]: {}%'.format(
-    0.1 * round(1000 * scores.mean())))
+    clf, haralicks, labels, cv=cv)
+print('Accuracy (Leave-one-out) with Logistic Regression [haralick features]: {:.1%}'.format(
+    scores.mean()))
 
+scores = cross_validation.cross_val_score(
+    clf, chists, labels, cv=cv)
+print('Accuracy (Leave-one-out) with Logistic Regression [color histograms]: {:.1%}'.format(
+    scores.mean()))
 
-# We can try to just use the sobel feature. The result is almost completely
-# random.
 scores = cross_validation.cross_val_score(
-    LogisticRegression(), np.atleast_2d(sobels).T, labels, cv=5).mean()
-print('Accuracy (5 fold x-val) with Logistic Regrssion [only using sobel feature]: {}%'.format(
-    0.1 * round(1000 * scores.mean())))
+    clf, haralick_plus_chists, labels, cv=cv)
+print('Accuracy (Leave-one-out) with Logistic Regression [texture features + color histograms]: {:.1%}'.format(
+    scores.mean()))
 
diff --git a/ch10/thresholded_figure.py b/ch10/thresholded_figure.py
new file mode 100644
index 00000000..947762e8
--- /dev/null
+++ b/ch10/thresholded_figure.py
@@ -0,0 +1,31 @@
+import mahotas as mh
+import numpy as np
+from matplotlib import pyplot as plt
+
+# Load image & convert to B&W
+image = mh.imread('../SimpleImageDataset/scene00.jpg')
+image = mh.colors.rgb2grey(image, dtype=np.uint8)
+plt.imshow(image)
+plt.gray()
+plt.title('original image')
+
+thresh = mh.thresholding.otsu(image)
+print('Otsu threshold is {}.'.format(thresh))
+
+threshed = (image > thresh)
+plt.figure()
+plt.imshow(threshed)
+plt.title('threholded image')
+mh.imsave('thresholded.png', threshed.astype(np.uint8)*255)
+
+im16 = mh.gaussian_filter(image, 16)
+
+# Repeat the thresholding operations with the blurred image 
+thresh = mh.thresholding.otsu(im16.astype(np.uint8))
+threshed = (im16 > thresh)
+plt.figure()
+plt.imshow(threshed)
+plt.title('threholded image (after blurring)')
+print('Otsu threshold after blurring is {}.'.format(thresh))
+mh.imsave('thresholded16.png', threshed.astype(np.uint8)*255)
+plt.show()
diff --git a/ch12/.gitignore b/ch12/.gitignore
index fb47e2ef..fa8fbcc2 100644
--- a/ch12/.gitignore
+++ b/ch12/.gitignore
@@ -1 +1,3 @@
 *.jugdata/
+output.txt
+results.image.txt
diff --git a/ch12/README.rst b/ch12/README.rst
new file mode 100644
index 00000000..fba88d4a
--- /dev/null
+++ b/ch12/README.rst
@@ -0,0 +1,28 @@
+==========
+Chapter 12
+==========
+
+Support code for *Chapter 12: Big(ger) Data* 
+
+Data
+----
+
+This chapter relies only on the image dataset that is packaged with the
+repository at ``../SimpleImageDataset/``.
+
+Scripts
+-------
+
+chapter.py
+    Code as written in the book
+jugfile.py
+    Example jugfile
+image-classification.py
+    Jugfile implementation of image classification from Chapter 10
+
+setup-aws.txt
+    Commands to setup Amazon WebServices machine
+run-jugfile.sh
+    Wrapper script to run jug file on jugfile.py
+run-image-classification.sh
+    Wrapper script to run jug file on image-classification.py
diff --git a/ch12/chapter.py b/ch12/chapter.py
new file mode 100644
index 00000000..15326680
--- /dev/null
+++ b/ch12/chapter.py
@@ -0,0 +1,95 @@
+from jug import TaskGenerator
+from glob import glob
+import mahotas as mh
+@TaskGenerator
+def compute_texture(im):   
+    from features import texture
+    imc = mh.imread(im)
+    return texture(mh.colors.rgb2gray(imc))
+
+@TaskGenerator
+def chist_file(fname):
+    from features import chist
+    im = mh.imread(fname)
+    return chist(im)
+
+import numpy as np
+to_array = TaskGenerator(np.array)
+hstack = TaskGenerator(np.hstack)
+
+haralicks = []
+chists = []
+labels = []
+
+# Change this variable to point to
+# the location of the dataset is on disk
+basedir = '../SimpleImageDataset/'
+# Use glob to get all the images
+images = glob('{}/*.jpg'.format(basedir))
+
+for fname in sorted(images):
+    haralicks.append(compute_texture(fname))
+    chists.append(chist_file(fname))
+    # The class is encoded in the filename as xxxx00.jpg
+    labels.append(fname[:-len('00.jpg')])
+
+haralicks = to_array(haralicks)
+chists = to_array(chists)
+labels = to_array(labels)
+
+@TaskGenerator
+def accuracy(features, labels):
+    from sklearn.linear_model import LogisticRegression
+    from sklearn.pipeline import Pipeline
+    from sklearn.preprocessing import StandardScaler
+    from sklearn import cross_validation
+    
+    clf = Pipeline([('preproc', StandardScaler()),
+                ('classifier', LogisticRegression())])
+    cv = cross_validation.LeaveOneOut(len(features))
+    scores = cross_validation.cross_val_score(
+        clf, features, labels, cv=cv)
+    return scores.mean()
+scores_base = accuracy(haralicks, labels)
+scores_chist = accuracy(chists, labels)
+
+combined = hstack([chists, haralicks])
+scores_combined  = accuracy(combined, labels)
+
+@TaskGenerator
+def print_results(scores):
+    with open('results.image.txt', 'w') as output:
+        for k,v in scores:
+            output.write('Accuracy [{}]: {:.1%}\n'.format(
+                k, v.mean()))
+
+print_results([
+        ('base', scores_base),
+        ('chists', scores_chist),
+        ('combined' , scores_combined),
+        ])
+
+@TaskGenerator
+def compute_lbp(fname):
+    from mahotas.features import lbp
+    imc = mh.imread(fname)
+    im = mh.colors.rgb2grey(imc)
+    return lbp(im, radius=8, points=6)
+
+lbps = []
+for fname in sorted(images):
+    # the rest of the loop as before
+    lbps.append(compute_lbp(fname))
+lbps = to_array(lbps)
+
+scores_lbps = accuracy(lbps, labels)
+combined_all = hstack([chists, haralicks, lbps])
+scores_combined_all = accuracy(combined_all, labels)
+
+print_results([
+        ('base', scores_base),
+        ('chists', scores_chist),
+        ('lbps', scores_lbps),
+        ('combined' , scores_combined),
+        ('combined_all' , scores_combined_all),
+        ])
diff --git a/ch12/features.py b/ch12/features.py
new file mode 120000
index 00000000..142a324d
--- /dev/null
+++ b/ch12/features.py
@@ -0,0 +1 @@
+../ch10/features.py
\ No newline at end of file
diff --git a/ch12/image-classification.py b/ch12/image-classification.py
index bae62d6d..6f76d26d 100644
--- a/ch12/image-classification.py
+++ b/ch12/image-classification.py
@@ -6,11 +6,11 @@
 # It is made available under the MIT License
 
 import mahotas as mh
-from sklearn import cross_validation
-from sklearn.linear_model.logistic import LogisticRegression
 import numpy as np
 from glob import glob
 from jug import TaskGenerator
+
+# We need to use the `features` module from chapter 10.
 from sys import path
 path.append('../ch10')
 
@@ -20,7 +20,7 @@
 basedir = '../SimpleImageDataset/'
 
 @TaskGenerator
-def features_for(im):
+def compute_texture(im):
     '''Compute features for an image
 
     Parameters
@@ -33,57 +33,84 @@ def features_for(im):
     fs : ndarray
         1-D array of features
     '''
-    im = mh.imread(im, as_grey=True).astype(np.uint8)
-    return mh.features.haralick(im).mean(0)
+    from features import texture
+    imc = mh.imread(im)
+    return texture(mh.colors.rgb2grey(imc))
+
+@TaskGenerator
+def chist(fname):
+    from features import chist as color_histogram
+    im = mh.imread(fname)
+    return color_histogram(im)
 
 @TaskGenerator
-def edginess_sobel_from_fname(fname):
-    from edginess import edginess_sobel
-    return edginess_sobel(mh.imread(fname, as_grey=True))
+def compute_lbp(fname):
+    from mahotas.features import lbp
+    imc = mh.imread(fname)
+    im = mh.colors.rgb2grey(imc)
+    return lbp(im, radius=8, points=6)
+
 
 @TaskGenerator
 def accuracy(features, labels):
+    from sklearn.linear_model import LogisticRegression
+    from sklearn.pipeline import Pipeline
+    from sklearn.preprocessing import StandardScaler
+    from sklearn import cross_validation
     # We use logistic regression because it is very fast.
     # Feel free to experiment with other classifiers
+    clf = Pipeline([('preproc', StandardScaler()),
+                ('classifier', LogisticRegression())])
+    cv = cross_validation.LeaveOneOut(len(features))
     scores = cross_validation.cross_val_score(
-        LogisticRegression(), features, labels, cv=5)
+        clf, features, labels, cv=cv)
     return scores.mean()
 
 
 @TaskGenerator
-def stack_features(sobels, haralicks):
-    return np.hstack([np.atleast_2d(sobels).T, haralicks])
-
-@TaskGenerator
-def print_results(scores_base, scores_combined):
-    output = open('results.image.txt', 'w')
-    output.write('Accuracy (5 fold x-val) with Logistic Regrssion [std features]: {}%\n'.format(
-            0.1 * round(1000 * scores_base.mean())))
-    output.write('Accuracy (5 fold x-val) with Logistic Regrssion [std features + sobel]: {}%\n'.format(
-        0.1 * round(1000 * scores_combined.mean())))
-    output.close()
+def print_results(scores):
+    with open('results.image.txt', 'w') as output:
+        for k,v in scores:
+            output.write('Accuracy (LOO x-val) with Logistic Regression [{0}]: {1:.1%}\n'.format(
+                k, v.mean()))
 
 
 to_array = TaskGenerator(np.array)
+hstack = TaskGenerator(np.hstack)
 
 haralicks = []
-sobels = []
+chists = []
+lbps = []
 labels = []
 
 # Use glob to get all the images
-images = glob('{}/*.jpg'.format(basedir))
-for fname in images:
-    haralicks.append(features_for(fname))
-    sobels.append(edginess_sobel_from_fname(fname))
+images = glob('{0}/*.jpg'.format(basedir))
+for fname in sorted(images):
+    haralicks.append(compute_texture(fname))
+    chists.append(chist(fname))
+    lbps.append(compute_lbp(fname))
     labels.append(fname[:-len('00.jpg')]) # The class is encoded in the filename as xxxx00.jpg
 
 haralicks = to_array(haralicks)
-sobels = to_array(sobels)
+chists = to_array(chists)
+lbps = to_array(lbps)
 labels = to_array(labels)
 
 scores_base = accuracy(haralicks, labels)
-haralick_plus_sobel = stack_features(sobels, haralicks)
-scores_combined  = accuracy(haralick_plus_sobel, labels)
-
-print_results(scores_base, scores_combined)
+scores_chist = accuracy(chists, labels)
+scores_lbps = accuracy(lbps, labels)
+
+combined = hstack([chists, haralicks])
+scores_combined = accuracy(combined, labels)
+
+combined_all = hstack([chists, haralicks, lbps])
+scores_combined_all = accuracy(combined_all, labels)
+
+print_results([
+        ('base', scores_base),
+        ('chists', scores_chist),
+        ('lbps', scores_lbps),
+        ('combined' , scores_combined),
+        ('combined_all' , scores_combined_all),
+        ])
 
diff --git a/ch12/jugfile.py b/ch12/jugfile.py
index 858c64f4..9d7e2b7a 100644
--- a/ch12/jugfile.py
+++ b/ch12/jugfile.py
@@ -23,7 +23,7 @@ def add(a, b):
 @TaskGenerator
 def print_final_result(oname, value):
     with open(oname, 'w') as output:
-        output.write("Final result: {}\n".format(value))
+        output.write("Final result: {0}\n".format(value))
 
 input = 2
 y = double(input)
diff --git a/ch12/run-image-classification.sh b/ch12/run-image-classification.sh
new file mode 100755
index 00000000..868d07fa
--- /dev/null
+++ b/ch12/run-image-classification.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+
+jug execute image-classification.py
diff --git a/ch12/run-jugfile.sh b/ch12/run-jugfile.sh
new file mode 100755
index 00000000..0ff59131
--- /dev/null
+++ b/ch12/run-jugfile.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+
+jug execute
+
diff --git a/ch12/setup-aws.txt b/ch12/setup-aws.txt
new file mode 100644
index 00000000..292654eb
--- /dev/null
+++ b/ch12/setup-aws.txt
@@ -0,0 +1,7 @@
+sudo yum update
+sudo yum -y install python-devel python-pip numpy scipy python-matplotlib
+sudo yum -y install gcc-c++
+sudo yum -y install git
+sudo pip-python install -U pip
+sudo pip install scikit-learn jug mahotas
+