astitwlathe · astitwlathe · Oct 10, 2014 · Oct 10, 2014 · Oct 10, 2014 · Oct 12, 2014
diff --git a/README.md b/README.md
@@ -1,11 +1,14 @@
 Building Machine Learning Systems with Python
 =============================================
 
-Source Code for the book Building Machine Learning Systems with Python by
-[Willi Richert](http://twotoreal.com) and [Luis Pedro
-Coelho](http://luispedro.org).
+Source Code for the book Building Machine Learning Systems with Python by [Luis
+Pedro Coelho](http://luispedro.org) and [Willi Richert](http://twotoreal.com).
 
-The book was published in 2013 by Packt Publishing and is available [from their
+The book was published in 2013 (second edition in 2015) by Packt Publishing and
+is available [from their
 website](http://www.packtpub.com/building-machine-learning-systems-with-python/book).
 
+The code in the repository corresponds to the second edition. Code for the
+first edition is available in [first\_edition
+branch](https://github.com/luispedro/BuildingMachineLearningSystemsWithPython/tree/first_edition).
 
diff --git a/ch01/analyze_webstats.py b/ch01/analyze_webstats.py
@@ -26,8 +26,9 @@
 x = x[~sp.isnan(y)]
 y = y[~sp.isnan(y)]
 
-# plot input data
+
 def plot_models(x, y, models, fname, mx=None, ymax=None, xmin=None):
+    ''' plot input data '''
 
     plt.figure(num=None, figsize=(8, 6))
     plt.clf()
@@ -138,8 +139,8 @@ def error(f, x, y):
 train = sorted(shuffled[split_idx:])
 fbt1 = sp.poly1d(sp.polyfit(xb[train], yb[train], 1))
 fbt2 = sp.poly1d(sp.polyfit(xb[train], yb[train], 2))
-print("fbt2(x)= \n%s"%fbt2)
-print("fbt2(x)-100,000= \n%s"%(fbt2-100000))
+print("fbt2(x)= \n%s" % fbt2)
+print("fbt2(x)-100,000= \n%s" % (fbt2-100000))
 fbt3 = sp.poly1d(sp.polyfit(xb[train], yb[train], 3))
 fbt10 = sp.poly1d(sp.polyfit(xb[train], yb[train], 10))
 fbt100 = sp.poly1d(sp.polyfit(xb[train], yb[train], 100))

diff --git a/ch01/gen_webstats.py b/ch01/gen_webstats.py
@@ -17,26 +17,22 @@
 
 sp.random.seed(3)  # to reproduce the data later on
 
-x = sp.arange(1, 31 * 24)
-y = sp.array(200 * (sp.sin(2 * sp.pi * x / (7 * 24))), dtype=int)
+x = sp.arange(1, 31*24)
+y = sp.array(200*(sp.sin(2*sp.pi*x/(7*24))), dtype=int)
 y += gamma.rvs(15, loc=0, scale=100, size=len(x))
-y += 2 * sp.exp(x / 100.0)
-y = sp.ma.array(y, mask=[y < 0])
-print(sum(y), sum(y < 0))
+y += 2 * sp.exp(x/100.0)
+y = sp.ma.array(y, mask=[y<0])
+print(sum(y), sum(y<0))
 
 plt.scatter(x, y)
 plt.title("Web traffic over the last month")
 plt.xlabel("Time")
 plt.ylabel("Hits/hour")
-plt.xticks([w * 7 * 24 for w in [0, 1, 2, 3, 4]], ['week %i' % (w + 1) for w in
-                                                   [0, 1, 2, 3, 4]])
-
+plt.xticks([w*7*24 for w in range(5)], 
+           ['week %i' %(w+1) for w in range(5)])
 plt.autoscale(tight=True)
 plt.grid()
 plt.savefig(os.path.join(CHART_DIR, "1400_01_01.png"))
 
-# sp.savetxt(os.path.join("..", "web_traffic.tsv"),
-# zip(x[~y.mask],y[~y.mask]), delimiter="\t", fmt="%i")
-
-sp.savetxt(os.path.join(
-    DATA_DIR, "web_traffic.tsv"), list(zip(x, y)), delimiter="\t", fmt="%s")
+sp.savetxt(os.path.join(DATA_DIR, "web_traffic.tsv"), 
+           list(zip(x, y)), delimiter="\t", fmt="%s")
diff --git a/ch02/README.rst b/ch02/README.rst
@@ -6,6 +6,9 @@ Support code for *Chapter 2: Learning How to Classify with Real-world
 Examples*. The directory data contains the seeds dataset, originally downloaded
 from https://archive.ics.uci.edu/ml/datasets/seeds
 
+chapter.py
+    The code as printed in the book.
+
 figure1.py
     Figure 1 in the book: all 2-by-2 scatter plots
 

diff --git a/ch02/chapter.py b/ch02/chapter.py
@@ -0,0 +1,164 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+#
+# It is made available under the MIT License
+
+
+from matplotlib import pyplot as plt
+import numpy as np
+
+# We load the data with load_iris from sklearn
+from sklearn.datasets import load_iris
+data = load_iris()
+
+# load_iris returns an object with several fields
+features = data.data
+feature_names = data.feature_names
+target = data.target
+target_names = data.target_names
+
+for t in range(3):
+ if t == 0:
+     c = 'r'
+     marker = '>'
+ elif t == 1:
+     c = 'g'
+     marker = 'o'
+ elif t == 2:
+     c = 'b'
+     marker = 'x'
+ plt.scatter(features[target == t, 0],
+            features[target == t, 1],
+            marker=marker,
+            c=c)
+# We use NumPy fancy indexing to get an array of strings:
+labels = target_names[target]
+
+# The petal length is the feature at position 2
+plength = features[:, 2]
+
+# Build an array of booleans:
+is_setosa = (labels == 'setosa')
+
+# This is the important step:
+max_setosa =plength[is_setosa].max()
+min_non_setosa = plength[~is_setosa].min()
+print('Maximum of setosa: {0}.'.format(max_setosa))
+
+print('Minimum of others: {0}.'.format(min_non_setosa))
+
+# ~ is the boolean negation operator
+features = features[~is_setosa]
+labels = labels[~is_setosa]
+# Build a new target variable, is_virigina
+is_virginica = (labels == 'virginica')
+
+# Initialize best_acc to impossibly low value
+best_acc = -1.0
+for fi in range(features.shape[1]):
+    # We are going to test all possible thresholds
+    thresh = features[:,fi]
+    for t in thresh:
+
+        # Get the vector for feature `fi`
+        feature_i = features[:, fi]
+        # apply threshold `t`
+        pred = (feature_i > t)
+        acc = (pred == is_virginica).mean()
+        rev_acc = (pred == ~is_virginica).mean()
+        if rev_acc > acc:
+            reverse = True
+            acc = rev_acc
+        else:
+            reverse = False
+
+        if acc > best_acc:
+            best_acc = acc
+            best_fi = fi
+            best_t = t
+            best_reverse = reverse
+
+print(best_fi, best_t, best_reverse, best_acc)
+
+def is_virginica_test(fi, t, reverse, example):
+    'Apply threshold model to a new example'
+    test = example[fi] > t
+    if reverse:
+        test = not test
+    return test
+from threshold import fit_model, predict
+
+# ning accuracy was 96.0%.
+# ing accuracy was 90.0% (N = 50).
+correct = 0.0
+
+for ei in range(len(features)):
+    # select all but the one at position `ei`:
+    training = np.ones(len(features), bool)
+    training[ei] = False
+    testing = ~training
+    model = fit_model(features[training], is_virginica[training])
+    predictions = predict(model, features[testing])
+    correct += np.sum(predictions == is_virginica[testing])
+acc = correct/float(len(features))
+print('Accuracy: {0:.1%}'.format(acc))
+
+
+###########################################
+############## SEEDS DATASET ##############
+###########################################
+
+from load import load_dataset
+
+feature_names = [
+    'area',
+    'perimeter',
+    'compactness',
+    'length of kernel',
+    'width of kernel',
+    'asymmetry coefficien',
+    'length of kernel groove',
+]
+features, labels = load_dataset('seeds')
+
+
+
+from sklearn.neighbors import KNeighborsClassifier
+classifier = KNeighborsClassifier(n_neighbors=1)
+from sklearn.cross_validation import KFold
+
+kf = KFold(len(features), n_folds=5, shuffle=True)
+means = []
+for training,testing in kf:
+   # We learn a model for this fold with `fit` and then apply it to the
+   # testing data with `predict`:
+   classifier.fit(features[training], labels[training])
+   prediction = classifier.predict(features[testing])
+
+   # np.mean on an array of booleans returns fraction
+ # of correct decisions for this fold:
+   curmean = np.mean(prediction == labels[testing])
+   means.append(curmean)
+print('Mean accuracy: {:.1%}'.format(np.mean(means)))
+
+
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+
+classifier = KNeighborsClassifier(n_neighbors=1)
+classifier = Pipeline([('norm', StandardScaler()), ('knn', classifier)])
+
+means = []
+for training,testing in kf:
+    # We learn a model for this fold with `fit` and then apply it to the
+    # testing data with `predict`:
+    classifier.fit(features[training], labels[training])
+    prediction = classifier.predict(features[testing])
+
+    # np.mean on an array of booleans returns fraction
+    # of correct decisions for this fold:
+    curmean = np.mean(prediction == labels[testing])
+    means.append(curmean)
+print('Mean accuracy: {:.1%}'.format(np.mean(means)))
diff --git a/ch02/figure1.py b/ch02/figure1.py
@@ -19,13 +19,21 @@
 
 fig,axes = plt.subplots(2, 3)
 pairs = [(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]
+
+# Set up 3 different pairs of (color, marker)
+color_markers = [
+        ('r', '>'),
+        ('g', 'o'),
+        ('b', 'x'),
+        ]
 for i, (p0, p1) in enumerate(pairs):
     ax = axes.flat[i]
 
-    # Use a different marker/color for each class `t`
-    for t, marker, c in zip(range(3), ">ox", "rgb"):
+    for t in range(3):
+        # Use a different color/marker for each class `t`
+        c,marker = color_markers[t]
         ax.scatter(features[target == t, p0], features[
-                    target == t, p1], marker=marker, c=c, s=40)
+                    target == t, p1], marker=marker, c=c)
     ax.set_xlabel(feature_names[p0])
     ax.set_ylabel(feature_names[p1])
     ax.set_xticks([])

diff --git a/ch02/figure2.py b/ch02/figure2.py
@@ -23,8 +23,9 @@
 labels = labels[~is_setosa]
 is_virginica = (labels == 'virginica')
 
-# Hand fixed threshold:
-t = 1.75
+# Hand fixed thresholds:
+t = 1.65
+t2 = 1.75
 
 # Features to use: 3 & 2
 f0, f1 = 3, 2
@@ -49,7 +50,7 @@
 ax.fill_between([t, x1], [y0, y0], [y1, y1], color=area2c)
 ax.fill_between([x0, t], [y0, y0], [y1, y1], color=area1c)
 ax.plot([t, t], [y0, y1], 'k--', lw=2)
-ax.plot([t - .1, t - .1], [y0, y1], 'k:', lw=2)
+ax.plot([t2, t2], [y0, y1], 'k:', lw=2)
 ax.scatter(features[is_virginica, f0],
             features[is_virginica, f1], c='b', marker='o', s=40)
 ax.scatter(features[~is_virginica, f0],

diff --git a/ch02/figure4_5_no_sklearn.py b/ch02/figure4_5_no_sklearn.py
@@ -45,7 +45,7 @@ def plot_decision(features, labels):
 
     model = fit_model(1, features[:, (0, 2)], np.array(labels))
     C = predict(
-        np.vstack([X.ravel(), Y.ravel()]).T, model).reshape(X.shape)
+        model, np.vstack([X.ravel(), Y.ravel()]).T).reshape(X.shape)
     if COLOUR_FIGURE:
         cmap = ListedColormap([(1., .6, .6), (.6, 1., .6), (.6, .6, 1.)])
     else:

diff --git a/ch02/figure4_5_sklearn.py b/ch02/figure4_5_sklearn.py
@@ -58,11 +58,11 @@ def plot_decision(features, labels, num_neighbors=1):
     ax.pcolormesh(X, Y, C, cmap=cmap)
     if COLOUR_FIGURE:
         cmap = ListedColormap([(1., .0, .0), (.1, .6, .1), (.0, .0, 1.)])
-        ax.scatter(features[:, 0], features[:, 2], c=labels, cmap=cmap, s=40)
+        ax.scatter(features[:, 0], features[:, 2], c=labels, cmap=cmap)
     else:
         for lab, ma in zip(range(3), "Do^"):
             ax.plot(features[labels == lab, 0], features[
-                     labels == lab, 2], ma, c=(1., 1., 1.), ms=8)
+                     labels == lab, 2], ma, c=(1., 1., 1.), ms=6)
     return fig,ax
 
 

diff --git a/ch02/knn.py b/ch02/knn.py
@@ -26,7 +26,7 @@ def plurality(xs):
             return k
 
 # This function was called ``apply_model`` in the first edition
-def predict(features, model):
+def predict(model, features):
     '''Apply k-nn model'''
     k, train_feats, labels = model
     results = []
@@ -42,5 +42,5 @@ def predict(features, model):
 
 
 def accuracy(features, labels, model):
-    preds = predict(features, model)
+    preds = predict(model, features)
     return np.mean(preds == labels)
diff --git a/ch02/threshold.py b/ch02/threshold.py
@@ -40,7 +40,7 @@ def fit_model(features, labels):
 
 
 # This function was called ``apply_model`` in the first edition
-def predict(features, model):
+def predict(model, features):
     '''Apply a learned model'''
     # A model is a pair as returned by fit_model
     t, fi, reverse = model
@@ -51,5 +51,5 @@ def predict(features, model):
 
 def accuracy(features, labels, model):
     '''Compute the accuracy of the model'''
-    preds = predict(features, model)
+    preds = predict(model, features)
     return np.mean(preds == labels)
diff --git a/ch04/.gitignore b/ch04/.gitignore
@@ -1,2 +1,6 @@
 wiki_lda.pkl
 wiki_lda.pkl.state
+*.png
+*.npy
+*.pkl
+topics.txt
diff --git a/ch04/README.rst b/ch04/README.rst
@@ -4,6 +4,16 @@ Chapter 4
 
 Support code for *Chapter 4: Topic Modeling*
 
+
+AP Data
+-------
+
+To download the AP data, use the ``download_ap.sh`` script inside the ``data``
+directory::
+
+    cd data
+    ./download_ap.sh
+
 Word cloud creation
 -------------------
 
@@ -49,3 +59,7 @@ Scripts
 
 blei_lda.py
     Computes LDA using the AP Corpus.
+wikitopics_create.py
+    Create the topic model for Wikipedia using LDA (must download wikipedia database first)
+wikitopics_create_hdp.py
+    Create the topic model for Wikipedia using HDP (must download wikipedia database first)