yenchih
diff --git a/‎ch02/figure1.py‎
Lines changed: 1 addition & 1 deletion b/‎ch02/figure1.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ch02/figure2.py‎
Lines changed: 2 additions & 2 deletions b/‎ch02/figure2.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎ch02/figure4_5_sklearn.py‎
Lines changed: 4 additions & 4 deletions b/‎ch02/figure4_5_sklearn.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎ch02/seeds_knn_sklearn.py‎
Lines changed: 21 additions & 0 deletions b/‎ch02/seeds_knn_sklearn.py‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎ch02/stump.py‎
Lines changed: 14 additions & 4 deletions b/‎ch02/stump.py‎
Lines changed: 14 additions & 4 deletions
diff --git a/‎ch08/all_correlations.py‎
Lines changed: 10 additions & 0 deletions b/‎ch08/all_correlations.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎ch08/corrneighbours.py‎
Lines changed: 10 additions & 7 deletions b/‎ch08/corrneighbours.py‎
Lines changed: 10 additions & 7 deletions
diff --git a/‎ch08/figure3.py‎
Lines changed: 0 additions & 1 deletion b/‎ch08/figure3.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎ch08/load_ml100k.py‎
Lines changed: 1 addition & 1 deletion b/‎ch08/load_ml100k.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ch08/similar_movie.py‎
Lines changed: 3 additions & 2 deletions b/‎ch08/similar_movie.py‎
Lines changed: 3 additions & 2 deletions
@@ -22,7 +22,7 @@
     # Use a different marker/color for each class `t`
     for t, marker, c in zip(range(3), ">ox", "rgb"):
         ax.scatter(features[target == t, p0], features[
-                    target == t, p1], marker=marker, c=c)
+                    target == t, p1], marker=marker, c=c, s=40)
     ax.set_xlabel(feature_names[p0])
     ax.set_ylabel(feature_names[p1])
     ax.set_xticks([])
 
@@ -47,9 +47,9 @@
 ax.plot([t, t], [y0, y1], 'k--', lw=2)
 ax.plot([t - .1, t - .1], [y0, y1], 'k:', lw=2)
 ax.scatter(features[is_virginica, f0],
-            features[is_virginica, f1], c='b', marker='o')
+            features[is_virginica, f1], c='b', marker='o', s=32)
 ax.scatter(features[~is_virginica, f0],
-            features[~is_virginica, f1], c='r', marker='x')
+            features[~is_virginica, f1], c='r', marker='x', s=32)
 ax.set_ylim(y0, y1)
 ax.set_xlim(x0, x1)
 ax.set_xlabel(feature_names[f0])
 
@@ -47,7 +47,7 @@ def plot_decision(features, labels, num_neighbors=1):
     model.fit(features[:, (0,2)], labels)
     C = model.predict(np.vstack([X.ravel(), Y.ravel()]).T).reshape(X.shape)
     if COLOUR_FIGURE:
-        cmap = ListedColormap([(1., .6, .6), (.6, 1., .6), (.6, .6, 1.)])
+        cmap = ListedColormap([(1., .7, .7), (.7, 1., .7), (.7, .7, 1.)])
     else:
         cmap = ListedColormap([(1., 1., 1.), (.2, .2, .2), (.6, .6, .6)])
     fig,ax = plt.subplots()
@@ -57,12 +57,12 @@ def plot_decision(features, labels, num_neighbors=1):
     ax.set_ylabel(feature_names[2])
     ax.pcolormesh(X, Y, C, cmap=cmap)
     if COLOUR_FIGURE:
-        cmap = ListedColormap([(1., .0, .0), (.0, 1., .0), (.0, .0, 1.)])
-        ax.scatter(features[:, 0], features[:, 2], c=labels, cmap=cmap)
+        cmap = ListedColormap([(1., .0, .0), (.1, .6, .1), (.0, .0, 1.)])
+        ax.scatter(features[:, 0], features[:, 2], c=labels, cmap=cmap, s=40)
     else:
         for lab, ma in zip(range(3), "Do^"):
             ax.plot(features[labels == lab, 0], features[
-                     labels == lab, 2], ma, c=(1., 1., 1.))
+                     labels == lab, 2], ma, c=(1., 1., 1.), ms=8)
     return fig,ax
 
 
 
@@ -67,3 +67,24 @@
 crossed = cross_val_score(classifier, features, labels)
 print('Result with prescaling: {}'.format(crossed))
 
+
+# Now, generate & print a cross-validated confusion matrix for the same result
+from sklearn.metrics import confusion_matrix
+names = list(set(labels))
+labels = np.array([names.index(ell) for ell in labels])
+preds = labels.copy()
+preds[:] = -1
+for train, test in kf:
+    classifier.fit(features[train], labels[train])
+    preds[test] = classifier.predict(features[test])
+
+cmat = confusion_matrix(labels, preds)
+print()
+print('Confusion matrix: [rows represent true outcome, columns predicted outcome]')
+print(cmat)
+
+# The explicit float() conversion is necessary in Python 2
+# (Otherwise, result is rounded to 0)
+acc = cmat.trace()/float(cmat.sum())
+print('Accuracy: {0:.1%}'.format(acc))
+
@@ -7,8 +7,8 @@
 
 from sklearn.datasets import load_iris
 data = load_iris()
-features = data['data']
-labels = data['target_names'][data['target']]
+features = data.data
+labels = data.target_names[data.target]
 
 
 is_setosa = (labels == 'setosa')
@@ -35,11 +35,21 @@
         # Accuracy is the fraction of predictions that match reality
         acc = (pred == is_virginica).mean()
 
+        # We test whether negating the test is a better threshold:
+        acc_neg = ((~pred) == is_virginica).mean()
+        if acc_neg > acc:
+            acc = acc_neg
+            negated = True
+        else:
+            negated = False
+
         # If this is better than previous best, then this is now the new best:
 
         if acc > best_acc:
             best_acc = acc
             best_fi = fi
             best_t = t
-print('Best threshold is {0} on feature {1}, which achieves accuracy of {2:.1%}.'.format(
-    best_t, best_fi, best_acc))
+            best_is_negated = negated
+
+print('Best threshold is {0} on feature {1} (index {2}), which achieves accuracy of {3:.1%}.'.format(
+    best_t, data.feature_names[best_fi], best_fi, best_acc))
@@ -41,3 +41,13 @@ def all_correlations(y, X):
     xs_ += 1e-5  # Handle zeros in x
 
     return (xy - x_ * y_ * n) / n / xs_ / ys_
+
+# If you have scipy installed, then you can compute correlations with
+# scipy.spatial.cdist:
+
+def all_correlations_scipy(y, X):
+    from scipy import spatial
+    y = np.atleast_2d(y)
+    sp = spatial.distance.cdist(X, y, 'correlation')
+    # The "correlation distance" is 1 - corr(x,y); so we invert that to obtain the correlation
+    return 1 - sp.ravel()
@@ -10,7 +10,7 @@
 import numpy as np
 from load_ml100k import load
 
-def estimate_user(user, rest, num_neigbors=100):
+def estimate_user(user, rest, num_neighbors=100):
     '''Estimate ratings for user based on the binary rating matrix
 
     Returns
@@ -24,8 +24,8 @@ def estimate_user(user, rest, num_neigbors=100):
     br = rest > 0
     ws = all_correlations(bu, br)
 
-    # Select top `num_neigbors`:
-    selected = ws.argsort()[-num_neigbors:]
+    # Select top `num_neighbors`:
+    selected = ws.argsort()[-num_neighbors:]
 
     # Use these to compute estimates:
     estimates = rest[selected].mean(0)
@@ -49,15 +49,13 @@ def train_test(user, rest):
 
 
 def all_estimates(reviews):
-    reviews = reviews.toarray()
     estimates = np.zeros_like(reviews)
     for i in range(reviews.shape[0]):
         estimates[i] = estimate_user(reviews[i], np.delete(reviews, i, 0))
     return estimates
 
 def main():
     reviews = load()
-    reviews = reviews.toarray()
 
     err = []
     for i in range(reviews.shape[0]):
@@ -67,11 +65,16 @@ def main():
     revs = (reviews > 0).sum(1)
     err = np.array(err)
     rmse = np.sqrt(err / revs[:, None])
+
+    rmse_model, rmse_null = np.mean(rmse, 0)
+
     print("Average of RMSE / Null-model RMSE")
-    print(np.mean(rmse, 0))
+    print("{:.2}\t{:.2} (improvement: {:.1%}".format(rmse_model, rmse_null, (rmse_null-rmse_model)/rmse_null))
     print()
+
+    rmse_model, rmse_null = np.mean(rmse[revs > 60], 0)
     print("Average of RMSE / Null-model RMSE (users with more than 60 reviewed movies)")
-    print(np.mean(rmse[revs > 60], 0))
+    print("{:.2}\t{:.2} (improvement: {:.1%}".format(rmse_model, rmse_null, (rmse_null-rmse_model)/rmse_null))
 
 if __name__ == '__main__':
     main()
@@ -8,7 +8,6 @@
 from load_ml100k import load
 from matplotlib import pyplot as plt
 data = load()
-data = data.toarray()
 plt.gray()
 plt.imshow(data[:200, :200], interpolation='nearest')
 plt.xlabel('User ID')
 
@@ -17,4 +17,4 @@ def load():
     ij -= 1  # original data is in 1-based system
     values = data[:, 2]
     reviews = sparse.csc_matrix((values, ij.T)).astype(float)
-    return reviews
+    return reviews.toarray()
@@ -65,9 +65,10 @@ def all_estimates(reviews, k=1):
 
 if __name__ == '__main__':
     from load_ml100k import load
-    reviews = load().torarray()
+    reviews = load()
     estimates = all_estimates(reviews)
     error = (estimates - reviews)
     error **= 2
     error = error[reviews > 0]
-    print(np.sqrt(error).mean())
+    rmse = np.sqrt(error.mean())
+    print("RMSE is {0}.".format(rmse))