Skip to content

Commit b57108c

Browse files
committed
2 parents e885d6a + 974a2ea commit b57108c

16 files changed

+85
-45
lines changed

ch02/figure1.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
# Use a different marker/color for each class `t`
2323
for t, marker, c in zip(range(3), ">ox", "rgb"):
2424
ax.scatter(features[target == t, p0], features[
25-
target == t, p1], marker=marker, c=c)
25+
target == t, p1], marker=marker, c=c, s=40)
2626
ax.set_xlabel(feature_names[p0])
2727
ax.set_ylabel(feature_names[p1])
2828
ax.set_xticks([])

ch02/figure2.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,9 @@
4747
ax.plot([t, t], [y0, y1], 'k--', lw=2)
4848
ax.plot([t - .1, t - .1], [y0, y1], 'k:', lw=2)
4949
ax.scatter(features[is_virginica, f0],
50-
features[is_virginica, f1], c='b', marker='o')
50+
features[is_virginica, f1], c='b', marker='o', s=32)
5151
ax.scatter(features[~is_virginica, f0],
52-
features[~is_virginica, f1], c='r', marker='x')
52+
features[~is_virginica, f1], c='r', marker='x', s=32)
5353
ax.set_ylim(y0, y1)
5454
ax.set_xlim(x0, x1)
5555
ax.set_xlabel(feature_names[f0])

ch02/figure4_5_sklearn.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def plot_decision(features, labels, num_neighbors=1):
4747
model.fit(features[:, (0,2)], labels)
4848
C = model.predict(np.vstack([X.ravel(), Y.ravel()]).T).reshape(X.shape)
4949
if COLOUR_FIGURE:
50-
cmap = ListedColormap([(1., .6, .6), (.6, 1., .6), (.6, .6, 1.)])
50+
cmap = ListedColormap([(1., .7, .7), (.7, 1., .7), (.7, .7, 1.)])
5151
else:
5252
cmap = ListedColormap([(1., 1., 1.), (.2, .2, .2), (.6, .6, .6)])
5353
fig,ax = plt.subplots()
@@ -57,12 +57,12 @@ def plot_decision(features, labels, num_neighbors=1):
5757
ax.set_ylabel(feature_names[2])
5858
ax.pcolormesh(X, Y, C, cmap=cmap)
5959
if COLOUR_FIGURE:
60-
cmap = ListedColormap([(1., .0, .0), (.0, 1., .0), (.0, .0, 1.)])
61-
ax.scatter(features[:, 0], features[:, 2], c=labels, cmap=cmap)
60+
cmap = ListedColormap([(1., .0, .0), (.1, .6, .1), (.0, .0, 1.)])
61+
ax.scatter(features[:, 0], features[:, 2], c=labels, cmap=cmap, s=40)
6262
else:
6363
for lab, ma in zip(range(3), "Do^"):
6464
ax.plot(features[labels == lab, 0], features[
65-
labels == lab, 2], ma, c=(1., 1., 1.))
65+
labels == lab, 2], ma, c=(1., 1., 1.), ms=8)
6666
return fig,ax
6767

6868

ch02/seeds_knn_sklearn.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,3 +67,24 @@
6767
crossed = cross_val_score(classifier, features, labels)
6868
print('Result with prescaling: {}'.format(crossed))
6969

70+
71+
# Now, generate & print a cross-validated confusion matrix for the same result
72+
from sklearn.metrics import confusion_matrix
73+
names = list(set(labels))
74+
labels = np.array([names.index(ell) for ell in labels])
75+
preds = labels.copy()
76+
preds[:] = -1
77+
for train, test in kf:
78+
classifier.fit(features[train], labels[train])
79+
preds[test] = classifier.predict(features[test])
80+
81+
cmat = confusion_matrix(labels, preds)
82+
print()
83+
print('Confusion matrix: [rows represent true outcome, columns predicted outcome]')
84+
print(cmat)
85+
86+
# The explicit float() conversion is necessary in Python 2
87+
# (Otherwise, result is rounded to 0)
88+
acc = cmat.trace()/float(cmat.sum())
89+
print('Accuracy: {0:.1%}'.format(acc))
90+

ch02/stump.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77

88
from sklearn.datasets import load_iris
99
data = load_iris()
10-
features = data['data']
11-
labels = data['target_names'][data['target']]
10+
features = data.data
11+
labels = data.target_names[data.target]
1212

1313

1414
is_setosa = (labels == 'setosa')
@@ -35,11 +35,21 @@
3535
# Accuracy is the fraction of predictions that match reality
3636
acc = (pred == is_virginica).mean()
3737

38+
# We test whether negating the test is a better threshold:
39+
acc_neg = ((~pred) == is_virginica).mean()
40+
if acc_neg > acc:
41+
acc = acc_neg
42+
negated = True
43+
else:
44+
negated = False
45+
3846
# If this is better than previous best, then this is now the new best:
3947

4048
if acc > best_acc:
4149
best_acc = acc
4250
best_fi = fi
4351
best_t = t
44-
print('Best threshold is {0} on feature {1}, which achieves accuracy of {2:.1%}.'.format(
45-
best_t, best_fi, best_acc))
52+
best_is_negated = negated
53+
54+
print('Best threshold is {0} on feature {1} (index {2}), which achieves accuracy of {3:.1%}.'.format(
55+
best_t, data.feature_names[best_fi], best_fi, best_acc))

ch08/all_correlations.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,3 +41,13 @@ def all_correlations(y, X):
4141
xs_ += 1e-5 # Handle zeros in x
4242

4343
return (xy - x_ * y_ * n) / n / xs_ / ys_
44+
45+
# If you have scipy installed, then you can compute correlations with
46+
# scipy.spatial.cdist:
47+
48+
def all_correlations_scipy(y, X):
49+
from scipy import spatial
50+
y = np.atleast_2d(y)
51+
sp = spatial.distance.cdist(X, y, 'correlation')
52+
# The "correlation distance" is 1 - corr(x,y); so we invert that to obtain the correlation
53+
return 1 - sp.ravel()

ch08/corrneighbours.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import numpy as np
1111
from load_ml100k import load
1212

13-
def estimate_user(user, rest, num_neigbors=100):
13+
def estimate_user(user, rest, num_neighbors=100):
1414
'''Estimate ratings for user based on the binary rating matrix
1515
1616
Returns
@@ -24,8 +24,8 @@ def estimate_user(user, rest, num_neigbors=100):
2424
br = rest > 0
2525
ws = all_correlations(bu, br)
2626

27-
# Select top `num_neigbors`:
28-
selected = ws.argsort()[-num_neigbors:]
27+
# Select top `num_neighbors`:
28+
selected = ws.argsort()[-num_neighbors:]
2929

3030
# Use these to compute estimates:
3131
estimates = rest[selected].mean(0)
@@ -49,15 +49,13 @@ def train_test(user, rest):
4949

5050

5151
def all_estimates(reviews):
52-
reviews = reviews.toarray()
5352
estimates = np.zeros_like(reviews)
5453
for i in range(reviews.shape[0]):
5554
estimates[i] = estimate_user(reviews[i], np.delete(reviews, i, 0))
5655
return estimates
5756

5857
def main():
5958
reviews = load()
60-
reviews = reviews.toarray()
6159

6260
err = []
6361
for i in range(reviews.shape[0]):
@@ -67,11 +65,16 @@ def main():
6765
revs = (reviews > 0).sum(1)
6866
err = np.array(err)
6967
rmse = np.sqrt(err / revs[:, None])
68+
69+
rmse_model, rmse_null = np.mean(rmse, 0)
70+
7071
print("Average of RMSE / Null-model RMSE")
71-
print(np.mean(rmse, 0))
72+
print("{:.2}\t{:.2} (improvement: {:.1%}".format(rmse_model, rmse_null, (rmse_null-rmse_model)/rmse_null))
7273
print()
74+
75+
rmse_model, rmse_null = np.mean(rmse[revs > 60], 0)
7376
print("Average of RMSE / Null-model RMSE (users with more than 60 reviewed movies)")
74-
print(np.mean(rmse[revs > 60], 0))
77+
print("{:.2}\t{:.2} (improvement: {:.1%}".format(rmse_model, rmse_null, (rmse_null-rmse_model)/rmse_null))
7578

7679
if __name__ == '__main__':
7780
main()

ch08/figure3.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
from load_ml100k import load
99
from matplotlib import pyplot as plt
1010
data = load()
11-
data = data.toarray()
1211
plt.gray()
1312
plt.imshow(data[:200, :200], interpolation='nearest')
1413
plt.xlabel('User ID')

ch08/load_ml100k.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,4 @@ def load():
1717
ij -= 1 # original data is in 1-based system
1818
values = data[:, 2]
1919
reviews = sparse.csc_matrix((values, ij.T)).astype(float)
20-
return reviews
20+
return reviews.toarray()

ch08/similar_movie.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,9 +65,10 @@ def all_estimates(reviews, k=1):
6565

6666
if __name__ == '__main__':
6767
from load_ml100k import load
68-
reviews = load().torarray()
68+
reviews = load()
6969
estimates = all_estimates(reviews)
7070
error = (estimates - reviews)
7171
error **= 2
7272
error = error[reviews > 0]
73-
print(np.sqrt(error).mean())
73+
rmse = np.sqrt(error.mean())
74+
print("RMSE is {0}.".format(rmse))

0 commit comments

Comments
 (0)