Skip to content

Commit 8684a43

Browse files
committed
ENH Add code from book
1 parent abdf43b commit 8684a43

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

92 files changed

+5651
-0
lines changed

ch01/analyze_webstats.py

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
import os
2+
import scipy as sp
3+
import matplotlib.pyplot as plt
4+
5+
data_dir = os.path.join(
6+
os.path.dirname(os.path.realpath(__file__)), "..", "data")
7+
data = sp.genfromtxt(os.path.join(data_dir, "web_traffic.tsv"), delimiter="\t")
8+
print(data[:10])
9+
10+
# all examples will have three classes in this file
11+
colors = ['g', 'k', 'b', 'm', 'r']
12+
linestyles = ['-', '-.', '--', ':', '-']
13+
14+
x = data[:, 0]
15+
y = data[:, 1]
16+
print("Number of invalid entries:", sp.sum(sp.isnan(y)))
17+
x = x[~sp.isnan(y)]
18+
y = y[~sp.isnan(y)]
19+
20+
# plot input data
21+
22+
23+
def plot_models(x, y, models, fname, mx=None, ymax=None, xmin=None):
24+
plt.clf()
25+
plt.scatter(x, y, s=10)
26+
plt.title("Web traffic over the last month")
27+
plt.xlabel("Time")
28+
plt.ylabel("Hits/hour")
29+
plt.xticks(
30+
[w * 7 * 24 for w in range(10)], ['week %i' % w for w in range(10)])
31+
32+
if models:
33+
if mx is None:
34+
mx = sp.linspace(0, x[-1], 1000)
35+
for model, style, color in zip(models, linestyles, colors):
36+
# print "Model:",model
37+
# print "Coeffs:",model.coeffs
38+
plt.plot(mx, model(mx), linestyle=style, linewidth=2, c=color)
39+
40+
plt.legend(["d=%i" % m.order for m in models], loc="upper left")
41+
42+
plt.autoscale(tight=True)
43+
plt.ylim(ymin=0)
44+
if ymax:
45+
plt.ylim(ymax=ymax)
46+
if xmin:
47+
plt.xlim(xmin=xmin)
48+
plt.grid(True, linestyle='-', color='0.75')
49+
plt.savefig(fname)
50+
51+
# first look at the data
52+
plot_models(x, y, None, os.path.join("..", "1400_01_01.png"))
53+
54+
# create and plot models
55+
fp1, res, rank, sv, rcond = sp.polyfit(x, y, 1, full=True)
56+
print("Model parameters: %s" % fp1)
57+
print("Error of the model:", res)
58+
f1 = sp.poly1d(fp1)
59+
f2 = sp.poly1d(sp.polyfit(x, y, 2))
60+
f3 = sp.poly1d(sp.polyfit(x, y, 3))
61+
f10 = sp.poly1d(sp.polyfit(x, y, 10))
62+
f100 = sp.poly1d(sp.polyfit(x, y, 100))
63+
64+
plot_models(x, y, [f1], os.path.join("..", "1400_01_02.png"))
65+
plot_models(x, y, [f1, f2], os.path.join("..", "1400_01_03.png"))
66+
plot_models(
67+
x, y, [f1, f2, f3, f10, f100], os.path.join("..", "1400_01_04.png"))
68+
69+
# fit and plot a model using the knowledge about inflection point
70+
inflection = 3.5 * 7 * 24
71+
xa = x[:inflection]
72+
ya = y[:inflection]
73+
xb = x[inflection:]
74+
yb = y[inflection:]
75+
76+
fa = sp.poly1d(sp.polyfit(xa, ya, 1))
77+
fb = sp.poly1d(sp.polyfit(xb, yb, 1))
78+
79+
plot_models(x, y, [fa, fb], os.path.join("..", "1400_01_05.png"))
80+
81+
82+
def error(f, x, y):
83+
return sp.sum((f(x) - y) ** 2)
84+
85+
print("Errors for the complete data set:")
86+
for f in [f1, f2, f3, f10, f100]:
87+
print("Error d=%i: %f" % (f.order, error(f, x, y)))
88+
89+
print("Errors for only the time after inflection point")
90+
for f in [f1, f2, f3, f10, f100]:
91+
print("Error d=%i: %f" % (f.order, error(f, xb, yb)))
92+
93+
print("Error inflection=%f" % (error(fa, xa, ya) + error(fb, xb, yb)))
94+
95+
96+
# extrapolating into the future
97+
plot_models(
98+
x, y, [f1, f2, f3, f10, f100], os.path.join("..", "1400_01_06.png"),
99+
mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100),
100+
ymax=10000, xmin=0 * 7 * 24)
101+
102+
print("Trained only on data after inflection point")
103+
fb1 = fb
104+
fb2 = sp.poly1d(sp.polyfit(xb, yb, 2))
105+
fb3 = sp.poly1d(sp.polyfit(xb, yb, 3))
106+
fb10 = sp.poly1d(sp.polyfit(xb, yb, 10))
107+
fb100 = sp.poly1d(sp.polyfit(xb, yb, 100))
108+
109+
print("Errors for only the time after inflection point")
110+
for f in [fb1, fb2, fb3, fb10, fb100]:
111+
print("Error d=%i: %f" % (f.order, error(f, xb, yb)))
112+
113+
plot_models(
114+
x, y, [fb1, fb2, fb3, fb10, fb100], os.path.join("..", "1400_01_07.png"),
115+
mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100),
116+
ymax=10000, xmin=0 * 7 * 24)
117+
118+
# separating training from testing data
119+
frac = 0.3
120+
split_idx = int(frac * len(xb))
121+
shuffled = sp.random.permutation(list(range(len(xb))))
122+
test = sorted(shuffled[:split_idx])
123+
train = sorted(shuffled[split_idx:])
124+
fbt1 = sp.poly1d(sp.polyfit(xb[train], yb[train], 1))
125+
fbt2 = sp.poly1d(sp.polyfit(xb[train], yb[train], 2))
126+
fbt3 = sp.poly1d(sp.polyfit(xb[train], yb[train], 3))
127+
fbt10 = sp.poly1d(sp.polyfit(xb[train], yb[train], 10))
128+
fbt100 = sp.poly1d(sp.polyfit(xb[train], yb[train], 100))
129+
130+
print("Test errors for only the time after inflection point")
131+
for f in [fbt1, fbt2, fbt3, fbt10, fbt100]:
132+
print("Error d=%i: %f" % (f.order, error(f, xb[test], yb[test])))
133+
134+
plot_models(
135+
x, y, [fbt1, fbt2, fbt3, fbt10, fbt100], os.path.join("..",
136+
"1400_01_08.png"),
137+
mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100),
138+
ymax=10000, xmin=0 * 7 * 24)
139+
140+
from scipy.optimize import fsolve
141+
print(fbt2)
142+
print(fbt2 - 100000)
143+
reached_max = fsolve(fbt2 - 100000, 800) / (7 * 24)
144+
print("100,000 hits/hour expected at week %f" % reached_max[0])

ch01/gen_webstats.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# This script generates web traffic data for our hypothetical
2+
# web startup "MLASS" in chapter 01
3+
4+
import os
5+
import scipy as sp
6+
from scipy.stats import gamma
7+
import matplotlib.pyplot as plt
8+
9+
sp.random.seed(3) # to reproduce the data later on
10+
11+
x = sp.arange(1, 31 * 24)
12+
y = sp.array(200 * (sp.sin(2 * sp.pi * x / (7 * 24))), dtype=int)
13+
y += gamma.rvs(15, loc=0, scale=100, size=len(x))
14+
y += 2 * sp.exp(x / 100.0)
15+
y = sp.ma.array(y, mask=[y < 0])
16+
print(sum(y), sum(y < 0))
17+
18+
plt.scatter(x, y)
19+
plt.title("Web traffic over the last month")
20+
plt.xlabel("Time")
21+
plt.ylabel("Hits/hour")
22+
plt.xticks([w * 7 * 24 for w in [0, 1, 2, 3, 4]], ['week %i' % (w + 1) for w in [
23+
0, 1, 2, 3, 4]])
24+
25+
plt.autoscale(tight=True)
26+
plt.grid()
27+
plt.savefig(os.path.join("..", "1400_01_01.png"))
28+
29+
data_dir = os.path.join(
30+
os.path.dirname(os.path.realpath(__file__)), "..", "data")
31+
32+
# sp.savetxt(os.path.join("..", "web_traffic.tsv"),
33+
# zip(x[~y.mask],y[~y.mask]), delimiter="\t", fmt="%i")
34+
sp.savetxt(os.path.join(
35+
data_dir, "web_traffic.tsv"), list(zip(x, y)), delimiter="\t", fmt="%s")

ch01/performance_test.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
2+
import timeit
3+
4+
normal_py_sec = timeit.timeit('sum(x*x for x in xrange(1000))',
5+
number=10000)
6+
naive_np_sec = timeit.timeit('sum(na*na)',
7+
setup="import numpy as np; na=np.arange(1000)",
8+
number=10000)
9+
good_np_sec = timeit.timeit('na.dot(na)',
10+
setup="import numpy as np; na=np.arange(1000)",
11+
number=10000)
12+
13+
print("Normal Python: %f sec" % normal_py_sec)
14+
print("Naive NumPy: %f sec" % naive_np_sec)
15+
print("Good NumPy: %f sec" % good_np_sec)

ch02/extra/create_tsv.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import milksets.iris
2+
import milksets.seeds
3+
4+
def save_as_tsv(fname, module):
5+
features, labels = module.load()
6+
nlabels = [module.label_names[ell] for ell in labels]
7+
with open(fname, 'w') as ofile:
8+
for f,n in zip(features, nlabels):
9+
print >>ofile, "\t".join(map(str,f)+[n])
10+
11+
save_as_tsv('iris.tsv', milksets.iris)
12+
save_as_tsv('seeds.tsv', milksets.seeds)

ch02/figure1.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import numpy as np
2+
from sklearn.datasets import load_iris
3+
from matplotlib import pyplot as plt
4+
5+
data = load_iris()
6+
features = data['data']
7+
feature_names = data['feature_names']
8+
target = data['target']
9+
10+
11+
pairs = [(0,1),(0,2),(0,3),(1,2),(1,3),(2,3)]
12+
for i,(p0,p1) in enumerate(pairs):
13+
plt.subplot(2,3,i+1)
14+
for t,marker,c in zip(range(3),">ox","rgb"):
15+
plt.scatter(features[target == t,p0], features[target == t,p1], marker=marker, c=c)
16+
plt.xlabel(feature_names[p0])
17+
plt.ylabel(feature_names[p1])
18+
plt.xticks([])
19+
plt.yticks([])
20+
plt.savefig('../1400_02_01.png')
21+

ch02/figure2.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
COLOUR_FIGURE = False
2+
3+
from matplotlib import pyplot as plt
4+
from sklearn.datasets import load_iris
5+
data = load_iris()
6+
features = data['data']
7+
feature_names = data['feature_names']
8+
species = data['target_names'][data['target']]
9+
10+
setosa = (species == 'setosa')
11+
features = features[~setosa]
12+
species = species[~setosa]
13+
virginica = species == 'virginica'
14+
15+
t = 1.75
16+
p0,p1 = 3,2
17+
18+
if COLOUR_FIGURE:
19+
area1c = (1.,.8,.8)
20+
area2c = (.8,.8,1.)
21+
else:
22+
area1c = (1.,1,1)
23+
area2c = (.7,.7,.7)
24+
25+
x0,x1 =[features[:,p0].min()*.9,features[:,p0].max()*1.1]
26+
y0,y1 =[features[:,p1].min()*.9,features[:,p1].max()*1.1]
27+
28+
plt.fill_between([t,x1],[y0,y0],[y1,y1],color=area2c)
29+
plt.fill_between([x0,t],[y0,y0],[y1,y1],color=area1c)
30+
plt.plot([t,t],[y0,y1],'k--',lw=2)
31+
plt.plot([t-.1,t-.1],[y0,y1],'k:',lw=2)
32+
plt.scatter(features[virginica,p0], features[virginica,p1], c='b', marker='o')
33+
plt.scatter(features[~virginica,p0], features[~virginica,p1], c='r', marker='x')
34+
plt.ylim(y0,y1)
35+
plt.xlim(x0,x1)
36+
plt.xlabel(feature_names[p0])
37+
plt.ylabel(feature_names[p1])
38+
plt.savefig('../1400_02_02.png')

ch02/figure4_5.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
COLOUR_FIGURE = False
2+
3+
from matplotlib import pyplot as plt
4+
from matplotlib.colors import ListedColormap
5+
from load import load_dataset
6+
import numpy as np
7+
from knn import learn_model, apply_model, accuracy
8+
9+
feature_names = [
10+
'area',
11+
'perimeter',
12+
'compactness',
13+
'length of kernel',
14+
'width of kernel',
15+
'asymmetry coefficien',
16+
'length of kernel groove',
17+
]
18+
19+
20+
def train_plot(features, labels):
21+
y0,y1 = features[:,2].min()*.9, features[:,2].max()*1.1
22+
x0,x1 = features[:,0].min()*.9, features[:,0].max()*1.1
23+
X = np.linspace(x0,x1,100)
24+
Y = np.linspace(y0,y1,100)
25+
X,Y = np.meshgrid(X,Y)
26+
27+
model = learn_model(1, features[:,(0,2)], np.array(labels))
28+
C = apply_model(np.vstack([X.ravel(),Y.ravel()]).T, model).reshape(X.shape)
29+
if COLOUR_FIGURE:
30+
cmap = ListedColormap([(1.,.6,.6),(.6,1.,.6),(.6,.6,1.)])
31+
else:
32+
cmap = ListedColormap([(1.,1.,1.),(.2,.2,.2),(.6,.6,.6)])
33+
plt.xlim(x0,x1)
34+
plt.ylim(y0,y1)
35+
plt.xlabel(feature_names[0])
36+
plt.ylabel(feature_names[2])
37+
plt.pcolormesh(X,Y,C, cmap=cmap)
38+
if COLOUR_FIGURE:
39+
cmap = ListedColormap([(1.,.0,.0),(.0,1.,.0),(.0,.0,1.)])
40+
plt.scatter(features[:,0], features[:,2], c=labels, cmap=cmap)
41+
else:
42+
for lab,ma in zip(range(3), "Do^"):
43+
plt.plot(features[labels == lab,0], features[labels == lab,2], ma, c=(1.,1.,1.))
44+
45+
46+
features,labels = load_dataset('seeds')
47+
names = sorted(set(labels))
48+
labels = np.array([names.index(ell) for ell in labels])
49+
50+
train_plot(features, labels)
51+
plt.savefig('../1400_02_04.png')
52+
53+
features -= features.mean(0)
54+
features /= features.std(0)
55+
train_plot(features, labels)
56+
plt.savefig('../1400_02_05.png')

ch02/heldout.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
from matplotlib import pyplot as plt
2+
import numpy as np
3+
from sklearn.datasets import load_iris
4+
from threshold import learn_model, apply_model, accuracy
5+
6+
data = load_iris()
7+
features = data['data']
8+
labels = data['target_names'][data['target']]
9+
10+
11+
setosa = (labels == 'setosa')
12+
features = features[~setosa]
13+
labels = labels[~setosa]
14+
virginica = (labels == 'virginica')
15+
16+
testing = np.tile([True, False], 50)
17+
training = ~testing
18+
19+
model = learn_model(features[training], virginica[training])
20+
train_error = accuracy(features[training], virginica[training], model)
21+
test_error = accuracy(features[testing], virginica[testing], model)
22+
23+
print('''\
24+
Training error was {0:.1%}.
25+
Testing error was {1:.1%} (N = {2}).
26+
'''.format(train_error, test_error, testing.sum()))
27+

ch02/knn.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import numpy as np
2+
def learn_model(k, features, labels):
3+
return k, features.copy(),labels.copy()
4+
5+
def plurality(xs):
6+
from collections import defaultdict
7+
counts = defaultdict(int)
8+
for x in xs:
9+
counts[x] += 1
10+
maxv = max(counts.values())
11+
for k,v in counts.items():
12+
if v == maxv:
13+
return k
14+
15+
def apply_model(features, model):
16+
k, train_feats, labels = model
17+
results = []
18+
for f in features:
19+
label_dist = []
20+
for t,ell in zip(train_feats, labels):
21+
label_dist.append( (np.linalg.norm(f-t), ell) )
22+
label_dist.sort(key=lambda d_ell: d_ell[0])
23+
label_dist = label_dist[:k]
24+
results.append(plurality([ell for _,ell in label_dist]))
25+
return np.array(results)
26+
27+
def accuracy(features, labels, model):
28+
preds = apply_model(features, model)
29+
return np.mean(preds == labels)

0 commit comments

Comments
 (0)