markpyne
diff --git a/‎ch01/analyze_webstats.py‎
Lines changed: 144 additions & 0 deletions b/‎ch01/analyze_webstats.py‎
Lines changed: 144 additions & 0 deletions
diff --git a/‎ch01/gen_webstats.py‎
Lines changed: 35 additions & 0 deletions b/‎ch01/gen_webstats.py‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎ch01/performance_test.py‎
Lines changed: 15 additions & 0 deletions b/‎ch01/performance_test.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎ch02/extra/create_tsv.py‎
Lines changed: 12 additions & 0 deletions b/‎ch02/extra/create_tsv.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎ch02/figure1.py‎
Lines changed: 21 additions & 0 deletions b/‎ch02/figure1.py‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎ch02/figure2.py‎
Lines changed: 38 additions & 0 deletions b/‎ch02/figure2.py‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎ch02/figure4_5.py‎
Lines changed: 56 additions & 0 deletions b/‎ch02/figure4_5.py‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎ch02/heldout.py‎
Lines changed: 27 additions & 0 deletions b/‎ch02/heldout.py‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎ch02/knn.py‎
Lines changed: 29 additions & 0 deletions b/‎ch02/knn.py‎
Lines changed: 29 additions & 0 deletions
@@ -0,0 +1,144 @@
+import os
+import scipy as sp
+import matplotlib.pyplot as plt
+
+data_dir = os.path.join(
+    os.path.dirname(os.path.realpath(__file__)), "..", "data")
+data = sp.genfromtxt(os.path.join(data_dir, "web_traffic.tsv"), delimiter="\t")
+print(data[:10])
+
+# all examples will have three classes in this file
+colors = ['g', 'k', 'b', 'm', 'r']
+linestyles = ['-', '-.', '--', ':', '-']
+
+x = data[:, 0]
+y = data[:, 1]
+print("Number of invalid entries:", sp.sum(sp.isnan(y)))
+x = x[~sp.isnan(y)]
+y = y[~sp.isnan(y)]
+
+# plot input data
+
+
+def plot_models(x, y, models, fname, mx=None, ymax=None, xmin=None):
+    plt.clf()
+    plt.scatter(x, y, s=10)
+    plt.title("Web traffic over the last month")
+    plt.xlabel("Time")
+    plt.ylabel("Hits/hour")
+    plt.xticks(
+        [w * 7 * 24 for w in range(10)], ['week %i' % w for w in range(10)])
+
+    if models:
+        if mx is None:
+            mx = sp.linspace(0, x[-1], 1000)
+        for model, style, color in zip(models, linestyles, colors):
+            # print "Model:",model
+            # print "Coeffs:",model.coeffs
+            plt.plot(mx, model(mx), linestyle=style, linewidth=2, c=color)
+
+        plt.legend(["d=%i" % m.order for m in models], loc="upper left")
+
+    plt.autoscale(tight=True)
+    plt.ylim(ymin=0)
+    if ymax:
+        plt.ylim(ymax=ymax)
+    if xmin:
+        plt.xlim(xmin=xmin)
+    plt.grid(True, linestyle='-', color='0.75')
+    plt.savefig(fname)
+
+# first look at the data
+plot_models(x, y, None, os.path.join("..", "1400_01_01.png"))
+
+# create and plot models
+fp1, res, rank, sv, rcond = sp.polyfit(x, y, 1, full=True)
+print("Model parameters: %s" % fp1)
+print("Error of the model:", res)
+f1 = sp.poly1d(fp1)
+f2 = sp.poly1d(sp.polyfit(x, y, 2))
+f3 = sp.poly1d(sp.polyfit(x, y, 3))
+f10 = sp.poly1d(sp.polyfit(x, y, 10))
+f100 = sp.poly1d(sp.polyfit(x, y, 100))
+
+plot_models(x, y, [f1], os.path.join("..", "1400_01_02.png"))
+plot_models(x, y, [f1, f2], os.path.join("..", "1400_01_03.png"))
+plot_models(
+    x, y, [f1, f2, f3, f10, f100], os.path.join("..", "1400_01_04.png"))
+
+# fit and plot a model using the knowledge about inflection point
+inflection = 3.5 * 7 * 24
+xa = x[:inflection]
+ya = y[:inflection]
+xb = x[inflection:]
+yb = y[inflection:]
+
+fa = sp.poly1d(sp.polyfit(xa, ya, 1))
+fb = sp.poly1d(sp.polyfit(xb, yb, 1))
+
+plot_models(x, y, [fa, fb], os.path.join("..", "1400_01_05.png"))
+
+
+def error(f, x, y):
+    return sp.sum((f(x) - y) ** 2)
+
+print("Errors for the complete data set:")
+for f in [f1, f2, f3, f10, f100]:
+    print("Error d=%i: %f" % (f.order, error(f, x, y)))
+
+print("Errors for only the time after inflection point")
+for f in [f1, f2, f3, f10, f100]:
+    print("Error d=%i: %f" % (f.order, error(f, xb, yb)))
+
+print("Error inflection=%f" % (error(fa, xa, ya) + error(fb, xb, yb)))
+
+
+# extrapolating into the future
+plot_models(
+    x, y, [f1, f2, f3, f10, f100], os.path.join("..", "1400_01_06.png"),
+    mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100),
+    ymax=10000, xmin=0 * 7 * 24)
+
+print("Trained only on data after inflection point")
+fb1 = fb
+fb2 = sp.poly1d(sp.polyfit(xb, yb, 2))
+fb3 = sp.poly1d(sp.polyfit(xb, yb, 3))
+fb10 = sp.poly1d(sp.polyfit(xb, yb, 10))
+fb100 = sp.poly1d(sp.polyfit(xb, yb, 100))
+
+print("Errors for only the time after inflection point")
+for f in [fb1, fb2, fb3, fb10, fb100]:
+    print("Error d=%i: %f" % (f.order, error(f, xb, yb)))
+
+plot_models(
+    x, y, [fb1, fb2, fb3, fb10, fb100], os.path.join("..", "1400_01_07.png"),
+    mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100),
+    ymax=10000, xmin=0 * 7 * 24)
+
+# separating training from testing data
+frac = 0.3
+split_idx = int(frac * len(xb))
+shuffled = sp.random.permutation(list(range(len(xb))))
+test = sorted(shuffled[:split_idx])
+train = sorted(shuffled[split_idx:])
+fbt1 = sp.poly1d(sp.polyfit(xb[train], yb[train], 1))
+fbt2 = sp.poly1d(sp.polyfit(xb[train], yb[train], 2))
+fbt3 = sp.poly1d(sp.polyfit(xb[train], yb[train], 3))
+fbt10 = sp.poly1d(sp.polyfit(xb[train], yb[train], 10))
+fbt100 = sp.poly1d(sp.polyfit(xb[train], yb[train], 100))
+
+print("Test errors for only the time after inflection point")
+for f in [fbt1, fbt2, fbt3, fbt10, fbt100]:
+    print("Error d=%i: %f" % (f.order, error(f, xb[test], yb[test])))
+
+plot_models(
+    x, y, [fbt1, fbt2, fbt3, fbt10, fbt100], os.path.join("..",
+                                                          "1400_01_08.png"),
+    mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100),
+    ymax=10000, xmin=0 * 7 * 24)
+
+from scipy.optimize import fsolve
+print(fbt2)
+print(fbt2 - 100000)
+reached_max = fsolve(fbt2 - 100000, 800) / (7 * 24)
+print("100,000 hits/hour expected at week %f" % reached_max[0])
@@ -0,0 +1,35 @@
+# This script generates web traffic data for our hypothetical
+# web startup "MLASS" in chapter 01
+
+import os
+import scipy as sp
+from scipy.stats import gamma
+import matplotlib.pyplot as plt
+
+sp.random.seed(3)  # to reproduce the data later on
+
+x = sp.arange(1, 31 * 24)
+y = sp.array(200 * (sp.sin(2 * sp.pi * x / (7 * 24))), dtype=int)
+y += gamma.rvs(15, loc=0, scale=100, size=len(x))
+y += 2 * sp.exp(x / 100.0)
+y = sp.ma.array(y, mask=[y < 0])
+print(sum(y), sum(y < 0))
+
+plt.scatter(x, y)
+plt.title("Web traffic over the last month")
+plt.xlabel("Time")
+plt.ylabel("Hits/hour")
+plt.xticks([w * 7 * 24 for w in [0, 1, 2, 3, 4]], ['week %i' % (w + 1) for w in [
+           0, 1, 2, 3, 4]])
+
+plt.autoscale(tight=True)
+plt.grid()
+plt.savefig(os.path.join("..", "1400_01_01.png"))
+
+data_dir = os.path.join(
+    os.path.dirname(os.path.realpath(__file__)), "..", "data")
+
+# sp.savetxt(os.path.join("..", "web_traffic.tsv"),
+# zip(x[~y.mask],y[~y.mask]), delimiter="\t", fmt="%i")
+sp.savetxt(os.path.join(
+    data_dir, "web_traffic.tsv"), list(zip(x, y)), delimiter="\t", fmt="%s")
@@ -0,0 +1,15 @@
+
+import timeit
+
+normal_py_sec = timeit.timeit('sum(x*x for x in xrange(1000))',
+                              number=10000)
+naive_np_sec = timeit.timeit('sum(na*na)',
+                             setup="import numpy as np; na=np.arange(1000)",
+                             number=10000)
+good_np_sec = timeit.timeit('na.dot(na)',
+                            setup="import numpy as np; na=np.arange(1000)",
+                            number=10000)
+
+print("Normal Python: %f sec" % normal_py_sec)
+print("Naive NumPy: %f sec" % naive_np_sec)
+print("Good NumPy: %f sec" % good_np_sec)
@@ -0,0 +1,12 @@
+import milksets.iris
+import milksets.seeds
+
+def save_as_tsv(fname, module):
+    features, labels = module.load()
+    nlabels = [module.label_names[ell] for ell in labels]
+    with open(fname, 'w') as ofile:
+        for f,n in zip(features, nlabels):
+            print >>ofile, "\t".join(map(str,f)+[n])
+
+save_as_tsv('iris.tsv', milksets.iris)
+save_as_tsv('seeds.tsv', milksets.seeds)
@@ -0,0 +1,21 @@
+import numpy as np
+from sklearn.datasets import load_iris
+from matplotlib import pyplot as plt
+
+data = load_iris()
+features = data['data']
+feature_names = data['feature_names']
+target = data['target']
+
+
+pairs = [(0,1),(0,2),(0,3),(1,2),(1,3),(2,3)]
+for i,(p0,p1) in enumerate(pairs):
+    plt.subplot(2,3,i+1)
+    for t,marker,c in zip(range(3),">ox","rgb"):
+        plt.scatter(features[target == t,p0], features[target == t,p1], marker=marker, c=c)
+    plt.xlabel(feature_names[p0])
+    plt.ylabel(feature_names[p1])
+    plt.xticks([])
+    plt.yticks([])
+plt.savefig('../1400_02_01.png')
+
@@ -0,0 +1,38 @@
+COLOUR_FIGURE = False
+
+from matplotlib import pyplot as plt
+from sklearn.datasets import load_iris
+data = load_iris()
+features = data['data']
+feature_names = data['feature_names']
+species = data['target_names'][data['target']]
+
+setosa = (species == 'setosa')
+features = features[~setosa]
+species = species[~setosa]
+virginica = species == 'virginica'
+
+t = 1.75
+p0,p1 = 3,2
+
+if COLOUR_FIGURE:
+    area1c = (1.,.8,.8)
+    area2c = (.8,.8,1.)
+else:
+    area1c = (1.,1,1)
+    area2c = (.7,.7,.7)
+
+x0,x1 =[features[:,p0].min()*.9,features[:,p0].max()*1.1]
+y0,y1 =[features[:,p1].min()*.9,features[:,p1].max()*1.1]
+
+plt.fill_between([t,x1],[y0,y0],[y1,y1],color=area2c)
+plt.fill_between([x0,t],[y0,y0],[y1,y1],color=area1c)
+plt.plot([t,t],[y0,y1],'k--',lw=2)
+plt.plot([t-.1,t-.1],[y0,y1],'k:',lw=2)
+plt.scatter(features[virginica,p0], features[virginica,p1], c='b', marker='o')
+plt.scatter(features[~virginica,p0], features[~virginica,p1], c='r', marker='x')
+plt.ylim(y0,y1)
+plt.xlim(x0,x1)
+plt.xlabel(feature_names[p0])
+plt.ylabel(feature_names[p1])
+plt.savefig('../1400_02_02.png')
@@ -0,0 +1,56 @@
+COLOUR_FIGURE = False
+
+from matplotlib import pyplot as plt
+from matplotlib.colors import ListedColormap
+from load import load_dataset
+import numpy as np
+from knn import learn_model, apply_model, accuracy
+
+feature_names = [
+    'area',
+    'perimeter',
+    'compactness',
+    'length of kernel',
+    'width of kernel',
+    'asymmetry coefficien',
+    'length of kernel groove',
+]
+
+
+def train_plot(features, labels):
+    y0,y1 = features[:,2].min()*.9, features[:,2].max()*1.1
+    x0,x1 = features[:,0].min()*.9, features[:,0].max()*1.1
+    X = np.linspace(x0,x1,100)
+    Y = np.linspace(y0,y1,100)
+    X,Y = np.meshgrid(X,Y)
+
+    model = learn_model(1, features[:,(0,2)], np.array(labels))
+    C = apply_model(np.vstack([X.ravel(),Y.ravel()]).T, model).reshape(X.shape)
+    if COLOUR_FIGURE:
+        cmap = ListedColormap([(1.,.6,.6),(.6,1.,.6),(.6,.6,1.)])
+    else:
+        cmap = ListedColormap([(1.,1.,1.),(.2,.2,.2),(.6,.6,.6)])
+    plt.xlim(x0,x1)
+    plt.ylim(y0,y1)
+    plt.xlabel(feature_names[0])
+    plt.ylabel(feature_names[2])
+    plt.pcolormesh(X,Y,C, cmap=cmap)
+    if COLOUR_FIGURE:
+        cmap = ListedColormap([(1.,.0,.0),(.0,1.,.0),(.0,.0,1.)])
+        plt.scatter(features[:,0], features[:,2], c=labels, cmap=cmap)
+    else:
+        for lab,ma in zip(range(3), "Do^"):
+            plt.plot(features[labels == lab,0], features[labels == lab,2], ma, c=(1.,1.,1.))
+
+
+features,labels = load_dataset('seeds')
+names = sorted(set(labels))
+labels = np.array([names.index(ell) for ell in labels])
+
+train_plot(features, labels)
+plt.savefig('../1400_02_04.png')
+
+features -= features.mean(0)
+features /= features.std(0)
+train_plot(features, labels)
+plt.savefig('../1400_02_05.png')
@@ -0,0 +1,27 @@
+from matplotlib import pyplot as plt
+import numpy as np
+from sklearn.datasets import load_iris
+from threshold import learn_model, apply_model, accuracy
+
+data = load_iris()
+features = data['data']
+labels = data['target_names'][data['target']]
+
+
+setosa = (labels == 'setosa')
+features = features[~setosa]
+labels = labels[~setosa]
+virginica = (labels == 'virginica')
+
+testing = np.tile([True, False], 50)
+training = ~testing
+
+model = learn_model(features[training], virginica[training])
+train_error = accuracy(features[training], virginica[training], model)
+test_error = accuracy(features[testing], virginica[testing], model)
+
+print('''\
+Training error was {0:.1%}.
+Testing error was {1:.1%} (N = {2}).
+'''.format(train_error, test_error, testing.sum()))
+
@@ -0,0 +1,29 @@
+import numpy as np
+def learn_model(k, features, labels):
+    return k, features.copy(),labels.copy()
+
+def plurality(xs):
+    from collections import defaultdict
+    counts = defaultdict(int)
+    for x in xs:
+        counts[x] += 1
+    maxv = max(counts.values())
+    for k,v in counts.items():
+        if v == maxv:
+            return k
+
+def apply_model(features, model):
+    k, train_feats, labels = model
+    results = []
+    for f in features:
+        label_dist = []
+        for t,ell in zip(train_feats, labels):
+            label_dist.append( (np.linalg.norm(f-t), ell) )
+        label_dist.sort(key=lambda d_ell: d_ell[0])
+        label_dist = label_dist[:k]
+        results.append(plurality([ell for _,ell in label_dist]))
+    return np.array(results)
+
+def accuracy(features, labels, model):
+    preds = apply_model(features, model)
+    return np.mean(preds == labels)