diff --git a/README.md b/README.md index 05218427..cbcd3155 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,14 @@ Building Machine Learning Systems with Python ============================================= -Source Code for the book Building Machine Learning Systems with Python by -[Willi Richert](http://twotoreal.com) and [Luis Pedro -Coelho](http://luispedro.org). +Source Code for the book Building Machine Learning Systems with Python by [Luis +Pedro Coelho](http://luispedro.org) and [Willi Richert](http://twotoreal.com). -The book was published in 2013 by Packt Publishing and is available [from their +The book was published in 2013 (second edition in 2015) by Packt Publishing and +is available [from their website](http://www.packtpub.com/building-machine-learning-systems-with-python/book). +The code in the repository corresponds to the second edition. Code for the +first edition is available in [first\_edition +branch](https://github.com/luispedro/BuildingMachineLearningSystemsWithPython/tree/first_edition). diff --git a/SimpleImageDataset/building02.jpg b/SimpleImageDataset/building02.jpg new file mode 100644 index 00000000..343c5242 Binary files /dev/null and b/SimpleImageDataset/building02.jpg differ diff --git a/SimpleImageDataset/scene01.jpg b/SimpleImageDataset/scene01.jpg index 29b327d2..e7b416d3 100644 Binary files a/SimpleImageDataset/scene01.jpg and b/SimpleImageDataset/scene01.jpg differ diff --git a/SimpleImageDataset/scene02.jpg b/SimpleImageDataset/scene02.jpg new file mode 100644 index 00000000..89fd6b27 Binary files /dev/null and b/SimpleImageDataset/scene02.jpg differ diff --git a/SimpleImageDataset/scene08.jpg b/SimpleImageDataset/scene08.jpg new file mode 100644 index 00000000..7dee8860 Binary files /dev/null and b/SimpleImageDataset/scene08.jpg differ diff --git a/ch01/analyze_webstats.py b/ch01/analyze_webstats.py index a053cec4..5da892e2 100644 --- a/ch01/analyze_webstats.py +++ b/ch01/analyze_webstats.py @@ -6,13 +6,15 @@ # It is made available under the MIT License import os +from utils import DATA_DIR, CHART_DIR import scipy as sp import matplotlib.pyplot as plt -from utils import DATA_DIR, CHART_DIR +sp.random.seed(3) # to reproduce the data later on data = sp.genfromtxt(os.path.join(DATA_DIR, "web_traffic.tsv"), delimiter="\t") print(data[:10]) +print(data.shape) # all examples will have three classes in this file colors = ['g', 'k', 'b', 'm', 'r'] @@ -24,10 +26,11 @@ x = x[~sp.isnan(y)] y = y[~sp.isnan(y)] -# plot input data - def plot_models(x, y, models, fname, mx=None, ymax=None, xmin=None): + ''' plot input data ''' + + plt.figure(num=None, figsize=(8, 6)) plt.clf() plt.scatter(x, y, s=10) plt.title("Web traffic over the last month") @@ -59,11 +62,15 @@ def plot_models(x, y, models, fname, mx=None, ymax=None, xmin=None): plot_models(x, y, None, os.path.join(CHART_DIR, "1400_01_01.png")) # create and plot models -fp1, res, rank, sv, rcond = sp.polyfit(x, y, 1, full=True) -print("Model parameters: %s" % fp1) -print("Error of the model:", res) +fp1, res1, rank1, sv1, rcond1 = sp.polyfit(x, y, 1, full=True) +print("Model parameters of fp1: %s" % fp1) +print("Error of the model of fp1:", res1) f1 = sp.poly1d(fp1) -f2 = sp.poly1d(sp.polyfit(x, y, 2)) + +fp2, res2, rank2, sv2, rcond2 = sp.polyfit(x, y, 2, full=True) +print("Model parameters of fp2: %s" % fp2) +print("Error of the model of fp2:", res2) +f2 = sp.poly1d(fp2) f3 = sp.poly1d(sp.polyfit(x, y, 3)) f10 = sp.poly1d(sp.polyfit(x, y, 10)) f100 = sp.poly1d(sp.polyfit(x, y, 100)) @@ -102,7 +109,8 @@ def error(f, x, y): # extrapolating into the future plot_models( - x, y, [f1, f2, f3, f10, f100], os.path.join(CHART_DIR, "1400_01_06.png"), + x, y, [f1, f2, f3, f10, f100], + os.path.join(CHART_DIR, "1400_01_06.png"), mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100), ymax=10000, xmin=0 * 7 * 24) @@ -118,7 +126,8 @@ def error(f, x, y): print("Error d=%i: %f" % (f.order, error(f, xb, yb))) plot_models( - x, y, [fb1, fb2, fb3, fb10, fb100], os.path.join(CHART_DIR, "1400_01_07.png"), + x, y, [fb1, fb2, fb3, fb10, fb100], + os.path.join(CHART_DIR, "1400_01_07.png"), mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100), ymax=10000, xmin=0 * 7 * 24) @@ -130,6 +139,8 @@ def error(f, x, y): train = sorted(shuffled[split_idx:]) fbt1 = sp.poly1d(sp.polyfit(xb[train], yb[train], 1)) fbt2 = sp.poly1d(sp.polyfit(xb[train], yb[train], 2)) +print("fbt2(x)= \n%s" % fbt2) +print("fbt2(x)-100,000= \n%s" % (fbt2-100000)) fbt3 = sp.poly1d(sp.polyfit(xb[train], yb[train], 3)) fbt10 = sp.poly1d(sp.polyfit(xb[train], yb[train], 10)) fbt100 = sp.poly1d(sp.polyfit(xb[train], yb[train], 100)) @@ -139,13 +150,13 @@ def error(f, x, y): print("Error d=%i: %f" % (f.order, error(f, xb[test], yb[test]))) plot_models( - x, y, [fbt1, fbt2, fbt3, fbt10, fbt100], os.path.join(CHART_DIR, - "1400_01_08.png"), + x, y, [fbt1, fbt2, fbt3, fbt10, fbt100], + os.path.join(CHART_DIR, "1400_01_08.png"), mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100), ymax=10000, xmin=0 * 7 * 24) from scipy.optimize import fsolve print(fbt2) print(fbt2 - 100000) -reached_max = fsolve(fbt2 - 100000, 800) / (7 * 24) +reached_max = fsolve(fbt2 - 100000, x0=800) / (7 * 24) print("100,000 hits/hour expected at week %f" % reached_max[0]) diff --git a/ch01/gen_webstats.py b/ch01/gen_webstats.py index 2aea928a..61d0b738 100644 --- a/ch01/gen_webstats.py +++ b/ch01/gen_webstats.py @@ -17,26 +17,22 @@ sp.random.seed(3) # to reproduce the data later on -x = sp.arange(1, 31 * 24) -y = sp.array(200 * (sp.sin(2 * sp.pi * x / (7 * 24))), dtype=int) +x = sp.arange(1, 31*24) +y = sp.array(200*(sp.sin(2*sp.pi*x/(7*24))), dtype=int) y += gamma.rvs(15, loc=0, scale=100, size=len(x)) -y += 2 * sp.exp(x / 100.0) -y = sp.ma.array(y, mask=[y < 0]) -print(sum(y), sum(y < 0)) +y += 2 * sp.exp(x/100.0) +y = sp.ma.array(y, mask=[y<0]) +print(sum(y), sum(y<0)) plt.scatter(x, y) plt.title("Web traffic over the last month") plt.xlabel("Time") plt.ylabel("Hits/hour") -plt.xticks([w * 7 * 24 for w in [0, 1, 2, 3, 4]], ['week %i' % (w + 1) for w in [ - 0, 1, 2, 3, 4]]) - +plt.xticks([w*7*24 for w in range(5)], + ['week %i' %(w+1) for w in range(5)]) plt.autoscale(tight=True) plt.grid() plt.savefig(os.path.join(CHART_DIR, "1400_01_01.png")) -# sp.savetxt(os.path.join("..", "web_traffic.tsv"), -# zip(x[~y.mask],y[~y.mask]), delimiter="\t", fmt="%i") - -sp.savetxt(os.path.join( - DATA_DIR, "web_traffic.tsv"), list(zip(x, y)), delimiter="\t", fmt="%s") +sp.savetxt(os.path.join(DATA_DIR, "web_traffic.tsv"), + list(zip(x, y)), delimiter="\t", fmt="%s") diff --git a/ch01/performance_test.py b/ch01/performance_test.py index 16f3792d..f2111732 100644 --- a/ch01/performance_test.py +++ b/ch01/performance_test.py @@ -8,7 +8,7 @@ import timeit -normal_py_sec = timeit.timeit('sum(x*x for x in xrange(1000))', +normal_py_sec = timeit.timeit('sum(x*x for x in range(1000))', number=10000) naive_np_sec = timeit.timeit('sum(na*na)', setup="import numpy as np; na=np.arange(1000)", diff --git a/ch02/README.rst b/ch02/README.rst new file mode 100644 index 00000000..e2cb729a --- /dev/null +++ b/ch02/README.rst @@ -0,0 +1,55 @@ +========= +Chapter 2 +========= + +Support code for *Chapter 2: Learning How to Classify with Real-world +Examples*. The directory data contains the seeds dataset, originally downloaded +from https://archive.ics.uci.edu/ml/datasets/seeds + +chapter.py + The code as printed in the book. + +figure1.py + Figure 1 in the book: all 2-by-2 scatter plots + +figure2.py + Figure 2 in the book: threshold & decision area + +figure4_5_sklearn.py + Figures 4 and 5 in the book: Knn decision borders before and after feature + normalization. This also produces a version of the figure using 11 + neighbors (not in the book), which shows that the result is smoother, not + as sensitive to exact positions of each datapoint. + +figure4_5_no_sklearn.py + Alternative code for Figures 4 and 5 without using scikit-learn + +load.py + Code to load the seeds data + +simple_threshold.py + Code from the book: finds the first partition, between Setosa and the other classes. + +stump.py + Code from the book: finds the second partition, between Virginica and Versicolor. + +threshold.py + Functional implementation of a threshold classifier + +heldout.py + Evalute the threshold model on heldout data + +seeds_knn_sklearn.py + Demonstrate cross-validation and feature normalization using scikit-learn + +seeds_threshold.py + Test thresholding model on the seeds dataset (result mention in book, but no code) + +seeds_knn_increasing_k.py + Test effect of increasing num_neighbors on accuracy. + +knn.py + Implementation of K-Nearest neighbor without using scikit-learn. + +seeds_knn.py + Demonstrate cross-validation (without scikit-learn) diff --git a/ch02/chapter.py b/ch02/chapter.py new file mode 100644 index 00000000..c68b45ab --- /dev/null +++ b/ch02/chapter.py @@ -0,0 +1,164 @@ +# This code is supporting material for the book +# Building Machine Learning Systems with Python +# by Willi Richert and Luis Pedro Coelho +# published by PACKT Publishing +# +# It is made available under the MIT License + + +from matplotlib import pyplot as plt +import numpy as np + +# We load the data with load_iris from sklearn +from sklearn.datasets import load_iris +data = load_iris() + +# load_iris returns an object with several fields +features = data.data +feature_names = data.feature_names +target = data.target +target_names = data.target_names + +for t in range(3): + if t == 0: + c = 'r' + marker = '>' + elif t == 1: + c = 'g' + marker = 'o' + elif t == 2: + c = 'b' + marker = 'x' + plt.scatter(features[target == t, 0], + features[target == t, 1], + marker=marker, + c=c) +# We use NumPy fancy indexing to get an array of strings: +labels = target_names[target] + +# The petal length is the feature at position 2 +plength = features[:, 2] + +# Build an array of booleans: +is_setosa = (labels == 'setosa') + +# This is the important step: +max_setosa =plength[is_setosa].max() +min_non_setosa = plength[~is_setosa].min() +print('Maximum of setosa: {0}.'.format(max_setosa)) + +print('Minimum of others: {0}.'.format(min_non_setosa)) + +# ~ is the boolean negation operator +features = features[~is_setosa] +labels = labels[~is_setosa] +# Build a new target variable, is_virigina +is_virginica = (labels == 'virginica') + +# Initialize best_acc to impossibly low value +best_acc = -1.0 +for fi in range(features.shape[1]): + # We are going to test all possible thresholds + thresh = features[:,fi] + for t in thresh: + + # Get the vector for feature `fi` + feature_i = features[:, fi] + # apply threshold `t` + pred = (feature_i > t) + acc = (pred == is_virginica).mean() + rev_acc = (pred == ~is_virginica).mean() + if rev_acc > acc: + reverse = True + acc = rev_acc + else: + reverse = False + + if acc > best_acc: + best_acc = acc + best_fi = fi + best_t = t + best_reverse = reverse + +print(best_fi, best_t, best_reverse, best_acc) + +def is_virginica_test(fi, t, reverse, example): + 'Apply threshold model to a new example' + test = example[fi] > t + if reverse: + test = not test + return test +from threshold import fit_model, predict + +# ning accuracy was 96.0%. +# ing accuracy was 90.0% (N = 50). +correct = 0.0 + +for ei in range(len(features)): + # select all but the one at position `ei`: + training = np.ones(len(features), bool) + training[ei] = False + testing = ~training + model = fit_model(features[training], is_virginica[training]) + predictions = predict(model, features[testing]) + correct += np.sum(predictions == is_virginica[testing]) +acc = correct/float(len(features)) +print('Accuracy: {0:.1%}'.format(acc)) + + +########################################### +############## SEEDS DATASET ############## +########################################### + +from load import load_dataset + +feature_names = [ + 'area', + 'perimeter', + 'compactness', + 'length of kernel', + 'width of kernel', + 'asymmetry coefficien', + 'length of kernel groove', +] +features, labels = load_dataset('seeds') + + + +from sklearn.neighbors import KNeighborsClassifier +classifier = KNeighborsClassifier(n_neighbors=1) +from sklearn.cross_validation import KFold + +kf = KFold(len(features), n_folds=5, shuffle=True) +means = [] +for training,testing in kf: + # We learn a model for this fold with `fit` and then apply it to the + # testing data with `predict`: + classifier.fit(features[training], labels[training]) + prediction = classifier.predict(features[testing]) + + # np.mean on an array of booleans returns fraction + # of correct decisions for this fold: + curmean = np.mean(prediction == labels[testing]) + means.append(curmean) +print('Mean accuracy: {:.1%}'.format(np.mean(means))) + + +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler + +classifier = KNeighborsClassifier(n_neighbors=1) +classifier = Pipeline([('norm', StandardScaler()), ('knn', classifier)]) + +means = [] +for training,testing in kf: + # We learn a model for this fold with `fit` and then apply it to the + # testing data with `predict`: + classifier.fit(features[training], labels[training]) + prediction = classifier.predict(features[testing]) + + # np.mean on an array of booleans returns fraction + # of correct decisions for this fold: + curmean = np.mean(prediction == labels[testing]) + means.append(curmean) +print('Mean accuracy: {:.1%}'.format(np.mean(means))) diff --git a/ch02/extra/create_tsv.py b/ch02/extra/create_tsv.py index b0ddee89..e6d7b4fd 100644 --- a/ch02/extra/create_tsv.py +++ b/ch02/extra/create_tsv.py @@ -5,7 +5,6 @@ # # It is made available under the MIT License -import milksets.iris import milksets.seeds @@ -16,5 +15,4 @@ def save_as_tsv(fname, module): for f, n in zip(features, nlabels): print >>ofile, "\t".join(map(str, f) + [n]) -save_as_tsv('iris.tsv', milksets.iris) save_as_tsv('seeds.tsv', milksets.seeds) diff --git a/ch02/figure1.py b/ch02/figure1.py index 922b8ce1..4ec6fff8 100644 --- a/ch02/figure1.py +++ b/ch02/figure1.py @@ -5,24 +5,38 @@ # # It is made available under the MIT License -import numpy as np -from sklearn.datasets import load_iris from matplotlib import pyplot as plt -data = load_iris() -features = data['data'] -feature_names = data['feature_names'] -target = data['target'] +# We load the data with load_iris from sklearn +from sklearn.datasets import load_iris +# load_iris returns an object with several fields +data = load_iris() +features = data.data +feature_names = data.feature_names +target = data.target +target_names = data.target_names +fig,axes = plt.subplots(2, 3) pairs = [(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)] + +# Set up 3 different pairs of (color, marker) +color_markers = [ + ('r', '>'), + ('g', 'o'), + ('b', 'x'), + ] for i, (p0, p1) in enumerate(pairs): - plt.subplot(2, 3, i + 1) - for t, marker, c in zip(range(3), ">ox", "rgb"): - plt.scatter(features[target == t, p0], features[ + ax = axes.flat[i] + + for t in range(3): + # Use a different color/marker for each class `t` + c,marker = color_markers[t] + ax.scatter(features[target == t, p0], features[ target == t, p1], marker=marker, c=c) - plt.xlabel(feature_names[p0]) - plt.ylabel(feature_names[p1]) - plt.xticks([]) - plt.yticks([]) -plt.savefig('figure1.png') + ax.set_xlabel(feature_names[p0]) + ax.set_ylabel(feature_names[p1]) + ax.set_xticks([]) + ax.set_yticks([]) +fig.tight_layout() +fig.savefig('figure1.png') diff --git a/ch02/figure2.py b/ch02/figure2.py index a04562f8..0b69d395 100644 --- a/ch02/figure2.py +++ b/ch02/figure2.py @@ -10,17 +10,25 @@ from matplotlib import pyplot as plt from sklearn.datasets import load_iris data = load_iris() -features = data['data'] -feature_names = data['feature_names'] -species = data['target_names'][data['target']] +features = data.data +feature_names = data.feature_names +target = data.target +target_names = data.target_names -setosa = (species == 'setosa') -features = features[~setosa] -species = species[~setosa] -virginica = species == 'virginica' +# We use NumPy fancy indexing to get an array of strings: +labels = target_names[target] -t = 1.75 -p0, p1 = 3, 2 +is_setosa = (labels == 'setosa') +features = features[~is_setosa] +labels = labels[~is_setosa] +is_virginica = (labels == 'virginica') + +# Hand fixed thresholds: +t = 1.65 +t2 = 1.75 + +# Features to use: 3 & 2 +f0, f1 = 3, 2 if COLOUR_FIGURE: area1c = (1., .8, .8) @@ -29,19 +37,27 @@ area1c = (1., 1, 1) area2c = (.7, .7, .7) -x0, x1 = [features[:, p0].min() * .9, features[:, p0].max() * 1.1] -y0, y1 = [features[:, p1].min() * .9, features[:, p1].max() * 1.1] - -plt.fill_between([t, x1], [y0, y0], [y1, y1], color=area2c) -plt.fill_between([x0, t], [y0, y0], [y1, y1], color=area1c) -plt.plot([t, t], [y0, y1], 'k--', lw=2) -plt.plot([t - .1, t - .1], [y0, y1], 'k:', lw=2) -plt.scatter(features[virginica, p0], - features[virginica, p1], c='b', marker='o') -plt.scatter(features[~virginica, p0], - features[~virginica, p1], c='r', marker='x') -plt.ylim(y0, y1) -plt.xlim(x0, x1) -plt.xlabel(feature_names[p0]) -plt.ylabel(feature_names[p1]) -plt.savefig('figure2.png') +# Plot from 90% of smallest value to 110% of largest value +# (all feature values are positive, otherwise this would not work very well) + +x0 = features[:, f0].min() * .9 +x1 = features[:, f0].max() * 1.1 + +y0 = features[:, f1].min() * .9 +y1 = features[:, f1].max() * 1.1 + +fig,ax = plt.subplots() +ax.fill_between([t, x1], [y0, y0], [y1, y1], color=area2c) +ax.fill_between([x0, t], [y0, y0], [y1, y1], color=area1c) +ax.plot([t, t], [y0, y1], 'k--', lw=2) +ax.plot([t2, t2], [y0, y1], 'k:', lw=2) +ax.scatter(features[is_virginica, f0], + features[is_virginica, f1], c='b', marker='o', s=40) +ax.scatter(features[~is_virginica, f0], + features[~is_virginica, f1], c='r', marker='x', s=40) +ax.set_ylim(y0, y1) +ax.set_xlim(x0, x1) +ax.set_xlabel(feature_names[f0]) +ax.set_ylabel(feature_names[f1]) +fig.tight_layout() +fig.savefig('figure2.png') diff --git a/ch02/figure4_5.py b/ch02/figure4_5_no_sklearn.py similarity index 60% rename from ch02/figure4_5.py rename to ch02/figure4_5_no_sklearn.py index 3ddece0c..adc83d73 100644 --- a/ch02/figure4_5.py +++ b/ch02/figure4_5_no_sklearn.py @@ -11,7 +11,7 @@ from matplotlib.colors import ListedColormap from load import load_dataset import numpy as np -from knn import learn_model, apply_model, accuracy +from knn import fit_model, predict feature_names = [ 'area', @@ -24,42 +24,56 @@ ] -def train_plot(features, labels): +def plot_decision(features, labels): + '''Plots decision boundary for KNN + + Parameters + ---------- + features : ndarray + labels : sequence + + Returns + ------- + fig : Matplotlib Figure + ax : Matplotlib Axes + ''' y0, y1 = features[:, 2].min() * .9, features[:, 2].max() * 1.1 x0, x1 = features[:, 0].min() * .9, features[:, 0].max() * 1.1 X = np.linspace(x0, x1, 100) Y = np.linspace(y0, y1, 100) X, Y = np.meshgrid(X, Y) - model = learn_model(1, features[:, (0, 2)], np.array(labels)) - C = apply_model( - np.vstack([X.ravel(), Y.ravel()]).T, model).reshape(X.shape) + model = fit_model(1, features[:, (0, 2)], np.array(labels)) + C = predict( + model, np.vstack([X.ravel(), Y.ravel()]).T).reshape(X.shape) if COLOUR_FIGURE: cmap = ListedColormap([(1., .6, .6), (.6, 1., .6), (.6, .6, 1.)]) else: cmap = ListedColormap([(1., 1., 1.), (.2, .2, .2), (.6, .6, .6)]) - plt.xlim(x0, x1) - plt.ylim(y0, y1) - plt.xlabel(feature_names[0]) - plt.ylabel(feature_names[2]) - plt.pcolormesh(X, Y, C, cmap=cmap) + fig,ax = plt.subplots() + ax.set_xlim(x0, x1) + ax.set_ylim(y0, y1) + ax.set_xlabel(feature_names[0]) + ax.set_ylabel(feature_names[2]) + ax.pcolormesh(X, Y, C, cmap=cmap) if COLOUR_FIGURE: cmap = ListedColormap([(1., .0, .0), (.0, 1., .0), (.0, .0, 1.)]) - plt.scatter(features[:, 0], features[:, 2], c=labels, cmap=cmap) + ax.scatter(features[:, 0], features[:, 2], c=labels, cmap=cmap) else: for lab, ma in zip(range(3), "Do^"): - plt.plot(features[labels == lab, 0], features[ + ax.plot(features[labels == lab, 0], features[ labels == lab, 2], ma, c=(1., 1., 1.)) + return fig,ax features, labels = load_dataset('seeds') names = sorted(set(labels)) labels = np.array([names.index(ell) for ell in labels]) -train_plot(features, labels) -plt.savefig('figure4.png') +fig,ax = plot_decision(features, labels) +fig.savefig('figure4.png') features -= features.mean(0) features /= features.std(0) -train_plot(features, labels) -plt.savefig('figure5.png') +fig,ax = plot_decision(features, labels) +fig.savefig('figure5.png') diff --git a/ch02/figure4_5_sklearn.py b/ch02/figure4_5_sklearn.py new file mode 100644 index 00000000..55ac0c80 --- /dev/null +++ b/ch02/figure4_5_sklearn.py @@ -0,0 +1,85 @@ +# This code is supporting material for the book +# Building Machine Learning Systems with Python +# by Willi Richert and Luis Pedro Coelho +# published by PACKT Publishing +# +# It is made available under the MIT License + +COLOUR_FIGURE = False + +from matplotlib import pyplot as plt +from matplotlib.colors import ListedColormap +from load import load_dataset +import numpy as np +from sklearn.neighbors import KNeighborsClassifier + +feature_names = [ + 'area', + 'perimeter', + 'compactness', + 'length of kernel', + 'width of kernel', + 'asymmetry coefficien', + 'length of kernel groove', +] + + +def plot_decision(features, labels, num_neighbors=1): + '''Plots decision boundary for KNN + + Parameters + ---------- + features : ndarray + labels : sequence + + Returns + ------- + fig : Matplotlib Figure + ax : Matplotlib Axes + ''' + y0, y1 = features[:, 2].min() * .9, features[:, 2].max() * 1.1 + x0, x1 = features[:, 0].min() * .9, features[:, 0].max() * 1.1 + X = np.linspace(x0, x1, 1000) + Y = np.linspace(y0, y1, 1000) + X, Y = np.meshgrid(X, Y) + + model = KNeighborsClassifier(num_neighbors) + model.fit(features[:, (0,2)], labels) + C = model.predict(np.vstack([X.ravel(), Y.ravel()]).T).reshape(X.shape) + if COLOUR_FIGURE: + cmap = ListedColormap([(1., .7, .7), (.7, 1., .7), (.7, .7, 1.)]) + else: + cmap = ListedColormap([(1., 1., 1.), (.2, .2, .2), (.6, .6, .6)]) + fig,ax = plt.subplots() + ax.set_xlim(x0, x1) + ax.set_ylim(y0, y1) + ax.set_xlabel(feature_names[0]) + ax.set_ylabel(feature_names[2]) + ax.pcolormesh(X, Y, C, cmap=cmap) + if COLOUR_FIGURE: + cmap = ListedColormap([(1., .0, .0), (.1, .6, .1), (.0, .0, 1.)]) + ax.scatter(features[:, 0], features[:, 2], c=labels, cmap=cmap) + else: + for lab, ma in zip(range(3), "Do^"): + ax.plot(features[labels == lab, 0], features[ + labels == lab, 2], ma, c=(1., 1., 1.), ms=6) + return fig,ax + + +features, labels = load_dataset('seeds') +names = sorted(set(labels)) +labels = np.array([names.index(ell) for ell in labels]) + +fig,ax = plot_decision(features, labels) +fig.tight_layout() +fig.savefig('figure4sklearn.png') + +features -= features.mean(0) +features /= features.std(0) +fig,ax = plot_decision(features, labels) +fig.tight_layout() +fig.savefig('figure5sklearn.png') + +fig,ax = plot_decision(features, labels, 11) +fig.tight_layout() +fig.savefig('figure5sklearn_with_11_neighbors.png') diff --git a/ch02/heldout.py b/ch02/heldout.py index a14b2df3..e381e706 100644 --- a/ch02/heldout.py +++ b/ch02/heldout.py @@ -8,30 +8,32 @@ # This script demonstrates the difference between the training accuracy and # testing (held-out) accuracy. -from matplotlib import pyplot as plt import numpy as np from sklearn.datasets import load_iris -from threshold import learn_model, apply_model, accuracy +from threshold import fit_model, accuracy data = load_iris() features = data['data'] labels = data['target_names'][data['target']] # We are going to remove the setosa examples as they are too easy: -setosa = (labels == 'setosa') -features = features[~setosa] -labels = labels[~setosa] +is_setosa = (labels == 'setosa') +features = features[~is_setosa] +labels = labels[~is_setosa] # Now we classify virginica vs non-virginica -virginica = (labels == 'virginica') +is_virginica = (labels == 'virginica') # Split the data in two: testing and training testing = np.tile([True, False], 50) # testing = [True,False,True,False,True,False...] + +# Training is the negation of testing: i.e., datapoints not used for testing, +# will be used for training training = ~testing -model = learn_model(features[training], virginica[training]) -train_accuracy = accuracy(features[training], virginica[training], model) -test_accuracy = accuracy(features[testing], virginica[testing], model) +model = fit_model(features[training], is_virginica[training]) +train_accuracy = accuracy(features[training], is_virginica[training], model) +test_accuracy = accuracy(features[testing], is_virginica[testing], model) print('''\ Training accuracy was {0:.1%}. diff --git a/ch02/knn.py b/ch02/knn.py index 0c9151c9..89ebfdb4 100644 --- a/ch02/knn.py +++ b/ch02/knn.py @@ -7,8 +7,8 @@ import numpy as np - -def learn_model(k, features, labels): +# This function was called ``learn_model`` in the first edition +def fit_model(k, features, labels): '''Learn a k-nn model''' # There is no model in k-nn, just a copy of the inputs return k, features.copy(), labels.copy() @@ -25,8 +25,8 @@ def plurality(xs): if v == maxv: return k - -def apply_model(features, model): +# This function was called ``apply_model`` in the first edition +def predict(model, features): '''Apply k-nn model''' k, train_feats, labels = model results = [] @@ -42,5 +42,5 @@ def apply_model(features, model): def accuracy(features, labels, model): - preds = apply_model(features, model) + preds = predict(model, features) return np.mean(preds == labels) diff --git a/ch02/seeds_knn.py b/ch02/seeds_knn.py index 158b21b2..c18d9592 100644 --- a/ch02/seeds_knn.py +++ b/ch02/seeds_knn.py @@ -7,7 +7,7 @@ from load import load_dataset import numpy as np -from knn import learn_model, apply_model, accuracy +from knn import fit_model, accuracy features, labels = load_dataset('seeds') @@ -19,7 +19,7 @@ def cross_validate(features, labels): training = np.ones(len(features), bool) training[fold::10] = 0 testing = ~training - model = learn_model(1, features[training], labels[training]) + model = fit_model(1, features[training], labels[training]) test_error = accuracy(features[testing], labels[testing], model) error += test_error diff --git a/ch02/seeds_knn_increasing_k.py b/ch02/seeds_knn_increasing_k.py new file mode 100644 index 00000000..7cd8b3f9 --- /dev/null +++ b/ch02/seeds_knn_increasing_k.py @@ -0,0 +1,48 @@ +# This code is supporting material for the book +# Building Machine Learning Systems with Python +# by Willi Richert and Luis Pedro Coelho +# published by PACKT Publishing +# +# It is made available under the MIT License + +# Basic imports +from __future__ import print_function +import numpy as np +from matplotlib import pyplot as plt +from load import load_dataset + + +from sklearn.neighbors import KNeighborsClassifier + +from sklearn.cross_validation import cross_val_score +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler + + +features, labels = load_dataset('seeds') + +# Values of k to consider: all in 1 .. 160 +ks = np.arange(1,161) + +# We build a classifier object here with the default number of neighbors +# (It happens to be 5, but it does not matter as we will be changing it below +classifier = KNeighborsClassifier() +classifier = Pipeline([('norm', StandardScaler()), ('knn', classifier)]) + +# accuracies will hold our results +accuracies = [] +for k in ks: + # set the classifier parameter + classifier.set_params(knn__n_neighbors=k) + crossed = cross_val_score(classifier, features, labels) + + # Save only the average + accuracies.append(crossed.mean()) + +accuracies = np.array(accuracies) + +# Scale the accuracies by 100 to plot as a percentage instead of as a fraction +plt.plot(ks, accuracies*100) +plt.xlabel('Value for k (nr. of neighbors)') +plt.ylabel('Accuracy (%)') +plt.savefig('figure6.png') diff --git a/ch02/seeds_knn_sklearn.py b/ch02/seeds_knn_sklearn.py new file mode 100644 index 00000000..ac89bb59 --- /dev/null +++ b/ch02/seeds_knn_sklearn.py @@ -0,0 +1,90 @@ +# This code is supporting material for the book +# Building Machine Learning Systems with Python +# by Willi Richert and Luis Pedro Coelho +# published by PACKT Publishing +# +# It is made available under the MIT License + +# Basic imports +from __future__ import print_function +import numpy as np +from load import load_dataset + + +# Import sklearn implementation of KNN +from sklearn.neighbors import KNeighborsClassifier + +features, labels = load_dataset('seeds') +classifier = KNeighborsClassifier(n_neighbors=4) + + +n = len(features) +correct = 0.0 +for ei in range(n): + training = np.ones(n, bool) + training[ei] = 0 + testing = ~training + classifier.fit(features[training], labels[training]) + pred = classifier.predict(features[ei]) + correct += (pred == labels[ei]) +print('Result of leave-one-out: {}'.format(correct/n)) + +# Import KFold object +from sklearn.cross_validation import KFold + +# means will hold the mean for each fold +means = [] + +# kf is a generator of pairs (training,testing) so that each iteration +# implements a separate fold. +kf = KFold(len(features), n_folds=3, shuffle=True) +for training,testing in kf: + # We learn a model for this fold with `fit` and then apply it to the + # testing data with `predict`: + classifier.fit(features[training], labels[training]) + prediction = classifier.predict(features[testing]) + + # np.mean on an array of booleans returns the fraction of correct decisions + # for this fold: + curmean = np.mean(prediction == labels[testing]) + means.append(curmean) +print('Result of cross-validation using KFold: {}'.format(means)) + +# The function cross_val_score does the same thing as the loop above with a +# single function call + +from sklearn.cross_validation import cross_val_score +crossed = cross_val_score(classifier, features, labels) +print('Result of cross-validation using cross_val_score: {}'.format(crossed)) + +# The results above use the features as is, which we learned was not optimal +# except if the features happen to all be in the same scale. We can pre-scale +# the features as explained in the main text: + +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler +classifier = Pipeline([('norm', StandardScaler()), ('knn', classifier)]) +crossed = cross_val_score(classifier, features, labels) +print('Result with prescaling: {}'.format(crossed)) + + +# Now, generate & print a cross-validated confusion matrix for the same result +from sklearn.metrics import confusion_matrix +names = list(set(labels)) +labels = np.array([names.index(ell) for ell in labels]) +preds = labels.copy() +preds[:] = -1 +for train, test in kf: + classifier.fit(features[train], labels[train]) + preds[test] = classifier.predict(features[test]) + +cmat = confusion_matrix(labels, preds) +print() +print('Confusion matrix: [rows represent true outcome, columns predicted outcome]') +print(cmat) + +# The explicit float() conversion is necessary in Python 2 +# (Otherwise, result is rounded to 0) +acc = cmat.trace()/float(cmat.sum()) +print('Accuracy: {0:.1%}'.format(acc)) + diff --git a/ch02/seeds_threshold.py b/ch02/seeds_threshold.py index cd184107..6b1f87d0 100644 --- a/ch02/seeds_threshold.py +++ b/ch02/seeds_threshold.py @@ -7,7 +7,7 @@ from load import load_dataset import numpy as np -from threshold import learn_model, apply_model, accuracy +from threshold import fit_model, accuracy features, labels = load_dataset('seeds') @@ -24,7 +24,7 @@ # whatever is not training is for testing testing = ~training - model = learn_model(features[training], labels[training]) + model = fit_model(features[training], labels[training]) test_error = accuracy(features[testing], labels[testing], model) error += test_error diff --git a/ch02/simple_threshold.py b/ch02/simple_threshold.py index 3f432512..d174f283 100644 --- a/ch02/simple_threshold.py +++ b/ch02/simple_threshold.py @@ -5,7 +5,6 @@ # # It is made available under the MIT License -import numpy as np from sklearn.datasets import load_iris data = load_iris() @@ -13,8 +12,14 @@ target = data['target'] target_names = data['target_names'] labels = target_names[target] - plength = features[:, 2] + +# To use numpy operations to get setosa features, +# we build a boolean array is_setosa = (labels == 'setosa') -print('Maximum of setosa: {0}.'.format(plength[is_setosa].max())) -print('Minimum of others: {0}.'.format(plength[~is_setosa].min())) + +max_setosa = plength[is_setosa].max() +min_non_setosa = plength[~is_setosa].min() + +print('Maximum of setosa: {0}.'.format(max_setosa)) +print('Minimum of others: {0}.'.format(min_non_setosa)) diff --git a/ch02/stump.py b/ch02/stump.py index 841ce983..0dfaec85 100644 --- a/ch02/stump.py +++ b/ch02/stump.py @@ -5,29 +5,51 @@ # # It is made available under the MIT License -from matplotlib import pyplot as plt from sklearn.datasets import load_iris data = load_iris() -features = data['data'] -labels = data['target_names'][data['target']] +features = data.data +labels = data.target_names[data.target] -setosa = (labels == 'setosa') -features = features[~setosa] -labels = labels[~setosa] -virginica = (labels == 'virginica') +is_setosa = (labels == 'setosa') +features = features[~is_setosa] +labels = labels[~is_setosa] +is_virginica = (labels == 'virginica') +# Initialize to a value that is worse than any possible test best_acc = -1.0 + +# Loop over all the features for fi in range(features.shape[1]): + # Test every possible threshold value for feature fi thresh = features[:, fi].copy() + + # Test them in order thresh.sort() for t in thresh: + + # Generate predictions using t as a threshold pred = (features[:, fi] > t) - acc = (pred == virginica).mean() + + # Accuracy is the fraction of predictions that match reality + acc = (pred == is_virginica).mean() + + # We test whether negating the test is a better threshold: + acc_neg = ((~pred) == is_virginica).mean() + if acc_neg > acc: + acc = acc_neg + negated = True + else: + negated = False + + # If this is better than previous best, then this is now the new best: + if acc > best_acc: best_acc = acc best_fi = fi best_t = t -print('Best cut is {0} on feature {1}, which achieves accuracy of {2:.1%}.'.format( - best_t, best_fi, best_acc)) + best_is_negated = negated + +print('Best threshold is {0} on feature {1} (index {2}), which achieves accuracy of {3:.1%}.'.format( + best_t, data.feature_names[best_fi], best_fi, best_acc)) diff --git a/ch02/threshold.py b/ch02/threshold.py index 40337433..d621a350 100644 --- a/ch02/threshold.py +++ b/ch02/threshold.py @@ -8,7 +8,8 @@ import numpy as np -def learn_model(features, labels): +# This function was called ``learn_model`` in the first edition +def fit_model(features, labels): '''Learn a simple threshold model''' best_acc = -1.0 # Loop over all the features: @@ -21,22 +22,34 @@ def learn_model(features, labels): # Measure the accuracy of this acc = (pred == labels).mean() + + rev_acc = (pred == ~labels).mean() + if rev_acc > acc: + acc = rev_acc + reverse = True + else: + reverse = False if acc > best_acc: best_acc = acc best_fi = fi best_t = t + best_reverse = reverse # A model is a threshold and an index - return best_t, best_fi + return best_t, best_fi, best_reverse -def apply_model(features, model): +# This function was called ``apply_model`` in the first edition +def predict(model, features): '''Apply a learned model''' - # A model is a pair as returned by learn_model - t, fi = model - return features[:, fi] > t + # A model is a pair as returned by fit_model + t, fi, reverse = model + if reverse: + return features[:, fi] <= t + else: + return features[:, fi] > t def accuracy(features, labels, model): '''Compute the accuracy of the model''' - preds = apply_model(features, model) + preds = predict(model, features) return np.mean(preds == labels) diff --git a/ch03/data/toy/03.txt b/ch03/data/toy/03.txt index 7ca1f741..c5e03e90 100644 --- a/ch03/data/toy/03.txt +++ b/ch03/data/toy/03.txt @@ -1 +1 @@ -Most imaging databases safe images permanently. \ No newline at end of file +Most imaging databases save images permanently. diff --git a/ch03/noise_analysis.py b/ch03/noise_analysis.py new file mode 100644 index 00000000..11ca13f4 --- /dev/null +++ b/ch03/noise_analysis.py @@ -0,0 +1,69 @@ +# This code is supporting material for the book +# Building Machine Learning Systems with Python +# by Willi Richert and Luis Pedro Coelho +# published by PACKT Publishing +# +# It is made available under the MIT License + +import sklearn.datasets + +groups = [ + 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', + 'comp.sys.mac.hardware', 'comp.windows.x', 'sci.space'] +train_data = sklearn.datasets.fetch_20newsgroups(subset="train", + categories=groups) + +labels = train_data.target +num_clusters = 50 # sp.unique(labels).shape[0] + +import nltk.stem +english_stemmer = nltk.stem.SnowballStemmer('english') + +from sklearn.feature_extraction.text import TfidfVectorizer + + +class StemmedTfidfVectorizer(TfidfVectorizer): + + def build_analyzer(self): + analyzer = super(TfidfVectorizer, self).build_analyzer() + return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc)) + +vectorizer = StemmedTfidfVectorizer(min_df=10, max_df=0.5, + stop_words='english', decode_error='ignore' + ) +vectorized = vectorizer.fit_transform(train_data.data) + +post_group = zip(train_data.data, train_data.target) +# Create a list of tuples that can be sorted by +# the length of the posts +all = [(len(post[0]), post[0], train_data.target_names[post[1]]) + for post in post_group] +graphics = sorted([post for post in all if post[2] == 'comp.graphics']) +print(graphics[5]) +# (245, 'From: SITUNAYA@IBM3090.BHAM.AC.UK\nSubject: test....(sorry)\nOrganization: +# The University of Birmingham, United Kingdom\nLines: 1\nNNTP-Posting-Host: ibm3090.bham.ac.uk +# \n\n==============================================================================\n', +# 'comp.graphics') + +noise_post = graphics[5][1] + +analyzer = vectorizer.build_analyzer() +print(list(analyzer(noise_post))) + +useful = set(analyzer(noise_post)).intersection(vectorizer.get_feature_names()) +print(sorted(useful)) +# ['ac', 'birmingham', 'host', 'kingdom', 'nntp', 'sorri', 'test', 'uk', 'unit', 'univers'] + +for term in sorted(useful): + print('IDF(%s)=%.2f' % (term, + vectorizer._tfidf.idf_[vectorizer.vocabulary_[term]])) +# IDF(ac)=3.51 +# IDF(birmingham)=6.77 +# IDF(host)=1.74 +# IDF(kingdom)=6.68 +# IDF(nntp)=1.77 +# IDF(sorri)=4.14 +# IDF(test)=3.83 +# IDF(uk)=3.70 +# IDF(unit)=4.42 +# IDF(univers)=1.91 diff --git a/ch03/plot_kmeans_example.py b/ch03/plot_kmeans_example.py index c012894c..43b9b71b 100644 --- a/ch03/plot_kmeans_example.py +++ b/ch03/plot_kmeans_example.py @@ -15,7 +15,7 @@ from matplotlib import pylab from sklearn.cluster import KMeans -from utils import DATA_DIR, CHART_DIR +from utils import CHART_DIR seed = 2 sp.random.seed(seed) # to reproduce the data later on @@ -33,7 +33,6 @@ def plot_clustering(x, y, title, mx=None, ymax=None, xmin=None, km=None): pylab.title(title) pylab.xlabel("Occurrence word 1") pylab.ylabel("Occurrence word 2") - # pylab.xticks([w*7*24 for w in range(10)], ['week %i'%w for w in range(10)]) pylab.autoscale(tight=True) pylab.ylim(ymin=0, ymax=1) diff --git a/ch03/rel_post_01.py b/ch03/rel_post_01.py index 2f807f3d..9c516180 100644 --- a/ch03/rel_post_01.py +++ b/ch03/rel_post_01.py @@ -43,8 +43,7 @@ def build_analyzer(self): return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc)) vectorizer = StemmedTfidfVectorizer( - min_df=1, stop_words='english', charset_error='ignore') -print(vectorizer) + min_df=1, stop_words='english', decode_error='ignore') X_train = vectorizer.fit_transform(posts) diff --git a/ch03/rel_post_mlcomp_01.py b/ch03/rel_post_20news.py similarity index 57% rename from ch03/rel_post_mlcomp_01.py rename to ch03/rel_post_20news.py index cc738f2c..6dad9431 100644 --- a/ch03/rel_post_mlcomp_01.py +++ b/ch03/rel_post_20news.py @@ -5,19 +5,9 @@ # # It is made available under the MIT License -import os -import sys import sklearn.datasets import scipy as sp -from utils import DATA_DIR - -if not os.path.exists(DATA_DIR): - print("""\ -It seems that you have not yet downloaded the MLCOMP data set. -Please do so and place it into %s."""%DATA_DIR) - sys.exit(1) - new_post = \ """Disk drive problems. Hi, I have a problem with my hard disk. After 1 year it is working only sporadically now. @@ -25,15 +15,28 @@ Any ideas? Thanks. """ +print("""\ +Dear reader of the 1st edition of 'Building Machine Learning Systems with Python'! +For the 2nd edition we introduced a couple of changes that will result into +results that differ from the results in the 1st edition. +E.g. we now fully rely on scikit's fetch_20newsgroups() instead of requiring +you to download the data manually from MLCOMP. +If you have any questions, please ask at http://www.twotoreal.com +""") + +all_data = sklearn.datasets.fetch_20newsgroups(subset="all") +print("Number of total posts: %i" % len(all_data.filenames)) +# Number of total posts: 18846 + groups = [ 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', - 'comp.sys.ma c.hardware', 'comp.windows.x', 'sci.space'] -dataset = sklearn.datasets.load_mlcomp("20news-18828", "train", - mlcomp_root=DATA_DIR, - categories=groups) -print("Number of posts:", len(dataset.filenames)) + 'comp.sys.mac.hardware', 'comp.windows.x', 'sci.space'] +train_data = sklearn.datasets.fetch_20newsgroups(subset="train", + categories=groups) +print("Number of training posts in tech groups:", len(train_data.filenames)) +# Number of training posts in tech groups: 3529 -labels = dataset.target +labels = train_data.target num_clusters = 50 # sp.unique(labels).shape[0] import nltk.stem @@ -49,31 +52,41 @@ def build_analyzer(self): return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc)) vectorizer = StemmedTfidfVectorizer(min_df=10, max_df=0.5, - # max_features=1000, - stop_words='english', charset_error='ignore' + stop_words='english', decode_error='ignore' ) -vectorized = vectorizer.fit_transform(dataset.data) + +vectorized = vectorizer.fit_transform(train_data.data) num_samples, num_features = vectorized.shape print("#samples: %d, #features: %d" % (num_samples, num_features)) - +# samples: 3529, #features: 4712 from sklearn.cluster import KMeans -km = KMeans(n_clusters=num_clusters, init='k-means++', n_init=1, - verbose=1) - +km = KMeans(n_clusters=num_clusters, n_init=1, verbose=1, random_state=3) clustered = km.fit(vectorized) +print("km.labels_=%s" % km.labels_) +# km.labels_=[ 6 34 22 ..., 2 21 26] + +print("km.labels_.shape=%s" % km.labels_.shape) +# km.labels_.shape=3529 + from sklearn import metrics print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)) +# Homogeneity: 0.400 print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)) +# Completeness: 0.206 print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)) +# V-measure: 0.272 print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels, km.labels_)) +# Adjusted Rand Index: 0.064 print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels, km.labels_)) +# Adjusted Mutual Information: 0.197 print(("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(vectorized, labels, sample_size=1000))) +# Silhouette Coefficient: 0.006 new_post_vec = vectorizer.transform([new_post]) new_post_label = km.predict(new_post_vec)[0] @@ -83,13 +96,14 @@ def build_analyzer(self): similar = [] for i in similar_indices: dist = sp.linalg.norm((new_post_vec - vectorized[i]).toarray()) - similar.append((dist, dataset.data[i])) + similar.append((dist, train_data.data[i])) similar = sorted(similar) +print("Count similar: %i" % len(similar)) show_at_1 = similar[0] -show_at_2 = similar[len(similar) / 2] -show_at_3 = similar[-1] +show_at_2 = similar[int(len(similar) / 10)] +show_at_3 = similar[int(len(similar) / 2)] print("=== #1 ===") print(show_at_1) diff --git a/ch03/utils.py b/ch03/utils.py index ffb20929..29d46836 100644 --- a/ch03/utils.py +++ b/ch03/utils.py @@ -15,3 +15,8 @@ print("Uh, we were expecting a data directory, which contains the toy data") sys.exit(1) +CHART_DIR = os.path.join( + os.path.dirname(os.path.realpath(__file__)), "charts") +if not os.path.exists(CHART_DIR): + os.mkdir(CHART_DIR) + diff --git a/ch04/.gitignore b/ch04/.gitignore new file mode 100644 index 00000000..c4c0b18a --- /dev/null +++ b/ch04/.gitignore @@ -0,0 +1,6 @@ +wiki_lda.pkl +wiki_lda.pkl.state +*.png +*.npy +*.pkl +topics.txt diff --git a/ch04/README.rst b/ch04/README.rst new file mode 100644 index 00000000..99a3c186 --- /dev/null +++ b/ch04/README.rst @@ -0,0 +1,65 @@ +========= +Chapter 4 +========= + +Support code for *Chapter 4: Topic Modeling* + + +AP Data +------- + +To download the AP data, use the ``download_ap.sh`` script inside the ``data`` +directory:: + + cd data + ./download_ap.sh + +Word cloud creation +------------------- + +Word cloud creation requires that ``pytagcloud`` be installed (in turn, this +requires ``pygame``). Since this is not an essential part of the chapter, the +code will work even if you have not installed it (naturally, the cloud image +will not be generated and a warning will be printed). + + +Wikipedia processing +-------------------- + +You will need **a lot of disk space**. The download of the Wikipedia text is +11GB and preprocessing it takes another 24GB to save it in the intermediate +format that gensim uses for a total of 34GB! + +Run the following two commands inside the ``data/`` directory:: + + ./download_wp.sh + ./preprocess-wikidata.sh + +As the filenames indicate, the first step will download the data and the second +one will preprocess it. Preprocessing can take several hours, but it is +feasible to run it on a modern laptop. Once the second step is finished, you +may remove the input file if you want to save disk space +(``data/enwiki-latest-pages-articles.xml.bz2``). + +To generate the model, you can run the ``wikitopics_create.py`` script, while +the ``wikitopics_plot.py`` script will plot the most heavily discussed topic as +well as the least heavily discussed one. The code is split into steps as the +first one can take a very long time. Then it saves the results so that you can +later explore them at leisure. + +You should not expect that your results will exactly match the results in the +book, for two reasons: + +1. The LDA algorithm is a probabilistic algorithm and can give different + results every time it is run. +2. Wikipedia keeps changing. Thus, even your input data will be different. + +Scripts +------- + +blei_lda.py + Computes LDA using the AP Corpus. +wikitopics_create.py + Create the topic model for Wikipedia using LDA (must download wikipedia database first) +wikitopics_create_hdp.py + Create the topic model for Wikipedia using HDP (must download wikipedia database first) diff --git a/ch04/blei_lda.py b/ch04/blei_lda.py index 6c4a9f54..7f6ac2b3 100644 --- a/ch04/blei_lda.py +++ b/ch04/blei_lda.py @@ -6,56 +6,81 @@ # It is made available under the MIT License from __future__ import print_function +from wordcloud import create_cloud try: - from gensim import corpora, models, similarities + from gensim import corpora, models, matutils except: print("import gensim failed.") print() print("Please install it") raise -try: - from mpltools import style - style.use('ggplot') -except: - print("Could not import mpltools: plots will not be styled correctly") - import matplotlib.pyplot as plt import numpy as np from os import path +NUM_TOPICS = 100 + +# Check that data exists if not path.exists('./data/ap/ap.dat'): print('Error: Expected data to be present at data/ap/') print('Please cd into ./data & run ./download_ap.sh') +# Load the data corpus = corpora.BleiCorpus('./data/ap/ap.dat', './data/ap/vocab.txt') + +# Build the topic model model = models.ldamodel.LdaModel( - corpus, num_topics=100, id2word=corpus.id2word, alpha=None) + corpus, num_topics=NUM_TOPICS, id2word=corpus.id2word, alpha=None) -for ti in xrange(84): +# Iterate over all the topics in the model +for ti in range(model.num_topics): words = model.show_topic(ti, 64) - tf = sum(f for f, w in words) - print('\n'.join('{}:{}'.format(w, int(1000. * f / tf)) for f, w in words)) - print() - print() - print() + tf = sum(f for _, f in words) + with open('topics.txt', 'w') as output: + output.write('\n'.join('{}:{}'.format(w, int(1000. * f / tf)) for w, f in words)) + output.write("\n\n\n") + +# We first identify the most discussed topic, i.e., the one with the +# highest total weight -thetas = [model[c] for c in corpus] -plt.hist([len(t) for t in thetas], np.arange(42)) -plt.ylabel('Nr of documents') -plt.xlabel('Nr of topics') -plt.savefig('../1400OS_04_01+.png') +topics = matutils.corpus2dense(model[corpus], num_terms=model.num_topics) +weight = topics.sum(1) +max_topic = weight.argmax() + + +# Get the top 64 words for this topic +# Without the argument, show_topic would return only 10 words +words = model.show_topic(max_topic, 64) + +# This function will actually check for the presence of pytagcloud and is otherwise a no-op +create_cloud('cloud_blei_lda.png', words) + +num_topics_used = [len(model[doc]) for doc in corpus] +fig,ax = plt.subplots() +ax.hist(num_topics_used, np.arange(42)) +ax.set_ylabel('Nr of documents') +ax.set_xlabel('Nr of topics') +fig.tight_layout() +fig.savefig('Figure_04_01.png') + + +# Now, repeat the same exercise using alpha=1.0 +# You can edit the constant below to play around with this parameter +ALPHA = 1.0 model1 = models.ldamodel.LdaModel( - corpus, num_topics=100, id2word=corpus.id2word, alpha=1.) -thetas1 = [model1[c] for c in corpus] - -#model8 = models.ldamodel.LdaModel(corpus, num_topics=100, id2word=corpus.id2word, alpha=1.e-8) -#thetas8 = [model8[c] for c in corpus] -plt.clf() -plt.hist([[len(t) for t in thetas], [len(t) for t in thetas1]], np.arange(42)) -plt.ylabel('Nr of documents') -plt.xlabel('Nr of topics') -plt.text(9, 223, r'default alpha') -plt.text(26, 156, 'alpha=1.0') -plt.savefig('../1400OS_04_02+.png') + corpus, num_topics=NUM_TOPICS, id2word=corpus.id2word, alpha=ALPHA) +num_topics_used1 = [len(model1[doc]) for doc in corpus] + +fig,ax = plt.subplots() +ax.hist([num_topics_used, num_topics_used1], np.arange(42)) +ax.set_ylabel('Nr of documents') +ax.set_xlabel('Nr of topics') + +# The coordinates below were fit by trial and error to look good +ax.text(9, 223, r'default alpha') +ax.text(26, 156, 'alpha=1.0') +fig.tight_layout() +fig.savefig('Figure_04_02.png') + diff --git a/ch04/build_lda.py b/ch04/build_lda.py index 192e3718..a0ee9c5f 100644 --- a/ch04/build_lda.py +++ b/ch04/build_lda.py @@ -14,8 +14,7 @@ raise from scipy.spatial import distance import numpy as np -import string -from gensim import corpora, models, similarities +from gensim import corpora, models import sklearn.datasets import nltk.stem from collections import defaultdict @@ -80,7 +79,7 @@ def __len__(self): distances = distance.squareform(distance.pdist(thetas)) large = distances.max() + 1 -for i in xrange(len(distances)): +for i in range(len(distances)): distances[i, i] = large print(otexts[1]) diff --git a/ch04/data/.gitignore b/ch04/data/.gitignore index 8eb48cbf..91f24bbc 100644 --- a/ch04/data/.gitignore +++ b/ch04/data/.gitignore @@ -1,8 +1,12 @@ ap.tgz ap/ +dataset-379-20news-18828_HJRZF.zip +379/ enwiki-latest-pages-articles.xml.bz2 wiki_en_output_bow.mm +wiki_en_output_bow.mm.gz wiki_en_output_bow.mm.index wiki_en_output_tfidf.mm +wiki_en_output_tfidf.mm.gz wiki_en_output_tfidf.mm.index -wiki_en_output_wordids.txt +wiki_en_output_wordids.txt.bz2 diff --git a/ch04/data/download_ap.sh b/ch04/data/download_ap.sh index 6de8ded8..da27814a 100755 --- a/ch04/data/download_ap.sh +++ b/ch04/data/download_ap.sh @@ -1,3 +1,3 @@ #!/bin/sh -wget http://www.cs.princeton.edu/~blei/lda-c/ap.tgz +wget http://www.cs.columbia.edu/~blei/lda-c/ap.tgz tar xzf ap.tgz diff --git a/ch04/data/preprocess-wikidata.sh b/ch04/data/preprocess-wikidata.sh new file mode 100755 index 00000000..d9a6ebb3 --- /dev/null +++ b/ch04/data/preprocess-wikidata.sh @@ -0,0 +1,3 @@ +#!/bin/sh + +python -m gensim.scripts.make_wiki enwiki-latest-pages-articles.xml.bz2 wiki_en_output diff --git a/ch04/wikitopics.py b/ch04/wikitopics.py deleted file mode 100644 index f04698dc..00000000 --- a/ch04/wikitopics.py +++ /dev/null @@ -1,49 +0,0 @@ -# This code is supporting material for the book -# Building Machine Learning Systems with Python -# by Willi Richert and Luis Pedro Coelho -# published by PACKT Publishing -# -# It is made available under the MIT License - -from __future__ import print_function -import numpy as np -import logging -import gensim -logging.basicConfig( - format='%(asctime)s : %(levelname)s : %(message)s', - level=logging.INFO) -id2word = gensim.corpora.Dictionary.load_from_text( - 'data/wiki_en_output_wordids.txt') -mm = gensim.corpora.MmCorpus('data/wiki_en_output_tfidf.mm') -model = gensim.models.ldamodel.LdaModel( - corpus=mm, - id2word=id2word, - num_topics=100, - update_every=1, - chunksize=10000, - passes=1) -model.save('wiki_lda.pkl') -topics = [model[doc] for doc in mm] -lens = np.array([len(t) for t in topics]) -print(np.mean(lens <= 10)) -print(np.mean(lens)) - -counts = np.zeros(100) -for doc_top in topics: - for ti, _ in doc_toc: - counts[ti] += 1 - -for doc_top in topics: - for ti, _ in doc_top: - counts[ti] += 1 - -words = model.show_topic(counts.argmax(), 64) -print(words) -print() -print() -print() -words = model.show_topic(counts.argmin(), 64) -print(words) -print() -print() -print() diff --git a/ch04/wikitopics_create.py b/ch04/wikitopics_create.py new file mode 100644 index 00000000..d1aff278 --- /dev/null +++ b/ch04/wikitopics_create.py @@ -0,0 +1,51 @@ +# This code is supporting material for the book +# Building Machine Learning Systems with Python +# by Willi Richert and Luis Pedro Coelho +# published by PACKT Publishing +# +# It is made available under the MIT License + +from __future__ import print_function +import logging +import gensim +import numpy as np + +NR_OF_TOPICS = 100 + +# Set up logging in order to get progress information as the model is being built: +logging.basicConfig( + format='%(asctime)s : %(levelname)s : %(message)s', + level=logging.INFO) + +# Load the preprocessed corpus (id2word & mm): +id2word = gensim.corpora.Dictionary.load_from_text( + 'data/wiki_en_output_wordids.txt.bz2') +mm = gensim.corpora.MmCorpus('data/wiki_en_output_tfidf.mm') + +# Calling the constructor is enough to build the model +# This call will take a few hours! +model = gensim.models.ldamodel.LdaModel( + corpus=mm, + id2word=id2word, + num_topics=NR_OF_TOPICS, + update_every=1, + chunksize=10000, + passes=1) + +# Save the model so we do not need to learn it again. +model.save('wiki_lda.pkl') + +# Compute the document/topic matrix +topics = np.zeros((len(mm), model.num_topics)) +for di,doc in enumerate(mm): + doc_top = model[doc] + for ti,tv in doc_top: + topics[di,ti] += tv +np.save('topics.npy', topics) + +# Alternatively, we create a sparse matrix and save that. This alternative +# saves disk space, at the cost of slightly more complex code: + +## from scipy import sparse, io +## sp = sparse.csr_matrix(topics) +## io.savemat('topics.mat', {'topics': sp}) diff --git a/ch04/wikitopics_create_hdp.py b/ch04/wikitopics_create_hdp.py new file mode 100644 index 00000000..951d1850 --- /dev/null +++ b/ch04/wikitopics_create_hdp.py @@ -0,0 +1,39 @@ +# This code is supporting material for the book +# Building Machine Learning Systems with Python +# by Willi Richert and Luis Pedro Coelho +# published by PACKT Publishing +# +# It is made available under the MIT License + +from __future__ import print_function +import logging +import gensim +import numpy as np + +# Set up logging in order to get progress information as the model is being built: +logging.basicConfig( + format='%(asctime)s : %(levelname)s : %(message)s', + level=logging.INFO) + +# Load the preprocessed corpus (id2word & mm): +id2word = gensim.corpora.Dictionary.load_from_text( + 'data/wiki_en_output_wordids.txt.bz2') +mm = gensim.corpora.MmCorpus('data/wiki_en_output_tfidf.mm') + +# Calling the constructor is enough to build the model +# This call will take a few hours! +model = gensim.models.hdpmodel.HdpModel( + corpus=mm, + id2word=id2word, + chunksize=10000) + +# Save the model so we do not need to learn it again. +model.save('wiki_hdp.pkl') + +# Compute the document/topic matrix +topics = np.zeros((len(mm), model.num_topics)) +for di,doc in enumerate(mm): + doc_top = model[doc] + for ti,tv in doc_top: + topics[di,ti] += tv +np.save('topics_hdp.npy', topics) diff --git a/ch04/wikitopics_plot.py b/ch04/wikitopics_plot.py new file mode 100644 index 00000000..04adf780 --- /dev/null +++ b/ch04/wikitopics_plot.py @@ -0,0 +1,64 @@ +# This code is supporting material for the book +# Building Machine Learning Systems with Python +# by Willi Richert and Luis Pedro Coelho +# published by PACKT Publishing +# +# It is made available under the MIT License + +from __future__ import print_function +import numpy as np +import gensim +from os import path +from wordcloud import create_cloud + +if not path.exists('wiki_lda.pkl'): + import sys + sys.stderr.write('''\ +This script must be run after wikitopics_create.py! + +That script creates and saves the LDA model (this must onlly be done once). +This script is responsible for the analysis.''') + sys.exit(1) + +# Load the preprocessed Wikipedia corpus (id2word and mm) +id2word = gensim.corpora.Dictionary.load_from_text( + 'data/wiki_en_output_wordids.txt.bz2') +mm = gensim.corpora.MmCorpus('data/wiki_en_output_tfidf.mm') + +# Load the precomputed model +model = gensim.models.ldamodel.LdaModel.load('wiki_lda.pkl') + +topics = np.load('topics.npy', mmap_mode='r') + +# Compute the number of topics mentioned in each document +lens = (topics > 0).sum(axis=1) +print('Mean number of topics mentioned: {0:.3}'.format(np.mean(lens))) +print('Percentage of articles mentioning less than 10 topics: {0:.1%}'.format(np.mean(lens <= 10))) + +# Weights will be the total weight of each topic +weights = topics.sum(0) + +# Retrieve the most heavily used topic and plot it as a word cloud: +words = model.show_topic(weights.argmax(), 64) + +# The parameter ``maxsize`` often needs some manual tuning to make it look nice. +create_cloud('Wikipedia_most.png', words, maxsize=250, fontname='Cardo') + +fraction_mention = np.mean(topics[:,weights.argmax()] > 0) +print("The most mentioned topics is mentioned in {:.1%} of documents.".format(fraction_mention)) +total_weight = np.mean(topics[:,weights.argmax()]) +print("It represents {:.1%} of the total number of words.".format(total_weight)) +print() +print() +print() + +# Retrieve the **least** heavily used topic and plot it as a word cloud: +words = model.show_topic(weights.argmin(), 64) +create_cloud('Wikipedia_least.png', words, maxsize=150, fontname='Cardo') +fraction_mention = np.mean(topics[:,weights.argmin()] > 0) +print("The least mentioned topics is mentioned in {:.1%} of documents.".format(fraction_mention)) +total_weight = np.mean(topics[:,weights.argmin()]) +print("It represents {:.1%} of the total number of words.".format(total_weight)) +print() +print() +print() diff --git a/ch04/wordcloud.py b/ch04/wordcloud.py new file mode 100644 index 00000000..accca2d6 --- /dev/null +++ b/ch04/wordcloud.py @@ -0,0 +1,29 @@ +from __future__ import print_function +warned_of_error = False + +def create_cloud(oname, words,maxsize=120, fontname='Lobster'): + '''Creates a word cloud (when pytagcloud is installed) + + Parameters + ---------- + oname : output filename + words : list of (value,str) + maxsize : int, optional + Size of maximum word. The best setting for this parameter will often + require some manual tuning for each input. + fontname : str, optional + Font to use. + ''' + try: + from pytagcloud import create_tag_image, make_tags + except ImportError: + if not warned_of_error: + print("Could not import pytagcloud. Skipping cloud generation") + return + + # gensim returns a weight between 0 and 1 for each word, while pytagcloud + # expects an integer word count. So, we multiply by a large number and + # round. For a visualization this is an adequate approximation. + words = [(w,int(v*10000)) for w,v in words] + tags = make_tags(words, maxsize=maxsize) + create_tag_image(tags, oname, size=(1800, 1200), fontname=fontname) diff --git a/ch05/README.md b/ch05/README.md index 776c90bc..39253f22 100644 --- a/ch05/README.md +++ b/ch05/README.md @@ -1,11 +1,10 @@ Chapter 5 - Classification - Detecting Poor Answers =================================================== -The book chapter is based on StackExchange's data blob from August 2012: -[http://www.clearbits.net/get/2076-aug-2012.torrent](http://www.clearbits.net/get/2076-aug-2012.torrent) +The book chapter is based on StackExchange's data blob from August 2012 for the first edition. -After publishing the book, StackExchange stayed as awesome as it always has been and released an updated version: -[https://archive.org/download/stackexchange/stackexchange_archive.torrent](https://archive.org/download/stackexchange/stackexchange_archive.torrent) +After publishing the book, StackExchange released the May 2014 version at +[https://archive.org/download/stackexchange/stackexchange_archive.torrent](https://archive.org/download/stackexchange/stackexchange_archive.torrent). Note that using the latest version, you will get slightly different results. diff --git a/ch05/chose_instances.py b/ch05/chose_instances.py index e997e0fb..0614cd32 100644 --- a/ch05/chose_instances.py +++ b/ch05/chose_instances.py @@ -23,9 +23,12 @@ will not be used in the chapter. If, however, you want to experiment with them (highly encouraged!), you can get the library from http://packages.python.org/pyenchant/. """) + class EnchantMock: + def __init__(self): pass + def check(self, word): return True speller = EnchantMock() @@ -35,7 +38,6 @@ def check(self, word): filtered_meta = json.load(open(filtered_meta, "r")) - def misspelled_fraction(p): tokens = p.split() if not tokens: diff --git a/ch05/classify.py b/ch05/classify.py index 69832b56..76699819 100644 --- a/ch05/classify.py +++ b/ch05/classify.py @@ -16,7 +16,7 @@ from sklearn import neighbors from data import chosen, chosen_meta -from utils import plot_roc, plot_pr +from utils import plot_pr from utils import plot_feat_importance from utils import load_meta from utils import fetch_posts @@ -30,11 +30,12 @@ import nltk -# splitting questions into train (70%) and test(30%) and then take their -# answers -all_posts = list(meta.keys()) -all_questions = [q for q, v in meta.items() if v['ParentId'] == -1] -all_answers = [q for q, v in meta.items() if v['ParentId'] != -1] # [:500] +# The sorting below is only to ensure reproducable numbers. Further down +# we will occasionally skip a fold when it contains instances of only +# one label. The two lines below ensure that the behavior is exactly the +# same for different runs. +all_questions = sorted([q for q, v in meta.items() if v['ParentId'] == -1]) +all_answers = sorted([q for q, v in meta.items() if v['ParentId'] != -1]) feature_names = np.array(( 'NumTextTokens', @@ -47,20 +48,15 @@ 'NumImages' )) -# activate the following for reduced feature space -""" -feature_names = np.array(( - 'NumTextTokens', - 'LinkCount', -)) -""" - def prepare_sent_features(): for pid, text in fetch_posts(chosen, with_index=True): if not text: meta[pid]['AvgSentLen'] = meta[pid]['AvgWordLen'] = 0 else: + from platform import python_version + if python_version().startswith('2'): + text = text.decode('utf-8') sent_lens = [len(nltk.word_tokenize( sent)) for sent in nltk.sent_tokenize(text)] meta[pid]['AvgSentLen'] = np.mean(sent_lens) @@ -80,17 +76,26 @@ def get_features(aid): return tuple(meta[aid][fn] for fn in feature_names) qa_X = np.asarray([get_features(aid) for aid in all_answers]) -# Score > 0 tests => positive class is good answer -# Score <= 0 tests => positive class is poor answer -qa_Y = np.asarray([meta[aid]['Score'] > 0 for aid in all_answers]) + classifying_answer = "good" +#classifying_answer = "poor" + +if classifying_answer == "good": + # Score > 0 tests => positive class is good answer + qa_Y = np.asarray([meta[aid]['Score'] > 0 for aid in all_answers]) +elif classifying_answer == "poor": + # Score <= 0 tests => positive class is poor answer + qa_Y = np.asarray([meta[aid]['Score'] <= 0 for aid in all_answers]) +else: + raise Exception("classifying_answer='%s' is not supported" % + classifying_answer) for idx, feat in enumerate(feature_names): plot_feat_hist([(qa_X[:, idx], feat)]) -""" -plot_feat_hist([(qa_X[:, idx], feature_names[idx]) for idx in [1,0]], 'feat_hist_two.png') -plot_feat_hist([(qa_X[:, idx], feature_names[idx]) for idx in [3,4,5,6]], 'feat_hist_four.png') -""" + +#plot_feat_hist([(qa_X[:, idx], feature_names[idx]) for idx in [1,0]], 'feat_hist_two.png') +#plot_feat_hist([(qa_X[:, idx], feature_names[idx]) for idx in [3,4,5,6]], 'feat_hist_four.png') + avg_scores_summary = [] @@ -115,10 +120,16 @@ def measure(clf_class, parameters, name, data_size=None, plot=False): pr_scores = [] precisions, recalls, thresholds = [], [], [] - for train, test in cv: + for fold_idx, (train, test) in enumerate(cv): X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] + only_one_class_in_train = len(set(y_train)) == 1 + only_one_class_in_test = len(set(y_test)) == 1 + if only_one_class_in_train or only_one_class_in_test: + # this would pose problems later on + continue + clf = clf_class(**parameters) clf.fit(X_train, y_train) @@ -145,12 +156,20 @@ def measure(clf_class, parameters, name, data_size=None, plot=False): precisions.append(precision) recalls.append(recall) thresholds.append(pr_thresholds) + + # This threshold is determined at the end of the chapter 5, + # where we find conditions such that precision is in the area of + # about 80%. With it we trade off recall for precision. + threshold_for_detecting_good_answers = 0.59 + + print("Clone #%i" % fold_idx) print(classification_report(y_test, proba[:, label_idx] > - 0.63, target_names=['not accepted', 'accepted'])) + threshold_for_detecting_good_answers, target_names=['not accepted', 'accepted'])) # get medium clone scores_to_sort = pr_scores # roc_scores medium = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] + print("Medium clone is #%i" % medium) if plot: #plot_roc(roc_scores[medium], name, fprs[medium], tprs[medium]) @@ -178,6 +197,7 @@ def measure(clf_class, parameters, name, data_size=None, plot=False): def bias_variance_analysis(clf_class, parameters, name): + #import ipdb;ipdb.set_trace() data_sizes = np.arange(60, 2000, 4) train_errors = [] @@ -208,16 +228,16 @@ def k_complexity_analysis(clf_class, parameters): plot_k_complexity(ks, train_errors, test_errors) -for k in [5]: # [5, 10, 40, 90]: +for k in [5]: +# for k in [5, 10, 40]: + #measure(neighbors.KNeighborsClassifier, {'n_neighbors': k}, "%iNN" % k) bias_variance_analysis(neighbors.KNeighborsClassifier, { - 'n_neighbors': k, 'warn_on_equidistant': False}, "%iNN" % k) - k_complexity_analysis(neighbors.KNeighborsClassifier, {'n_neighbors': k, - 'warn_on_equidistant': False}) - # measure(neighbors.KNeighborsClassifier, {'n_neighbors': k, 'p': 2, - #'warn_on_equidistant': False}, "%iNN" % k) + 'n_neighbors': k}, "%iNN" % k) + k_complexity_analysis(neighbors.KNeighborsClassifier, {'n_neighbors': k}) from sklearn.linear_model import LogisticRegression -for C in [0.1]: # [0.01, 0.1, 1.0, 10.0]: +for C in [0.1]: +# for C in [0.01, 0.1, 1.0, 10.0]: name = "LogReg C=%.2f" % C bias_variance_analysis(LogisticRegression, {'penalty': 'l2', 'C': C}, name) measure(LogisticRegression, {'penalty': 'l2', 'C': C}, name, plot=True) diff --git a/ch05/data.py b/ch05/data.py index 62ddbc6d..e3b7e4bb 100644 --- a/ch05/data.py +++ b/ch05/data.py @@ -7,7 +7,7 @@ import os -DATA_DIR = "data" # put your posts-2011-12.xml into this directory +DATA_DIR = "data" # put your posts-2012.xml into this directory CHART_DIR = "charts" filtered = os.path.join(DATA_DIR, "filtered.tsv") diff --git a/ch05/log_reg_example.py b/ch05/log_reg_example.py index 7efc0eb1..bfbbd227 100644 --- a/ch05/log_reg_example.py +++ b/ch05/log_reg_example.py @@ -38,7 +38,8 @@ def lr_model(clf, X): pyplot.xlabel("feature value") pyplot.ylabel("class") pyplot.grid(True, linestyle='-', color='0.75') -pyplot.savefig(os.path.join(CHART_DIR, "log_reg_example_data.png"), bbox_inches="tight") +pyplot.savefig( + os.path.join(CHART_DIR, "log_reg_example_data.png"), bbox_inches="tight") def lin_model(clf, X): @@ -69,7 +70,8 @@ def lin_model(clf, X): pyplot.ylabel("class") pyplot.title("linear fit on additional data") pyplot.grid(True, linestyle='-', color='0.75') -pyplot.savefig(os.path.join(CHART_DIR, "log_reg_log_linear_fit.png"), bbox_inches="tight") +pyplot.savefig( + os.path.join(CHART_DIR, "log_reg_log_linear_fit.png"), bbox_inches="tight") pyplot.figure(figsize=(10, 4)) pyplot.xlim((-5, 20)) @@ -79,7 +81,8 @@ def lin_model(clf, X): pyplot.xlabel("feature value") pyplot.ylabel("class") pyplot.grid(True, linestyle='-', color='0.75') -pyplot.savefig(os.path.join(CHART_DIR, "log_reg_example_fitted.png"), bbox_inches="tight") +pyplot.savefig( + os.path.join(CHART_DIR, "log_reg_example_fitted.png"), bbox_inches="tight") X = np.arange(0, 1, 0.001) pyplot.figure(figsize=(10, 4)) @@ -97,4 +100,5 @@ def lin_model(clf, X): pyplot.xlabel("P") pyplot.ylabel("log(odds) = log(P / (1-P))") pyplot.grid(True, linestyle='-', color='0.75') -pyplot.savefig(os.path.join(CHART_DIR, "log_reg_log_odds.png"), bbox_inches="tight") +pyplot.savefig( + os.path.join(CHART_DIR, "log_reg_log_odds.png"), bbox_inches="tight") diff --git a/ch05/so_xml_to_tsv.py b/ch05/so_xml_to_tsv.py index f425a2e5..7ce5b150 100644 --- a/ch05/so_xml_to_tsv.py +++ b/ch05/so_xml_to_tsv.py @@ -10,6 +10,7 @@ # to a question that has been asked in 2011 or 2012. # +import sys import os import re try: @@ -24,8 +25,13 @@ from data import DATA_DIR -filename = os.path.join(DATA_DIR, "posts-2011-12.xml") +#filename = os.path.join(DATA_DIR, "posts-2011-12.xml") +filename = os.path.join(DATA_DIR, "posts-2012.xml") +print("Reading from xml %s" % filename) filename_filtered = os.path.join(DATA_DIR, "filtered.tsv") +print("Filtered: %s" % filename_filtered) +filename_filtered_meta = os.path.join(DATA_DIR, "filtered-meta.json") +print("Meta: %s" % filename_filtered_meta) q_creation = {} # creation datetimes of questions q_accepted = {} # id of accepted answer @@ -56,20 +62,19 @@ def filter_html(s): # sometimes source code contain links, which we don't want to count link_count_in_code += len(link_match.findall(match_str)) - anchors = link_match.findall(s) - link_count = len(anchors) + links = link_match.findall(s) + link_count = len(links) link_count -= link_count_in_code - html_free_s = re.sub( + link_free_s = re.sub( " +", " ", tag_match.sub('', code_free_s)).replace("\n", "") - link_free_s = html_free_s - for anchor in anchors: - if anchor.lower().startswith("http://"): - link_free_s = link_free_s.replace(anchor, '') + for link in links: + if link.lower().startswith("http://"): + link_free_s = link_free_s.replace(link, '') - num_text_tokens = html_free_s.count(" ") + num_text_tokens = link_free_s.count(" ") return link_free_s, num_text_tokens, num_code_lines, link_count, num_images @@ -77,21 +82,26 @@ def filter_html(s): num_questions = 0 num_answers = 0 -from itertools import imap +if sys.version_info.major < 3: + # Python 2, map() returns a list, which will lead to out of memory errors. + # The following import ensures that the script behaves like being executed + # with Python 3. + from itertools import imap as map + def parsexml(filename): global num_questions, num_answers counter = 0 - it = imap(itemgetter(1), + it = map(itemgetter(1), iter(etree.iterparse(filename, events=('start',)))) root = next(it) # get posts element for elem in it: if counter % 100000 == 0: - print(counter) + print("Processed %i elements" % counter) counter += 1 @@ -147,12 +157,12 @@ def parsexml(filename): root.clear() # preserve memory -with open(os.path.join(DATA_DIR, filename_filtered), "w") as f: +with open(filename_filtered, "w") as f: for values in parsexml(filename): line = "\t".join(map(str, values)) f.write(line + "\n") -with open(os.path.join(DATA_DIR, "filtered-meta.json"), "w") as f: +with open(filename_filtered_meta, "w") as f: json.dump(meta, f) print("years:", years) diff --git a/ch05/utils.py b/ch05/utils.py index 03429cef..c6e497a5 100644 --- a/ch05/utils.py +++ b/ch05/utils.py @@ -171,8 +171,8 @@ def plot_feat_hist(data_name_list, filename=None): assert filename is not None pylab.figure(num=None, figsize=(8, 6)) - num_rows = 1 + (len(data_name_list) - 1) / 2 - num_cols = 1 if len(data_name_list) == 1 else 2 + num_rows = int(1 + (len(data_name_list) - 1) / 2) + num_cols = int(1 if len(data_name_list) == 1 else 2) pylab.figure(figsize=(5 * num_cols, 4 * num_rows)) for i in range(num_rows): @@ -191,7 +191,7 @@ def plot_feat_hist(data_name_list, filename=None): else: bins = max_val n, bins, patches = pylab.hist( - x, bins=bins, normed=1, facecolor='blue', alpha=0.75) + x, bins=bins, normed=1, alpha=0.75) pylab.grid(True) @@ -209,7 +209,7 @@ def plot_bias_variance(data_sizes, train_errors, test_errors, name, title): pylab.title("Bias-Variance for '%s'" % name) pylab.plot( data_sizes, test_errors, "--", data_sizes, train_errors, "b-", lw=1) - pylab.legend(["train error", "test error"], loc="upper right") + pylab.legend(["test error", "train error"], loc="upper right") pylab.grid(True, linestyle='-', color='0.75') pylab.savefig( os.path.join(CHART_DIR, "bv_" + name.replace(" ", "_") + ".png"), bbox_inches="tight") @@ -220,10 +220,10 @@ def plot_k_complexity(ks, train_errors, test_errors): pylab.ylim([0.0, 1.0]) pylab.xlabel('k') pylab.ylabel('Error') - pylab.title('Errors for for different values of k') + pylab.title('Errors for for different values of $k$') pylab.plot( ks, test_errors, "--", ks, train_errors, "-", lw=1) - pylab.legend(["train error", "test error"], loc="upper right") + pylab.legend(["test error", "train error"], loc="upper right") pylab.grid(True, linestyle='-', color='0.75') pylab.savefig( os.path.join(CHART_DIR, "kcomplexity.png"), bbox_inches="tight") diff --git a/ch06/01_start.py b/ch06/01_start.py index 2940b0fa..0bdbdd12 100644 --- a/ch06/01_start.py +++ b/ch06/01_start.py @@ -40,7 +40,7 @@ def create_ngram_model(): def train_model(clf_factory, X, Y, name="NB ngram", plot=False): cv = ShuffleSplit( - n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) + n=len(X), n_iter=10, test_size=0.3, random_state=0) train_errors = [] test_errors = [] @@ -83,7 +83,7 @@ def train_model(clf_factory, X, Y, name="NB ngram", plot=False): summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores)) - print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary + print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) return np.mean(train_errors), np.mean(test_errors) @@ -94,18 +94,18 @@ def print_incorrect(clf, X, Y): X_wrong = X[wrong_idx] Y_wrong = Y[wrong_idx] Y_hat_wrong = Y_hat[wrong_idx] - for idx in xrange(len(X_wrong)): - print "clf.predict('%s')=%i instead of %i" %\ - (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx]) + for idx in range(len(X_wrong)): + print("clf.predict('%s')=%i instead of %i" % + (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])) if __name__ == "__main__": X_orig, Y_orig = load_sanders_data() classes = np.unique(Y_orig) for c in classes: - print "#%s: %i" % (c, sum(Y_orig == c)) + print("#%s: %i" % (c, sum(Y_orig == c))) - print "== Pos vs. neg ==" + print("== Pos vs. neg ==") pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative") X = X_orig[pos_neg] Y = Y_orig[pos_neg] @@ -113,19 +113,19 @@ def print_incorrect(clf, X, Y): train_model(create_ngram_model, X, Y, name="pos vs neg", plot=True) - print "== Pos/neg vs. irrelevant/neutral ==" + print("== Pos/neg vs. irrelevant/neutral ==") X = X_orig Y = tweak_labels(Y_orig, ["positive", "negative"]) train_model(create_ngram_model, X, Y, name="sent vs rest", plot=True) - print "== Pos vs. rest ==" + print("== Pos vs. rest ==") X = X_orig Y = tweak_labels(Y_orig, ["positive"]) train_model(create_ngram_model, X, Y, name="pos vs rest", plot=True) - print "== Neg vs. rest ==" + print("== Neg vs. rest ==") X = X_orig Y = tweak_labels(Y_orig, ["negative"]) train_model(create_ngram_model, X, Y, name="neg vs rest", plot=True) - print "time spent:", time.time() - start_time + print("time spent:", time.time() - start_time) diff --git a/ch06/02_tuning.py b/ch06/02_tuning.py index a48d3edc..5b6a835f 100644 --- a/ch06/02_tuning.py +++ b/ch06/02_tuning.py @@ -45,7 +45,7 @@ def create_ngram_model(params=None): def grid_search_model(clf_factory, X, Y): cv = ShuffleSplit( - n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) + n=len(X), n_iter=10, test_size=0.3, random_state=0) param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)], vect__min_df=[1, 2], @@ -64,7 +64,7 @@ def grid_search_model(clf_factory, X, Y): verbose=10) grid_search.fit(X, Y) clf = grid_search.best_estimator_ - print clf + print(clf) return clf @@ -114,7 +114,7 @@ def train_model(clf, X, Y, name="NB ngram", plot=False): summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores)) - print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary + print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) return np.mean(train_errors), np.mean(test_errors) @@ -125,9 +125,9 @@ def print_incorrect(clf, X, Y): X_wrong = X[wrong_idx] Y_wrong = Y[wrong_idx] Y_hat_wrong = Y_hat[wrong_idx] - for idx in xrange(len(X_wrong)): - print "clf.predict('%s')=%i instead of %i" %\ - (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx]) + for idx in range(len(X_wrong)): + print("clf.predict('%s')=%i instead of %i" % + (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])) def get_best_model(): @@ -149,16 +149,16 @@ def get_best_model(): X_orig, Y_orig = load_sanders_data() classes = np.unique(Y_orig) for c in classes: - print "#%s: %i" % (c, sum(Y_orig == c)) + print("#%s: %i" % (c, sum(Y_orig == c))) - print "== Pos vs. neg ==" + print("== Pos vs. neg ==") pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative") X = X_orig[pos_neg] Y = Y_orig[pos_neg] Y = tweak_labels(Y, ["positive"]) train_model(get_best_model(), X, Y, name="pos vs neg", plot=True) - print "== Pos/neg vs. irrelevant/neutral ==" + print("== Pos/neg vs. irrelevant/neutral ==") X = X_orig Y = tweak_labels(Y_orig, ["positive", "negative"]) @@ -166,16 +166,16 @@ def get_best_model(): # rest", plot=True) train_model(get_best_model(), X, Y, name="pos vs neg", plot=True) - print "== Pos vs. rest ==" + print("== Pos vs. rest ==") X = X_orig Y = tweak_labels(Y_orig, ["positive"]) train_model(get_best_model(), X, Y, name="pos vs rest", plot=True) - print "== Neg vs. rest ==" + print("== Neg vs. rest ==") X = X_orig Y = tweak_labels(Y_orig, ["negative"]) train_model(get_best_model(), X, Y, name="neg vs rest", plot=True) - print "time spent:", time.time() - start_time + print("time spent:", time.time() - start_time) diff --git a/ch06/03_clean.py b/ch06/03_clean.py index f00c2c44..0170ed82 100644 --- a/ch06/03_clean.py +++ b/ch06/03_clean.py @@ -57,7 +57,7 @@ } emo_repl_order = [k for (k_len, k) in reversed( - sorted([(len(k), k) for k in emo_repl.keys()]))] + sorted([(len(k), k) for k in list(emo_repl.keys())]))] re_repl = { r"\br\b": "are", @@ -84,7 +84,7 @@ def preprocessor(tweet): for k in emo_repl_order: tweet = tweet.replace(k, emo_repl[k]) - for r, repl in re_repl.iteritems(): + for r, repl in re_repl.items(): tweet = re.sub(r, repl, tweet) return tweet @@ -103,7 +103,7 @@ def preprocessor(tweet): def train_model(clf, X, Y, name="NB ngram", plot=False): # create it again for plotting cv = ShuffleSplit( - n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) + n=len(X), n_iter=10, test_size=0.3, random_state=0) train_errors = [] test_errors = [] @@ -150,7 +150,7 @@ def train_model(clf, X, Y, name="NB ngram", plot=False): summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores)) - print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary + print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) return np.mean(train_errors), np.mean(test_errors) @@ -161,9 +161,9 @@ def print_incorrect(clf, X, Y): X_wrong = X[wrong_idx] Y_wrong = Y[wrong_idx] Y_hat_wrong = Y_hat[wrong_idx] - for idx in xrange(len(X_wrong)): - print "clf.predict('%s')=%i instead of %i" %\ - (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx]) + for idx in range(len(X_wrong)): + print("clf.predict('%s')=%i instead of %i" % + (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])) def get_best_model(): @@ -185,16 +185,16 @@ def get_best_model(): X_orig, Y_orig = load_sanders_data() classes = np.unique(Y_orig) for c in classes: - print "#%s: %i" % (c, sum(Y_orig == c)) + print("#%s: %i" % (c, sum(Y_orig == c))) - print "== Pos vs. neg ==" + print("== Pos vs. neg ==") pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative") X = X_orig[pos_neg] Y = Y_orig[pos_neg] Y = tweak_labels(Y, ["positive"]) train_model(get_best_model(), X, Y, name="pos vs neg", plot=True) - print "== Pos/neg vs. irrelevant/neutral ==" + print("== Pos/neg vs. irrelevant/neutral ==") X = X_orig Y = tweak_labels(Y_orig, ["positive", "negative"]) @@ -202,16 +202,16 @@ def get_best_model(): # rest", plot=True) train_model(get_best_model(), X, Y, name="pos+neg vs rest", plot=True) - print "== Pos vs. rest ==" + print("== Pos vs. rest ==") X = X_orig Y = tweak_labels(Y_orig, ["positive"]) train_model(get_best_model(), X, Y, name="pos vs rest", plot=True) - print "== Neg vs. rest ==" + print("== Neg vs. rest ==") X = X_orig Y = tweak_labels(Y_orig, ["negative"]) train_model(get_best_model(), X, Y, name="neg vs rest", plot=True) - print "time spent:", time.time() - start_time + print("time spent:", time.time() - start_time) diff --git a/ch06/04_sent.py b/ch06/04_sent.py index 87fbafc9..c09a435f 100644 --- a/ch06/04_sent.py +++ b/ch06/04_sent.py @@ -153,7 +153,7 @@ def transform(self, documents): } emo_repl_order = [k for (k_len, k) in reversed( - sorted([(len(k), k) for k in emo_repl.keys()]))] + sorted([(len(k), k) for k in list(emo_repl.keys())]))] re_repl = { r"\br\b": "are", @@ -179,7 +179,7 @@ def preprocessor(tweet): for k in emo_repl_order: tweet = tweet.replace(k, emo_repl[k]) - for r, repl in re_repl.iteritems(): + for r, repl in re_repl.items(): tweet = re.sub(r, repl, tweet) return tweet.replace("-", " ").replace("_", " ") @@ -202,7 +202,7 @@ def preprocessor(tweet): def __grid_search_model(clf_factory, X, Y): cv = ShuffleSplit( - n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) + n=len(X), n_iter=10, test_size=0.3, random_state=0) param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)], vect__min_df=[1, 2], @@ -220,7 +220,7 @@ def __grid_search_model(clf_factory, X, Y): verbose=10) grid_search.fit(X, Y) clf = grid_search.best_estimator_ - print clf + print(clf) return clf @@ -228,7 +228,7 @@ def __grid_search_model(clf_factory, X, Y): def train_model(clf, X, Y, name="NB ngram", plot=False): # create it again for plotting cv = ShuffleSplit( - n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) + n=len(X), n_iter=10, test_size=0.3, random_state=0) train_errors = [] test_errors = [] @@ -275,7 +275,7 @@ def train_model(clf, X, Y, name="NB ngram", plot=False): summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores)) - print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary + print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) return np.mean(train_errors), np.mean(test_errors) @@ -286,9 +286,9 @@ def print_incorrect(clf, X, Y): X_wrong = X[wrong_idx] Y_wrong = Y[wrong_idx] Y_hat_wrong = Y_hat[wrong_idx] - for idx in xrange(len(X_wrong)): - print "clf.predict('%s')=%i instead of %i" %\ - (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx]) + for idx in range(len(X_wrong)): + print("clf.predict('%s')=%i instead of %i" % + (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])) def get_best_model(): @@ -315,16 +315,16 @@ def get_best_model(): #Y_orig = Y_orig[:100,] classes = np.unique(Y_orig) for c in classes: - print "#%s: %i" % (c, sum(Y_orig == c)) + print("#%s: %i" % (c, sum(Y_orig == c))) - print "== Pos vs. neg ==" + print("== Pos vs. neg ==") pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative") X = X_orig[pos_neg] Y = Y_orig[pos_neg] Y = tweak_labels(Y, ["positive"]) train_model(get_best_model(), X, Y, name="pos vs neg", plot=True) - print "== Pos/neg vs. irrelevant/neutral ==" + print("== Pos/neg vs. irrelevant/neutral ==") X = X_orig Y = tweak_labels(Y_orig, ["positive", "negative"]) @@ -332,18 +332,18 @@ def get_best_model(): # rest", plot=True) train_model(get_best_model(), X, Y, name="pos+neg vs rest", plot=True) - print "== Pos vs. rest ==" + print("== Pos vs. rest ==") X = X_orig Y = tweak_labels(Y_orig, ["positive"]) train_model(get_best_model(), X, Y, name="pos vs rest", plot=True) - print "== Neg vs. rest ==" + print("== Neg vs. rest ==") X = X_orig Y = tweak_labels(Y_orig, ["negative"]) train_model(get_best_model(), X, Y, name="neg vs rest", plot=True) - print "time spent:", time.time() - start_time + print("time spent:", time.time() - start_time) json.dump(poscache, open(poscache_filename, "w")) diff --git a/ch06/install.py b/ch06/install.py index c07a195f..be61f485 100644 --- a/ch06/install.py +++ b/ch06/install.py @@ -11,23 +11,19 @@ # # Pulls tweet data from Twitter because ToS prevents distributing it directly. # -# Right now we use unauthenticated requests, which are rate-limited to 150/hr. -# We use 125/hr to stay safe. -# -# # - Niek Sanders # njs@sananalytics.com # October 20, 2011 # # -# Excuse the ugly code. I threw this together as quickly as possible and I -# don't normally code in Python. -# # In Sanders' original form, the code was using Twitter API 1.0. # Now that Twitter moved to 1.1, we had to make a few changes. # Cf. twitterauth.py for the details. +# Regarding rate limiting, please check +# https://dev.twitter.com/rest/public/rate-limiting + import sys import csv import json @@ -38,24 +34,21 @@ import twitter except ImportError: print("""\ -You need to install python-twitter: - pip install python-twitter +You need to ... + pip install twitter If pip is not found you might have to install it using easy_install. If it does not work on your system, you might want to follow instructions -at https://github.com/bear/python-twitter, most likely: - $ git clone https://github.com/bear/python-twitter - $ cd python-twitter +at https://github.com/sixohsix/twitter, most likely: + $ git clone https://github.com/sixohsix/twitter + $ cd twitter $ sudo python setup.py install """) sys.exit(1) from twitterauth import CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN_KEY, ACCESS_TOKEN_SECRET -api = twitter.Api(consumer_key=CONSUMER_KEY, consumer_secret=CONSUMER_SECRET, - access_token_key=ACCESS_TOKEN_KEY, access_token_secret=ACCESS_TOKEN_SECRET) - - -MAX_TWEETS_PER_HR = 350 +api = twitter.Twitter(auth=twitter.OAuth(consumer_key=CONSUMER_KEY, consumer_secret=CONSUMER_SECRET, + token=ACCESS_TOKEN_KEY, token_secret=ACCESS_TOKEN_SECRET)) DATA_PATH = "data" @@ -87,16 +80,15 @@ def get_user_params(DATA_PATH): def dump_user_params(user_params): # dump user params for confirmation - print 'Input: ' + user_params['inList'] - print 'Output: ' + user_params['outList'] - print 'Raw data: ' + user_params['rawDir'] - return + print('Input: ' + user_params['inList']) + print('Output: ' + user_params['outList']) + print('Raw data: ' + user_params['rawDir']) def read_total_list(in_filename): # read total fetch list csv - fp = open(in_filename, 'rb') + fp = open(in_filename, 'rt') reader = csv.reader(fp, delimiter=',', quotechar='"') if os.path.exists(MISSING_ID_FILE): @@ -111,10 +103,12 @@ def read_total_list(in_filename): else: not_authed_ids = [] - print "We will skip %i tweets that are not available/visible any more on twitter" % (len(missing_ids) + len(not_authed_ids)) + print("We will skip %i tweets that are not available or visible any more on twitter" % ( + len(missing_ids) + len(not_authed_ids))) ignore_ids = set(missing_ids + not_authed_ids) total_list = [] + for row in reader: if row[2] not in ignore_ids: total_list.append(row) @@ -140,12 +134,12 @@ def purge_already_fetched(fetch_list, raw_dir): parse_tweet_json(tweet_file) count_done += 1 except RuntimeError: - print "Error parsing", item + print("Error parsing", item) rem_list.append(item) else: rem_list.append(item) - print "We have already downloaded %i tweets." % count_done + print("We have already downloaded %i tweets." % count_done) return rem_list @@ -156,66 +150,50 @@ def download_tweets(fetch_list, raw_dir): if not os.path.exists(raw_dir): os.mkdir(raw_dir) - # stay within rate limits - download_pause_sec = 3600 / MAX_TWEETS_PER_HR - # download tweets for idx in range(0, len(fetch_list)): - # stay in Twitter API rate limits - print 'Pausing %d sec to obey Twitter API rate limits' % \ - (download_pause_sec) - time.sleep(download_pause_sec) - # current item item = fetch_list[idx] - print item - - # print status - print '--> downloading tweet #%s (%d of %d)' % \ - (item[2], idx + 1, len(fetch_list)) + print(item) - # Old Twitter API 1.0 - # pull data - # url = 'https://api.twitter.com/1/statuses/show.json?id=' + item[2] - # print url - # urllib.urlretrieve(url, raw_dir + item[2] + '.json') + print('--> downloading tweet #%s (%d of %d)' % + (item[2], idx + 1, len(fetch_list))) - # New Twitter API 1.1 try: - sec = api.GetSleepTime('/statuses/show/:id') - if sec > 0: - print "Sleeping %i seconds to conform to Twitter's rate limiting" % sec - time.sleep(sec) + #import pdb;pdb.set_trace() + response = api.statuses.show(_id=item[2]) - result = api.GetStatus(item[2]) - json_data = result.AsJsonString() + if response.rate_limit_remaining <= 0: + wait_seconds = response.rate_limit_reset - time.time() + print("Rate limiting requests us to wait %f seconds" % + wait_seconds) + time.sleep(wait_seconds+5) - except twitter.TwitterError, e: + except twitter.TwitterError as e: fatal = True - for m in e.message: + print(e) + for m in json.loads(e.response_data.decode())['errors']: if m['code'] == 34: - print "Tweet missing: ", item - # [{u'message': u'Sorry, that page does not exist', u'code': 34}] - with open(MISSING_ID_FILE, "a") as f: + print("Tweet missing: ", item) + with open(MISSING_ID_FILE, "at") as f: f.write(item[2] + "\n") fatal = False break elif m['code'] == 63: - print "User of tweet '%s' has been suspended." % item - # [{u'message': u'Sorry, that page does not exist', u'code': 34}] - with open(MISSING_ID_FILE, "a") as f: + print("User of tweet '%s' has been suspended." % item) + with open(MISSING_ID_FILE, "at") as f: f.write(item[2] + "\n") fatal = False break elif m['code'] == 88: - print "Rate limit exceeded. Please lower max_tweets_per_hr." + print("Rate limit exceeded.") fatal = True break elif m['code'] == 179: - print "Not authorized to view this tweet." - with open(NOT_AUTHORIZED_ID_FILE, "a") as f: + print("Not authorized to view this tweet.") + with open(NOT_AUTHORIZED_ID_FILE, "at") as f: f.write(item[2] + "\n") fatal = False break @@ -225,8 +203,8 @@ def download_tweets(fetch_list, raw_dir): else: continue - with open(raw_dir + item[2] + '.json', "w") as f: - f.write(json_data + "\n") + with open(raw_dir + item[2] + '.json', "wt") as f: + f.write(json.dumps(dict(response)) + "\n") return @@ -234,12 +212,13 @@ def download_tweets(fetch_list, raw_dir): def parse_tweet_json(filename): # read tweet - fp = open(filename, 'rb') + fp = open(filename, 'r') # parse json try: tweet_json = json.load(fp) - except ValueError: + except ValueError as e: + print(e) raise RuntimeError('error parsing json') # look for twitter api error msgs @@ -281,20 +260,20 @@ def build_output_corpus(out_filename, raw_dir, total_list): writer.writerow(full_row) except RuntimeError: - print '--> bad data in tweet #' + item[2] + print('--> bad data in tweet #' + item[2]) missing_count += 1 else: - print '--> missing tweet #' + item[2] + print('--> missing tweet #' + item[2]) missing_count += 1 # indicate success if missing_count == 0: - print '\nSuccessfully downloaded corpus!' - print 'Output in: ' + out_filename + '\n' + print('\nSuccessfully downloaded corpus!') + print('Output in: ' + out_filename + '\n') else: - print '\nMissing %d of %d tweets!' % (missing_count, len(total_list)) - print 'Partial output in: ' + out_filename + '\n' + print('\nMissing %d of %d tweets!' % (missing_count, len(total_list))) + print('Partial output in: ' + out_filename + '\n') return @@ -302,7 +281,7 @@ def build_output_corpus(out_filename, raw_dir, total_list): def main(): # get user parameters user_params = get_user_params(DATA_PATH) - print user_params + print(user_params) dump_user_params(user_params) # get fetch list @@ -310,7 +289,7 @@ def main(): # remove already fetched or missing tweets fetch_list = purge_already_fetched(total_list, user_params['rawDir']) - print "Fetching %i tweets..." % len(fetch_list) + print("Fetching %i tweets..." % len(fetch_list)) if fetch_list: # start fetching data from twitter @@ -319,10 +298,11 @@ def main(): # second pass for any failed downloads fetch_list = purge_already_fetched(total_list, user_params['rawDir']) if fetch_list: - print '\nStarting second pass to retry %i failed downloads...' % len(fetch_list) + print('\nStarting second pass to retry %i failed downloads...' % + len(fetch_list)) download_tweets(fetch_list, user_params['rawDir']) else: - print "Nothing to fetch any more." + print("Nothing to fetch any more.") # build output corpus build_output_corpus(user_params['outList'], user_params['rawDir'], diff --git a/ch06/utils.py b/ch06/utils.py index ca8b2357..6757354f 100644 --- a/ch06/utils.py +++ b/ch06/utils.py @@ -6,6 +6,7 @@ # It is made available under the MIT License import os +import sys import collections import csv import json @@ -57,7 +58,7 @@ def load_sanders_data(dirname=".", line_count=-1): try: tweet = json.load(open(tweet_fn, "r")) except IOError: - print("Tweet '%s' not found. Skip."%tweet_fn) + print(("Tweet '%s' not found. Skip." % tweet_fn)) continue if 'text' in tweet and tweet['user']['lang'] == "en": @@ -84,14 +85,14 @@ def plot_pr(auc_score, name, phase, precision, recall, label=None): pylab.title('P/R curve (AUC=%0.2f) / %s' % (auc_score, label)) filename = name.replace(" ", "_") pylab.savefig(os.path.join(CHART_DIR, "pr_%s_%s.png" % - (filename, phase)), bbox_inches="tight") + (filename, phase)), bbox_inches="tight") def show_most_informative_features(vectorizer, clf, n=20): c_f = sorted(zip(clf.coef_[0], vectorizer.get_feature_names())) - top = zip(c_f[:n], c_f[:-(n + 1):-1]) + top = list(zip(c_f[:n], c_f[:-(n + 1):-1])) for (c1, f1), (c2, f2) in top: - print "\t%.4f\t%-15s\t\t%.4f\t%-15s" % (c1, f1, c2, f2) + print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (c1, f1, c2, f2)) def plot_log(): @@ -119,7 +120,7 @@ def plot_feat_importance(feature_names, clf, name): inds = np.argsort(coef) f_imp = f_imp[inds] coef = coef[inds] - xpos = np.array(range(len(coef))) + xpos = np.array(list(range(len(coef)))) pylab.bar(xpos, coef, width=1) pylab.title('Feature importance for %s' % (name)) @@ -181,8 +182,13 @@ def plot_bias_variance(data_sizes, train_errors, test_errors, name): def load_sent_word_net(): sent_scores = collections.defaultdict(list) + sentiwordnet_path = os.path.join(DATA_DIR, "SentiWordNet_3.0.0_20130122.txt") - with open(os.path.join(DATA_DIR, "SentiWordNet_3.0.0_20130122.txt"), "r") as csvfile: + if not os.path.exists(sentiwordnet_path): + print("Please download SentiWordNet_3.0.0 from http://sentiwordnet.isti.cnr.it/download.php, extract it and put it into the data directory") + sys.exit(1) + + with open(sentiwordnet_path, 'r') as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='"') for line in reader: if line[0].startswith("#"): @@ -200,7 +206,7 @@ def load_sent_word_net(): term = term.replace("-", " ").replace("_", " ") key = "%s/%s" % (POS, term.split("#")[0]) sent_scores[key].append((float(PosScore), float(NegScore))) - for key, value in sent_scores.iteritems(): + for key, value in sent_scores.items(): sent_scores[key] = np.mean(value, axis=0) return sent_scores diff --git a/ch07/.gitignore b/ch07/.gitignore index d21bb3d1..e33609d2 100644 --- a/ch07/.gitignore +++ b/ch07/.gitignore @@ -1 +1 @@ -.formula_*/ +*.png diff --git a/ch07/README.rst b/ch07/README.rst new file mode 100644 index 00000000..12a7b051 --- /dev/null +++ b/ch07/README.rst @@ -0,0 +1,42 @@ +========= +Chapter 7 +========= + +Support code for *Chapter 7: Regression* + + +Boston data analysis +-------------------- + +This dataset is shipped with sklearn. Thus, no extra download is required. + + +boston1.py + Fit a linear regression model to the Boston house price data +boston1numpy.py + Version of above script using numpy operations for linear regression +boston_cv_penalized.py + Test different penalized (and OLS) regression schemes on the Boston dataset +figure1_2.py + Show the regression line for Boston data +figure3.py + Show the regression line for Boston data with OLS and Lasso +figure4.py + Scatter plot of predicted-vs-actual for multidimensional regression + +10K data analysis +----------------- + +lr10k.py + Linear regression on 10K dataset, evaluation by cross-validation +predict10k_en.py + Elastic nets (including with inner cross-validation for parameter + settings). Produces scatter plot. + + +MovieLens data analysis +----------------------- + +In this chapter, we only consider a very simple approach, which is implemented +in the ``usermodel.py`` script. + diff --git a/ch07/boston1.py b/ch07/boston1.py index 0074f927..d0b30447 100644 --- a/ch07/boston1.py +++ b/ch07/boston1.py @@ -7,25 +7,33 @@ # This script shows an example of simple (ordinary) linear regression +# The first edition of the book NumPy functions only for this operation. See +# the file boston1numpy.py for that version. + import numpy as np from sklearn.datasets import load_boston -import pylab as plt +from sklearn.linear_model import LinearRegression +from matplotlib import pyplot as plt boston = load_boston() -x = np.array([np.concatenate((v, [1])) for v in boston.data]) +x = boston.data y = boston.target -# np.linal.lstsq implements least-squares linear regression -s, total_error, _, _ = np.linalg.lstsq(x, y) +# Fitting a model is trivial: call the ``fit`` method in LinearRegression: +lr = LinearRegression() +lr.fit(x, y) + +# The instance member `residues_` contains the sum of the squared residues +rmse = np.sqrt(lr.residues_/len(x)) +print('RMSE: {}'.format(rmse)) -rmse = np.sqrt(total_error[0] / len(x)) -print('Residual: {}'.format(rmse)) +fig, ax = plt.subplots() +# Plot a diagonal (for reference): +ax.plot([0, 50], [0, 50], '-', color=(.9,.3,.3), lw=4) # Plot the prediction versus real: -plt.plot(np.dot(x, s), boston.target, 'ro') +ax.scatter(lr.predict(x), boston.target) -# Plot a diagonal (for reference): -plt.plot([0, 50], [0, 50], 'g-') -plt.xlabel('predicted') -plt.ylabel('real') -plt.show() +ax.set_xlabel('predicted') +ax.set_ylabel('real') +fig.savefig('Figure_07_08.png') diff --git a/ch07/boston1numpy.py b/ch07/boston1numpy.py new file mode 100644 index 00000000..0074f927 --- /dev/null +++ b/ch07/boston1numpy.py @@ -0,0 +1,31 @@ +# This code is supporting material for the book +# Building Machine Learning Systems with Python +# by Willi Richert and Luis Pedro Coelho +# published by PACKT Publishing +# +# It is made available under the MIT License + +# This script shows an example of simple (ordinary) linear regression + +import numpy as np +from sklearn.datasets import load_boston +import pylab as plt + +boston = load_boston() +x = np.array([np.concatenate((v, [1])) for v in boston.data]) +y = boston.target + +# np.linal.lstsq implements least-squares linear regression +s, total_error, _, _ = np.linalg.lstsq(x, y) + +rmse = np.sqrt(total_error[0] / len(x)) +print('Residual: {}'.format(rmse)) + +# Plot the prediction versus real: +plt.plot(np.dot(x, s), boston.target, 'ro') + +# Plot a diagonal (for reference): +plt.plot([0, 50], [0, 50], 'g-') +plt.xlabel('predicted') +plt.ylabel('real') +plt.show() diff --git a/ch07/boston_cv10_penalized.py b/ch07/boston_cv10_penalized.py deleted file mode 100644 index c9f28636..00000000 --- a/ch07/boston_cv10_penalized.py +++ /dev/null @@ -1,50 +0,0 @@ -# This code is supporting material for the book -# Building Machine Learning Systems with Python -# by Willi Richert and Luis Pedro Coelho -# published by PACKT Publishing -# -# It is made available under the MIT License - -# This script fits several forms of penalized regression - -from __future__ import print_function -from sklearn.cross_validation import KFold -from sklearn.linear_model import ElasticNet, Lasso, Ridge -from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV -import numpy as np -from sklearn.datasets import load_boston -boston = load_boston() -x = np.array([np.concatenate((v, [1])) for v in boston.data]) -y = boston.target - -for name, met in [ - ('elastic-net(.5)', ElasticNet(fit_intercept=True, alpha=0.5)), - ('lasso(.5)', Lasso(fit_intercept=True, alpha=0.5)), - ('ridge(.5)', Ridge(fit_intercept=True, alpha=0.5)), -]: - # Fit on the whole data: - met.fit(x, y) - - # Predict on the whole data: - p = np.array([met.predict(xi) for xi in x]) - - e = p - y - # np.dot(e, e) == sum(ei**2 for ei in e) but faster - total_error = np.dot(e, e) - rmse_train = np.sqrt(total_error / len(p)) - - # Now, we use 10 fold cross-validation to estimate generalization error - kf = KFold(len(x), n_folds=10) - err = 0 - for train, test in kf: - met.fit(x[train], y[train]) - p = np.array([met.predict(xi) for xi in x[test]]) - e = p - y[test] - err += np.dot(e, e) - - rmse_10cv = np.sqrt(err / len(x)) - print('Method: {}'.format(name)) - print('RMSE on training: {}'.format(rmse_train)) - print('RMSE on 10-fold CV: {}'.format(rmse_10cv)) - print() - print() diff --git a/ch07/boston_cv_penalized.py b/ch07/boston_cv_penalized.py new file mode 100644 index 00000000..c894c4fa --- /dev/null +++ b/ch07/boston_cv_penalized.py @@ -0,0 +1,46 @@ +# This code is supporting material for the book +# Building Machine Learning Systems with Python +# by Willi Richert and Luis Pedro Coelho +# published by PACKT Publishing +# +# It is made available under the MIT License + +# This script fits several forms of penalized regression + +from __future__ import print_function +import numpy as np +from sklearn.cross_validation import KFold +from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, Ridge +from sklearn.metrics import r2_score +from sklearn.datasets import load_boston +boston = load_boston() +x = boston.data +y = boston.target + +for name, met in [ + ('linear regression', LinearRegression()), + ('lasso()', Lasso()), + ('elastic-net(.5)', ElasticNet(alpha=0.5)), + ('lasso(.5)', Lasso(alpha=0.5)), + ('ridge(.5)', Ridge(alpha=0.5)), +]: + # Fit on the whole data: + met.fit(x, y) + + # Predict on the whole data: + p = met.predict(x) + r2_train = r2_score(y, p) + + # Now, we use 10 fold cross-validation to estimate generalization error + kf = KFold(len(x), n_folds=5) + p = np.zeros_like(y) + for train, test in kf: + met.fit(x[train], y[train]) + p[test] = met.predict(x[test]) + + r2_cv = r2_score(y, p) + print('Method: {}'.format(name)) + print('R2 on training: {}'.format(r2_train)) + print('R2 on 5-fold CV: {}'.format(r2_cv)) + print() + print() diff --git a/ch07/cv10_lr.py b/ch07/cv10_lr.py deleted file mode 100644 index 2ae52e02..00000000 --- a/ch07/cv10_lr.py +++ /dev/null @@ -1,37 +0,0 @@ -# This code is supporting material for the book -# Building Machine Learning Systems with Python -# by Willi Richert and Luis Pedro Coelho -# published by PACKT Publishing -# -# It is made available under the MIT License - -from sklearn.cross_validation import KFold -from sklearn.linear_model import LinearRegression, ElasticNet -import numpy as np -from sklearn.datasets import load_boston -boston = load_boston() -x = np.array([np.concatenate((v, [1])) for v in boston.data]) -y = boston.target -FIT_EN = False - -if FIT_EN: - model = ElasticNet(fit_intercept=True, alpha=0.5) -else: - model = LinearRegression(fit_intercept=True) -model.fit(x, y) -p = np.array([model.predict(xi) for xi in x]) -e = p - y -total_error = np.dot(e, e) -rmse_train = np.sqrt(total_error / len(p)) - -kf = KFold(len(x), n_folds=10) -err = 0 -for train, test in kf: - model.fit(x[train], y[train]) - p = np.array([model.predict(xi) for xi in x[test]]) - e = p - y[test] - err += np.dot(e, e) - -rmse_10cv = np.sqrt(err / len(x)) -print('RMSE on training: {}'.format(rmse_train)) -print('RMSE on 10-fold CV: {}'.format(rmse_10cv)) diff --git a/ch07/data/.gitignore b/ch07/data/.gitignore new file mode 100644 index 00000000..3286ba0f --- /dev/null +++ b/ch07/data/.gitignore @@ -0,0 +1 @@ +E2006.train diff --git a/ch07/data/download.sh b/ch07/data/download.sh old mode 100644 new mode 100755 index 6f938e06..74753364 --- a/ch07/data/download.sh +++ b/ch07/data/download.sh @@ -1,5 +1,3 @@ #!/usr/bin/env bash -wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/E2006.train.bz2 +curl -O http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/E2006.train.bz2 bunzip2 E2006.train.bz2 -wget http://www.grouplens.org/system/files/ml-100k.zip -unzip ml-100k.zip diff --git a/ch07/figure1.py b/ch07/figure1.py deleted file mode 100644 index a9c23af7..00000000 --- a/ch07/figure1.py +++ /dev/null @@ -1,30 +0,0 @@ -# This code is supporting material for the book -# Building Machine Learning Systems with Python -# by Willi Richert and Luis Pedro Coelho -# published by PACKT Publishing -# -# It is made available under the MIT License - -import numpy as np -from sklearn.datasets import load_boston -import pylab as plt -from mpltools import style -style.use('ggplot') - -boston = load_boston() -plt.scatter(boston.data[:, 5], boston.target) -plt.xlabel("RM") -plt.ylabel("House Price") - - -x = boston.data[:, 5] -x = np.array([[v] for v in x]) -y = boston.target - -slope, res, _, _ = np.linalg.lstsq(x, y) -plt.plot([0, boston.data[:, 5].max() + 1], - [0, slope * (boston.data[:, 5].max() + 1)], '-', lw=4) -plt.savefig('Figure1.png', dpi=150) - -rmse = np.sqrt(res[0] / len(x)) -print('Residual: {}'.format(rmse)) diff --git a/ch07/figure1_2.py b/ch07/figure1_2.py new file mode 100644 index 00000000..3f11a0c7 --- /dev/null +++ b/ch07/figure1_2.py @@ -0,0 +1,63 @@ +# This code is supporting material for the book +# Building Machine Learning Systems with Python +# by Willi Richert and Luis Pedro Coelho +# published by PACKT Publishing +# +# It is made available under the MIT License + +import numpy as np +from sklearn.datasets import load_boston +from sklearn.linear_model import LinearRegression +from sklearn.metrics import mean_squared_error, r2_score +from matplotlib import pyplot as plt + +boston = load_boston() + +# Index number five in the number of rooms +fig,ax = plt.subplots() +ax.scatter(boston.data[:, 5], boston.target) +ax.set_xlabel("Average number of rooms (RM)") +ax.set_ylabel("House Price") + +x = boston.data[:, 5] +# fit (used below) takes a two-dimensional array as input. We use np.atleast_2d +# to convert from one to two dimensional, then transpose to make sure that the +# format matches: +x = np.transpose(np.atleast_2d(x)) + +y = boston.target + +lr = LinearRegression(fit_intercept=False) +lr.fit(x, y) + +ax.plot([0, boston.data[:, 5].max() + 1], + [0, lr.predict(boston.data[:, 5].max() + 1)], '-', lw=4) +fig.savefig('Figure1.png') + +mse = mean_squared_error(y, lr.predict(x)) +rmse = np.sqrt(mse) +print('RMSE (no intercept): {}'.format(rmse)) + +# Repeat, but fitting an intercept this time: +lr = LinearRegression(fit_intercept=True) + +lr.fit(x, y) + +fig,ax = plt.subplots() +ax.set_xlabel("Average number of rooms (RM)") +ax.set_ylabel("House Price") +ax.scatter(boston.data[:, 5], boston.target) +xmin = x.min() +xmax = x.max() +ax.plot([xmin, xmax], lr.predict([[xmin], [xmax]]) , '-', lw=4) +fig.savefig('Figure2.png') + +mse = mean_squared_error(y, lr.predict(x)) +print("Mean squared error (of training data): {:.3}".format(mse)) + +rmse = np.sqrt(mse) +print("Root mean squared error (of training data): {:.3}".format(rmse)) + +cod = r2_score(y, lr.predict(x)) +print('COD (on training data): {:.2}'.format(cod)) + diff --git a/ch07/figure2.py b/ch07/figure2.py deleted file mode 100644 index c5977207..00000000 --- a/ch07/figure2.py +++ /dev/null @@ -1,31 +0,0 @@ -# This code is supporting material for the book -# Building Machine Learning Systems with Python -# by Willi Richert and Luis Pedro Coelho -# published by PACKT Publishing -# -# It is made available under the MIT License - -import numpy as np -from sklearn.datasets import load_boston -import pylab as plt -from mpltools import style -style.use('ggplot') - -boston = load_boston() -plt.scatter(boston.data[:, 5], boston.target) -plt.xlabel("RM") -plt.ylabel("House Price") - - -x = boston.data[:, 5] -xmin = x.min() -xmax = x.max() -x = np.array([[v, 1] for v in x]) -y = boston.target - -(slope, bias), res, _, _ = np.linalg.lstsq(x, y) -plt.plot([xmin, xmax], [slope * xmin + bias, slope * xmax + bias], '-', lw=4) -plt.savefig('Figure2.png', dpi=150) - -rmse = np.sqrt(res[0] / len(x)) -print('Residual: {}'.format(rmse)) diff --git a/ch07/figure3.py b/ch07/figure3.py new file mode 100644 index 00000000..7543c1ec --- /dev/null +++ b/ch07/figure3.py @@ -0,0 +1,33 @@ +# This code is supporting material for the book +# Building Machine Learning Systems with Python +# by Willi Richert and Luis Pedro Coelho +# published by PACKT Publishing +# +# It is made available under the MIT License + +from sklearn.linear_model import LinearRegression, Lasso +import numpy as np +from sklearn.datasets import load_boston +from matplotlib import pyplot as plt + +boston = load_boston() +fig, ax = plt.subplots() +ax.scatter(boston.data[:, 5], boston.target) +ax.set_xlabel("Number of rooms (RM)") +ax.set_ylabel("House Price") + + +x = boston.data[:, 5] +xmin = x.min() +xmax = x.max() +x = np.transpose(np.atleast_2d(x)) +y = boston.target + +lr = LinearRegression() +lr.fit(x, y) +ax.plot([xmin, xmax], lr.predict([[xmin], [xmax]]), ':', lw=4, label='OLS model') + +las = Lasso() +las.fit(x, y) +ax.plot([xmin, xmax], las.predict([ [xmin], [xmax] ]), '-', lw=4, label='Lasso model') +fig.savefig('Figure3.png') diff --git a/ch07/figure4.py b/ch07/figure4.py index ab71b3a8..a24d48be 100644 --- a/ch07/figure4.py +++ b/ch07/figure4.py @@ -5,31 +5,29 @@ # # It is made available under the MIT License -from sklearn.linear_model import Lasso + +# This script plots prediction-vs-actual on training set for the Boston dataset +# using OLS regression import numpy as np +from sklearn.linear_model import LinearRegression from sklearn.datasets import load_boston -import pylab as plt -from mpltools import style -style.use('ggplot') +from sklearn.metrics import mean_squared_error +from matplotlib import pyplot as plt boston = load_boston() -plt.scatter(boston.data[:, 5], boston.target) -plt.xlabel("RM") -plt.ylabel("House Price") - -x = boston.data[:, 5] -xmin = x.min() -xmax = x.max() -x = np.array([[v, 1] for v in x]) +x = boston.data y = boston.target -(slope, bias), res, _, _ = np.linalg.lstsq(x, y) -plt.plot([xmin, xmax], [slope * xmin + bias, slope * xmax + bias], ':', lw=4) +lr = LinearRegression() +lr.fit(x, y) +p = lr.predict(x) +print("RMSE: {:.2}.".format(np.sqrt(mean_squared_error(y, p)))) +print("R2: {:.2}.".format(lr.score(x, y))) +fig,ax = plt.subplots() +ax.scatter(p, y) +ax.set_xlabel('Predicted price') +ax.set_ylabel('Actual price') +ax.plot([y.min(), y.max()], [y.min(), y.max()], lw=4) -las = Lasso() -las.fit(x, y) -y0 = las.predict([xmin, 1]) -y1 = las.predict([xmax, 1]) -plt.plot([xmin, xmax], [y0, y1], '-', lw=4) -plt.savefig('Figure3.png', dpi=150) +fig.savefig('Figure4.png') diff --git a/ch07/lasso_path_plot.py b/ch07/lasso_path_plot.py new file mode 100644 index 00000000..eab64c26 --- /dev/null +++ b/ch07/lasso_path_plot.py @@ -0,0 +1,29 @@ +# This code is supporting material for the book +# Building Machine Learning Systems with Python +# by Willi Richert and Luis Pedro Coelho +# published by PACKT Publishing +# +# It is made available under the MIT License + +from sklearn.linear_model import Lasso +from sklearn.datasets import load_boston +from matplotlib import pyplot as plt +import numpy as np + +boston = load_boston() +x = boston.data +y = boston.target + +las = Lasso(normalize=1) +alphas = np.logspace(-5, 2, 1000) +alphas, coefs, _= las.path(x, y, alphas=alphas) + +fig,ax = plt.subplots() +ax.plot(alphas, coefs.T) +ax.set_xscale('log') +ax.set_xlim(alphas.max(), alphas.min()) +ax.set_xlabel('Lasso coefficient path as a function of alpha') +ax.set_xlabel('Alpha') +ax.set_ylabel('Coefficient weight') +fig.savefig('Figure_LassoPath.png') + diff --git a/ch07/lr10k.py b/ch07/lr10k.py index 71afd412..831706a1 100644 --- a/ch07/lr10k.py +++ b/ch07/lr10k.py @@ -6,31 +6,33 @@ # It is made available under the MIT License import numpy as np +from sklearn.metrics import mean_squared_error, r2_score from sklearn.datasets import load_svmlight_file -from sklearn.linear_model import ElasticNet, LinearRegression -data, target = load_svmlight_file('E2006.train') -lr = LinearRegression(fit_intercept=True) - +from sklearn.linear_model import LinearRegression from sklearn.cross_validation import KFold -kf = KFold(len(target), n_folds=10) -err = 0 -for train, test in kf: - lr.fit(data[train], target[train]) - p = map(lr.predict, data[test]) - p = np.array(p).ravel() - e = p - target[test] - err += np.dot(e, e) -rmse_10cv = np.sqrt(err / len(target)) +# Whether to use Elastic nets (otherwise, ordinary linear regression is used) + +# Load data: +data, target = load_svmlight_file('data/E2006.train') +lr = LinearRegression() + +# Compute error on training data to demonstrate that we can obtain near perfect +# scores: lr.fit(data, target) -p = np.array(map(lr.predict, data)) -p = p.ravel() -e = p - target -total_error = np.dot(e, e) -rmse_train = np.sqrt(total_error / len(p)) +pred = lr.predict(data) + +print('RMSE on training, {:.2}'.format(np.sqrt(mean_squared_error(target, pred)))) +print('R2 on training, {:.2}'.format(r2_score(target, pred))) +print('') +pred = np.zeros_like(target) +kf = KFold(len(target), n_folds=5) +for train, test in kf: + lr.fit(data[train], target[train]) + pred[test] = lr.predict(data[test]) -print('RMSE on training: {}'.format(rmse_train)) -print('RMSE on 10-fold CV: {}'.format(rmse_10cv)) +print('RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(target, pred)))) +print('R2 on testing (5 fold), {:.2}'.format(r2_score(target, pred))) diff --git a/ch07/predict10k_en.py b/ch07/predict10k_en.py index 35372e81..a7dd960a 100644 --- a/ch07/predict10k_en.py +++ b/ch07/predict10k_en.py @@ -8,33 +8,66 @@ import numpy as np from sklearn.datasets import load_svmlight_file from sklearn.cross_validation import KFold -from sklearn.linear_model import ElasticNet, LinearRegression +from sklearn.linear_model import ElasticNetCV, ElasticNet +from sklearn.metrics import mean_squared_error, r2_score +from matplotlib import pyplot as plt data, target = load_svmlight_file('data/E2006.train') # Edit the lines below if you want to switch method: -# met = LinearRegression(fit_intercept=True) -met = ElasticNet(fit_intercept=True, alpha=.1) +# from sklearn.linear_model import Lasso +# met = Lasso(alpha=0.1) +met = ElasticNet(alpha=0.1) -kf = KFold(len(target), n_folds=10) -err = 0 +kf = KFold(len(target), n_folds=5) +pred = np.zeros_like(target) for train, test in kf: met.fit(data[train], target[train]) - p = map(met.predict, data[test]) - p = np.array(p).ravel() - e = p - target[test] - err += np.dot(e, e) + pred[test] = met.predict(data[test]) -rmse_10cv = np.sqrt(err / len(target)) +print('[EN 0.1] RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(target, pred)))) +print('[EN 0.1] R2 on testing (5 fold), {:.2}'.format(r2_score(target, pred))) +print('') +# Construct an ElasticNetCV object (use all available CPUs) +met = ElasticNetCV(n_jobs=-1) + +kf = KFold(len(target), n_folds=5) +pred = np.zeros_like(target) +for train, test in kf: + met.fit(data[train], target[train]) + pred[test] = met.predict(data[test]) + +print('[EN CV] RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(target, pred)))) +print('[EN CV] R2 on testing (5 fold), {:.2}'.format(r2_score(target, pred))) +print('') met.fit(data, target) -p = np.array(map(met.predict, data)) -p = p.ravel() -e = p - target -total_error = np.dot(e, e) -rmse_train = np.sqrt(total_error / len(p)) +pred = met.predict(data) +print('[EN CV] RMSE on training, {:.2}'.format(np.sqrt(mean_squared_error(target, pred)))) +print('[EN CV] R2 on training, {:.2}'.format(r2_score(target, pred))) + + +# Construct an ElasticNetCV object (use all available CPUs) +met = ElasticNetCV(n_jobs=-1, l1_ratio=[.01, .05, .25, .5, .75, .95, .99]) + +kf = KFold(len(target), n_folds=5) +pred = np.zeros_like(target) +for train, test in kf: + met.fit(data[train], target[train]) + pred[test] = met.predict(data[test]) + + +print('[EN CV l1_ratio] RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(target, pred)))) +print('[EN CV l1_ratio] R2 on testing (5 fold), {:.2}'.format(r2_score(target, pred))) +print('') + +fig, ax = plt.subplots() +y = target +ax.scatter(y, pred, c='k') +ax.plot([-5,-1], [-5,-1], 'r-', lw=2) +ax.set_xlabel('Actual value') +ax.set_ylabel('Predicted value') +fig.savefig('Figure_10k_scatter_EN_l1_ratio.png') -print('RMSE on training: {}'.format(rmse_train)) -print('RMSE on 10-fold CV: {}'.format(rmse_10cv)) diff --git a/ch07/usermodel.py b/ch07/usermodel.py deleted file mode 100644 index cae220fb..00000000 --- a/ch07/usermodel.py +++ /dev/null @@ -1,61 +0,0 @@ -# This code is supporting material for the book -# Building Machine Learning Systems with Python -# by Willi Richert and Luis Pedro Coelho -# published by PACKT Publishing -# -# It is made available under the MIT License - -import numpy as np -from scipy import sparse -from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV -from sklearn.cross_validation import KFold - -data = np.array([[int(tok) for tok in line.split('\t')[:3]] - for line in open('data/ml-100k/u.data')]) -ij = data[:, :2] -ij -= 1 # original data is in 1-based system -values = data[:, 2] -reviews = sparse.csc_matrix((values, ij.T)).astype(float) - -reg = ElasticNetCV(fit_intercept=True, alphas=[ - 0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) - - -def movie_norm(xc): - '''Normalize per movie''' - xc = xc.copy().toarray() - # x1 is the mean of the positive items - x1 = np.array([xi[xi > 0].mean() for xi in xc]) - x1 = np.nan_to_num(x1) - - for i in range(xc.shape[0]): - xc[i] -= (xc[i] > 0) * x1[i] - return xc, x1 - - -def learn_for(i): - u = reviews[i] - us = np.delete(np.arange(reviews.shape[0]), i) - ps, = np.where(u.toarray().ravel() > 0) - x = reviews[us][:, ps].T - y = u.data - err = 0 - eb = 0 - kf = KFold(len(y), n_folds=4) - for train, test in kf: - xc, x1 = movie_norm(x[train]) - reg.fit(xc, y[train] - x1) - - xc, x1 = movie_norm(x[test]) - p = np.array([reg.predict(xi) for xi in xc]).ravel() - e = (p + x1) - y[test] - err += np.sum(e * e) - eb += np.sum((y[train].mean() - y[test]) ** 2) - return np.sqrt(err / float(len(y))), np.sqrt(eb / float(len(y))) - -whole_data = [] -for i in range(reviews.shape[0]): - s = learn_for(i) - print(s[0] < s[1]) - print(s) - whole_data.append(s) diff --git a/ch08/README.rst b/ch08/README.rst new file mode 100644 index 00000000..488844e6 --- /dev/null +++ b/ch08/README.rst @@ -0,0 +1,41 @@ +========= +Chapter 8 +========= + +Support code for *Chapter 8: Recommendations*. + +The code refers to the second edition of the book and this code has been +significantly refactored when compared to the first one. + +Ratings Prediction +------------------ + +Note that since the partition of the data into training and testing is random, +everytime you run the code, the results will be different. + + +load_ml100k.py + Load data & partition into test/train +norm.py + Normalize the data +corrneighbours.py + Neighbour models based on ncrroaltoin +regression.py + Regression models +stacked.py + Stacked predictions +averaged.py + Averaging of predictions (mentioned in book, but code is not shown there). + +Association Rule Mining +----------------------- + +Check the folder ``apriori/`` + +apriori/histogram.py + Print a histogram of how many times each product was bought +apriori/apriori.py + Implementation of Apriori algorithm and association rule building +apriori/apriori_example.py + Example of Apriori algorithm in retail dataset + diff --git a/ch08/all_correlations.py b/ch08/all_correlations.py index df9c30bc..d3817bf0 100644 --- a/ch08/all_correlations.py +++ b/ch08/all_correlations.py @@ -7,10 +7,15 @@ import numpy as np -# This is the version in the book: - - -def all_correlations(bait, target): +def all_correlations(y, X): + from scipy import spatial + y = np.atleast_2d(y) + sp = spatial.distance.cdist(X, y, 'correlation') + # The "correlation distance" is 1 - corr(x,y); so we invert that to obtain the correlation + return 1 - sp.ravel() + +# This is the version in the book (1st Edition): +def all_correlations_book_version(bait, target): ''' corrs = all_correlations(bait, target) @@ -21,9 +26,7 @@ def all_correlations(bait, target): for c in target]) # This is a faster, but harder to read, implementation: - - -def all_correlations(y, X): +def all_correlations_fast_no_scipy(y, X): ''' Cs = all_correlations(y, X) @@ -41,3 +44,5 @@ def all_correlations(y, X): xs_ += 1e-5 # Handle zeros in x return (xy - x_ * y_ * n) / n / xs_ / ys_ + + diff --git a/ch08/apriori/.gitignore b/ch08/apriori/.gitignore index 05697a95..6379e20f 100644 --- a/ch08/apriori/.gitignore +++ b/ch08/apriori/.gitignore @@ -1 +1 @@ -retail.dat +retail.dat.gz diff --git a/ch08/apriori/apriori.py b/ch08/apriori/apriori.py index 1ebb7e54..eeaf0d46 100644 --- a/ch08/apriori/apriori.py +++ b/ch08/apriori/apriori.py @@ -5,9 +5,12 @@ # # It is made available under the MIT License +from collections import namedtuple + + def apriori(dataset, minsupport, maxsize): ''' - freqsets, baskets = apriori(dataset, minsupport, maxsize) + freqsets, support = apriori(dataset, minsupport, maxsize) Parameters ---------- @@ -21,48 +24,70 @@ def apriori(dataset, minsupport, maxsize): Returns ------- freqsets : sequence of sequences - baskets : dictionary + support : dictionary + This associates each itemset (represented as a frozenset) with a float + (the support of that itemset) ''' from collections import defaultdict baskets = defaultdict(list) pointers = defaultdict(list) + for i, ds in enumerate(dataset): for ell in ds: pointers[ell].append(i) baskets[frozenset([ell])].append(i) - pointers = dict([(k, frozenset(v)) for k, v in pointers.items()]) - baskets = dict([(k, frozenset(v)) for k, v in baskets.items()]) - valid = set(list(el)[0] - for el, c in baskets.items() if (len(c) >= minsupport)) - dataset = [[el for el in ds if (el in valid)] for ds in dataset] - dataset = [ds for ds in dataset if len(ds) > 1] - dataset = map(frozenset, dataset) + # Convert pointer items to frozensets to speed up operations later + new_pointers = dict() + for k in pointers: + if len(pointers[k]) >= minsupport: + new_pointers[k] = frozenset(pointers[k]) + pointers = new_pointers + for k in baskets: + baskets[k] = frozenset(baskets[k]) + + # Valid are all elements whose support is >= minsupport + valid = set() + for el, c in baskets.items(): + if len(c) >= minsupport: + valid.update(el) + + # Itemsets at first iteration are simply all singleton with valid elements: itemsets = [frozenset([v]) for v in valid] freqsets = [] for i in range(maxsize - 1): - print(len(itemsets)) + print("At iteration {}, number of frequent baskets: {}".format( + i, len(itemsets))) newsets = [] - for i, ell in enumerate(itemsets): - ccounts = baskets[ell] - for v_, pv in pointers.items(): - if v_ not in ell: + for it in itemsets: + ccounts = baskets[it] + + for v, pv in pointers.items(): + if v not in it: csup = (ccounts & pv) if len(csup) >= minsupport: - new = frozenset(ell | set([v_])) + new = frozenset(it | frozenset([v])) if new not in baskets: newsets.append(new) baskets[new] = csup freqsets.extend(itemsets) itemsets = newsets - return freqsets, baskets + if not len(itemsets): + break + support = {} + for k in baskets: + support[k] = float(len(baskets[k])) + return freqsets, support -def association_rules(dataset, freqsets, baskets, minlift): +# A namedtuple to collect all values that may be interesting +AssociationRule = namedtuple('AssociationRule', ['antecendent', 'consequent', 'base', 'py_x', 'lift']) + +def association_rules(dataset, freqsets, support, minlift): ''' - for (antecendent, consequent, base, py_x, lift) in association_rules(dataset, freqsets, baskets, minlift): + for assoc_rule in association_rules(dataset, freqsets, support, minlift): ... This function takes the returns from ``apriori``. @@ -72,9 +97,13 @@ def association_rules(dataset, freqsets, baskets, minlift): dataset : sequence of sequences input dataset freqsets : sequence of sequences - baskets : dictionary + support : dictionary minlift : int minimal lift of yielded rules + + Returns + ------- + assoc_rule : sequence of AssociationRule objects ''' nr_transactions = float(len(dataset)) freqsets = [f for f in freqsets if len(f) > 1] @@ -82,8 +111,9 @@ def association_rules(dataset, freqsets, baskets, minlift): for f in fset: consequent = frozenset([f]) antecendent = fset - consequent - base = len(baskets[consequent]) / nr_transactions - py_x = len(baskets[fset]) / float(len(baskets[antecendent])) + py_x = support[fset] / support[antecendent] + base = support[consequent] / nr_transactions lift = py_x / base if lift > minlift: - yield (antecendent, consequent, base, py_x, lift) + yield AssociationRule(antecendent, consequent, base, py_x, lift) + diff --git a/ch08/apriori/apriori_example.py b/ch08/apriori/apriori_example.py index da77e96c..971ed4d5 100644 --- a/ch08/apriori/apriori_example.py +++ b/ch08/apriori/apriori_example.py @@ -7,10 +7,17 @@ from apriori import apriori, association_rules from gzip import GzipFile + +# Load dataset dataset = [[int(tok) for tok in line.strip().split()] for line in GzipFile('retail.dat.gz')] -freqsets, baskets = apriori(dataset, 80, maxsize=5) -nr_transactions = float(len(dataset)) -for ant, con, base, pyx, lift in association_rules(dataset, freqsets, baskets, 30): - print('{} | {} | {} ({:%}) | {} | {} | {}' - .format(ant, con, len(baskets[con]), len(baskets[con]) / nr_transactions, len(baskets[ant]), len(baskets[con | ant]), int(lift))) + +freqsets, support = apriori(dataset, 80, maxsize=16) +rules = list(association_rules(dataset, freqsets, support, minlift=30.0)) + +rules.sort(key=(lambda ar: -ar.lift)) +for ar in rules: + print('{} -> {} (lift = {:.4})' + .format(set(ar.antecendent), + set(ar.consequent), + ar.lift)) diff --git a/ch08/apriori/apriori_naive.py b/ch08/apriori/apriori_naive.py index cb50a8e1..03ab9130 100644 --- a/ch08/apriori/apriori_naive.py +++ b/ch08/apriori/apriori_naive.py @@ -5,35 +5,83 @@ # # It is made available under the MIT License -import numpy as np from collections import defaultdict from itertools import chain from gzip import GzipFile -minsupport = 44 +minsupport = 80 dataset = [[int(tok) for tok in line.strip().split()] for line in GzipFile('retail.dat.gz')] -dataset = dataset[::20] counts = defaultdict(int) for elem in chain(*dataset): counts[elem] += 1 +# Only elements that have at least minsupport should be considered. valid = set(el for el, c in counts.items() if (c >= minsupport)) + +# Filter the dataset to contain only valid elements +# (This step is not strictly necessary, but will make the rest of the code +# faster as the itemsets will be smaller): dataset = [[el for el in ds if (el in valid)] for ds in dataset] -dataset = [frozenset(ds) for ds in dataset if len(ds) > 1] +# Convert to frozenset for fast processing +dataset = [frozenset(ds) for ds in dataset] itemsets = [frozenset([v]) for v in valid] -allsets = [itemsets] +freqsets = itemsets[:] for i in range(16): - print(len(itemsets)) + print("At iteration {}, number of frequent baskets: {}".format( + i, len(itemsets))) nextsets = [] - for i, ell in enumerate(itemsets): - for v_ in valid: - if v_ not in ell: - c = (ell | set([v_])) - if sum(1 for d in dataset if d.issuperset(c)) > minsupport: + + tested = set() + for it in itemsets: + for v in valid: + if v not in it: + # Create a new candidate set by adding v to it + c = (it | frozenset([v])) + + # Check if we have tested it already: + if c in tested: + continue + tested.add(c) + + # Count support by looping over dataset + # This step is slow. + # Check `apriori.py` for a better implementation. + support_c = sum(1 for d in dataset if d.issuperset(c)) + if support_c > minsupport: nextsets.append(c) - allsets.append(nextsets) + freqsets.extend(nextsets) itemsets = nextsets + if not len(itemsets): + break +print("Finished!") + + +def rules_from_itemset(itemset, dataset, minlift=1.): + nr_transactions = float(len(dataset)) + for item in itemset: + consequent = frozenset([item]) + antecedent = itemset-consequent + base = 0.0 + # acount: antecedent count + acount = 0.0 + + # ccount : consequent count + ccount = 0.0 + for d in dataset: + if item in d: base += 1 + if d.issuperset(itemset): ccount += 1 + if d.issuperset(antecedent): acount += 1 + base /= nr_transactions + p_y_given_x = ccount/acount + lift = p_y_given_x / base + if lift > minlift: + print('Rule {0} -> {1} has lift {2}' + .format(antecedent, consequent,lift)) + +for itemset in freqsets: + if len(itemset) > 1: + rules_from_itemset(itemset, dataset, minlift=4.) diff --git a/ch08/averaged.py b/ch08/averaged.py new file mode 100644 index 00000000..5b19bba7 --- /dev/null +++ b/ch08/averaged.py @@ -0,0 +1,33 @@ +import numpy as np +import load_ml100k +import regression +import corrneighbours +from sklearn import metrics +import norm + +def predict(train): + predicted0 = regression.predict(train) + predicted1 = regression.predict(train.T).T + predicted2 = corrneighbours.predict(train) + predicted3 = corrneighbours.predict(train.T).T + predicted4 = norm.predict(train) + predicted5 = norm.predict(train.T).T + stack = np.array([ + predicted0, + predicted1, + predicted2, + predicted3, + predicted4, + predicted5, + ]) + return stack.mean(0) + + +def main(): + train,test = load_ml100k.get_train_test(random_state=12) + predicted = predict(train) + r2 = metrics.r2_score(test[test > 0], predicted[test > 0]) + print('R2 averaged: {:.2%}'.format(r2)) + +if __name__ == '__main__': + main() diff --git a/ch08/chapter.py b/ch08/chapter.py new file mode 100644 index 00000000..d039d93f --- /dev/null +++ b/ch08/chapter.py @@ -0,0 +1,208 @@ +import numpy as np # NOT IN BOOK +from matplotlib import pyplot as plt # NOT IN BOOK + +def load(): + import numpy as np + from scipy import sparse + + data = np.loadtxt('data/ml-100k/u.data') + ij = data[:, :2] + ij -= 1 # original data is in 1-based system + values = data[:, 2] + reviews = sparse.csc_matrix((values, ij.T)).astype(float) + return reviews.toarray() +reviews = load() +U,M = np.where(reviews) +import random +test_idxs = np.array(random.sample(range(len(U)), len(U)//10)) + +train = reviews.copy() +train[U[test_idxs], M[test_idxs]] = 0 + +test = np.zeros_like(reviews) +test[U[test_idxs], M[test_idxs]] = reviews[U[test_idxs], M[test_idxs]] + +class NormalizePositive(object): + def __init__(self, axis=0): + self.axis = axis + + def fit(self, features, y=None): + if self.axis == 1: + features = features.T + # count features that are greater than zero in axis 0: + binary = (features > 0) + + count0 = binary.sum(axis=0) + + # to avoid division by zero, set zero counts to one: + count0[count0 == 0] = 1. + + # computing the mean is easy: + self.mean = features.sum(axis=0)/count0 + + # only consider differences where binary is True: + diff = (features - self.mean) * binary + diff **= 2 + # regularize the estimate of std by adding 0.1 + self.std = np.sqrt(0.1 + diff.sum(axis=0)/count0) + return self + + + def transform(self, features): + if self.axis == 1: + features = features.T + binary = (features > 0) + features = features - self.mean + features /= self.std + features *= binary + if self.axis == 1: + features = features.T + return features + + def inverse_transform(self, features, copy=True): + if copy: + features = features.copy() + if self.axis == 1: + features = features.T + features *= self.std + features += self.mean + if self.axis == 1: + features = features.T + return features + + def fit_transform(self, features): + return self.fit(features).transform(features) + + +norm = NormalizePositive(axis=1) +binary = (train > 0) +train = norm.fit_transform(train) +# plot just 200x200 area for space reasons +plt.imshow(binary[:200, :200], interpolation='nearest') + +from scipy.spatial import distance +# compute all pair-wise distances: +dists = distance.pdist(binary, 'correlation') +# Convert to square form, so that dists[i,j] +# is distance between binary[i] and binary[j]: +dists = distance.squareform(dists) +neighbors = dists.argsort(axis=1) + +# We are going to fill this matrix with results +filled = train.copy() +for u in range(filled.shape[0]): + # n_u is neighbors of user + n_u = neighbors[u, 1:] + for m in range(filled.shape[1]): + # get relevant reviews in order! + revs = [train[neigh, m] + for neigh in n_u + if binary [neigh, m]] + if len(revs): + # n is the number of reviews for this movie + n = len(revs) + # take half of the reviews plus one into consideration: + n //= 2 + n += 1 + revs = revs[:n] + filled[u,m] = np.mean(revs) + +predicted = norm.inverse_transform(filled) +from sklearn import metrics +r2 = metrics.r2_score(test[test > 0], predicted[test > 0]) +print('R2 score (binary neighbors): {:.1%}'.format(r2)) + +reviews = reviews.T +# use same code as before +r2 = metrics.r2_score(test[test > 0], predicted[test > 0]) +print('R2 score (binary movie neighbors): {:.1%}'.format(r2)) + + +from sklearn.linear_model import ElasticNetCV # NOT IN BOOK + +reg = ElasticNetCV(alphas=[ + 0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) +filled = train.copy() +# iterate over all users: +for u in range(train.shape[0]): + curtrain = np.delete(train, u, axis=0) + bu = binary[u] + reg.fit(curtrain[:,bu].T, train[u, bu]) + filled[u, ~bu] = reg.predict(curtrain[:,~bu].T) +predicted = norm.inverse_transform(filled) +r2 = metrics.r2_score(test[test > 0], predicted[test > 0]) +print('R2 score (user regression): {:.1%}'.format(r2)) + + +# SHOPPING BASKET ANALYSIS +# This is the slow version of the code, which will take a long time to +# complete. + + +from collections import defaultdict +from itertools import chain + +# File is downloaded as a compressed file +import gzip +# file format is a line per transaction +# of the form '12 34 342 5...' +dataset = [[int(tok) for tok in line.strip().split()] + for line in gzip.open('data/retail.dat.gz')] +dataset = [set(d) for d in dataset] +# count how often each product was purchased: +counts = defaultdict(int) +for elem in chain(*dataset): + counts[elem] += 1 + +minsupport = 80 +valid = set(k for k,v in counts.items() if (v >= minsupport)) +itemsets = [frozenset([v]) for v in valid] +freqsets = [] +for i in range(16): + nextsets = [] + tested = set() + for it in itemsets: + for v in valid: + if v not in it: + # Create a new candidate set by adding v to it + c = (it | frozenset([v])) + # check If we have tested it already + if c in tested: + continue + tested.add(c) + + # Count support by looping over dataset + # This step is slow. + # Check `apriori.py` for a better implementation. + support_c = sum(1 for d in dataset if d.issuperset(c)) + if support_c > minsupport: + nextsets.append(c) + freqsets.extend(nextsets) + itemsets = nextsets + if not len(itemsets): + break +print("Finished!") + + +minlift = 5.0 +nr_transactions = float(len(dataset)) +for itemset in freqsets: + for item in itemset: + consequent = frozenset([item]) + antecedent = itemset-consequent + base = 0.0 + # acount: antecedent count + acount = 0.0 + + # ccount : consequent count + ccount = 0.0 + for d in dataset: + if item in d: base += 1 + if d.issuperset(itemset): ccount += 1 + if d.issuperset(antecedent): acount += 1 + base /= nr_transactions + p_y_given_x = ccount/acount + lift = p_y_given_x / base + if lift > minlift: + print('Rule {0} -> {1} has lift {2}' + .format(antecedent, consequent,lift)) diff --git a/ch08/corrneighbours.py b/ch08/corrneighbours.py index 6f9b2e4e..eb30e685 100644 --- a/ch08/corrneighbours.py +++ b/ch08/corrneighbours.py @@ -6,50 +6,53 @@ # It is made available under the MIT License from __future__ import print_function -from all_correlations import all_correlations import numpy as np -from scipy import sparse -from load_ml100k import load -reviews = load() - - -def estimate_user(user, rest): - bu = user > 0 - br = rest > 0 - ws = all_correlations(bu, br) - selected = ws.argsort()[-100:] - estimates = rest[selected].mean(0) - estimates /= (.1 + br[selected].mean(0)) - return estimates - - -def train_test(user, rest): - estimates = estimate_user(user, rest) - bu = user > 0 - br = rest > 0 - err = estimates[bu] - user[bu] - null = rest.mean(0) - null /= (.1 + br.mean(0)) - nerr = null[bu] - user[bu] - return np.dot(err, err), np.dot(nerr, nerr) - - -def cross_validate_all(): - err = [] - for i in xrange(reviews.shape[0]): - err.append( - train_test(reviews[i], np.delete(reviews, i, 0)) - ) - revs = (reviews > 0).sum(1) - err = np.array(err) - rmse = np.sqrt(err / revs[:, None]) - print(np.mean(rmse, 0)) - print(np.mean(rmse[revs > 60], 0)) - - -def all_estimates(reviews): - reviews = reviews.toarray() - estimates = np.zeros_like(reviews) - for i in xrange(reviews.shape[0]): - estimates[i] = estimate_user(reviews[i], np.delete(reviews, i, 0)) - return estimates +from load_ml100k import get_train_test +from scipy.spatial import distance +from sklearn import metrics + +from norm import NormalizePositive + +def predict(otrain): + binary = (otrain > 0) + norm = NormalizePositive(axis=1) + train = norm.fit_transform(otrain) + + dists = distance.pdist(binary, 'correlation') + dists = distance.squareform(dists) + + neighbors = dists.argsort(axis=1) + filled = train.copy() + for u in range(filled.shape[0]): + # n_u are the neighbors of user + n_u = neighbors[u, 1:] + for m in range(filled.shape[1]): + # This code could be faster using numpy indexing trickery as the + # cost of readibility (this is left as an exercise to the reader): + revs = [train[neigh, m] + for neigh in n_u + if binary[neigh, m]] + if len(revs): + n = len(revs) + n //= 2 + n += 1 + revs = revs[:n] + filled[u,m] = np.mean(revs) + + return norm.inverse_transform(filled) + +def main(transpose_inputs=False): + train, test = get_train_test(random_state=12) + if transpose_inputs: + train = train.T + test = test.T + + predicted = predict(train) + r2 = metrics.r2_score(test[test > 0], predicted[test > 0]) + print('R2 score (binary {} neighbours): {:.1%}'.format( + ('movie' if transpose_inputs else 'user'), + r2)) + +if __name__ == '__main__': + main() + main(transpose_inputs=True) diff --git a/ch08/data/.gitignore b/ch08/data/.gitignore new file mode 100644 index 00000000..c391d625 --- /dev/null +++ b/ch08/data/.gitignore @@ -0,0 +1,3 @@ +retail.dat.gz +ml-100k.zip +/ml-100k/ diff --git a/ch08/data/download.sh b/ch08/data/download.sh new file mode 100755 index 00000000..b671171d --- /dev/null +++ b/ch08/data/download.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash +curl -L -O http://files.grouplens.org/papers/ml-100k.zip +unzip ml-100k.zip +curl -L -O http://fimi.ua.ac.be/data/retail.dat.gz diff --git a/ch08/figure3.py b/ch08/figure3.py index da699902..daafc300 100644 --- a/ch08/figure3.py +++ b/ch08/figure3.py @@ -8,9 +8,8 @@ from load_ml100k import load from matplotlib import pyplot as plt data = load() -data = data.toarray() plt.gray() plt.imshow(data[:200, :200], interpolation='nearest') plt.xlabel('User ID') plt.ylabel('Film ID') -plt.savefig('../1400_08_03+.png') +plt.savefig('Figure_08_03_DataMatrix.png') diff --git a/ch08/load_ml100k.py b/ch08/load_ml100k.py index e636a59c..7096e75c 100644 --- a/ch08/load_ml100k.py +++ b/ch08/load_ml100k.py @@ -5,15 +5,55 @@ # # It is made available under the MIT License -import numpy as np -from scipy import sparse +def load(): + '''Load ML-100k data + Returns the review matrix as a numpy array''' + import numpy as np + from scipy import sparse + from os import path -def load(): - data = np.array([[int(t) for t in line.split('\t')[:3]] - for line in open('data/ml-100k/u.data')]) + if not path.exists('data/ml-100k/u.data'): + raise IOError("Data has not been downloaded.\nTry the following:\n\n\tcd data\n\t./download.sh") + + # The input is in the form of a CSC sparse matrix, so it's a natural fit to + # load the data, but we then convert to a more traditional array before + # returning + data = np.loadtxt('data/ml-100k/u.data') ij = data[:, :2] ij -= 1 # original data is in 1-based system values = data[:, 2] reviews = sparse.csc_matrix((values, ij.T)).astype(float) - return reviews + return reviews.toarray() + +def get_train_test(reviews=None, random_state=None): + '''Split data into training & testing + + Parameters + ---------- + reviews : ndarray, optional + Input data + + Returns + ------- + train : ndarray + training data + test : ndarray + testing data + ''' + import numpy as np + import random + r = random.Random(random_state) + + if reviews is None: + reviews = load() + U,M = np.where(reviews) + test_idxs = np.array(r.sample(range(len(U)), len(U)//10)) + train = reviews.copy() + train[U[test_idxs], M[test_idxs]] = 0 + + test = np.zeros_like(reviews) + test[U[test_idxs], M[test_idxs]] = reviews[U[test_idxs], M[test_idxs]] + + return train, test + diff --git a/ch08/norm.py b/ch08/norm.py new file mode 100644 index 00000000..2925bbca --- /dev/null +++ b/ch08/norm.py @@ -0,0 +1,75 @@ +import numpy as np + +class NormalizePositive(object): + + def __init__(self, axis=0): + self.axis = axis + + def fit(self, features, y=None): + # count features that are greater than zero in axis `self.axis`: + if self.axis == 1: + features = features.T + binary = (features > 0) + count = binary.sum(axis=0) + + # to avoid division by zero, set zero counts to one: + count[count == 0] = 1. + + self.mean = features.sum(axis=0)/count + + # Compute variance by average squared difference to the mean, but only + # consider differences where binary is True (i.e., where there was a + # true rating): + diff = (features - self.mean) * binary + diff **= 2 + # regularize the estimate of std by adding 0.1 + self.std = np.sqrt(0.1 + diff.sum(axis=0)/count) + return self + + def transform(self, features): + if self.axis == 1: + features = features.T + binary = (features > 0) + features = features - self.mean + features /= self.std + features *= binary + if self.axis == 1: + features = features.T + return features + + def inverse_transform(self, features, copy=True): + if copy: + features = features.copy() + if self.axis == 1: + features = features.T + features *= self.std + features += self.mean + if self.axis == 1: + features = features.T + return features + + def fit_transform(self, features): + return self.fit(features).transform(features) + + +def predict(train): + norm = NormalizePositive() + train = norm.fit_transform(train) + return norm.inverse_transform(train * 0.) + + +def main(transpose_inputs=False): + from load_ml100k import get_train_test + from sklearn import metrics + train,test = get_train_test(random_state=12) + if transpose_inputs: + train = train.T + test = test.T + predicted = predict(train) + r2 = metrics.r2_score(test[test > 0], predicted[test > 0]) + print('R2 score ({} normalization): {:.1%}'.format( + ('movie' if transpose_inputs else 'user'), + r2)) +if __name__ == '__main__': + main() + main(transpose_inputs=True) diff --git a/ch08/regression.py b/ch08/regression.py new file mode 100644 index 00000000..693e99a4 --- /dev/null +++ b/ch08/regression.py @@ -0,0 +1,50 @@ +# This code is supporting material for the book +# Building Machine Learning Systems with Python +# by Willi Richert and Luis Pedro Coelho +# published by PACKT Publishing +# +# It is made available under the MIT License + +import numpy as np +from sklearn.linear_model import ElasticNetCV +from norm import NormalizePositive +from sklearn import metrics + + +def predict(train): + binary = (train > 0) + reg = ElasticNetCV(fit_intercept=True, alphas=[ + 0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) + norm = NormalizePositive() + train = norm.fit_transform(train) + + filled = train.copy() + # iterate over all users + for u in range(train.shape[0]): + # remove the current user for training + curtrain = np.delete(train, u, axis=0) + bu = binary[u] + if np.sum(bu) > 5: + reg.fit(curtrain[:,bu].T, train[u, bu]) + + # Fill the values that were not there already + filled[u, ~bu] = reg.predict(curtrain[:,~bu].T) + return norm.inverse_transform(filled) + + +def main(transpose_inputs=False): + from load_ml100k import get_train_test + train,test = get_train_test(random_state=12) + if transpose_inputs: + train = train.T + test = test.T + filled = predict(train) + r2 = metrics.r2_score(test[test > 0], filled[test > 0]) + + print('R2 score ({} regression): {:.1%}'.format( + ('movie' if transpose_inputs else 'user'), + r2)) + +if __name__ == '__main__': + main() + main(transpose_inputs=True) diff --git a/ch08/similar_movie.py b/ch08/similar_movie.py index bbb38ee6..cd49a162 100644 --- a/ch08/similar_movie.py +++ b/ch08/similar_movie.py @@ -7,11 +7,26 @@ from __future__ import print_function import numpy as np -from load_ml100k import load -from all_correlations import all_correlations def nn_movie(ureviews, reviews, uid, mid, k=1): + '''Movie neighbor based classifier + + Parameters + ---------- + ureviews : ndarray + reviews : ndarray + uid : int + index of user + mid : int + index of movie + k : int + index of neighbor to return + + Returns + ------- + pred : float + ''' X = ureviews y = ureviews[mid].copy() y -= y.mean() @@ -33,23 +48,27 @@ def nn_movie(ureviews, reviews, uid, mid, k=1): def all_estimates(reviews, k=1): + '''Estimate all review ratings + ''' reviews = reviews.astype(float) k -= 1 nusers, nmovies = reviews.shape estimates = np.zeros_like(reviews) for u in range(nusers): - ureviews = np.delete(reviews, u, 0) + ureviews = np.delete(reviews, u, axis=0) ureviews -= ureviews.mean(0) - ureviews /= (ureviews.std(0) + 1e-4) + ureviews /= (ureviews.std(0) + 1e-5) ureviews = ureviews.T.copy() for m in np.where(reviews[u] > 0)[0]: estimates[u, m] = nn_movie(ureviews, reviews, u, m, k) return estimates if __name__ == '__main__': - reviews = load().toarray() + from load_ml100k import load + reviews = load() estimates = all_estimates(reviews) error = (estimates - reviews) error **= 2 error = error[reviews > 0] - print(np.sqrt(error).mean()) + rmse = np.sqrt(error.mean()) + print("RMSE is {0}.".format(rmse)) diff --git a/ch08/stacked.py b/ch08/stacked.py index 1d6e9fc7..8fa6344e 100644 --- a/ch08/stacked.py +++ b/ch08/stacked.py @@ -1,43 +1,47 @@ -# This code is supporting material for the book -# Building Machine Learning Systems with Python -# by Willi Richert and Luis Pedro Coelho -# published by PACKT Publishing -# -# It is made available under the MIT License - -from __future__ import print_function -from sklearn.linear_model import LinearRegression -from load_ml100k import load import numpy as np -import similar_movie -import usermodel +import load_ml100k +import regression import corrneighbours +from sklearn import linear_model, metrics +import norm + +def predict(train): + tr_train,tr_test = load_ml100k.get_train_test(train, random_state=34) + tr_predicted0 = regression.predict(tr_train) + tr_predicted1 = regression.predict(tr_train.T).T + tr_predicted2 = corrneighbours.predict(tr_train) + tr_predicted3 = corrneighbours.predict(tr_train.T).T + tr_predicted4 = norm.predict(tr_train) + tr_predicted5 = norm.predict(tr_train.T).T + stack_tr = np.array([ + tr_predicted0[tr_test > 0], + tr_predicted1[tr_test > 0], + tr_predicted2[tr_test > 0], + tr_predicted3[tr_test > 0], + tr_predicted4[tr_test > 0], + tr_predicted5[tr_test > 0], + ]).T + + lr = linear_model.LinearRegression() + lr.fit(stack_tr, tr_test[tr_test > 0]) -reviews = load() -reg = LinearRegression() -es = np.array([ - usermodel.all_estimates(reviews), - corrneighbours.all_estimates(reviews), - similar_movies.all_estimates(reviews), -]) + stack_te = np.array([ + tr_predicted0.ravel(), + tr_predicted1.ravel(), + tr_predicted2.ravel(), + tr_predicted3.ravel(), + tr_predicted4.ravel(), + tr_predicted5.ravel(), + ]).T -reviews = reviews.toarray() + return lr.predict(stack_te).reshape(train.shape) -total_error = 0.0 -coefficients = [] -for u in xrange(reviews.shape[0]): - es0 = np.delete(es, u, 1) - r0 = np.delete(reviews, u, 0) - X, Y = np.where(r0 > 0) - X = es[:, X, Y] - y = r0[r0 > 0] - reg.fit(X.T, y) - coefficients.append(reg.coef_) +def main(): + train,test = load_ml100k.get_train_test(random_state=12) + predicted = predict(train) + r2 = metrics.r2_score(test[test > 0], predicted[test > 0]) + print('R2 stacked: {:.2%}'.format(r2)) - r0 = reviews[u] - X = np.where(r0 > 0) - p0 = reg.predict(es[:, u, X].squeeze().T) - err0 = r0[r0 > 0] - p0 - total_error += np.dot(err0, err0) - print(u) +if __name__ == '__main__': + main() diff --git a/ch08/stacked5.py b/ch08/stacked5.py deleted file mode 100644 index 12f3eacd..00000000 --- a/ch08/stacked5.py +++ /dev/null @@ -1,43 +0,0 @@ -# This code is supporting material for the book -# Building Machine Learning Systems with Python -# by Willi Richert and Luis Pedro Coelho -# published by PACKT Publishing -# -# It is made available under the MIT License - -from sklearn.linear_model import LinearRegression -from load_ml100k import load -import numpy as np -import similar_movie -import usermodel -import corrneighbours - -sreviews = load() -reviews = sreviews.toarray() -reg = LinearRegression() -es = np.array([ - usermodel.all_estimates(sreviews), - similar_movie.all_estimates(reviews, k=1), - similar_movie.all_estimates(reviews, k=2), - similar_movie.all_estimates(reviews, k=3), - similar_movie.all_estimates(reviews, k=4), - similar_movie.all_estimates(reviews, k=5), -]) - -total_error = 0.0 -coefficients = [] -for u in xrange(reviews.shape[0]): - es0 = np.delete(es, u, 1) - r0 = np.delete(reviews, u, 0) - X, Y = np.where(r0 > 0) - X = es[:, X, Y] - y = r0[r0 > 0] - reg.fit(X.T, y) - coefficients.append(reg.coef_) - - r0 = reviews[u] - X = np.where(r0 > 0) - p0 = reg.predict(es[:, u, X].squeeze().T) - err0 = r0[r0 > 0] - p0 - total_error += np.dot(err0, err0) -coefficients = np.array(coefficients) diff --git a/ch08/usermodel.py b/ch08/usermodel.py deleted file mode 100644 index 40a90a2a..00000000 --- a/ch08/usermodel.py +++ /dev/null @@ -1,52 +0,0 @@ -# This code is supporting material for the book -# Building Machine Learning Systems with Python -# by Willi Richert and Luis Pedro Coelho -# published by PACKT Publishing -# -# It is made available under the MIT License - -import numpy as np -from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV -from sklearn.cross_validation import KFold -from load_ml100k import load - - -def learn_for(reviews, i): - reg = ElasticNetCV(fit_intercept=True, alphas=[ - 0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) - u = reviews[i] - us = range(reviews.shape[0]) - del us[i] - ps, = np.where(u.toarray().ravel() > 0) - x = reviews[us][:, ps].T - y = u.data - kf = KFold(len(y), n_folds=4) - predictions = np.zeros(len(ps)) - for train, test in kf: - xc = x[train].copy().toarray() - x1 = np.array([xi[xi > 0].mean() for xi in xc]) - x1 = np.nan_to_num(x1) - - for i in xrange(xc.shape[0]): - xc[i] -= (xc[i] > 0) * x1[i] - - reg.fit(xc, y[train] - x1) - - xc = x[test].copy().toarray() - x1 = np.array([xi[xi > 0].mean() for xi in xc]) - x1 = np.nan_to_num(x1) - - for i in xrange(xc.shape[0]): - xc[i] -= (xc[i] > 0) * x1[i] - - p = np.array(map(reg.predict, xc)).ravel() - predictions[test] = p - return predictions - - -def all_estimates(reviews): - whole_data = [] - for i in xrange(reviews.shape[0]): - s = learn_for(reviews, i) - whole_data.append(s) - return np.array(whole_data) diff --git a/ch09/01_fft_based_classifier.py b/ch09/01_fft_based_classifier.py index 2a3f1bd8..84249724 100644 --- a/ch09/01_fft_based_classifier.py +++ b/ch09/01_fft_based_classifier.py @@ -14,7 +14,7 @@ from sklearn.metrics import confusion_matrix -from utils import plot_pr, plot_roc, plot_confusion_matrix, GENRE_LIST, TEST_DIR +from utils import plot_pr, plot_roc, plot_confusion_matrix, GENRE_LIST from fft import read_fft @@ -81,7 +81,7 @@ def train_model(clf_factory, X, Y, name, plot=False): if plot: for label in labels: - print("Plotting %s"%genre_list[label]) + print("Plotting %s" % genre_list[label]) scores_to_sort = roc_scores[label] median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] diff --git a/ch09/02_ceps_based_classifier.py b/ch09/02_ceps_based_classifier.py index 3b9e7237..2791267f 100644 --- a/ch09/02_ceps_based_classifier.py +++ b/ch09/02_ceps_based_classifier.py @@ -14,7 +14,7 @@ from sklearn.metrics import confusion_matrix -from utils import plot_roc, plot_confusion_matrix, GENRE_LIST, TEST_DIR +from utils import plot_roc, plot_confusion_matrix, GENRE_LIST from ceps import read_ceps @@ -82,7 +82,7 @@ def train_model(clf_factory, X, Y, name, plot=False): if plot: for label in labels: - print("Plotting %s"%genre_list[label]) + print("Plotting %s" % genre_list[label]) scores_to_sort = roc_scores[label] median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] diff --git a/ch09/ceps.py b/ch09/ceps.py index 239b0885..30a6f1cf 100644 --- a/ch09/ceps.py +++ b/ch09/ceps.py @@ -24,7 +24,7 @@ def write_ceps(ceps, fn): base_fn, ext = os.path.splitext(fn) data_fn = base_fn + ".ceps" np.save(data_fn, ceps) - print "Written", data_fn + print("Written %s"%data_fn) def create_ceps(fn): @@ -51,6 +51,6 @@ def read_ceps(genre_list, base_dir=GENRE_DIR): if __name__ == "__main__": os.chdir(GENRE_DIR) glob_wav = os.path.join(sys.argv[1], "*.wav") - print glob_wav + print(glob_wav) for fn in glob.glob(glob_wav): create_ceps(fn) diff --git a/ch09/fft.py b/ch09/fft.py index 5ff5a96f..0754c3ad 100644 --- a/ch09/fft.py +++ b/ch09/fft.py @@ -27,7 +27,7 @@ def write_fft(fft_features, fn): data_fn = base_fn + ".fft" np.save(data_fn, fft_features) - print "Written", data_fn + print("Written "%data_fn) def create_fft(fn): diff --git a/ch09/utils.py b/ch09/utils.py index 3fea02f7..d74f2169 100644 --- a/ch09/utils.py +++ b/ch09/utils.py @@ -29,11 +29,10 @@ TEST_DIR = None if GENRE_DIR is None or TEST_DIR is None: - print("Please set GENRE_DIR and TEST_DIR in utils.py") + print("Please set GENRE_DIR and TEST_DIR in utils.py") sys.exit(1) - def plot_confusion_matrix(cm, genre_list, name, title): pylab.clf() pylab.matshow(cm, fignum=False, cmap='Blues', vmin=0, vmax=1.0) @@ -93,7 +92,7 @@ def show_most_informative_features(vectorizer, clf, n=20): c_f = sorted(zip(clf.coef_[0], vectorizer.get_feature_names())) top = zip(c_f[:n], c_f[:-(n + 1):-1]) for (c1, f1), (c2, f2) in top: - print "\t%.4f\t%-15s\t\t%.4f\t%-15s" % (c1, f1, c2, f2) + print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (c1, f1, c2, f2)) def plot_log(): diff --git a/ch10/.gitignore b/ch10/.gitignore new file mode 100644 index 00000000..2f266195 --- /dev/null +++ b/ch10/.gitignore @@ -0,0 +1 @@ +AnimTransDistr/ diff --git a/ch10/README.rst b/ch10/README.rst new file mode 100644 index 00000000..91e32756 --- /dev/null +++ b/ch10/README.rst @@ -0,0 +1,37 @@ +========== +Chapter 10 +========== + +Support code for *Chapter 10: Pattern Recognition & Computer Vision* + +Data +---- + +This chapter relies on a publicly available dataset (which can be downloaded +using the ``download.sh`` script inside the ``data/`` directory) as well the +dataset that is packaged with the repository at ``../SimpleImageDataset/``. + +Running ``download.sh`` will retrieve the other dataset into a directory +``AnimTransDistr/``. + +Scripts +------- + +chapter.py + Code as written in the book. +thresholded_figure.py + Computes the thresholded figures, including after Gaussian blurring +lena-ring.py + Lena image with center in focus and blurred edges +figure10.py + Just paste two images next to each others +features.py + Contains the color histogram function from the book as well as a simple + wrapper around ``mahotas.texture.haralick`` +simple_classification.py + Classify SimpleImageDataset with texture features + color histogram features +large_classification.py + Classify ``AnimTransDistr`` with both texture and SURF features. +neighbors.py + Computes image neighbors as well as the neighbor figure from the book. + diff --git a/ch10/chapter.py b/ch10/chapter.py new file mode 100644 index 00000000..233720bb --- /dev/null +++ b/ch10/chapter.py @@ -0,0 +1,186 @@ +import numpy as np +import mahotas as mh +image = mh.imread('scene00.jpg') +from matplotlib import pyplot as plt +plt.imshow(image) +plt.show() +image = mh.colors.rgb2grey(image, dtype=np.uint8) +plt.imshow(image) # Display the image +plt.gray() +thresh = mh.thresholding.otsu(image) +print('Otsu threshold is {}.'.format(thresh)) +# Otsu threshold is 138. +plt.imshow(image > thresh) + +im16 = mh.gaussian_filter(image,16) +im = mh.demos.load('lenna') + +r,g,b = im.transpose(2,0,1) +r12 = mh.gaussian_filter(r, 12.) +g12 = mh.gaussian_filter(g, 12.) +b12 = mh.gaussian_filter(b, 12.) +im12 = mh.as_rgb(r12,g12,b12) +h, w = r.shape # height and width +Y, X = np.mgrid[:h,:w] +Y = Y-h/2. # center at h/2 +Y = Y / Y.max() # normalize to -1 .. +1 + +X = X-w/2. +X = X / X.max() + +C = np.exp(-2.*(X**2+ Y**2)) + +# Normalize again to 0..1 +C = C - C.min() +C = C / C.ptp() +C = C[:,:,None] # This adds a dummy third dimension to C + +ringed = mh.stretch(im*C + (1-C)*im12) + +haralick_features = mh.features.haralick(image) +haralick_features_mean = np.mean(haralick_features, axis=0) +haralick_features_all = np.ravel(haralick_features) + +from glob import glob +images = glob('../SimpleImageDataset/*.jpg') +features = [] +labels = [] +for im in images: + labels.append(im[:-len('00.jpg')]) + im = mh.imread(im) + im = mh.colors.rgb2gray(im, dtype=np.uint8) + features.append(mh.features.haralick(im).ravel()) + +features = np.array(features) +labels = np.array(labels) + +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.linear_model import LogisticRegression +clf = Pipeline([('preproc', StandardScaler()), + ('classifier', LogisticRegression())]) + + +from sklearn import cross_validation +cv = cross_validation.LeaveOneOut(len(images)) +scores = cross_validation.cross_val_score( + clf, features, labels, cv=cv) +print('Accuracy: {:.1%}'.format(scores.mean())) +# Accuracy: 81.1% + +def chist(im): + im = im // 64 + r,g,b = im.transpose((2,0,1)) + pixels = 1 * r + 4 * b + 16 * g + hist = np.bincount(pixels.ravel(), minlength=64) + hist = hist.astype(float) + hist = np.log1p(hist) + return hist + +features = [] +for im in images: + im = mh.imread(im) + features.append(chist(im)) + + + +features = [] +for im in images: + imcolor = mh.imread(im) + im = mh.colors.rgb2gray(imcolor, dtype=np.uint8) + features.append(np.concatenate([ + mh.features.haralick(im).ravel(), + chist(imcolor), + ])) + + +scores = cross_validation.cross_val_score( + clf, features, labels, cv=cv) +print('Accuracy: {:.1%}'.format(scores.mean())) +# Accuracy: 95.6% + + +features = [] +for im in images: + imcolor = mh.imread(im) + # Ignore everything in the 200 pixels close to the borders + imcolor = imcolor[200:-200, 200:-200] + im = mh.colors.rgb2gray(imcolor, dtype=np.uint8) + features.append(np.concatenate([ + mh.features.haralick(im).ravel(), + chist(imcolor), + ])) + +sc = StandardScaler() +features = sc.fit_transform(features) +from scipy.spatial import distance +dists = distance.squareform(distance.pdist(features)) + + +fig, axes = plt.subplots(2, 9) +for ci,i in enumerate(range(0,90,10)): + left = images[i] + dists_left = dists[i] + right = dists_left.argsort() + # right[0] is the same as left[i], so pick the next closest element + right = right[1] + right = images[right] + left = mh.imread(left) + right = mh.imread(right) + axes[0, ci].imshow(left) + axes[1, ci].imshow(right) + + + +from sklearn.grid_search import GridSearchCV +C_range = 10.0 ** np.arange(-4, 3) +grid = GridSearchCV(LogisticRegression(), param_grid={'C' : C_range}) +clf = Pipeline([('preproc', StandardScaler()), + ('classifier', grid)]) + +cv = cross_validation.KFold(len(features), 5, + shuffle=True, random_state=123) +scores = cross_validation.cross_val_score( + clf, features, labels, cv=cv) +print('Accuracy: {:.1%}'.format(scores.mean())) + + + + +from mahotas.features import surf +image = mh.demos.load('lena') +image = mh.colors.rgb2gray(image, dtype=np.uint8) +descriptors = surf.surf(image, descriptor_only=True) + +from mahotas.features import surf +descriptors = surf.dense(image, spacing=16) +alldescriptors = [] +for im in images: + im = mh.imread(im, as_grey=True) + im = im.astype(np.uint8) + alldescriptors.append(surf.dense(image, spacing=16)) +# get all descriptors into a single array +concatenated = np.concatenate(alldescriptors) +print('Number of descriptors: {}'.format( + len(concatenated))) +# use only every 64th vector +concatenated = concatenated[::64] +from sklearn.cluster import KMeans # FIXME CAPITALIZATION +k = 256 +km = KMeans(k) +km.fit(concatenated) + +features = [] +for d in alldescriptors: + c = km.predict(d) + features.append( + np.array([np.sum(c == ci) for ci in range(k)]) + ) +# build single array and convert to float +features = np.array(features, dtype=float) +scores = cross_validation.cross_val_score( + clf, features, labels, cv=cv) +print('Accuracy: {:.1%}'.format(scores.mean())) +# Accuracy: 62.6% + + diff --git a/ch10/get-anim-dataset.sh b/ch10/download.sh similarity index 70% rename from ch10/get-anim-dataset.sh rename to ch10/download.sh index 93121a7f..fb623f3d 100755 --- a/ch10/get-anim-dataset.sh +++ b/ch10/download.sh @@ -2,7 +2,7 @@ mkdir -p AnimTransDistr cd AnimTransDistr -wget http://vision.stanford.edu/Datasets/AnimTransDistr.rar +curl -O http://vision.stanford.edu/Datasets/AnimTransDistr.rar unrar x AnimTransDistr.rar # The following file is a weird file: rm Anims/104034.jpg diff --git a/ch10/edginess.py b/ch10/edginess.py deleted file mode 100644 index 5815d654..00000000 --- a/ch10/edginess.py +++ /dev/null @@ -1,22 +0,0 @@ -# This code is supporting material for the book -# Building Machine Learning Systems with Python -# by Willi Richert and Luis Pedro Coelho -# published by PACKT Publishing -# -# It is made available under the MIT License - -import numpy as np -import mahotas as mh - - -def edginess_sobel(image): - '''Measure the "edginess" of an image - - image should be a 2d numpy array (an image) - - Returns a floating point value which is higher the "edgier" the image is. - - ''' - edges = mh.sobel(image, just_filter=True) - edges = edges.ravel() - return np.sqrt(np.dot(edges, edges)) diff --git a/ch10/features.py b/ch10/features.py new file mode 100644 index 00000000..42847b30 --- /dev/null +++ b/ch10/features.py @@ -0,0 +1,70 @@ +# This code is supporting material for the book +# Building Machine Learning Systems with Python +# by Willi Richert and Luis Pedro Coelho +# published by PACKT Publishing +# +# It is made available under the MIT License + +import numpy as np +import mahotas as mh + + +def edginess_sobel(image): + '''Measure the "edginess" of an image + + image should be a 2d numpy array (an image) + + Returns a floating point value which is higher the "edgier" the image is. + + ''' + edges = mh.sobel(image, just_filter=True) + edges = edges.ravel() + return np.sqrt(np.dot(edges, edges)) + +def texture(im): + '''Compute features for an image + + Parameters + ---------- + im : ndarray + + Returns + ------- + fs : ndarray + 1-D array of features + ''' + im = im.astype(np.uint8) + return mh.features.haralick(im).ravel() + + +def chist(im): + '''Compute color histogram of input image + + Parameters + ---------- + im : ndarray + should be an RGB image + + Returns + ------- + c : ndarray + 1-D array of histogram values + ''' + + # Downsample pixel values: + im = im // 64 + + # We can also implement the following by using np.histogramdd + # im = im.reshape((-1,3)) + # bins = [np.arange(5), np.arange(5), np.arange(5)] + # hist = np.histogramdd(im, bins=bins)[0] + # hist = hist.ravel() + + # Separate RGB channels: + r,g,b = im.transpose((2,0,1)) + + pixels = 1 * r + 4 * g + 16 * b + hist = np.bincount(pixels.ravel(), minlength=64) + hist = hist.astype(float) + return np.log1p(hist) + diff --git a/ch10/figure10.py b/ch10/figure10.py index 0f5a1b5d..6cb45e7a 100644 --- a/ch10/figure10.py +++ b/ch10/figure10.py @@ -10,11 +10,11 @@ # This little script just builds an image with two examples, side-by-side: -text = mh.imread("simple-dataset/text21.jpg") -scene = mh.imread("simple-dataset/scene00.jpg") +text = mh.imread("../SimpleImageDataset/text21.jpg") +building = mh.imread("../SimpleImageDataset/building00.jpg") h, w, _ = text.shape canvas = np.zeros((h, 2 * w + 128, 3), np.uint8) -canvas[:, -w:] = scene +canvas[:, -w:] = building canvas[:, :w] = text canvas = canvas[::4, ::4] -mh.imsave('../1400OS_10_10+.jpg', canvas) +mh.imsave('figure10.jpg', canvas) diff --git a/ch10/figure13.py b/ch10/figure13.py deleted file mode 100644 index ce635ef9..00000000 --- a/ch10/figure13.py +++ /dev/null @@ -1,28 +0,0 @@ -# This code is supporting material for the book -# Building Machine Learning Systems with Python -# by Willi Richert and Luis Pedro Coelho -# published by PACKT Publishing -# -# It is made available under the MIT License - -import mahotas as mh -from mahotas.colors import rgb2grey -import numpy as np - -# Adds a little salt-n-pepper noise to an image - -im = mh.imread('lenna.jpg') -im = rgb2grey(im) - -# Salt & pepper arrays -salt = np.random.random(im.shape) > .975 -pepper = np.random.random(im.shape) > .975 - -# salt is 170 & pepper is 30 -# Some playing around showed that setting these to more extreme values looks -# very artificial. These look nicer - -im = np.maximum(salt * 170, mh.stretch(im)) -im = np.minimum(pepper * 30 + im * (~pepper), im) - -mh.imsave('../1400OS_10_13+.jpg', im.astype(np.uint8)) diff --git a/ch10/figure18.py b/ch10/figure18.py deleted file mode 100644 index ee11cf05..00000000 --- a/ch10/figure18.py +++ /dev/null @@ -1,118 +0,0 @@ -# This code is supporting material for the book -# Building Machine Learning Systems with Python -# by Willi Richert and Luis Pedro Coelho -# published by PACKT Publishing -# -# It is made available under the MIT License - -import mahotas as mh -from sklearn import cross_validation -from sklearn.linear_model.logistic import LogisticRegression -from mpltools import style -from matplotlib import pyplot as plt -import numpy as np -from glob import glob - -basedir = 'AnimTransDistr' - - -def features_for(images): - fs = [] - for im in images: - im = mh.imread(im, as_grey=True).astype(np.uint8) - fs.append(mh.features.haralick(im).mean(0)) - return np.array(fs) - - -def features_labels(groups): - labels = np.zeros(sum(map(len, groups))) - st = 0 - for i, g in enumerate(groups): - labels[st:st + len(g)] = i - st += len(g) - return np.vstack(groups), labels - -classes = [ - 'Anims', - 'Cars', - 'Distras', - 'Trans', -] - -features = [] -labels = [] -for ci, cl in enumerate(classes): - images = glob('{}/{}/*.jpg'.format(basedir, cl)) - features.extend(features_for(images)) - labels.extend([ci for _ in images]) - -features = np.array(features) -labels = np.array(labels) - -scores0 = cross_validation.cross_val_score( - LogisticRegression(), features, labels, cv=10) -print('Accuracy (5 fold x-val) with Logistic Regrssion [std features]: %s%%' % ( - 0.1 * round(1000 * scores0.mean()))) - -tfeatures = features - -from sklearn.cluster import KMeans -from mahotas.features import surf - -images = [] -labels = [] - -for ci, cl in enumerate(classes): - curimages = glob('{}/{}/*.jpg'.format(basedir, cl)) - images.extend(curimages) - labels.extend([ci for _ in curimages]) -labels = np.array(labels) - -alldescriptors = [] -for im in images: - im = mh.imread(im, as_grey=1) - im = im.astype(np.uint8) - - #alldescriptors.append(surf.dense(im, spacing=max(im.shape)//32)) - alldescriptors.append(surf.surf(im, descriptor_only=True)) - -print('Descriptors done') -k = 256 -km = KMeans(k) - -concatenated = np.concatenate(alldescriptors) -concatenated = concatenated[::64] -print('k-meaning...') -km.fit(concatenated) -features = [] -for d in alldescriptors: - c = km.predict(d) - features.append( - np.array([np.sum(c == i) for i in xrange(k)]) - ) -features = np.array(features) -print('predicting...') -scoreSURFlr = cross_validation.cross_val_score( - LogisticRegression(), features, labels, cv=5).mean() -print('Accuracy (5 fold x-val) with Log. Reg [SURF features]: %s%%' % ( - 0.1 * round(1000 * scoreSURFlr.mean()))) - -print('combined...') -allfeatures = np.hstack([features, tfeatures]) -scoreSURFplr = cross_validation.cross_val_score( - LogisticRegression(), allfeatures, labels, cv=5).mean() - -print('Accuracy (5 fold x-val) with Log. Reg [All features]: %s%%' % ( - 0.1 * round(1000 * scoreSURFplr.mean()))) - -style.use('ggplot') -plt.plot([0, 1, 2], 100 * - np.array([scores0.mean(), scoreSURFlr, scoreSURFplr]), 'k-', lw=8) -plt.plot( - [0, 1, 2], 100 * np.array([scores0.mean(), scoreSURFlr, scoreSURFplr]), - 'o', mec='#cccccc', mew=12, mfc='white') -plt.xlim(-.5, 2.5) -plt.ylim(scores0.mean() * 90., scoreSURFplr * 110) -plt.xticks([0, 1, 2], ["baseline", "SURF", "combined"]) -plt.ylabel('Accuracy (%)') -plt.savefig('../1400OS_10_18+.png') diff --git a/ch10/figure5_6.py b/ch10/figure5_6.py deleted file mode 100644 index 100c84a7..00000000 --- a/ch10/figure5_6.py +++ /dev/null @@ -1,26 +0,0 @@ -# This code is supporting material for the book -# Building Machine Learning Systems with Python -# by Willi Richert and Luis Pedro Coelho -# published by PACKT Publishing -# -# It is made available under the MIT License - -from matplotlib import pyplot as plt -import numpy as np -import mahotas as mh -image = mh.imread('../1400OS_10_01.jpeg') -image = mh.colors.rgb2gray(image) -im8 = mh.gaussian_filter(image, 8) -im16 = mh.gaussian_filter(image, 16) -im32 = mh.gaussian_filter(image, 32) -h, w = im8.shape -canvas = np.ones((h, 3 * w + 256), np.uint8) -canvas *= 255 -canvas[:, :w] = im8 -canvas[:, w + 128:2 * w + 128] = im16 -canvas[:, -w:] = im32 -mh.imsave('../1400OS_10_05+.jpg', canvas[:, ::2]) - -im32 = mh.stretch(im32) -ot32 = mh.otsu(im32) -mh.imsave('../1400OS_10_06+.jpg', (im32 > ot32).astype(np.uint8) * 255) diff --git a/ch10/figure9.py b/ch10/figure9.py deleted file mode 100644 index 6d3db1a4..00000000 --- a/ch10/figure9.py +++ /dev/null @@ -1,26 +0,0 @@ -# This code is supporting material for the book -# Building Machine Learning Systems with Python -# by Willi Richert and Luis Pedro Coelho -# published by PACKT Publishing -# -# It is made available under the MIT License - -from matplotlib import pyplot as plt -import numpy as np -import mahotas as mh -image = mh.imread('../1400OS_10_01.jpeg') -image = mh.colors.rgb2gray(image, dtype=np.uint8) -image = image[::4, ::4] -thresh = mh.sobel(image) -filtered = mh.sobel(image, just_filter=True) - -thresh = mh.dilate(thresh, np.ones((7, 7))) -filtered = mh.dilate(mh.stretch(filtered), np.ones((7, 7))) - - -h, w = thresh.shape -canvas = 255 * np.ones((h, w * 2 + 64), np.uint8) -canvas[:, :w] = thresh * 255 -canvas[:, -w:] = filtered - -mh.imsave('../1400OS_10_09+.jpg', canvas) diff --git a/ch10/large_classification.py b/ch10/large_classification.py new file mode 100644 index 00000000..8db3571b --- /dev/null +++ b/ch10/large_classification.py @@ -0,0 +1,108 @@ +# This code is supporting material for the book +# Building Machine Learning Systems with Python +# by Willi Richert and Luis Pedro Coelho +# published by PACKT Publishing +# +# It is made available under the MIT License + +from __future__ import print_function +import mahotas as mh +from glob import glob +from sklearn import cross_validation +from sklearn.linear_model import LogisticRegression +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.grid_search import GridSearchCV +import numpy as np + +basedir = 'AnimTransDistr' +print('This script will test classification of the AnimTransDistr dataset') + +C_range = 10.0 ** np.arange(-4, 3) +grid = GridSearchCV(LogisticRegression(), param_grid={'C' : C_range}) +clf = Pipeline([('preproc', StandardScaler()), + ('classifier', grid)]) + +def features_for(im): + from features import chist + im = mh.imread(im) + img = mh.colors.rgb2grey(im).astype(np.uint8) + return np.concatenate([mh.features.haralick(img).ravel(), + chist(im)]) + +def images(): + '''Iterate over all (image,label) pairs + + This function will return + ''' + for ci, cl in enumerate(classes): + images = glob('{}/{}/*.jpg'.format(basedir, cl)) + for im in sorted(images): + yield im, ci + +classes = [ + 'Anims', + 'Cars', + 'Distras', + 'Trans', +] + +print('Computing whole-image texture features...') +ifeatures = [] +labels = [] +for im, ell in images(): + ifeatures.append(features_for(im)) + labels.append(ell) + +ifeatures = np.array(ifeatures) +labels = np.array(labels) + +cv = cross_validation.KFold(len(ifeatures), 5, shuffle=True, random_state=123) +scores0 = cross_validation.cross_val_score( + clf, ifeatures, labels, cv=cv) +print('Accuracy (5 fold x-val) with Logistic Regression [image features]: {:.1%}'.format( + scores0.mean())) + + +from sklearn.cluster import KMeans +from mahotas.features import surf + + +print('Computing SURF descriptors...') +alldescriptors = [] +for im,_ in images(): + im = mh.imread(im, as_grey=True) + im = im.astype(np.uint8) + + # To use dense sampling, you can try the following line: + # alldescriptors.append(surf.dense(im, spacing=16)) + alldescriptors.append(surf.surf(im, descriptor_only=True)) + +print('Descriptor computation complete.') +k = 256 +km = KMeans(k) + +concatenated = np.concatenate(alldescriptors) +print('Number of descriptors: {}'.format( + len(concatenated))) +concatenated = concatenated[::64] +print('Clustering with K-means...') +km.fit(concatenated) +sfeatures = [] +for d in alldescriptors: + c = km.predict(d) + sfeatures.append(np.bincount(c, minlength=k)) +sfeatures = np.array(sfeatures, dtype=float) +print('predicting...') +score_SURF = cross_validation.cross_val_score( + clf, sfeatures, labels, cv=cv).mean() +print('Accuracy (5 fold x-val) with Logistic Regression [SURF features]: {:.1%}'.format( + score_SURF.mean())) + + +print('Performing classification with all features combined...') +allfeatures = np.hstack([sfeatures, ifeatures]) +score_SURF_global = cross_validation.cross_val_score( + clf, allfeatures, labels, cv=cv).mean() +print('Accuracy (5 fold x-val) with Logistic Regression [All features]: {:.1%}'.format( + score_SURF_global.mean())) diff --git a/ch10/lenna-ring.py b/ch10/lena-ring.py similarity index 93% rename from ch10/lenna-ring.py rename to ch10/lena-ring.py index 9fa9eff3..fb28b53d 100644 --- a/ch10/lenna-ring.py +++ b/ch10/lena-ring.py @@ -9,7 +9,7 @@ import numpy as np # Read in the image -im = mh.imread('lenna.jpg') +im = mh.demos.load('lena') # This breaks up the image into RGB channels r, g, b = im.transpose(2, 0, 1) @@ -38,4 +38,4 @@ # The final result is sharp in the centre and smooths out to the borders: ring = mh.stretch(im * C + (1 - C) * im12) -mh.imsave('lenna-ring.jpg', ring) +mh.imsave('lena-ring.jpg', ring) diff --git a/ch10/neighbors.py b/ch10/neighbors.py new file mode 100644 index 00000000..1f71d0de --- /dev/null +++ b/ch10/neighbors.py @@ -0,0 +1,64 @@ +# This code is supporting material for the book +# Building Machine Learning Systems with Python +# by Willi Richert and Luis Pedro Coelho +# published by PACKT Publishing + +import numpy as np +import mahotas as mh +from glob import glob +from features import texture, chist +from matplotlib import pyplot as plt +from sklearn.preprocessing import StandardScaler +from scipy.spatial import distance + +basedir = '../SimpleImageDataset/' + + +haralicks = [] +chists = [] + +print('Computing features...') +# Use glob to get all the images +images = glob('{}/*.jpg'.format(basedir)) +# We sort the images to ensure that they are always processed in the same order +# Otherwise, this would introduce some variation just based on the random +# ordering that the filesystem uses +images.sort() + +for fname in images: + imc = mh.imread(fname) + imc = imc[200:-200,200:-200] + haralicks.append(texture(mh.colors.rgb2grey(imc))) + chists.append(chist(imc)) + +haralicks = np.array(haralicks) +chists = np.array(chists) +features = np.hstack([chists, haralicks]) + +print('Computing neighbors...') +sc = StandardScaler() +features = sc.fit_transform(features) +dists = distance.squareform(distance.pdist(features)) + +print('Plotting...') +fig, axes = plt.subplots(2, 9, figsize=(16,8)) + +# Remove ticks from all subplots +for ax in axes.flat: + ax.set_xticks([]) + ax.set_yticks([]) + +for ci,i in enumerate(range(0,90,10)): + left = images[i] + dists_left = dists[i] + right = dists_left.argsort() + # right[0] is the same as left[i], so pick the next closest element + right = right[1] + right = images[right] + left = mh.imread(left) + right = mh.imread(right) + axes[0, ci].imshow(left) + axes[1, ci].imshow(right) + +fig.tight_layout() +fig.savefig('figure_neighbors.png', dpi=300) diff --git a/ch10/scene00.jpg b/ch10/scene00.jpg new file mode 100644 index 00000000..ed727a50 Binary files /dev/null and b/ch10/scene00.jpg differ diff --git a/ch10/simple_classification.py b/ch10/simple_classification.py index 2f9ebfa4..a5a448d2 100644 --- a/ch10/simple_classification.py +++ b/ch10/simple_classification.py @@ -6,37 +6,65 @@ # It is made available under the MIT License import mahotas as mh -from sklearn import cross_validation -from sklearn.linear_model.logistic import LogisticRegression import numpy as np from glob import glob -from edginess import edginess_sobel -basedir = 'simple-dataset' +from features import texture, chist +from sklearn.linear_model import LogisticRegression +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler +basedir = '../SimpleImageDataset/' -def features_for(im): - im = mh.imread(im, as_grey=True).astype(np.uint8) - return mh.features.haralick(im).mean(0) -features = [] -sobels = [] +haralicks = [] labels = [] +chists = [] + +print('This script will test (with cross-validation) classification of the simple 3 class dataset') +print('Computing features...') +# Use glob to get all the images images = glob('{}/*.jpg'.format(basedir)) -for im in images: - features.append(features_for(im)) - sobels.append(edginess_sobel(mh.imread(im, as_grey=True))) - labels.append(im[:-len('00.jpg')]) -features = np.array(features) +# We sort the images to ensure that they are always processed in the same order +# Otherwise, this would introduce some variation just based on the random +# ordering that the filesystem uses +for fname in sorted(images): + imc = mh.imread(fname) + haralicks.append(texture(mh.colors.rgb2grey(imc))) + chists.append(chist(imc)) + + # Files are named like building00.jpg, scene23.jpg... + labels.append(fname[:-len('xx.jpg')]) + +print('Finished computing features.') + +haralicks = np.array(haralicks) labels = np.array(labels) +chists = np.array(chists) + +haralick_plus_chists = np.hstack([chists, haralicks]) + +# We use Logistic Regression because it achieves high accuracy on small(ish) datasets +# Feel free to experiment with other classifiers +clf = Pipeline([('preproc', StandardScaler()), + ('classifier', LogisticRegression())]) + +from sklearn import cross_validation +cv = cross_validation.LeaveOneOut(len(images)) scores = cross_validation.cross_val_score( - LogisticRegression(), features, labels, cv=5) -print('Accuracy (5 fold x-val) with Logistic Regrssion [std features]: {}%'.format( - 0.1 * round(1000 * scores.mean()))) + clf, haralicks, labels, cv=cv) +print('Accuracy (Leave-one-out) with Logistic Regression [haralick features]: {:.1%}'.format( + scores.mean())) scores = cross_validation.cross_val_score( - LogisticRegression(), np.hstack([np.atleast_2d(sobels).T, features]), labels, cv=5).mean() -print('Accuracy (5 fold x-val) with Logistic Regrssion [std features + sobel]: {}%'.format( - 0.1 * round(1000 * scores.mean()))) + clf, chists, labels, cv=cv) +print('Accuracy (Leave-one-out) with Logistic Regression [color histograms]: {:.1%}'.format( + scores.mean())) + +scores = cross_validation.cross_val_score( + clf, haralick_plus_chists, labels, cv=cv) +print('Accuracy (Leave-one-out) with Logistic Regression [texture features + color histograms]: {:.1%}'.format( + scores.mean())) + diff --git a/ch10/threshold.py b/ch10/threshold.py index 51f32861..d7361bfb 100644 --- a/ch10/threshold.py +++ b/ch10/threshold.py @@ -7,15 +7,28 @@ import numpy as np import mahotas as mh -image = mh.imread('../1400OS_10_01.jpeg') + +# Load our example image: +image = mh.imread('../SimpleImageDataset/building05.jpg') + +# Convert to greyscale image = mh.colors.rgb2gray(image, dtype=np.uint8) + +# Compute a threshold value: thresh = mh.thresholding.otsu(image) -print(thresh) +print('Otsu threshold is {0}'.format(thresh)) + +# Compute the thresholded image otsubin = (image > thresh) +print('Saving thresholded image (with Otsu threshold) to otsu-threshold.jpeg') mh.imsave('otsu-threshold.jpeg', otsubin.astype(np.uint8) * 255) -otsubin = ~ mh.close(~otsubin, np.ones((15, 15))) + +# Execute morphological opening to smooth out the edges +otsubin = mh.open(otsubin, np.ones((15, 15))) mh.imsave('otsu-closed.jpeg', otsubin.astype(np.uint8) * 255) +# An alternative thresholding method: thresh = mh.thresholding.rc(image) -print(thresh) +print('Ridley-Calvard threshold is {0}'.format(thresh)) +print('Saving thresholded image (with Ridley-Calvard threshold) to rc-threshold.jpeg') mh.imsave('rc-threshold.jpeg', (image > thresh).astype(np.uint8) * 255) diff --git a/ch10/thresholded_figure.py b/ch10/thresholded_figure.py new file mode 100644 index 00000000..947762e8 --- /dev/null +++ b/ch10/thresholded_figure.py @@ -0,0 +1,31 @@ +import mahotas as mh +import numpy as np +from matplotlib import pyplot as plt + +# Load image & convert to B&W +image = mh.imread('../SimpleImageDataset/scene00.jpg') +image = mh.colors.rgb2grey(image, dtype=np.uint8) +plt.imshow(image) +plt.gray() +plt.title('original image') + +thresh = mh.thresholding.otsu(image) +print('Otsu threshold is {}.'.format(thresh)) + +threshed = (image > thresh) +plt.figure() +plt.imshow(threshed) +plt.title('threholded image') +mh.imsave('thresholded.png', threshed.astype(np.uint8)*255) + +im16 = mh.gaussian_filter(image, 16) + +# Repeat the thresholding operations with the blurred image +thresh = mh.thresholding.otsu(im16.astype(np.uint8)) +threshed = (im16 > thresh) +plt.figure() +plt.imshow(threshed) +plt.title('threholded image (after blurring)') +print('Otsu threshold after blurring is {}.'.format(thresh)) +mh.imsave('thresholded16.png', threshed.astype(np.uint8)*255) +plt.show() diff --git a/ch12/.gitignore b/ch12/.gitignore new file mode 100644 index 00000000..fa8fbcc2 --- /dev/null +++ b/ch12/.gitignore @@ -0,0 +1,3 @@ +*.jugdata/ +output.txt +results.image.txt diff --git a/ch12/README.rst b/ch12/README.rst new file mode 100644 index 00000000..fba88d4a --- /dev/null +++ b/ch12/README.rst @@ -0,0 +1,28 @@ +========== +Chapter 12 +========== + +Support code for *Chapter 12: Big(ger) Data* + +Data +---- + +This chapter relies only on the image dataset that is packaged with the +repository at ``../SimpleImageDataset/``. + +Scripts +------- + +chapter.py + Code as written in the book +jugfile.py + Example jugfile +image-classification.py + Jugfile implementation of image classification from Chapter 10 + +setup-aws.txt + Commands to setup Amazon WebServices machine +run-jugfile.sh + Wrapper script to run jug file on jugfile.py +run-image-classification.sh + Wrapper script to run jug file on image-classification.py diff --git a/ch12/chapter.py b/ch12/chapter.py new file mode 100644 index 00000000..15326680 --- /dev/null +++ b/ch12/chapter.py @@ -0,0 +1,95 @@ +from jug import TaskGenerator +from glob import glob +import mahotas as mh +@TaskGenerator +def compute_texture(im): + from features import texture + imc = mh.imread(im) + return texture(mh.colors.rgb2gray(imc)) + +@TaskGenerator +def chist_file(fname): + from features import chist + im = mh.imread(fname) + return chist(im) + +import numpy as np +to_array = TaskGenerator(np.array) +hstack = TaskGenerator(np.hstack) + +haralicks = [] +chists = [] +labels = [] + +# Change this variable to point to +# the location of the dataset is on disk +basedir = '../SimpleImageDataset/' +# Use glob to get all the images +images = glob('{}/*.jpg'.format(basedir)) + +for fname in sorted(images): + haralicks.append(compute_texture(fname)) + chists.append(chist_file(fname)) + # The class is encoded in the filename as xxxx00.jpg + labels.append(fname[:-len('00.jpg')]) + +haralicks = to_array(haralicks) +chists = to_array(chists) +labels = to_array(labels) + +@TaskGenerator +def accuracy(features, labels): + from sklearn.linear_model import LogisticRegression + from sklearn.pipeline import Pipeline + from sklearn.preprocessing import StandardScaler + from sklearn import cross_validation + + clf = Pipeline([('preproc', StandardScaler()), + ('classifier', LogisticRegression())]) + cv = cross_validation.LeaveOneOut(len(features)) + scores = cross_validation.cross_val_score( + clf, features, labels, cv=cv) + return scores.mean() +scores_base = accuracy(haralicks, labels) +scores_chist = accuracy(chists, labels) + +combined = hstack([chists, haralicks]) +scores_combined = accuracy(combined, labels) + +@TaskGenerator +def print_results(scores): + with open('results.image.txt', 'w') as output: + for k,v in scores: + output.write('Accuracy [{}]: {:.1%}\n'.format( + k, v.mean())) + +print_results([ + ('base', scores_base), + ('chists', scores_chist), + ('combined' , scores_combined), + ]) + +@TaskGenerator +def compute_lbp(fname): + from mahotas.features import lbp + imc = mh.imread(fname) + im = mh.colors.rgb2grey(imc) + return lbp(im, radius=8, points=6) + +lbps = [] +for fname in sorted(images): + # the rest of the loop as before + lbps.append(compute_lbp(fname)) +lbps = to_array(lbps) + +scores_lbps = accuracy(lbps, labels) +combined_all = hstack([chists, haralicks, lbps]) +scores_combined_all = accuracy(combined_all, labels) + +print_results([ + ('base', scores_base), + ('chists', scores_chist), + ('lbps', scores_lbps), + ('combined' , scores_combined), + ('combined_all' , scores_combined_all), + ]) diff --git a/ch12/features.py b/ch12/features.py new file mode 120000 index 00000000..142a324d --- /dev/null +++ b/ch12/features.py @@ -0,0 +1 @@ +../ch10/features.py \ No newline at end of file diff --git a/ch12/image-classification.py b/ch12/image-classification.py new file mode 100644 index 00000000..6f76d26d --- /dev/null +++ b/ch12/image-classification.py @@ -0,0 +1,116 @@ +# This code is supporting material for the book +# Building Machine Learning Systems with Python +# by Willi Richert and Luis Pedro Coelho +# published by PACKT Publishing +# +# It is made available under the MIT License + +import mahotas as mh +import numpy as np +from glob import glob +from jug import TaskGenerator + +# We need to use the `features` module from chapter 10. +from sys import path +path.append('../ch10') + + +# This is the jug-enabled version of the script ``figure18.py`` in Chapter 10 + +basedir = '../SimpleImageDataset/' + +@TaskGenerator +def compute_texture(im): + '''Compute features for an image + + Parameters + ---------- + im : str + filepath for image to process + + Returns + ------- + fs : ndarray + 1-D array of features + ''' + from features import texture + imc = mh.imread(im) + return texture(mh.colors.rgb2grey(imc)) + +@TaskGenerator +def chist(fname): + from features import chist as color_histogram + im = mh.imread(fname) + return color_histogram(im) + +@TaskGenerator +def compute_lbp(fname): + from mahotas.features import lbp + imc = mh.imread(fname) + im = mh.colors.rgb2grey(imc) + return lbp(im, radius=8, points=6) + + +@TaskGenerator +def accuracy(features, labels): + from sklearn.linear_model import LogisticRegression + from sklearn.pipeline import Pipeline + from sklearn.preprocessing import StandardScaler + from sklearn import cross_validation + # We use logistic regression because it is very fast. + # Feel free to experiment with other classifiers + clf = Pipeline([('preproc', StandardScaler()), + ('classifier', LogisticRegression())]) + cv = cross_validation.LeaveOneOut(len(features)) + scores = cross_validation.cross_val_score( + clf, features, labels, cv=cv) + return scores.mean() + + +@TaskGenerator +def print_results(scores): + with open('results.image.txt', 'w') as output: + for k,v in scores: + output.write('Accuracy (LOO x-val) with Logistic Regression [{0}]: {1:.1%}\n'.format( + k, v.mean())) + + +to_array = TaskGenerator(np.array) +hstack = TaskGenerator(np.hstack) + +haralicks = [] +chists = [] +lbps = [] +labels = [] + +# Use glob to get all the images +images = glob('{0}/*.jpg'.format(basedir)) +for fname in sorted(images): + haralicks.append(compute_texture(fname)) + chists.append(chist(fname)) + lbps.append(compute_lbp(fname)) + labels.append(fname[:-len('00.jpg')]) # The class is encoded in the filename as xxxx00.jpg + +haralicks = to_array(haralicks) +chists = to_array(chists) +lbps = to_array(lbps) +labels = to_array(labels) + +scores_base = accuracy(haralicks, labels) +scores_chist = accuracy(chists, labels) +scores_lbps = accuracy(lbps, labels) + +combined = hstack([chists, haralicks]) +scores_combined = accuracy(combined, labels) + +combined_all = hstack([chists, haralicks, lbps]) +scores_combined_all = accuracy(combined_all, labels) + +print_results([ + ('base', scores_base), + ('chists', scores_chist), + ('lbps', scores_lbps), + ('combined' , scores_combined), + ('combined_all' , scores_combined_all), + ]) + diff --git a/ch12/jugfile.py b/ch12/jugfile.py index 91306ccb..9d7e2b7a 100644 --- a/ch12/jugfile.py +++ b/ch12/jugfile.py @@ -23,7 +23,7 @@ def add(a, b): @TaskGenerator def print_final_result(oname, value): with open(oname, 'w') as output: - print >>output, "Final result:", value + output.write("Final result: {0}\n".format(value)) input = 2 y = double(input) diff --git a/ch12/run-image-classification.sh b/ch12/run-image-classification.sh new file mode 100755 index 00000000..868d07fa --- /dev/null +++ b/ch12/run-image-classification.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env bash + +jug execute image-classification.py diff --git a/ch12/run-jugfile.sh b/ch12/run-jugfile.sh new file mode 100755 index 00000000..0ff59131 --- /dev/null +++ b/ch12/run-jugfile.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +jug execute + diff --git a/ch12/setup-aws.txt b/ch12/setup-aws.txt new file mode 100644 index 00000000..292654eb --- /dev/null +++ b/ch12/setup-aws.txt @@ -0,0 +1,7 @@ +sudo yum update +sudo yum -y install python-devel python-pip numpy scipy python-matplotlib +sudo yum -y install gcc-c++ +sudo yum -y install git +sudo pip-python install -U pip +sudo pip install scikit-learn jug mahotas +