diff --git a/README.md b/README.md index 05218427..cbcd3155 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,14 @@ Building Machine Learning Systems with Python ============================================= -Source Code for the book Building Machine Learning Systems with Python by -[Willi Richert](http://twotoreal.com) and [Luis Pedro -Coelho](http://luispedro.org). +Source Code for the book Building Machine Learning Systems with Python by [Luis +Pedro Coelho](http://luispedro.org) and [Willi Richert](http://twotoreal.com). -The book was published in 2013 by Packt Publishing and is available [from their +The book was published in 2013 (second edition in 2015) by Packt Publishing and +is available [from their website](http://www.packtpub.com/building-machine-learning-systems-with-python/book). +The code in the repository corresponds to the second edition. Code for the +first edition is available in [first\_edition +branch](https://github.com/luispedro/BuildingMachineLearningSystemsWithPython/tree/first_edition). diff --git a/ch01/analyze_webstats.py b/ch01/analyze_webstats.py index 8d3c4c41..5da892e2 100644 --- a/ch01/analyze_webstats.py +++ b/ch01/analyze_webstats.py @@ -26,8 +26,9 @@ x = x[~sp.isnan(y)] y = y[~sp.isnan(y)] -# plot input data + def plot_models(x, y, models, fname, mx=None, ymax=None, xmin=None): + ''' plot input data ''' plt.figure(num=None, figsize=(8, 6)) plt.clf() @@ -138,8 +139,8 @@ def error(f, x, y): train = sorted(shuffled[split_idx:]) fbt1 = sp.poly1d(sp.polyfit(xb[train], yb[train], 1)) fbt2 = sp.poly1d(sp.polyfit(xb[train], yb[train], 2)) -print("fbt2(x)= \n%s"%fbt2) -print("fbt2(x)-100,000= \n%s"%(fbt2-100000)) +print("fbt2(x)= \n%s" % fbt2) +print("fbt2(x)-100,000= \n%s" % (fbt2-100000)) fbt3 = sp.poly1d(sp.polyfit(xb[train], yb[train], 3)) fbt10 = sp.poly1d(sp.polyfit(xb[train], yb[train], 10)) fbt100 = sp.poly1d(sp.polyfit(xb[train], yb[train], 100)) diff --git a/ch01/gen_webstats.py b/ch01/gen_webstats.py index fa133d76..61d0b738 100644 --- a/ch01/gen_webstats.py +++ b/ch01/gen_webstats.py @@ -17,26 +17,22 @@ sp.random.seed(3) # to reproduce the data later on -x = sp.arange(1, 31 * 24) -y = sp.array(200 * (sp.sin(2 * sp.pi * x / (7 * 24))), dtype=int) +x = sp.arange(1, 31*24) +y = sp.array(200*(sp.sin(2*sp.pi*x/(7*24))), dtype=int) y += gamma.rvs(15, loc=0, scale=100, size=len(x)) -y += 2 * sp.exp(x / 100.0) -y = sp.ma.array(y, mask=[y < 0]) -print(sum(y), sum(y < 0)) +y += 2 * sp.exp(x/100.0) +y = sp.ma.array(y, mask=[y<0]) +print(sum(y), sum(y<0)) plt.scatter(x, y) plt.title("Web traffic over the last month") plt.xlabel("Time") plt.ylabel("Hits/hour") -plt.xticks([w * 7 * 24 for w in [0, 1, 2, 3, 4]], ['week %i' % (w + 1) for w in - [0, 1, 2, 3, 4]]) - +plt.xticks([w*7*24 for w in range(5)], + ['week %i' %(w+1) for w in range(5)]) plt.autoscale(tight=True) plt.grid() plt.savefig(os.path.join(CHART_DIR, "1400_01_01.png")) -# sp.savetxt(os.path.join("..", "web_traffic.tsv"), -# zip(x[~y.mask],y[~y.mask]), delimiter="\t", fmt="%i") - -sp.savetxt(os.path.join( - DATA_DIR, "web_traffic.tsv"), list(zip(x, y)), delimiter="\t", fmt="%s") +sp.savetxt(os.path.join(DATA_DIR, "web_traffic.tsv"), + list(zip(x, y)), delimiter="\t", fmt="%s") diff --git a/ch02/README.rst b/ch02/README.rst index 4dc0b211..e2cb729a 100644 --- a/ch02/README.rst +++ b/ch02/README.rst @@ -6,6 +6,9 @@ Support code for *Chapter 2: Learning How to Classify with Real-world Examples*. The directory data contains the seeds dataset, originally downloaded from https://archive.ics.uci.edu/ml/datasets/seeds +chapter.py + The code as printed in the book. + figure1.py Figure 1 in the book: all 2-by-2 scatter plots diff --git a/ch02/chapter.py b/ch02/chapter.py new file mode 100644 index 00000000..c68b45ab --- /dev/null +++ b/ch02/chapter.py @@ -0,0 +1,164 @@ +# This code is supporting material for the book +# Building Machine Learning Systems with Python +# by Willi Richert and Luis Pedro Coelho +# published by PACKT Publishing +# +# It is made available under the MIT License + + +from matplotlib import pyplot as plt +import numpy as np + +# We load the data with load_iris from sklearn +from sklearn.datasets import load_iris +data = load_iris() + +# load_iris returns an object with several fields +features = data.data +feature_names = data.feature_names +target = data.target +target_names = data.target_names + +for t in range(3): + if t == 0: + c = 'r' + marker = '>' + elif t == 1: + c = 'g' + marker = 'o' + elif t == 2: + c = 'b' + marker = 'x' + plt.scatter(features[target == t, 0], + features[target == t, 1], + marker=marker, + c=c) +# We use NumPy fancy indexing to get an array of strings: +labels = target_names[target] + +# The petal length is the feature at position 2 +plength = features[:, 2] + +# Build an array of booleans: +is_setosa = (labels == 'setosa') + +# This is the important step: +max_setosa =plength[is_setosa].max() +min_non_setosa = plength[~is_setosa].min() +print('Maximum of setosa: {0}.'.format(max_setosa)) + +print('Minimum of others: {0}.'.format(min_non_setosa)) + +# ~ is the boolean negation operator +features = features[~is_setosa] +labels = labels[~is_setosa] +# Build a new target variable, is_virigina +is_virginica = (labels == 'virginica') + +# Initialize best_acc to impossibly low value +best_acc = -1.0 +for fi in range(features.shape[1]): + # We are going to test all possible thresholds + thresh = features[:,fi] + for t in thresh: + + # Get the vector for feature `fi` + feature_i = features[:, fi] + # apply threshold `t` + pred = (feature_i > t) + acc = (pred == is_virginica).mean() + rev_acc = (pred == ~is_virginica).mean() + if rev_acc > acc: + reverse = True + acc = rev_acc + else: + reverse = False + + if acc > best_acc: + best_acc = acc + best_fi = fi + best_t = t + best_reverse = reverse + +print(best_fi, best_t, best_reverse, best_acc) + +def is_virginica_test(fi, t, reverse, example): + 'Apply threshold model to a new example' + test = example[fi] > t + if reverse: + test = not test + return test +from threshold import fit_model, predict + +# ning accuracy was 96.0%. +# ing accuracy was 90.0% (N = 50). +correct = 0.0 + +for ei in range(len(features)): + # select all but the one at position `ei`: + training = np.ones(len(features), bool) + training[ei] = False + testing = ~training + model = fit_model(features[training], is_virginica[training]) + predictions = predict(model, features[testing]) + correct += np.sum(predictions == is_virginica[testing]) +acc = correct/float(len(features)) +print('Accuracy: {0:.1%}'.format(acc)) + + +########################################### +############## SEEDS DATASET ############## +########################################### + +from load import load_dataset + +feature_names = [ + 'area', + 'perimeter', + 'compactness', + 'length of kernel', + 'width of kernel', + 'asymmetry coefficien', + 'length of kernel groove', +] +features, labels = load_dataset('seeds') + + + +from sklearn.neighbors import KNeighborsClassifier +classifier = KNeighborsClassifier(n_neighbors=1) +from sklearn.cross_validation import KFold + +kf = KFold(len(features), n_folds=5, shuffle=True) +means = [] +for training,testing in kf: + # We learn a model for this fold with `fit` and then apply it to the + # testing data with `predict`: + classifier.fit(features[training], labels[training]) + prediction = classifier.predict(features[testing]) + + # np.mean on an array of booleans returns fraction + # of correct decisions for this fold: + curmean = np.mean(prediction == labels[testing]) + means.append(curmean) +print('Mean accuracy: {:.1%}'.format(np.mean(means))) + + +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler + +classifier = KNeighborsClassifier(n_neighbors=1) +classifier = Pipeline([('norm', StandardScaler()), ('knn', classifier)]) + +means = [] +for training,testing in kf: + # We learn a model for this fold with `fit` and then apply it to the + # testing data with `predict`: + classifier.fit(features[training], labels[training]) + prediction = classifier.predict(features[testing]) + + # np.mean on an array of booleans returns fraction + # of correct decisions for this fold: + curmean = np.mean(prediction == labels[testing]) + means.append(curmean) +print('Mean accuracy: {:.1%}'.format(np.mean(means))) diff --git a/ch02/figure1.py b/ch02/figure1.py index 2bbdfb9c..4ec6fff8 100644 --- a/ch02/figure1.py +++ b/ch02/figure1.py @@ -19,13 +19,21 @@ fig,axes = plt.subplots(2, 3) pairs = [(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)] + +# Set up 3 different pairs of (color, marker) +color_markers = [ + ('r', '>'), + ('g', 'o'), + ('b', 'x'), + ] for i, (p0, p1) in enumerate(pairs): ax = axes.flat[i] - # Use a different marker/color for each class `t` - for t, marker, c in zip(range(3), ">ox", "rgb"): + for t in range(3): + # Use a different color/marker for each class `t` + c,marker = color_markers[t] ax.scatter(features[target == t, p0], features[ - target == t, p1], marker=marker, c=c, s=40) + target == t, p1], marker=marker, c=c) ax.set_xlabel(feature_names[p0]) ax.set_ylabel(feature_names[p1]) ax.set_xticks([]) diff --git a/ch02/figure2.py b/ch02/figure2.py index 29e6d96e..0b69d395 100644 --- a/ch02/figure2.py +++ b/ch02/figure2.py @@ -23,8 +23,9 @@ labels = labels[~is_setosa] is_virginica = (labels == 'virginica') -# Hand fixed threshold: -t = 1.75 +# Hand fixed thresholds: +t = 1.65 +t2 = 1.75 # Features to use: 3 & 2 f0, f1 = 3, 2 @@ -49,7 +50,7 @@ ax.fill_between([t, x1], [y0, y0], [y1, y1], color=area2c) ax.fill_between([x0, t], [y0, y0], [y1, y1], color=area1c) ax.plot([t, t], [y0, y1], 'k--', lw=2) -ax.plot([t - .1, t - .1], [y0, y1], 'k:', lw=2) +ax.plot([t2, t2], [y0, y1], 'k:', lw=2) ax.scatter(features[is_virginica, f0], features[is_virginica, f1], c='b', marker='o', s=40) ax.scatter(features[~is_virginica, f0], diff --git a/ch02/figure4_5_no_sklearn.py b/ch02/figure4_5_no_sklearn.py index 5f67e0d7..adc83d73 100644 --- a/ch02/figure4_5_no_sklearn.py +++ b/ch02/figure4_5_no_sklearn.py @@ -45,7 +45,7 @@ def plot_decision(features, labels): model = fit_model(1, features[:, (0, 2)], np.array(labels)) C = predict( - np.vstack([X.ravel(), Y.ravel()]).T, model).reshape(X.shape) + model, np.vstack([X.ravel(), Y.ravel()]).T).reshape(X.shape) if COLOUR_FIGURE: cmap = ListedColormap([(1., .6, .6), (.6, 1., .6), (.6, .6, 1.)]) else: diff --git a/ch02/figure4_5_sklearn.py b/ch02/figure4_5_sklearn.py index 9d20c7cb..55ac0c80 100644 --- a/ch02/figure4_5_sklearn.py +++ b/ch02/figure4_5_sklearn.py @@ -58,11 +58,11 @@ def plot_decision(features, labels, num_neighbors=1): ax.pcolormesh(X, Y, C, cmap=cmap) if COLOUR_FIGURE: cmap = ListedColormap([(1., .0, .0), (.1, .6, .1), (.0, .0, 1.)]) - ax.scatter(features[:, 0], features[:, 2], c=labels, cmap=cmap, s=40) + ax.scatter(features[:, 0], features[:, 2], c=labels, cmap=cmap) else: for lab, ma in zip(range(3), "Do^"): ax.plot(features[labels == lab, 0], features[ - labels == lab, 2], ma, c=(1., 1., 1.), ms=8) + labels == lab, 2], ma, c=(1., 1., 1.), ms=6) return fig,ax diff --git a/ch02/knn.py b/ch02/knn.py index e1d48ce0..89ebfdb4 100644 --- a/ch02/knn.py +++ b/ch02/knn.py @@ -26,7 +26,7 @@ def plurality(xs): return k # This function was called ``apply_model`` in the first edition -def predict(features, model): +def predict(model, features): '''Apply k-nn model''' k, train_feats, labels = model results = [] @@ -42,5 +42,5 @@ def predict(features, model): def accuracy(features, labels, model): - preds = predict(features, model) + preds = predict(model, features) return np.mean(preds == labels) diff --git a/ch02/threshold.py b/ch02/threshold.py index efbd942b..d621a350 100644 --- a/ch02/threshold.py +++ b/ch02/threshold.py @@ -40,7 +40,7 @@ def fit_model(features, labels): # This function was called ``apply_model`` in the first edition -def predict(features, model): +def predict(model, features): '''Apply a learned model''' # A model is a pair as returned by fit_model t, fi, reverse = model @@ -51,5 +51,5 @@ def predict(features, model): def accuracy(features, labels, model): '''Compute the accuracy of the model''' - preds = predict(features, model) + preds = predict(model, features) return np.mean(preds == labels) diff --git a/ch04/.gitignore b/ch04/.gitignore index 0a7a205c..c4c0b18a 100644 --- a/ch04/.gitignore +++ b/ch04/.gitignore @@ -1,2 +1,6 @@ wiki_lda.pkl wiki_lda.pkl.state +*.png +*.npy +*.pkl +topics.txt diff --git a/ch04/README.rst b/ch04/README.rst index 7582ba54..99a3c186 100644 --- a/ch04/README.rst +++ b/ch04/README.rst @@ -4,6 +4,16 @@ Chapter 4 Support code for *Chapter 4: Topic Modeling* + +AP Data +------- + +To download the AP data, use the ``download_ap.sh`` script inside the ``data`` +directory:: + + cd data + ./download_ap.sh + Word cloud creation ------------------- @@ -49,3 +59,7 @@ Scripts blei_lda.py Computes LDA using the AP Corpus. +wikitopics_create.py + Create the topic model for Wikipedia using LDA (must download wikipedia database first) +wikitopics_create_hdp.py + Create the topic model for Wikipedia using HDP (must download wikipedia database first) diff --git a/ch04/blei_lda.py b/ch04/blei_lda.py index d6cc37b7..7f6ac2b3 100644 --- a/ch04/blei_lda.py +++ b/ch04/blei_lda.py @@ -8,19 +8,13 @@ from __future__ import print_function from wordcloud import create_cloud try: - from gensim import corpora, models + from gensim import corpora, models, matutils except: print("import gensim failed.") print() print("Please install it") raise -try: - from mpltools import style - style.use('ggplot') -except: - print("Could not import mpltools: plots will not be styled correctly") - import matplotlib.pyplot as plt import numpy as np from os import path @@ -39,27 +33,22 @@ model = models.ldamodel.LdaModel( corpus, num_topics=NUM_TOPICS, id2word=corpus.id2word, alpha=None) -ti = 0 # Iterate over all the topics in the model -for ti in xrange(model.num_topics): +for ti in range(model.num_topics): words = model.show_topic(ti, 64) - tf = sum(f for f, w in words) + tf = sum(f for _, f in words) with open('topics.txt', 'w') as output: - output.write('\n'.join('{}:{}'.format(w, int(1000. * f / tf)) for f, w in words)) + output.write('\n'.join('{}:{}'.format(w, int(1000. * f / tf)) for w, f in words)) output.write("\n\n\n") # We first identify the most discussed topic, i.e., the one with the # highest total weight -# First, we need to sum up the weights across all the documents -weight = np.zeros(model.num_topics) -for doc in corpus: - for col, val in model[doc]: - weight[col] += val - # As a reasonable alternative, we could have used the log of val: - # weight[col] += np.log(val) +topics = matutils.corpus2dense(model[corpus], num_terms=model.num_topics) +weight = topics.sum(1) max_topic = weight.argmax() + # Get the top 64 words for this topic # Without the argument, show_topic would return only 10 words words = model.show_topic(max_topic, 64) @@ -68,11 +57,12 @@ create_cloud('cloud_blei_lda.png', words) num_topics_used = [len(model[doc]) for doc in corpus] -plt.hist(num_topics_used, np.arange(42)) -plt.ylabel('Nr of documents') -plt.xlabel('Nr of topics') -plt.savefig('Figure_04_01.png') -plt.clf() +fig,ax = plt.subplots() +ax.hist(num_topics_used, np.arange(42)) +ax.set_ylabel('Nr of documents') +ax.set_xlabel('Nr of topics') +fig.tight_layout() +fig.savefig('Figure_04_01.png') # Now, repeat the same exercise using alpha=1.0 @@ -83,12 +73,14 @@ corpus, num_topics=NUM_TOPICS, id2word=corpus.id2word, alpha=ALPHA) num_topics_used1 = [len(model1[doc]) for doc in corpus] -plt.hist([num_topics_used, num_topics_used1], np.arange(42)) -plt.ylabel('Nr of documents') -plt.xlabel('Nr of topics') +fig,ax = plt.subplots() +ax.hist([num_topics_used, num_topics_used1], np.arange(42)) +ax.set_ylabel('Nr of documents') +ax.set_xlabel('Nr of topics') # The coordinates below were fit by trial and error to look good -plt.text(9, 223, r'default alpha') -plt.text(26, 156, 'alpha=1.0') -plt.savefig('Figure_04_02.png') +ax.text(9, 223, r'default alpha') +ax.text(26, 156, 'alpha=1.0') +fig.tight_layout() +fig.savefig('Figure_04_02.png') diff --git a/ch04/build_lda.py b/ch04/build_lda.py index d4dcb7f2..a0ee9c5f 100644 --- a/ch04/build_lda.py +++ b/ch04/build_lda.py @@ -79,7 +79,7 @@ def __len__(self): distances = distance.squareform(distance.pdist(thetas)) large = distances.max() + 1 -for i in xrange(len(distances)): +for i in range(len(distances)): distances[i, i] = large print(otexts[1]) diff --git a/ch04/data/.gitignore b/ch04/data/.gitignore index d71ac12b..91f24bbc 100644 --- a/ch04/data/.gitignore +++ b/ch04/data/.gitignore @@ -4,7 +4,9 @@ dataset-379-20news-18828_HJRZF.zip 379/ enwiki-latest-pages-articles.xml.bz2 wiki_en_output_bow.mm +wiki_en_output_bow.mm.gz wiki_en_output_bow.mm.index wiki_en_output_tfidf.mm +wiki_en_output_tfidf.mm.gz wiki_en_output_tfidf.mm.index wiki_en_output_wordids.txt.bz2 diff --git a/ch04/data/download_ap.sh b/ch04/data/download_ap.sh index 6de8ded8..da27814a 100755 --- a/ch04/data/download_ap.sh +++ b/ch04/data/download_ap.sh @@ -1,3 +1,3 @@ #!/bin/sh -wget http://www.cs.princeton.edu/~blei/lda-c/ap.tgz +wget http://www.cs.columbia.edu/~blei/lda-c/ap.tgz tar xzf ap.tgz diff --git a/ch04/wikitopics_create_hdp.py b/ch04/wikitopics_create_hdp.py new file mode 100644 index 00000000..951d1850 --- /dev/null +++ b/ch04/wikitopics_create_hdp.py @@ -0,0 +1,39 @@ +# This code is supporting material for the book +# Building Machine Learning Systems with Python +# by Willi Richert and Luis Pedro Coelho +# published by PACKT Publishing +# +# It is made available under the MIT License + +from __future__ import print_function +import logging +import gensim +import numpy as np + +# Set up logging in order to get progress information as the model is being built: +logging.basicConfig( + format='%(asctime)s : %(levelname)s : %(message)s', + level=logging.INFO) + +# Load the preprocessed corpus (id2word & mm): +id2word = gensim.corpora.Dictionary.load_from_text( + 'data/wiki_en_output_wordids.txt.bz2') +mm = gensim.corpora.MmCorpus('data/wiki_en_output_tfidf.mm') + +# Calling the constructor is enough to build the model +# This call will take a few hours! +model = gensim.models.hdpmodel.HdpModel( + corpus=mm, + id2word=id2word, + chunksize=10000) + +# Save the model so we do not need to learn it again. +model.save('wiki_hdp.pkl') + +# Compute the document/topic matrix +topics = np.zeros((len(mm), model.num_topics)) +for di,doc in enumerate(mm): + doc_top = model[doc] + for ti,tv in doc_top: + topics[di,ti] += tv +np.save('topics_hdp.npy', topics) diff --git a/ch04/wordcloud.py b/ch04/wordcloud.py index 6c5302ea..accca2d6 100644 --- a/ch04/wordcloud.py +++ b/ch04/wordcloud.py @@ -24,8 +24,6 @@ def create_cloud(oname, words,maxsize=120, fontname='Lobster'): # gensim returns a weight between 0 and 1 for each word, while pytagcloud # expects an integer word count. So, we multiply by a large number and # round. For a visualization this is an adequate approximation. - # We also need to flip the order as gensim returns (value, word), whilst - # pytagcloud expects (word, value): - words = [(w,int(v*10000)) for v,w in words] + words = [(w,int(v*10000)) for w,v in words] tags = make_tags(words, maxsize=maxsize) create_tag_image(tags, oname, size=(1800, 1200), fontname=fontname) diff --git a/ch07/.gitignore b/ch07/.gitignore index d21bb3d1..e33609d2 100644 --- a/ch07/.gitignore +++ b/ch07/.gitignore @@ -1 +1 @@ -.formula_*/ +*.png diff --git a/ch07/boston1.py b/ch07/boston1.py index 1f82409f..d0b30447 100644 --- a/ch07/boston1.py +++ b/ch07/boston1.py @@ -27,11 +27,13 @@ rmse = np.sqrt(lr.residues_/len(x)) print('RMSE: {}'.format(rmse)) +fig, ax = plt.subplots() +# Plot a diagonal (for reference): +ax.plot([0, 50], [0, 50], '-', color=(.9,.3,.3), lw=4) + # Plot the prediction versus real: -plt.scatter(lr.predict(x), boston.target) +ax.scatter(lr.predict(x), boston.target) -# Plot a diagonal (for reference): -plt.plot([0, 50], [0, 50], '-', color=(.9,.3,.3), lw=4) -plt.xlabel('predicted') -plt.ylabel('real') -plt.savefig('Figure_07_08.png') +ax.set_xlabel('predicted') +ax.set_ylabel('real') +fig.savefig('Figure_07_08.png') diff --git a/ch07/data/.gitignore b/ch07/data/.gitignore new file mode 100644 index 00000000..3286ba0f --- /dev/null +++ b/ch07/data/.gitignore @@ -0,0 +1 @@ +E2006.train diff --git a/ch07/data/download.sh b/ch07/data/download.sh index 6f938e06..74753364 100755 --- a/ch07/data/download.sh +++ b/ch07/data/download.sh @@ -1,5 +1,3 @@ #!/usr/bin/env bash -wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/E2006.train.bz2 +curl -O http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/E2006.train.bz2 bunzip2 E2006.train.bz2 -wget http://www.grouplens.org/system/files/ml-100k.zip -unzip ml-100k.zip diff --git a/ch07/figure1_2.py b/ch07/figure1_2.py index ec1d31da..3f11a0c7 100644 --- a/ch07/figure1_2.py +++ b/ch07/figure1_2.py @@ -14,9 +14,10 @@ boston = load_boston() # Index number five in the number of rooms -plt.scatter(boston.data[:, 5], boston.target) -plt.xlabel("Number of rooms (RM)") -plt.ylabel("House Price") +fig,ax = plt.subplots() +ax.scatter(boston.data[:, 5], boston.target) +ax.set_xlabel("Average number of rooms (RM)") +ax.set_ylabel("House Price") x = boston.data[:, 5] # fit (used below) takes a two-dimensional array as input. We use np.atleast_2d @@ -29,9 +30,9 @@ lr = LinearRegression(fit_intercept=False) lr.fit(x, y) -plt.plot([0, boston.data[:, 5].max() + 1], +ax.plot([0, boston.data[:, 5].max() + 1], [0, lr.predict(boston.data[:, 5].max() + 1)], '-', lw=4) -plt.savefig('Figure1.png', dpi=150) +fig.savefig('Figure1.png') mse = mean_squared_error(y, lr.predict(x)) rmse = np.sqrt(mse) @@ -42,14 +43,14 @@ lr.fit(x, y) -plt.clf() -plt.xlabel("Number of rooms (RM)") -plt.ylabel("House Price") -plt.scatter(boston.data[:, 5], boston.target) +fig,ax = plt.subplots() +ax.set_xlabel("Average number of rooms (RM)") +ax.set_ylabel("House Price") +ax.scatter(boston.data[:, 5], boston.target) xmin = x.min() xmax = x.max() -plt.plot([xmin, xmax], lr.predict([[xmin], [xmax]]) , '-', lw=4) -plt.savefig('Figure2.png', dpi=150) +ax.plot([xmin, xmax], lr.predict([[xmin], [xmax]]) , '-', lw=4) +fig.savefig('Figure2.png') mse = mean_squared_error(y, lr.predict(x)) print("Mean squared error (of training data): {:.3}".format(mse)) diff --git a/ch07/figure3.py b/ch07/figure3.py index d8c10ddc..7543c1ec 100644 --- a/ch07/figure3.py +++ b/ch07/figure3.py @@ -8,12 +8,13 @@ from sklearn.linear_model import LinearRegression, Lasso import numpy as np from sklearn.datasets import load_boston -import pylab as plt +from matplotlib import pyplot as plt boston = load_boston() -plt.scatter(boston.data[:, 5], boston.target) -plt.xlabel("Number of rooms (RM)") -plt.ylabel("House Price") +fig, ax = plt.subplots() +ax.scatter(boston.data[:, 5], boston.target) +ax.set_xlabel("Number of rooms (RM)") +ax.set_ylabel("House Price") x = boston.data[:, 5] @@ -24,9 +25,9 @@ lr = LinearRegression() lr.fit(x, y) -plt.plot([xmin, xmax], lr.predict([[xmin], [xmax]]), ':', lw=4, label='OLS model') +ax.plot([xmin, xmax], lr.predict([[xmin], [xmax]]), ':', lw=4, label='OLS model') las = Lasso() las.fit(x, y) -plt.plot([xmin, xmax], las.predict([ [xmin], [xmax] ]), '-', lw=4, label='Lasso model') -plt.savefig('Figure3.png', dpi=150) +ax.plot([xmin, xmax], las.predict([ [xmin], [xmax] ]), '-', lw=4, label='Lasso model') +fig.savefig('Figure3.png') diff --git a/ch07/figure4.py b/ch07/figure4.py index 70e61c16..a24d48be 100644 --- a/ch07/figure4.py +++ b/ch07/figure4.py @@ -12,7 +12,7 @@ from sklearn.linear_model import LinearRegression from sklearn.datasets import load_boston from sklearn.metrics import mean_squared_error -import pylab as plt +from matplotlib import pyplot as plt boston = load_boston() @@ -24,9 +24,10 @@ p = lr.predict(x) print("RMSE: {:.2}.".format(np.sqrt(mean_squared_error(y, p)))) print("R2: {:.2}.".format(lr.score(x, y))) -plt.scatter(p, y) -plt.xlabel('Predicted price') -plt.ylabel('Actual price') -plt.plot([y.min(), y.max()], [y.min(), y.max()], lw=4) +fig,ax = plt.subplots() +ax.scatter(p, y) +ax.set_xlabel('Predicted price') +ax.set_ylabel('Actual price') +ax.plot([y.min(), y.max()], [y.min(), y.max()], lw=4) -plt.savefig('Figure4.png', dpi=150) +fig.savefig('Figure4.png') diff --git a/ch07/lasso_path_plot.py b/ch07/lasso_path_plot.py index f0a6f3f5..eab64c26 100644 --- a/ch07/lasso_path_plot.py +++ b/ch07/lasso_path_plot.py @@ -25,5 +25,5 @@ ax.set_xlabel('Lasso coefficient path as a function of alpha') ax.set_xlabel('Alpha') ax.set_ylabel('Coefficient weight') -fig.savefig('Figure_LassoPath.png', dpi=150) +fig.savefig('Figure_LassoPath.png') diff --git a/ch07/predict10k_en.py b/ch07/predict10k_en.py index bcadcef3..a7dd960a 100644 --- a/ch07/predict10k_en.py +++ b/ch07/predict10k_en.py @@ -69,4 +69,5 @@ ax.plot([-5,-1], [-5,-1], 'r-', lw=2) ax.set_xlabel('Actual value') ax.set_ylabel('Predicted value') -fig.savefig('Figure_10k_scatter_EN_l1_ratio.png', dpi=150) +fig.savefig('Figure_10k_scatter_EN_l1_ratio.png') + diff --git a/ch07/usermodel.py b/ch07/usermodel.py deleted file mode 100644 index 7b309455..00000000 --- a/ch07/usermodel.py +++ /dev/null @@ -1,61 +0,0 @@ -# This code is supporting material for the book -# Building Machine Learning Systems with Python -# by Willi Richert and Luis Pedro Coelho -# published by PACKT Publishing -# -# It is made available under the MIT License - -import numpy as np -from scipy import sparse -from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV -from sklearn.cross_validation import KFold - -data = np.array([[int(tok) for tok in line.split('\t')[:3]] - for line in open('data/ml-100k/u.data')]) -ij = data[:, :2] -ij -= 1 # original data is in 1-based system -values = data[:, 2] -reviews = sparse.csc_matrix((values, ij.T)).astype(float) - -reg = ElasticNetCV(fit_intercept=True, alphas=[ - 0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) - - -def movie_norm(xc): - '''Normalize per movie''' - xc = xc.copy().toarray() - # xpos is the mean of the positive items - xpos = np.array([xi[xi > 0].mean() for xi in xc]) - xpos = np.nan_to_num(xpos) - - for i in range(xc.shape[0]): - xc[i] -= (xc[i] > 0) * xpos[i] - return xc, xpos - - -def learn_for(i): - u = reviews[i] - us = np.delete(np.arange(reviews.shape[0]), i) - ps, = np.where(u.toarray().ravel() > 0) - x = reviews[us][:, ps].T - y = u.data - err = 0 - eb = 0 - kf = KFold(len(y), n_folds=5) - for train, test in kf: - xc, xpos = movie_norm(x[train]) - reg.fit(xc, y[train] - xpos) - - xc, xpos = movie_norm(x[test]) - p = reg.predict(xc).ravel() - e = (p + xpos) - y[test] - err += np.sum(e * e) - eb += np.sum((y[train].mean() - y[test]) ** 2) - return np.sqrt(err / float(len(y))), np.sqrt(eb / float(len(y))) - -whole_data = [] -for i in range(reviews.shape[0]): - s = learn_for(i) - print(s[0] < s[1]) - print(s) - whole_data.append(s) diff --git a/ch08/README.rst b/ch08/README.rst new file mode 100644 index 00000000..488844e6 --- /dev/null +++ b/ch08/README.rst @@ -0,0 +1,41 @@ +========= +Chapter 8 +========= + +Support code for *Chapter 8: Recommendations*. + +The code refers to the second edition of the book and this code has been +significantly refactored when compared to the first one. + +Ratings Prediction +------------------ + +Note that since the partition of the data into training and testing is random, +everytime you run the code, the results will be different. + + +load_ml100k.py + Load data & partition into test/train +norm.py + Normalize the data +corrneighbours.py + Neighbour models based on ncrroaltoin +regression.py + Regression models +stacked.py + Stacked predictions +averaged.py + Averaging of predictions (mentioned in book, but code is not shown there). + +Association Rule Mining +----------------------- + +Check the folder ``apriori/`` + +apriori/histogram.py + Print a histogram of how many times each product was bought +apriori/apriori.py + Implementation of Apriori algorithm and association rule building +apriori/apriori_example.py + Example of Apriori algorithm in retail dataset + diff --git a/ch08/all_correlations.py b/ch08/all_correlations.py index 509e5b8e..d3817bf0 100644 --- a/ch08/all_correlations.py +++ b/ch08/all_correlations.py @@ -7,9 +7,14 @@ import numpy as np -# This is the version in the book: - +def all_correlations(y, X): + from scipy import spatial + y = np.atleast_2d(y) + sp = spatial.distance.cdist(X, y, 'correlation') + # The "correlation distance" is 1 - corr(x,y); so we invert that to obtain the correlation + return 1 - sp.ravel() +# This is the version in the book (1st Edition): def all_correlations_book_version(bait, target): ''' corrs = all_correlations(bait, target) @@ -21,9 +26,7 @@ def all_correlations_book_version(bait, target): for c in target]) # This is a faster, but harder to read, implementation: - - -def all_correlations(y, X): +def all_correlations_fast_no_scipy(y, X): ''' Cs = all_correlations(y, X) @@ -42,12 +45,4 @@ def all_correlations(y, X): return (xy - x_ * y_ * n) / n / xs_ / ys_ -# If you have scipy installed, then you can compute correlations with -# scipy.spatial.cdist: -def all_correlations_scipy(y, X): - from scipy import spatial - y = np.atleast_2d(y) - sp = spatial.distance.cdist(X, y, 'correlation') - # The "correlation distance" is 1 - corr(x,y); so we invert that to obtain the correlation - return 1 - sp.ravel() diff --git a/ch08/apriori/.gitignore b/ch08/apriori/.gitignore index 05697a95..6379e20f 100644 --- a/ch08/apriori/.gitignore +++ b/ch08/apriori/.gitignore @@ -1 +1 @@ -retail.dat +retail.dat.gz diff --git a/ch08/apriori/apriori.py b/ch08/apriori/apriori.py index 1ebb7e54..eeaf0d46 100644 --- a/ch08/apriori/apriori.py +++ b/ch08/apriori/apriori.py @@ -5,9 +5,12 @@ # # It is made available under the MIT License +from collections import namedtuple + + def apriori(dataset, minsupport, maxsize): ''' - freqsets, baskets = apriori(dataset, minsupport, maxsize) + freqsets, support = apriori(dataset, minsupport, maxsize) Parameters ---------- @@ -21,48 +24,70 @@ def apriori(dataset, minsupport, maxsize): Returns ------- freqsets : sequence of sequences - baskets : dictionary + support : dictionary + This associates each itemset (represented as a frozenset) with a float + (the support of that itemset) ''' from collections import defaultdict baskets = defaultdict(list) pointers = defaultdict(list) + for i, ds in enumerate(dataset): for ell in ds: pointers[ell].append(i) baskets[frozenset([ell])].append(i) - pointers = dict([(k, frozenset(v)) for k, v in pointers.items()]) - baskets = dict([(k, frozenset(v)) for k, v in baskets.items()]) - valid = set(list(el)[0] - for el, c in baskets.items() if (len(c) >= minsupport)) - dataset = [[el for el in ds if (el in valid)] for ds in dataset] - dataset = [ds for ds in dataset if len(ds) > 1] - dataset = map(frozenset, dataset) + # Convert pointer items to frozensets to speed up operations later + new_pointers = dict() + for k in pointers: + if len(pointers[k]) >= minsupport: + new_pointers[k] = frozenset(pointers[k]) + pointers = new_pointers + for k in baskets: + baskets[k] = frozenset(baskets[k]) + + # Valid are all elements whose support is >= minsupport + valid = set() + for el, c in baskets.items(): + if len(c) >= minsupport: + valid.update(el) + + # Itemsets at first iteration are simply all singleton with valid elements: itemsets = [frozenset([v]) for v in valid] freqsets = [] for i in range(maxsize - 1): - print(len(itemsets)) + print("At iteration {}, number of frequent baskets: {}".format( + i, len(itemsets))) newsets = [] - for i, ell in enumerate(itemsets): - ccounts = baskets[ell] - for v_, pv in pointers.items(): - if v_ not in ell: + for it in itemsets: + ccounts = baskets[it] + + for v, pv in pointers.items(): + if v not in it: csup = (ccounts & pv) if len(csup) >= minsupport: - new = frozenset(ell | set([v_])) + new = frozenset(it | frozenset([v])) if new not in baskets: newsets.append(new) baskets[new] = csup freqsets.extend(itemsets) itemsets = newsets - return freqsets, baskets + if not len(itemsets): + break + support = {} + for k in baskets: + support[k] = float(len(baskets[k])) + return freqsets, support -def association_rules(dataset, freqsets, baskets, minlift): +# A namedtuple to collect all values that may be interesting +AssociationRule = namedtuple('AssociationRule', ['antecendent', 'consequent', 'base', 'py_x', 'lift']) + +def association_rules(dataset, freqsets, support, minlift): ''' - for (antecendent, consequent, base, py_x, lift) in association_rules(dataset, freqsets, baskets, minlift): + for assoc_rule in association_rules(dataset, freqsets, support, minlift): ... This function takes the returns from ``apriori``. @@ -72,9 +97,13 @@ def association_rules(dataset, freqsets, baskets, minlift): dataset : sequence of sequences input dataset freqsets : sequence of sequences - baskets : dictionary + support : dictionary minlift : int minimal lift of yielded rules + + Returns + ------- + assoc_rule : sequence of AssociationRule objects ''' nr_transactions = float(len(dataset)) freqsets = [f for f in freqsets if len(f) > 1] @@ -82,8 +111,9 @@ def association_rules(dataset, freqsets, baskets, minlift): for f in fset: consequent = frozenset([f]) antecendent = fset - consequent - base = len(baskets[consequent]) / nr_transactions - py_x = len(baskets[fset]) / float(len(baskets[antecendent])) + py_x = support[fset] / support[antecendent] + base = support[consequent] / nr_transactions lift = py_x / base if lift > minlift: - yield (antecendent, consequent, base, py_x, lift) + yield AssociationRule(antecendent, consequent, base, py_x, lift) + diff --git a/ch08/apriori/apriori_example.py b/ch08/apriori/apriori_example.py index da77e96c..971ed4d5 100644 --- a/ch08/apriori/apriori_example.py +++ b/ch08/apriori/apriori_example.py @@ -7,10 +7,17 @@ from apriori import apriori, association_rules from gzip import GzipFile + +# Load dataset dataset = [[int(tok) for tok in line.strip().split()] for line in GzipFile('retail.dat.gz')] -freqsets, baskets = apriori(dataset, 80, maxsize=5) -nr_transactions = float(len(dataset)) -for ant, con, base, pyx, lift in association_rules(dataset, freqsets, baskets, 30): - print('{} | {} | {} ({:%}) | {} | {} | {}' - .format(ant, con, len(baskets[con]), len(baskets[con]) / nr_transactions, len(baskets[ant]), len(baskets[con | ant]), int(lift))) + +freqsets, support = apriori(dataset, 80, maxsize=16) +rules = list(association_rules(dataset, freqsets, support, minlift=30.0)) + +rules.sort(key=(lambda ar: -ar.lift)) +for ar in rules: + print('{} -> {} (lift = {:.4})' + .format(set(ar.antecendent), + set(ar.consequent), + ar.lift)) diff --git a/ch08/apriori/apriori_naive.py b/ch08/apriori/apriori_naive.py index cb50a8e1..03ab9130 100644 --- a/ch08/apriori/apriori_naive.py +++ b/ch08/apriori/apriori_naive.py @@ -5,35 +5,83 @@ # # It is made available under the MIT License -import numpy as np from collections import defaultdict from itertools import chain from gzip import GzipFile -minsupport = 44 +minsupport = 80 dataset = [[int(tok) for tok in line.strip().split()] for line in GzipFile('retail.dat.gz')] -dataset = dataset[::20] counts = defaultdict(int) for elem in chain(*dataset): counts[elem] += 1 +# Only elements that have at least minsupport should be considered. valid = set(el for el, c in counts.items() if (c >= minsupport)) + +# Filter the dataset to contain only valid elements +# (This step is not strictly necessary, but will make the rest of the code +# faster as the itemsets will be smaller): dataset = [[el for el in ds if (el in valid)] for ds in dataset] -dataset = [frozenset(ds) for ds in dataset if len(ds) > 1] +# Convert to frozenset for fast processing +dataset = [frozenset(ds) for ds in dataset] itemsets = [frozenset([v]) for v in valid] -allsets = [itemsets] +freqsets = itemsets[:] for i in range(16): - print(len(itemsets)) + print("At iteration {}, number of frequent baskets: {}".format( + i, len(itemsets))) nextsets = [] - for i, ell in enumerate(itemsets): - for v_ in valid: - if v_ not in ell: - c = (ell | set([v_])) - if sum(1 for d in dataset if d.issuperset(c)) > minsupport: + + tested = set() + for it in itemsets: + for v in valid: + if v not in it: + # Create a new candidate set by adding v to it + c = (it | frozenset([v])) + + # Check if we have tested it already: + if c in tested: + continue + tested.add(c) + + # Count support by looping over dataset + # This step is slow. + # Check `apriori.py` for a better implementation. + support_c = sum(1 for d in dataset if d.issuperset(c)) + if support_c > minsupport: nextsets.append(c) - allsets.append(nextsets) + freqsets.extend(nextsets) itemsets = nextsets + if not len(itemsets): + break +print("Finished!") + + +def rules_from_itemset(itemset, dataset, minlift=1.): + nr_transactions = float(len(dataset)) + for item in itemset: + consequent = frozenset([item]) + antecedent = itemset-consequent + base = 0.0 + # acount: antecedent count + acount = 0.0 + + # ccount : consequent count + ccount = 0.0 + for d in dataset: + if item in d: base += 1 + if d.issuperset(itemset): ccount += 1 + if d.issuperset(antecedent): acount += 1 + base /= nr_transactions + p_y_given_x = ccount/acount + lift = p_y_given_x / base + if lift > minlift: + print('Rule {0} -> {1} has lift {2}' + .format(antecedent, consequent,lift)) + +for itemset in freqsets: + if len(itemset) > 1: + rules_from_itemset(itemset, dataset, minlift=4.) diff --git a/ch08/averaged.py b/ch08/averaged.py new file mode 100644 index 00000000..5b19bba7 --- /dev/null +++ b/ch08/averaged.py @@ -0,0 +1,33 @@ +import numpy as np +import load_ml100k +import regression +import corrneighbours +from sklearn import metrics +import norm + +def predict(train): + predicted0 = regression.predict(train) + predicted1 = regression.predict(train.T).T + predicted2 = corrneighbours.predict(train) + predicted3 = corrneighbours.predict(train.T).T + predicted4 = norm.predict(train) + predicted5 = norm.predict(train.T).T + stack = np.array([ + predicted0, + predicted1, + predicted2, + predicted3, + predicted4, + predicted5, + ]) + return stack.mean(0) + + +def main(): + train,test = load_ml100k.get_train_test(random_state=12) + predicted = predict(train) + r2 = metrics.r2_score(test[test > 0], predicted[test > 0]) + print('R2 averaged: {:.2%}'.format(r2)) + +if __name__ == '__main__': + main() diff --git a/ch08/chapter.py b/ch08/chapter.py new file mode 100644 index 00000000..d039d93f --- /dev/null +++ b/ch08/chapter.py @@ -0,0 +1,208 @@ +import numpy as np # NOT IN BOOK +from matplotlib import pyplot as plt # NOT IN BOOK + +def load(): + import numpy as np + from scipy import sparse + + data = np.loadtxt('data/ml-100k/u.data') + ij = data[:, :2] + ij -= 1 # original data is in 1-based system + values = data[:, 2] + reviews = sparse.csc_matrix((values, ij.T)).astype(float) + return reviews.toarray() +reviews = load() +U,M = np.where(reviews) +import random +test_idxs = np.array(random.sample(range(len(U)), len(U)//10)) + +train = reviews.copy() +train[U[test_idxs], M[test_idxs]] = 0 + +test = np.zeros_like(reviews) +test[U[test_idxs], M[test_idxs]] = reviews[U[test_idxs], M[test_idxs]] + +class NormalizePositive(object): + def __init__(self, axis=0): + self.axis = axis + + def fit(self, features, y=None): + if self.axis == 1: + features = features.T + # count features that are greater than zero in axis 0: + binary = (features > 0) + + count0 = binary.sum(axis=0) + + # to avoid division by zero, set zero counts to one: + count0[count0 == 0] = 1. + + # computing the mean is easy: + self.mean = features.sum(axis=0)/count0 + + # only consider differences where binary is True: + diff = (features - self.mean) * binary + diff **= 2 + # regularize the estimate of std by adding 0.1 + self.std = np.sqrt(0.1 + diff.sum(axis=0)/count0) + return self + + + def transform(self, features): + if self.axis == 1: + features = features.T + binary = (features > 0) + features = features - self.mean + features /= self.std + features *= binary + if self.axis == 1: + features = features.T + return features + + def inverse_transform(self, features, copy=True): + if copy: + features = features.copy() + if self.axis == 1: + features = features.T + features *= self.std + features += self.mean + if self.axis == 1: + features = features.T + return features + + def fit_transform(self, features): + return self.fit(features).transform(features) + + +norm = NormalizePositive(axis=1) +binary = (train > 0) +train = norm.fit_transform(train) +# plot just 200x200 area for space reasons +plt.imshow(binary[:200, :200], interpolation='nearest') + +from scipy.spatial import distance +# compute all pair-wise distances: +dists = distance.pdist(binary, 'correlation') +# Convert to square form, so that dists[i,j] +# is distance between binary[i] and binary[j]: +dists = distance.squareform(dists) +neighbors = dists.argsort(axis=1) + +# We are going to fill this matrix with results +filled = train.copy() +for u in range(filled.shape[0]): + # n_u is neighbors of user + n_u = neighbors[u, 1:] + for m in range(filled.shape[1]): + # get relevant reviews in order! + revs = [train[neigh, m] + for neigh in n_u + if binary [neigh, m]] + if len(revs): + # n is the number of reviews for this movie + n = len(revs) + # take half of the reviews plus one into consideration: + n //= 2 + n += 1 + revs = revs[:n] + filled[u,m] = np.mean(revs) + +predicted = norm.inverse_transform(filled) +from sklearn import metrics +r2 = metrics.r2_score(test[test > 0], predicted[test > 0]) +print('R2 score (binary neighbors): {:.1%}'.format(r2)) + +reviews = reviews.T +# use same code as before +r2 = metrics.r2_score(test[test > 0], predicted[test > 0]) +print('R2 score (binary movie neighbors): {:.1%}'.format(r2)) + + +from sklearn.linear_model import ElasticNetCV # NOT IN BOOK + +reg = ElasticNetCV(alphas=[ + 0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) +filled = train.copy() +# iterate over all users: +for u in range(train.shape[0]): + curtrain = np.delete(train, u, axis=0) + bu = binary[u] + reg.fit(curtrain[:,bu].T, train[u, bu]) + filled[u, ~bu] = reg.predict(curtrain[:,~bu].T) +predicted = norm.inverse_transform(filled) +r2 = metrics.r2_score(test[test > 0], predicted[test > 0]) +print('R2 score (user regression): {:.1%}'.format(r2)) + + +# SHOPPING BASKET ANALYSIS +# This is the slow version of the code, which will take a long time to +# complete. + + +from collections import defaultdict +from itertools import chain + +# File is downloaded as a compressed file +import gzip +# file format is a line per transaction +# of the form '12 34 342 5...' +dataset = [[int(tok) for tok in line.strip().split()] + for line in gzip.open('data/retail.dat.gz')] +dataset = [set(d) for d in dataset] +# count how often each product was purchased: +counts = defaultdict(int) +for elem in chain(*dataset): + counts[elem] += 1 + +minsupport = 80 +valid = set(k for k,v in counts.items() if (v >= minsupport)) +itemsets = [frozenset([v]) for v in valid] +freqsets = [] +for i in range(16): + nextsets = [] + tested = set() + for it in itemsets: + for v in valid: + if v not in it: + # Create a new candidate set by adding v to it + c = (it | frozenset([v])) + # check If we have tested it already + if c in tested: + continue + tested.add(c) + + # Count support by looping over dataset + # This step is slow. + # Check `apriori.py` for a better implementation. + support_c = sum(1 for d in dataset if d.issuperset(c)) + if support_c > minsupport: + nextsets.append(c) + freqsets.extend(nextsets) + itemsets = nextsets + if not len(itemsets): + break +print("Finished!") + + +minlift = 5.0 +nr_transactions = float(len(dataset)) +for itemset in freqsets: + for item in itemset: + consequent = frozenset([item]) + antecedent = itemset-consequent + base = 0.0 + # acount: antecedent count + acount = 0.0 + + # ccount : consequent count + ccount = 0.0 + for d in dataset: + if item in d: base += 1 + if d.issuperset(itemset): ccount += 1 + if d.issuperset(antecedent): acount += 1 + base /= nr_transactions + p_y_given_x = ccount/acount + lift = p_y_given_x / base + if lift > minlift: + print('Rule {0} -> {1} has lift {2}' + .format(antecedent, consequent,lift)) diff --git a/ch08/corrneighbours.py b/ch08/corrneighbours.py index eef3881e..eb30e685 100644 --- a/ch08/corrneighbours.py +++ b/ch08/corrneighbours.py @@ -6,75 +6,53 @@ # It is made available under the MIT License from __future__ import print_function -from all_correlations import all_correlations import numpy as np -from load_ml100k import load - -def estimate_user(user, rest, num_neighbors=100): - '''Estimate ratings for user based on the binary rating matrix - - Returns - ------- - estimates: ndarray - Returns a rating estimate for each movie - ''' - - # Compute binary matrix correlations: - bu = user > 0 - br = rest > 0 - ws = all_correlations(bu, br) - - # Select top `num_neighbors`: - selected = ws.argsort()[-num_neighbors:] - - # Use these to compute estimates: - estimates = rest[selected].mean(0) - estimates /= (.1 + br[selected].mean(0)) - return estimates - - -def train_test(user, rest): - '''Train & test on a single user - - Returns both the prediction error and the null error - ''' - estimates = estimate_user(user, rest) - bu = user > 0 - br = rest > 0 - err = estimates[bu] - user[bu] - null = rest.mean(0) - null /= (.1 + br.mean(0)) - nerr = null[bu] - user[bu] - return np.dot(err, err), np.dot(nerr, nerr) - - -def all_estimates(reviews): - estimates = np.zeros_like(reviews) - for i in range(reviews.shape[0]): - estimates[i] = estimate_user(reviews[i], np.delete(reviews, i, 0)) - return estimates - -def main(): - reviews = load() - - err = [] - for i in range(reviews.shape[0]): - err.append( - train_test(reviews[i], np.delete(reviews, i, 0)) - ) - revs = (reviews > 0).sum(1) - err = np.array(err) - rmse = np.sqrt(err / revs[:, None]) - - rmse_model, rmse_null = np.mean(rmse, 0) - - print("Average of RMSE / Null-model RMSE") - print("{:.2}\t{:.2} (improvement: {:.1%}".format(rmse_model, rmse_null, (rmse_null-rmse_model)/rmse_null)) - print() - - rmse_model, rmse_null = np.mean(rmse[revs > 60], 0) - print("Average of RMSE / Null-model RMSE (users with more than 60 reviewed movies)") - print("{:.2}\t{:.2} (improvement: {:.1%}".format(rmse_model, rmse_null, (rmse_null-rmse_model)/rmse_null)) +from load_ml100k import get_train_test +from scipy.spatial import distance +from sklearn import metrics + +from norm import NormalizePositive + +def predict(otrain): + binary = (otrain > 0) + norm = NormalizePositive(axis=1) + train = norm.fit_transform(otrain) + + dists = distance.pdist(binary, 'correlation') + dists = distance.squareform(dists) + + neighbors = dists.argsort(axis=1) + filled = train.copy() + for u in range(filled.shape[0]): + # n_u are the neighbors of user + n_u = neighbors[u, 1:] + for m in range(filled.shape[1]): + # This code could be faster using numpy indexing trickery as the + # cost of readibility (this is left as an exercise to the reader): + revs = [train[neigh, m] + for neigh in n_u + if binary[neigh, m]] + if len(revs): + n = len(revs) + n //= 2 + n += 1 + revs = revs[:n] + filled[u,m] = np.mean(revs) + + return norm.inverse_transform(filled) + +def main(transpose_inputs=False): + train, test = get_train_test(random_state=12) + if transpose_inputs: + train = train.T + test = test.T + + predicted = predict(train) + r2 = metrics.r2_score(test[test > 0], predicted[test > 0]) + print('R2 score (binary {} neighbours): {:.1%}'.format( + ('movie' if transpose_inputs else 'user'), + r2)) if __name__ == '__main__': main() + main(transpose_inputs=True) diff --git a/ch08/data b/ch08/data deleted file mode 120000 index 32d37b53..00000000 --- a/ch08/data +++ /dev/null @@ -1 +0,0 @@ -../ch07/data \ No newline at end of file diff --git a/ch08/data/.gitignore b/ch08/data/.gitignore new file mode 100644 index 00000000..c391d625 --- /dev/null +++ b/ch08/data/.gitignore @@ -0,0 +1,3 @@ +retail.dat.gz +ml-100k.zip +/ml-100k/ diff --git a/ch08/data/download.sh b/ch08/data/download.sh new file mode 100755 index 00000000..b671171d --- /dev/null +++ b/ch08/data/download.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash +curl -L -O http://files.grouplens.org/papers/ml-100k.zip +unzip ml-100k.zip +curl -L -O http://fimi.ua.ac.be/data/retail.dat.gz diff --git a/ch08/figure3.py b/ch08/figure3.py index ac6961f3..daafc300 100644 --- a/ch08/figure3.py +++ b/ch08/figure3.py @@ -12,4 +12,4 @@ plt.imshow(data[:200, :200], interpolation='nearest') plt.xlabel('User ID') plt.ylabel('Film ID') -plt.savefig('../1400_08_03+.png') +plt.savefig('Figure_08_03_DataMatrix.png') diff --git a/ch08/load_ml100k.py b/ch08/load_ml100k.py index da6add37..7096e75c 100644 --- a/ch08/load_ml100k.py +++ b/ch08/load_ml100k.py @@ -11,6 +11,10 @@ def load(): Returns the review matrix as a numpy array''' import numpy as np from scipy import sparse + from os import path + + if not path.exists('data/ml-100k/u.data'): + raise IOError("Data has not been downloaded.\nTry the following:\n\n\tcd data\n\t./download.sh") # The input is in the form of a CSC sparse matrix, so it's a natural fit to # load the data, but we then convert to a more traditional array before @@ -21,3 +25,35 @@ def load(): values = data[:, 2] reviews = sparse.csc_matrix((values, ij.T)).astype(float) return reviews.toarray() + +def get_train_test(reviews=None, random_state=None): + '''Split data into training & testing + + Parameters + ---------- + reviews : ndarray, optional + Input data + + Returns + ------- + train : ndarray + training data + test : ndarray + testing data + ''' + import numpy as np + import random + r = random.Random(random_state) + + if reviews is None: + reviews = load() + U,M = np.where(reviews) + test_idxs = np.array(r.sample(range(len(U)), len(U)//10)) + train = reviews.copy() + train[U[test_idxs], M[test_idxs]] = 0 + + test = np.zeros_like(reviews) + test[U[test_idxs], M[test_idxs]] = reviews[U[test_idxs], M[test_idxs]] + + return train, test + diff --git a/ch08/norm.py b/ch08/norm.py new file mode 100644 index 00000000..2925bbca --- /dev/null +++ b/ch08/norm.py @@ -0,0 +1,75 @@ +import numpy as np + +class NormalizePositive(object): + + def __init__(self, axis=0): + self.axis = axis + + def fit(self, features, y=None): + # count features that are greater than zero in axis `self.axis`: + if self.axis == 1: + features = features.T + binary = (features > 0) + count = binary.sum(axis=0) + + # to avoid division by zero, set zero counts to one: + count[count == 0] = 1. + + self.mean = features.sum(axis=0)/count + + # Compute variance by average squared difference to the mean, but only + # consider differences where binary is True (i.e., where there was a + # true rating): + diff = (features - self.mean) * binary + diff **= 2 + # regularize the estimate of std by adding 0.1 + self.std = np.sqrt(0.1 + diff.sum(axis=0)/count) + return self + + def transform(self, features): + if self.axis == 1: + features = features.T + binary = (features > 0) + features = features - self.mean + features /= self.std + features *= binary + if self.axis == 1: + features = features.T + return features + + def inverse_transform(self, features, copy=True): + if copy: + features = features.copy() + if self.axis == 1: + features = features.T + features *= self.std + features += self.mean + if self.axis == 1: + features = features.T + return features + + def fit_transform(self, features): + return self.fit(features).transform(features) + + +def predict(train): + norm = NormalizePositive() + train = norm.fit_transform(train) + return norm.inverse_transform(train * 0.) + + +def main(transpose_inputs=False): + from load_ml100k import get_train_test + from sklearn import metrics + train,test = get_train_test(random_state=12) + if transpose_inputs: + train = train.T + test = test.T + predicted = predict(train) + r2 = metrics.r2_score(test[test > 0], predicted[test > 0]) + print('R2 score ({} normalization): {:.1%}'.format( + ('movie' if transpose_inputs else 'user'), + r2)) +if __name__ == '__main__': + main() + main(transpose_inputs=True) diff --git a/ch08/regression.py b/ch08/regression.py new file mode 100644 index 00000000..693e99a4 --- /dev/null +++ b/ch08/regression.py @@ -0,0 +1,50 @@ +# This code is supporting material for the book +# Building Machine Learning Systems with Python +# by Willi Richert and Luis Pedro Coelho +# published by PACKT Publishing +# +# It is made available under the MIT License + +import numpy as np +from sklearn.linear_model import ElasticNetCV +from norm import NormalizePositive +from sklearn import metrics + + +def predict(train): + binary = (train > 0) + reg = ElasticNetCV(fit_intercept=True, alphas=[ + 0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) + norm = NormalizePositive() + train = norm.fit_transform(train) + + filled = train.copy() + # iterate over all users + for u in range(train.shape[0]): + # remove the current user for training + curtrain = np.delete(train, u, axis=0) + bu = binary[u] + if np.sum(bu) > 5: + reg.fit(curtrain[:,bu].T, train[u, bu]) + + # Fill the values that were not there already + filled[u, ~bu] = reg.predict(curtrain[:,~bu].T) + return norm.inverse_transform(filled) + + +def main(transpose_inputs=False): + from load_ml100k import get_train_test + train,test = get_train_test(random_state=12) + if transpose_inputs: + train = train.T + test = test.T + filled = predict(train) + r2 = metrics.r2_score(test[test > 0], filled[test > 0]) + + print('R2 score ({} regression): {:.1%}'.format( + ('movie' if transpose_inputs else 'user'), + r2)) + +if __name__ == '__main__': + main() + main(transpose_inputs=True) diff --git a/ch08/stacked.py b/ch08/stacked.py index 76afe9fc..8fa6344e 100644 --- a/ch08/stacked.py +++ b/ch08/stacked.py @@ -1,40 +1,47 @@ -# This code is supporting material for the book -# Building Machine Learning Systems with Python -# by Willi Richert and Luis Pedro Coelho -# published by PACKT Publishing -# -# It is made available under the MIT License - -from __future__ import print_function -from sklearn.linear_model import LinearRegression -from load_ml100k import load import numpy as np -import similar_movie -import usermodel +import load_ml100k +import regression import corrneighbours +from sklearn import linear_model, metrics +import norm + +def predict(train): + tr_train,tr_test = load_ml100k.get_train_test(train, random_state=34) + tr_predicted0 = regression.predict(tr_train) + tr_predicted1 = regression.predict(tr_train.T).T + tr_predicted2 = corrneighbours.predict(tr_train) + tr_predicted3 = corrneighbours.predict(tr_train.T).T + tr_predicted4 = norm.predict(tr_train) + tr_predicted5 = norm.predict(tr_train.T).T + stack_tr = np.array([ + tr_predicted0[tr_test > 0], + tr_predicted1[tr_test > 0], + tr_predicted2[tr_test > 0], + tr_predicted3[tr_test > 0], + tr_predicted4[tr_test > 0], + tr_predicted5[tr_test > 0], + ]).T + + lr = linear_model.LinearRegression() + lr.fit(stack_tr, tr_test[tr_test > 0]) + + stack_te = np.array([ + tr_predicted0.ravel(), + tr_predicted1.ravel(), + tr_predicted2.ravel(), + tr_predicted3.ravel(), + tr_predicted4.ravel(), + tr_predicted5.ravel(), + ]).T + + return lr.predict(stack_te).reshape(train.shape) -reviews = load() -reg = LinearRegression() -es = np.array([ - usermodel.all_estimates(reviews), - corrneighbours.all_estimates(reviews), - similar_movie.all_estimates(reviews), -]) -total_error = 0.0 -coefficients = [] -for u in range(reviews.shape[0]): - es0 = np.delete(es, u, 1) - r0 = np.delete(reviews, u, 0) - X, Y = np.where(r0 > 0) - X = es[:, X, Y] - y = r0[r0 > 0] - reg.fit(X.T, y) - coefficients.append(reg.coef_) +def main(): + train,test = load_ml100k.get_train_test(random_state=12) + predicted = predict(train) + r2 = metrics.r2_score(test[test > 0], predicted[test > 0]) + print('R2 stacked: {:.2%}'.format(r2)) - r0 = reviews[u] - X = np.where(r0 > 0) - p0 = reg.predict(es[:, u, X].squeeze().T) - err0 = r0[r0 > 0] - p0 - total_error += np.dot(err0, err0) - print(u) +if __name__ == '__main__': + main() diff --git a/ch08/stacked5.py b/ch08/stacked5.py deleted file mode 100644 index 7ace7d32..00000000 --- a/ch08/stacked5.py +++ /dev/null @@ -1,44 +0,0 @@ -# This code is supporting material for the book -# Building Machine Learning Systems with Python -# by Willi Richert and Luis Pedro Coelho -# published by PACKT Publishing -# -# It is made available under the MIT License - -from sklearn.linear_model import LinearRegression -from load_ml100k import load -import numpy as np -import similar_movie -import usermodel - -reviews = load() -# Collect several estimates -es = np.array([ - usermodel.all_estimates(reviews), - similar_movie.all_estimates(reviews, k=1), - similar_movie.all_estimates(reviews, k=2), - similar_movie.all_estimates(reviews, k=3), - similar_movie.all_estimates(reviews, k=4), - similar_movie.all_estimates(reviews, k=5), -]) - -total_error = 0.0 -coefficients = [] - -reg = LinearRegression() -# Iterate over all users -for u in range(reviews.shape[0]): - es0 = np.delete(es, u, axis=1) - r0 = np.delete(reviews, u, axis=0) - X, Y = np.where(r0 > 0) - X = es[:, X, Y] - y = r0[r0 > 0] - reg.fit(X.T, y) - coefficients.append(reg.coef_) - - r0 = reviews[u] - X = np.where(r0 > 0) - p0 = reg.predict(es[:, u, X].squeeze().T) - err0 = r0[r0 > 0] - p0 - total_error += np.dot(err0, err0) -coefficients = np.array(coefficients) diff --git a/ch08/usermodel.py b/ch08/usermodel.py deleted file mode 100644 index 589afca4..00000000 --- a/ch08/usermodel.py +++ /dev/null @@ -1,54 +0,0 @@ -# This code is supporting material for the book -# Building Machine Learning Systems with Python -# by Willi Richert and Luis Pedro Coelho -# published by PACKT Publishing -# -# It is made available under the MIT License - -import numpy as np -from sklearn.linear_model import ElasticNetCV -from sklearn.cross_validation import KFold - - -def learn_for(reviews, i): - reg = ElasticNetCV(fit_intercept=True, alphas=[ - 0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) - nusers,nmovies = reviews.shape - u = reviews[i] - us = np.arange(reviews.shape[0]) - us = np.delete(us, i) - ps, = np.where(u.ravel() > 0) - x = reviews[us][:, ps].T - kf = KFold(len(ps), n_folds=4) - predictions = np.zeros(len(ps)) - for train, test in kf: - xc = x[train].copy() - x1 = np.array([xi[xi > 0].mean() for xi in xc]) - x1 = np.nan_to_num(x1) - - for i in range(xc.shape[0]): - xc[i] -= (xc[i] > 0) * x1[i] - - reg.fit(xc, u[train] - x1) - - xc = x[test].copy() - x1 = np.array([xi[xi > 0].mean() for xi in xc]) - x1 = np.nan_to_num(x1) - - for i in range(xc.shape[0]): - xc[i] -= (xc[i] > 0) * x1[i] - - p = reg.predict(xc).ravel() - predictions[test] = p - fill_preds = np.zeros(nmovies) - fill_preds[ps] = predictions - return fill_preds - - -def all_estimates(reviews): - whole_data = [] - for i in range(reviews.shape[0]): - s = learn_for(reviews, i) - whole_data.append(s) - return np.array(whole_data) - diff --git a/ch10/.gitignore b/ch10/.gitignore new file mode 100644 index 00000000..2f266195 --- /dev/null +++ b/ch10/.gitignore @@ -0,0 +1 @@ +AnimTransDistr/ diff --git a/ch10/README.rst b/ch10/README.rst index 7137582b..91e32756 100644 --- a/ch10/README.rst +++ b/ch10/README.rst @@ -17,23 +17,21 @@ Running ``download.sh`` will retrieve the other dataset into a directory Scripts ------- -threshold.py - Threshold building image with both Otsu and Ridley-Calvard threshold +chapter.py + Code as written in the book. +thresholded_figure.py + Computes the thresholded figures, including after Gaussian blurring lena-ring.py - Show Lena in a sharp ring around a soft focus image -figure5_6.py - Computes figures 5 and 6 in the book -figure9.py - Compute Sobel images + Lena image with center in focus and blurred edges figure10.py Just paste two images next to each others -figure13.py - Demonstrate salt&pepper effect features.py - Contains the ``edginess_sobel`` function from the book as well as a simple + Contains the color histogram function from the book as well as a simple wrapper around ``mahotas.texture.haralick`` simple_classification.py - Classify SimpleImageDataset with texture features + sobel feature -figure18.py + Classify SimpleImageDataset with texture features + color histogram features +large_classification.py Classify ``AnimTransDistr`` with both texture and SURF features. +neighbors.py + Computes image neighbors as well as the neighbor figure from the book. diff --git a/ch10/chapter.py b/ch10/chapter.py new file mode 100644 index 00000000..233720bb --- /dev/null +++ b/ch10/chapter.py @@ -0,0 +1,186 @@ +import numpy as np +import mahotas as mh +image = mh.imread('scene00.jpg') +from matplotlib import pyplot as plt +plt.imshow(image) +plt.show() +image = mh.colors.rgb2grey(image, dtype=np.uint8) +plt.imshow(image) # Display the image +plt.gray() +thresh = mh.thresholding.otsu(image) +print('Otsu threshold is {}.'.format(thresh)) +# Otsu threshold is 138. +plt.imshow(image > thresh) + +im16 = mh.gaussian_filter(image,16) +im = mh.demos.load('lenna') + +r,g,b = im.transpose(2,0,1) +r12 = mh.gaussian_filter(r, 12.) +g12 = mh.gaussian_filter(g, 12.) +b12 = mh.gaussian_filter(b, 12.) +im12 = mh.as_rgb(r12,g12,b12) +h, w = r.shape # height and width +Y, X = np.mgrid[:h,:w] +Y = Y-h/2. # center at h/2 +Y = Y / Y.max() # normalize to -1 .. +1 + +X = X-w/2. +X = X / X.max() + +C = np.exp(-2.*(X**2+ Y**2)) + +# Normalize again to 0..1 +C = C - C.min() +C = C / C.ptp() +C = C[:,:,None] # This adds a dummy third dimension to C + +ringed = mh.stretch(im*C + (1-C)*im12) + +haralick_features = mh.features.haralick(image) +haralick_features_mean = np.mean(haralick_features, axis=0) +haralick_features_all = np.ravel(haralick_features) + +from glob import glob +images = glob('../SimpleImageDataset/*.jpg') +features = [] +labels = [] +for im in images: + labels.append(im[:-len('00.jpg')]) + im = mh.imread(im) + im = mh.colors.rgb2gray(im, dtype=np.uint8) + features.append(mh.features.haralick(im).ravel()) + +features = np.array(features) +labels = np.array(labels) + +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.linear_model import LogisticRegression +clf = Pipeline([('preproc', StandardScaler()), + ('classifier', LogisticRegression())]) + + +from sklearn import cross_validation +cv = cross_validation.LeaveOneOut(len(images)) +scores = cross_validation.cross_val_score( + clf, features, labels, cv=cv) +print('Accuracy: {:.1%}'.format(scores.mean())) +# Accuracy: 81.1% + +def chist(im): + im = im // 64 + r,g,b = im.transpose((2,0,1)) + pixels = 1 * r + 4 * b + 16 * g + hist = np.bincount(pixels.ravel(), minlength=64) + hist = hist.astype(float) + hist = np.log1p(hist) + return hist + +features = [] +for im in images: + im = mh.imread(im) + features.append(chist(im)) + + + +features = [] +for im in images: + imcolor = mh.imread(im) + im = mh.colors.rgb2gray(imcolor, dtype=np.uint8) + features.append(np.concatenate([ + mh.features.haralick(im).ravel(), + chist(imcolor), + ])) + + +scores = cross_validation.cross_val_score( + clf, features, labels, cv=cv) +print('Accuracy: {:.1%}'.format(scores.mean())) +# Accuracy: 95.6% + + +features = [] +for im in images: + imcolor = mh.imread(im) + # Ignore everything in the 200 pixels close to the borders + imcolor = imcolor[200:-200, 200:-200] + im = mh.colors.rgb2gray(imcolor, dtype=np.uint8) + features.append(np.concatenate([ + mh.features.haralick(im).ravel(), + chist(imcolor), + ])) + +sc = StandardScaler() +features = sc.fit_transform(features) +from scipy.spatial import distance +dists = distance.squareform(distance.pdist(features)) + + +fig, axes = plt.subplots(2, 9) +for ci,i in enumerate(range(0,90,10)): + left = images[i] + dists_left = dists[i] + right = dists_left.argsort() + # right[0] is the same as left[i], so pick the next closest element + right = right[1] + right = images[right] + left = mh.imread(left) + right = mh.imread(right) + axes[0, ci].imshow(left) + axes[1, ci].imshow(right) + + + +from sklearn.grid_search import GridSearchCV +C_range = 10.0 ** np.arange(-4, 3) +grid = GridSearchCV(LogisticRegression(), param_grid={'C' : C_range}) +clf = Pipeline([('preproc', StandardScaler()), + ('classifier', grid)]) + +cv = cross_validation.KFold(len(features), 5, + shuffle=True, random_state=123) +scores = cross_validation.cross_val_score( + clf, features, labels, cv=cv) +print('Accuracy: {:.1%}'.format(scores.mean())) + + + + +from mahotas.features import surf +image = mh.demos.load('lena') +image = mh.colors.rgb2gray(image, dtype=np.uint8) +descriptors = surf.surf(image, descriptor_only=True) + +from mahotas.features import surf +descriptors = surf.dense(image, spacing=16) +alldescriptors = [] +for im in images: + im = mh.imread(im, as_grey=True) + im = im.astype(np.uint8) + alldescriptors.append(surf.dense(image, spacing=16)) +# get all descriptors into a single array +concatenated = np.concatenate(alldescriptors) +print('Number of descriptors: {}'.format( + len(concatenated))) +# use only every 64th vector +concatenated = concatenated[::64] +from sklearn.cluster import KMeans # FIXME CAPITALIZATION +k = 256 +km = KMeans(k) +km.fit(concatenated) + +features = [] +for d in alldescriptors: + c = km.predict(d) + features.append( + np.array([np.sum(c == ci) for ci in range(k)]) + ) +# build single array and convert to float +features = np.array(features, dtype=float) +scores = cross_validation.cross_val_score( + clf, features, labels, cv=cv) +print('Accuracy: {:.1%}'.format(scores.mean())) +# Accuracy: 62.6% + + diff --git a/ch10/download.sh b/ch10/download.sh index 93121a7f..fb623f3d 100755 --- a/ch10/download.sh +++ b/ch10/download.sh @@ -2,7 +2,7 @@ mkdir -p AnimTransDistr cd AnimTransDistr -wget http://vision.stanford.edu/Datasets/AnimTransDistr.rar +curl -O http://vision.stanford.edu/Datasets/AnimTransDistr.rar unrar x AnimTransDistr.rar # The following file is a weird file: rm Anims/104034.jpg diff --git a/ch10/features.py b/ch10/features.py index 27fd3e91..42847b30 100644 --- a/ch10/features.py +++ b/ch10/features.py @@ -34,6 +34,37 @@ def texture(im): 1-D array of features ''' im = im.astype(np.uint8) - return mh.features.haralick(im).mean(0) + return mh.features.haralick(im).ravel() +def chist(im): + '''Compute color histogram of input image + + Parameters + ---------- + im : ndarray + should be an RGB image + + Returns + ------- + c : ndarray + 1-D array of histogram values + ''' + + # Downsample pixel values: + im = im // 64 + + # We can also implement the following by using np.histogramdd + # im = im.reshape((-1,3)) + # bins = [np.arange(5), np.arange(5), np.arange(5)] + # hist = np.histogramdd(im, bins=bins)[0] + # hist = hist.ravel() + + # Separate RGB channels: + r,g,b = im.transpose((2,0,1)) + + pixels = 1 * r + 4 * g + 16 * b + hist = np.bincount(pixels.ravel(), minlength=64) + hist = hist.astype(float) + return np.log1p(hist) + diff --git a/ch10/figure10.py b/ch10/figure10.py index 058b1069..6cb45e7a 100644 --- a/ch10/figure10.py +++ b/ch10/figure10.py @@ -11,10 +11,10 @@ # This little script just builds an image with two examples, side-by-side: text = mh.imread("../SimpleImageDataset/text21.jpg") -scene = mh.imread("../SimpleImageDataset/scene00.jpg") +building = mh.imread("../SimpleImageDataset/building00.jpg") h, w, _ = text.shape canvas = np.zeros((h, 2 * w + 128, 3), np.uint8) -canvas[:, -w:] = scene +canvas[:, -w:] = building canvas[:, :w] = text canvas = canvas[::4, ::4] -mh.imsave('../1400OS_10_10+.jpg', canvas) +mh.imsave('figure10.jpg', canvas) diff --git a/ch10/figure13.py b/ch10/figure13.py deleted file mode 100644 index 89174e8e..00000000 --- a/ch10/figure13.py +++ /dev/null @@ -1,28 +0,0 @@ -# This code is supporting material for the book -# Building Machine Learning Systems with Python -# by Willi Richert and Luis Pedro Coelho -# published by PACKT Publishing -# -# It is made available under the MIT License - -import mahotas as mh -from mahotas.colors import rgb2grey -import numpy as np - -# Adds a little salt-n-pepper noise to an image - -im = mh.demos.load('lena') -im = rgb2grey(im) - -# Salt & pepper arrays -salt = np.random.random(im.shape) > .975 -pepper = np.random.random(im.shape) > .975 - -# salt is 170 & pepper is 30 -# Some playing around showed that setting these to more extreme values looks -# very artificial. These look nicer - -im = np.maximum(salt * 170, mh.stretch(im)) -im = np.minimum(pepper * 30 + im * (~pepper), im) - -mh.imsave('../1400OS_10_13+.jpg', im.astype(np.uint8)) diff --git a/ch10/figure18.py b/ch10/figure18.py deleted file mode 100644 index f17f9520..00000000 --- a/ch10/figure18.py +++ /dev/null @@ -1,121 +0,0 @@ -# This code is supporting material for the book -# Building Machine Learning Systems with Python -# by Willi Richert and Luis Pedro Coelho -# published by PACKT Publishing -# -# It is made available under the MIT License - -from __future__ import print_function -import mahotas as mh -from sklearn import cross_validation -from sklearn.linear_model.logistic import LogisticRegression -from matplotlib import pyplot as plt -import numpy as np -from glob import glob - -basedir = 'AnimTransDistr' -print('This script will test classification of the AnimTransDistr dataset') - - -def features_for(images): - fs = [] - for im in images: - im = mh.imread(im, as_grey=True).astype(np.uint8) - fs.append(mh.features.haralick(im).mean(0)) - return np.array(fs) - - -def features_labels(groups): - labels = np.zeros(sum(map(len, groups))) - st = 0 - for i, g in enumerate(groups): - labels[st:st + len(g)] = i - st += len(g) - return np.vstack(groups), labels - -classes = [ - 'Anims', - 'Cars', - 'Distras', - 'Trans', -] - -print('Computing whole-image texture features...') -features = [] -labels = [] -for ci, cl in enumerate(classes): - images = glob('{}/{}/*.jpg'.format(basedir, cl)) - features.extend(features_for(images)) - labels.extend([ci for _ in images]) - -features = np.array(features) -labels = np.array(labels) - -scores0 = cross_validation.cross_val_score( - LogisticRegression(), features, labels, cv=10) -print('Accuracy (5 fold x-val) with Logistic Regrssion [std features]: %s%%' % ( - 0.1 * round(1000 * scores0.mean()))) - -tfeatures = features - -from sklearn.cluster import KMeans -from mahotas.features import surf - -images = [] -labels = [] - -for ci, cl in enumerate(classes): - curimages = glob('{}/{}/*.jpg'.format(basedir, cl)) - images.extend(curimages) - labels.extend([ci for _ in curimages]) -labels = np.array(labels) - -print('Computing SURF descriptors...') -alldescriptors = [] -for im in images: - im = mh.imread(im, as_grey=1) - im = im.astype(np.uint8) - - # To use dense sampling, you can try the following line: - # alldescriptors.append(surf.dense(im, spacing=max(im.shape)//32)) - alldescriptors.append(surf.surf(im, descriptor_only=True)) - -print('Descriptors done') -k = 256 -km = KMeans(k) - -concatenated = np.concatenate(alldescriptors) -concatenated = concatenated[::64] -print('Clustering with K-means...') -km.fit(concatenated) -features = [] -for d in alldescriptors: - c = km.predict(d) - features.append( - np.array([np.sum(c == i) for i in range(k)]) - ) -features = np.array(features) -print('predicting...') -scoreSURFlr = cross_validation.cross_val_score( - LogisticRegression(), features, labels, cv=5).mean() -print('Accuracy (5 fold x-val) with Log. Reg [SURF features]: %s%%' % ( - 0.1 * round(1000 * scoreSURFlr.mean()))) - -print('Attemping classification with all features combined...') -allfeatures = np.hstack([features, tfeatures]) -scoreSURFplr = cross_validation.cross_val_score( - LogisticRegression(), allfeatures, labels, cv=5).mean() - -print('Accuracy (5 fold x-val) with Log. Reg [All features]: %s%%' % ( - 0.1 * round(1000 * scoreSURFplr.mean()))) - -plt.plot([0, 1, 2], 100 * - np.array([scores0.mean(), scoreSURFlr, scoreSURFplr]), 'k-', lw=8) -plt.plot( - [0, 1, 2], 100 * np.array([scores0.mean(), scoreSURFlr, scoreSURFplr]), - 'o', mec='#cccccc', mew=12, mfc='white') -plt.xlim(-.5, 2.5) -plt.ylim(scores0.mean() * 90., scoreSURFplr * 110) -plt.xticks([0, 1, 2], ["Texture", "SURF", "combined"]) -plt.ylabel('Accuracy (%)') -plt.savefig('../1400OS_10_18+.png') diff --git a/ch10/figure5_6.py b/ch10/figure5_6.py deleted file mode 100644 index 37d52752..00000000 --- a/ch10/figure5_6.py +++ /dev/null @@ -1,37 +0,0 @@ -# This code is supporting material for the book -# Building Machine Learning Systems with Python -# by Willi Richert and Luis Pedro Coelho -# published by PACKT Publishing -# -# It is made available under the MIT License - -import numpy as np -import mahotas as mh -image = mh.imread('../SimpleImageDataset/building05.jpg') -image = mh.colors.rgb2gray(image) - -# Compute Gaussian filtered versions with increasing kernel widths -im8 = mh.gaussian_filter(image, 8) -im16 = mh.gaussian_filter(image, 16) -im32 = mh.gaussian_filter(image, 32) - -# We now build a composite image with three panels: -# -# [ IM8 | | IM16 | | IM32 ] - -h, w = im8.shape -canvas = np.ones((h, 3 * w + 256), np.uint8) -canvas *= 255 -canvas[:, :w] = im8 -canvas[:, w + 128:2 * w + 128] = im16 -canvas[:, -w:] = im32 -mh.imsave('../1400OS_10_05+.jpg', canvas[:, ::2]) - -# Threshold the image -# We need to first stretch it to convert to an integer image -im32 = mh.stretch(im32) -ot32 = mh.otsu(im32) - -# Convert to 255 np.uint8 to match the other images -im255 = 255 * (im32 > ot32).astype(np.uint8) -mh.imsave('../1400OS_10_06+.jpg', im255) diff --git a/ch10/figure9.py b/ch10/figure9.py deleted file mode 100644 index fbaa985f..00000000 --- a/ch10/figure9.py +++ /dev/null @@ -1,28 +0,0 @@ -# This code is supporting material for the book -# Building Machine Learning Systems with Python -# by Willi Richert and Luis Pedro Coelho -# published by PACKT Publishing -# -# It is made available under the MIT License - -import numpy as np -import mahotas as mh -image = mh.imread('../SimpleImageDataset/building05.jpg') -image = mh.colors.rgb2gray(image, dtype=np.uint8) - -# We need to downsample ``image`` so that the details are visibly pixelated. -# This exaggerates the effect so that the result is clear -image = image[::4, ::4] -thresh = mh.sobel(image) -filtered = mh.sobel(image, just_filter=True) - -thresh = mh.dilate(thresh, np.ones((7, 7))) -filtered = mh.dilate(mh.stretch(filtered), np.ones((7, 7))) - -# Paste the thresholded and non-thresholded versions of the image side-by-side -h, w = thresh.shape -canvas = 255 * np.ones((h, w * 2 + 64), np.uint8) -canvas[:, :w] = thresh * 255 -canvas[:, -w:] = filtered - -mh.imsave('../1400OS_10_09+.jpg', canvas) diff --git a/ch10/large_classification.py b/ch10/large_classification.py new file mode 100644 index 00000000..8db3571b --- /dev/null +++ b/ch10/large_classification.py @@ -0,0 +1,108 @@ +# This code is supporting material for the book +# Building Machine Learning Systems with Python +# by Willi Richert and Luis Pedro Coelho +# published by PACKT Publishing +# +# It is made available under the MIT License + +from __future__ import print_function +import mahotas as mh +from glob import glob +from sklearn import cross_validation +from sklearn.linear_model import LogisticRegression +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.grid_search import GridSearchCV +import numpy as np + +basedir = 'AnimTransDistr' +print('This script will test classification of the AnimTransDistr dataset') + +C_range = 10.0 ** np.arange(-4, 3) +grid = GridSearchCV(LogisticRegression(), param_grid={'C' : C_range}) +clf = Pipeline([('preproc', StandardScaler()), + ('classifier', grid)]) + +def features_for(im): + from features import chist + im = mh.imread(im) + img = mh.colors.rgb2grey(im).astype(np.uint8) + return np.concatenate([mh.features.haralick(img).ravel(), + chist(im)]) + +def images(): + '''Iterate over all (image,label) pairs + + This function will return + ''' + for ci, cl in enumerate(classes): + images = glob('{}/{}/*.jpg'.format(basedir, cl)) + for im in sorted(images): + yield im, ci + +classes = [ + 'Anims', + 'Cars', + 'Distras', + 'Trans', +] + +print('Computing whole-image texture features...') +ifeatures = [] +labels = [] +for im, ell in images(): + ifeatures.append(features_for(im)) + labels.append(ell) + +ifeatures = np.array(ifeatures) +labels = np.array(labels) + +cv = cross_validation.KFold(len(ifeatures), 5, shuffle=True, random_state=123) +scores0 = cross_validation.cross_val_score( + clf, ifeatures, labels, cv=cv) +print('Accuracy (5 fold x-val) with Logistic Regression [image features]: {:.1%}'.format( + scores0.mean())) + + +from sklearn.cluster import KMeans +from mahotas.features import surf + + +print('Computing SURF descriptors...') +alldescriptors = [] +for im,_ in images(): + im = mh.imread(im, as_grey=True) + im = im.astype(np.uint8) + + # To use dense sampling, you can try the following line: + # alldescriptors.append(surf.dense(im, spacing=16)) + alldescriptors.append(surf.surf(im, descriptor_only=True)) + +print('Descriptor computation complete.') +k = 256 +km = KMeans(k) + +concatenated = np.concatenate(alldescriptors) +print('Number of descriptors: {}'.format( + len(concatenated))) +concatenated = concatenated[::64] +print('Clustering with K-means...') +km.fit(concatenated) +sfeatures = [] +for d in alldescriptors: + c = km.predict(d) + sfeatures.append(np.bincount(c, minlength=k)) +sfeatures = np.array(sfeatures, dtype=float) +print('predicting...') +score_SURF = cross_validation.cross_val_score( + clf, sfeatures, labels, cv=cv).mean() +print('Accuracy (5 fold x-val) with Logistic Regression [SURF features]: {:.1%}'.format( + score_SURF.mean())) + + +print('Performing classification with all features combined...') +allfeatures = np.hstack([sfeatures, ifeatures]) +score_SURF_global = cross_validation.cross_val_score( + clf, allfeatures, labels, cv=cv).mean() +print('Accuracy (5 fold x-val) with Logistic Regression [All features]: {:.1%}'.format( + score_SURF_global.mean())) diff --git a/ch10/neighbors.py b/ch10/neighbors.py new file mode 100644 index 00000000..1f71d0de --- /dev/null +++ b/ch10/neighbors.py @@ -0,0 +1,64 @@ +# This code is supporting material for the book +# Building Machine Learning Systems with Python +# by Willi Richert and Luis Pedro Coelho +# published by PACKT Publishing + +import numpy as np +import mahotas as mh +from glob import glob +from features import texture, chist +from matplotlib import pyplot as plt +from sklearn.preprocessing import StandardScaler +from scipy.spatial import distance + +basedir = '../SimpleImageDataset/' + + +haralicks = [] +chists = [] + +print('Computing features...') +# Use glob to get all the images +images = glob('{}/*.jpg'.format(basedir)) +# We sort the images to ensure that they are always processed in the same order +# Otherwise, this would introduce some variation just based on the random +# ordering that the filesystem uses +images.sort() + +for fname in images: + imc = mh.imread(fname) + imc = imc[200:-200,200:-200] + haralicks.append(texture(mh.colors.rgb2grey(imc))) + chists.append(chist(imc)) + +haralicks = np.array(haralicks) +chists = np.array(chists) +features = np.hstack([chists, haralicks]) + +print('Computing neighbors...') +sc = StandardScaler() +features = sc.fit_transform(features) +dists = distance.squareform(distance.pdist(features)) + +print('Plotting...') +fig, axes = plt.subplots(2, 9, figsize=(16,8)) + +# Remove ticks from all subplots +for ax in axes.flat: + ax.set_xticks([]) + ax.set_yticks([]) + +for ci,i in enumerate(range(0,90,10)): + left = images[i] + dists_left = dists[i] + right = dists_left.argsort() + # right[0] is the same as left[i], so pick the next closest element + right = right[1] + right = images[right] + left = mh.imread(left) + right = mh.imread(right) + axes[0, ci].imshow(left) + axes[1, ci].imshow(right) + +fig.tight_layout() +fig.savefig('figure_neighbors.png', dpi=300) diff --git a/ch10/scene00.jpg b/ch10/scene00.jpg new file mode 100644 index 00000000..ed727a50 Binary files /dev/null and b/ch10/scene00.jpg differ diff --git a/ch10/simple_classification.py b/ch10/simple_classification.py index 7f8a214d..a5a448d2 100644 --- a/ch10/simple_classification.py +++ b/ch10/simple_classification.py @@ -6,18 +6,20 @@ # It is made available under the MIT License import mahotas as mh -from sklearn import cross_validation -from sklearn.linear_model.logistic import LogisticRegression import numpy as np from glob import glob -from features import texture, edginess_sobel + +from features import texture, chist +from sklearn.linear_model import LogisticRegression +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler basedir = '../SimpleImageDataset/' haralicks = [] -sobels = [] labels = [] +chists = [] print('This script will test (with cross-validation) classification of the simple 3 class dataset') print('Computing features...') @@ -28,9 +30,9 @@ # Otherwise, this would introduce some variation just based on the random # ordering that the filesystem uses for fname in sorted(images): - im = mh.imread(fname, as_grey=True) - haralicks.append(texture(im)) - sobels.append(edginess_sobel(im)) + imc = mh.imread(fname) + haralicks.append(texture(mh.colors.rgb2grey(imc))) + chists.append(chist(imc)) # Files are named like building00.jpg, scene23.jpg... labels.append(fname[:-len('xx.jpg')]) @@ -38,27 +40,31 @@ print('Finished computing features.') haralicks = np.array(haralicks) -sobels = np.array(sobels) labels = np.array(labels) +chists = np.array(chists) + +haralick_plus_chists = np.hstack([chists, haralicks]) + -# We use logistic regression because it is very fast. +# We use Logistic Regression because it achieves high accuracy on small(ish) datasets # Feel free to experiment with other classifiers -scores = cross_validation.cross_val_score( - LogisticRegression(), haralicks, labels, cv=5) -print('Accuracy (5 fold x-val) with Logistic Regression [std features]: {}%'.format( - 0.1 * round(1000 * scores.mean()))) +clf = Pipeline([('preproc', StandardScaler()), + ('classifier', LogisticRegression())]) -haralick_plus_sobel = np.hstack([np.atleast_2d(sobels).T, haralicks]) +from sklearn import cross_validation +cv = cross_validation.LeaveOneOut(len(images)) scores = cross_validation.cross_val_score( - LogisticRegression(), haralick_plus_sobel, labels, cv=5).mean() -print('Accuracy (5 fold x-val) with Logistic Regression [std features + sobel]: {}%'.format( - 0.1 * round(1000 * scores.mean()))) + clf, haralicks, labels, cv=cv) +print('Accuracy (Leave-one-out) with Logistic Regression [haralick features]: {:.1%}'.format( + scores.mean())) +scores = cross_validation.cross_val_score( + clf, chists, labels, cv=cv) +print('Accuracy (Leave-one-out) with Logistic Regression [color histograms]: {:.1%}'.format( + scores.mean())) -# We can try to just use the sobel feature. The result is almost completely -# random. scores = cross_validation.cross_val_score( - LogisticRegression(), np.atleast_2d(sobels).T, labels, cv=5).mean() -print('Accuracy (5 fold x-val) with Logistic Regression [only using sobel feature]: {}%'.format( - 0.1 * round(1000 * scores.mean()))) + clf, haralick_plus_chists, labels, cv=cv) +print('Accuracy (Leave-one-out) with Logistic Regression [texture features + color histograms]: {:.1%}'.format( + scores.mean())) diff --git a/ch10/thresholded_figure.py b/ch10/thresholded_figure.py new file mode 100644 index 00000000..947762e8 --- /dev/null +++ b/ch10/thresholded_figure.py @@ -0,0 +1,31 @@ +import mahotas as mh +import numpy as np +from matplotlib import pyplot as plt + +# Load image & convert to B&W +image = mh.imread('../SimpleImageDataset/scene00.jpg') +image = mh.colors.rgb2grey(image, dtype=np.uint8) +plt.imshow(image) +plt.gray() +plt.title('original image') + +thresh = mh.thresholding.otsu(image) +print('Otsu threshold is {}.'.format(thresh)) + +threshed = (image > thresh) +plt.figure() +plt.imshow(threshed) +plt.title('threholded image') +mh.imsave('thresholded.png', threshed.astype(np.uint8)*255) + +im16 = mh.gaussian_filter(image, 16) + +# Repeat the thresholding operations with the blurred image +thresh = mh.thresholding.otsu(im16.astype(np.uint8)) +threshed = (im16 > thresh) +plt.figure() +plt.imshow(threshed) +plt.title('threholded image (after blurring)') +print('Otsu threshold after blurring is {}.'.format(thresh)) +mh.imsave('thresholded16.png', threshed.astype(np.uint8)*255) +plt.show() diff --git a/ch12/.gitignore b/ch12/.gitignore index fb47e2ef..fa8fbcc2 100644 --- a/ch12/.gitignore +++ b/ch12/.gitignore @@ -1 +1,3 @@ *.jugdata/ +output.txt +results.image.txt diff --git a/ch12/README.rst b/ch12/README.rst new file mode 100644 index 00000000..fba88d4a --- /dev/null +++ b/ch12/README.rst @@ -0,0 +1,28 @@ +========== +Chapter 12 +========== + +Support code for *Chapter 12: Big(ger) Data* + +Data +---- + +This chapter relies only on the image dataset that is packaged with the +repository at ``../SimpleImageDataset/``. + +Scripts +------- + +chapter.py + Code as written in the book +jugfile.py + Example jugfile +image-classification.py + Jugfile implementation of image classification from Chapter 10 + +setup-aws.txt + Commands to setup Amazon WebServices machine +run-jugfile.sh + Wrapper script to run jug file on jugfile.py +run-image-classification.sh + Wrapper script to run jug file on image-classification.py diff --git a/ch12/chapter.py b/ch12/chapter.py new file mode 100644 index 00000000..15326680 --- /dev/null +++ b/ch12/chapter.py @@ -0,0 +1,95 @@ +from jug import TaskGenerator +from glob import glob +import mahotas as mh +@TaskGenerator +def compute_texture(im): + from features import texture + imc = mh.imread(im) + return texture(mh.colors.rgb2gray(imc)) + +@TaskGenerator +def chist_file(fname): + from features import chist + im = mh.imread(fname) + return chist(im) + +import numpy as np +to_array = TaskGenerator(np.array) +hstack = TaskGenerator(np.hstack) + +haralicks = [] +chists = [] +labels = [] + +# Change this variable to point to +# the location of the dataset is on disk +basedir = '../SimpleImageDataset/' +# Use glob to get all the images +images = glob('{}/*.jpg'.format(basedir)) + +for fname in sorted(images): + haralicks.append(compute_texture(fname)) + chists.append(chist_file(fname)) + # The class is encoded in the filename as xxxx00.jpg + labels.append(fname[:-len('00.jpg')]) + +haralicks = to_array(haralicks) +chists = to_array(chists) +labels = to_array(labels) + +@TaskGenerator +def accuracy(features, labels): + from sklearn.linear_model import LogisticRegression + from sklearn.pipeline import Pipeline + from sklearn.preprocessing import StandardScaler + from sklearn import cross_validation + + clf = Pipeline([('preproc', StandardScaler()), + ('classifier', LogisticRegression())]) + cv = cross_validation.LeaveOneOut(len(features)) + scores = cross_validation.cross_val_score( + clf, features, labels, cv=cv) + return scores.mean() +scores_base = accuracy(haralicks, labels) +scores_chist = accuracy(chists, labels) + +combined = hstack([chists, haralicks]) +scores_combined = accuracy(combined, labels) + +@TaskGenerator +def print_results(scores): + with open('results.image.txt', 'w') as output: + for k,v in scores: + output.write('Accuracy [{}]: {:.1%}\n'.format( + k, v.mean())) + +print_results([ + ('base', scores_base), + ('chists', scores_chist), + ('combined' , scores_combined), + ]) + +@TaskGenerator +def compute_lbp(fname): + from mahotas.features import lbp + imc = mh.imread(fname) + im = mh.colors.rgb2grey(imc) + return lbp(im, radius=8, points=6) + +lbps = [] +for fname in sorted(images): + # the rest of the loop as before + lbps.append(compute_lbp(fname)) +lbps = to_array(lbps) + +scores_lbps = accuracy(lbps, labels) +combined_all = hstack([chists, haralicks, lbps]) +scores_combined_all = accuracy(combined_all, labels) + +print_results([ + ('base', scores_base), + ('chists', scores_chist), + ('lbps', scores_lbps), + ('combined' , scores_combined), + ('combined_all' , scores_combined_all), + ]) diff --git a/ch12/features.py b/ch12/features.py new file mode 120000 index 00000000..142a324d --- /dev/null +++ b/ch12/features.py @@ -0,0 +1 @@ +../ch10/features.py \ No newline at end of file diff --git a/ch12/image-classification.py b/ch12/image-classification.py index 6283a588..6f76d26d 100644 --- a/ch12/image-classification.py +++ b/ch12/image-classification.py @@ -6,11 +6,11 @@ # It is made available under the MIT License import mahotas as mh -from sklearn import cross_validation -from sklearn.linear_model.logistic import LogisticRegression import numpy as np from glob import glob from jug import TaskGenerator + +# We need to use the `features` module from chapter 10. from sys import path path.append('../ch10') @@ -20,7 +20,7 @@ basedir = '../SimpleImageDataset/' @TaskGenerator -def features_for(im): +def compute_texture(im): '''Compute features for an image Parameters @@ -33,57 +33,84 @@ def features_for(im): fs : ndarray 1-D array of features ''' - im = mh.imread(im, as_grey=True).astype(np.uint8) - return mh.features.haralick(im).mean(0) + from features import texture + imc = mh.imread(im) + return texture(mh.colors.rgb2grey(imc)) + +@TaskGenerator +def chist(fname): + from features import chist as color_histogram + im = mh.imread(fname) + return color_histogram(im) @TaskGenerator -def edginess_sobel_from_fname(fname): - from edginess import edginess_sobel - return edginess_sobel(mh.imread(fname, as_grey=True)) +def compute_lbp(fname): + from mahotas.features import lbp + imc = mh.imread(fname) + im = mh.colors.rgb2grey(imc) + return lbp(im, radius=8, points=6) + @TaskGenerator def accuracy(features, labels): + from sklearn.linear_model import LogisticRegression + from sklearn.pipeline import Pipeline + from sklearn.preprocessing import StandardScaler + from sklearn import cross_validation # We use logistic regression because it is very fast. # Feel free to experiment with other classifiers + clf = Pipeline([('preproc', StandardScaler()), + ('classifier', LogisticRegression())]) + cv = cross_validation.LeaveOneOut(len(features)) scores = cross_validation.cross_val_score( - LogisticRegression(), features, labels, cv=5) + clf, features, labels, cv=cv) return scores.mean() @TaskGenerator -def stack_features(sobels, haralicks): - return np.hstack([np.atleast_2d(sobels).T, haralicks]) - -@TaskGenerator -def print_results(scores_base, scores_combined): - output = open('results.image.txt', 'w') - output.write('Accuracy (5 fold x-val) with Logistic Regrssion [std features]: {}%\n'.format( - 0.1 * round(1000 * scores_base.mean()))) - output.write('Accuracy (5 fold x-val) with Logistic Regrssion [std features + sobel]: {}%\n'.format( - 0.1 * round(1000 * scores_combined.mean()))) - output.close() +def print_results(scores): + with open('results.image.txt', 'w') as output: + for k,v in scores: + output.write('Accuracy (LOO x-val) with Logistic Regression [{0}]: {1:.1%}\n'.format( + k, v.mean())) to_array = TaskGenerator(np.array) +hstack = TaskGenerator(np.hstack) haralicks = [] -sobels = [] +chists = [] +lbps = [] labels = [] # Use glob to get all the images -images = glob('{}/*.jpg'.format(basedir)) +images = glob('{0}/*.jpg'.format(basedir)) for fname in sorted(images): - haralicks.append(features_for(fname)) - sobels.append(edginess_sobel_from_fname(fname)) + haralicks.append(compute_texture(fname)) + chists.append(chist(fname)) + lbps.append(compute_lbp(fname)) labels.append(fname[:-len('00.jpg')]) # The class is encoded in the filename as xxxx00.jpg haralicks = to_array(haralicks) -sobels = to_array(sobels) +chists = to_array(chists) +lbps = to_array(lbps) labels = to_array(labels) scores_base = accuracy(haralicks, labels) -haralick_plus_sobel = stack_features(sobels, haralicks) -scores_combined = accuracy(haralick_plus_sobel, labels) - -print_results(scores_base, scores_combined) +scores_chist = accuracy(chists, labels) +scores_lbps = accuracy(lbps, labels) + +combined = hstack([chists, haralicks]) +scores_combined = accuracy(combined, labels) + +combined_all = hstack([chists, haralicks, lbps]) +scores_combined_all = accuracy(combined_all, labels) + +print_results([ + ('base', scores_base), + ('chists', scores_chist), + ('lbps', scores_lbps), + ('combined' , scores_combined), + ('combined_all' , scores_combined_all), + ]) diff --git a/ch12/jugfile.py b/ch12/jugfile.py index 858c64f4..9d7e2b7a 100644 --- a/ch12/jugfile.py +++ b/ch12/jugfile.py @@ -23,7 +23,7 @@ def add(a, b): @TaskGenerator def print_final_result(oname, value): with open(oname, 'w') as output: - output.write("Final result: {}\n".format(value)) + output.write("Final result: {0}\n".format(value)) input = 2 y = double(input) diff --git a/ch12/run-image-classification.sh b/ch12/run-image-classification.sh new file mode 100755 index 00000000..868d07fa --- /dev/null +++ b/ch12/run-image-classification.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env bash + +jug execute image-classification.py diff --git a/ch12/run-jugfile.sh b/ch12/run-jugfile.sh new file mode 100755 index 00000000..0ff59131 --- /dev/null +++ b/ch12/run-jugfile.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +jug execute + diff --git a/ch12/setup-aws.txt b/ch12/setup-aws.txt new file mode 100644 index 00000000..292654eb --- /dev/null +++ b/ch12/setup-aws.txt @@ -0,0 +1,7 @@ +sudo yum update +sudo yum -y install python-devel python-pip numpy scipy python-matplotlib +sudo yum -y install gcc-c++ +sudo yum -y install git +sudo pip-python install -U pip +sudo pip install scikit-learn jug mahotas +