diff --git a/README.md b/README.md index 05218427..cbcd3155 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,14 @@ Building Machine Learning Systems with Python ============================================= -Source Code for the book Building Machine Learning Systems with Python by -[Willi Richert](http://twotoreal.com) and [Luis Pedro -Coelho](http://luispedro.org). +Source Code for the book Building Machine Learning Systems with Python by [Luis +Pedro Coelho](http://luispedro.org) and [Willi Richert](http://twotoreal.com). -The book was published in 2013 by Packt Publishing and is available [from their +The book was published in 2013 (second edition in 2015) by Packt Publishing and +is available [from their website](http://www.packtpub.com/building-machine-learning-systems-with-python/book). +The code in the repository corresponds to the second edition. Code for the +first edition is available in [first\_edition +branch](https://github.com/luispedro/BuildingMachineLearningSystemsWithPython/tree/first_edition). diff --git a/SimpleImageDataset/scene01.jpg b/SimpleImageDataset/scene01.jpg index 29b327d2..e7b416d3 100644 Binary files a/SimpleImageDataset/scene01.jpg and b/SimpleImageDataset/scene01.jpg differ diff --git a/SimpleImageDataset/scene08.jpg b/SimpleImageDataset/scene08.jpg index acc89a74..7dee8860 100644 Binary files a/SimpleImageDataset/scene08.jpg and b/SimpleImageDataset/scene08.jpg differ diff --git a/ch01/analyze_webstats.py b/ch01/analyze_webstats.py index 8d3c4c41..5da892e2 100644 --- a/ch01/analyze_webstats.py +++ b/ch01/analyze_webstats.py @@ -26,8 +26,9 @@ x = x[~sp.isnan(y)] y = y[~sp.isnan(y)] -# plot input data + def plot_models(x, y, models, fname, mx=None, ymax=None, xmin=None): + ''' plot input data ''' plt.figure(num=None, figsize=(8, 6)) plt.clf() @@ -138,8 +139,8 @@ def error(f, x, y): train = sorted(shuffled[split_idx:]) fbt1 = sp.poly1d(sp.polyfit(xb[train], yb[train], 1)) fbt2 = sp.poly1d(sp.polyfit(xb[train], yb[train], 2)) -print("fbt2(x)= \n%s"%fbt2) -print("fbt2(x)-100,000= \n%s"%(fbt2-100000)) +print("fbt2(x)= \n%s" % fbt2) +print("fbt2(x)-100,000= \n%s" % (fbt2-100000)) fbt3 = sp.poly1d(sp.polyfit(xb[train], yb[train], 3)) fbt10 = sp.poly1d(sp.polyfit(xb[train], yb[train], 10)) fbt100 = sp.poly1d(sp.polyfit(xb[train], yb[train], 100)) diff --git a/ch01/gen_webstats.py b/ch01/gen_webstats.py index fa133d76..61d0b738 100644 --- a/ch01/gen_webstats.py +++ b/ch01/gen_webstats.py @@ -17,26 +17,22 @@ sp.random.seed(3) # to reproduce the data later on -x = sp.arange(1, 31 * 24) -y = sp.array(200 * (sp.sin(2 * sp.pi * x / (7 * 24))), dtype=int) +x = sp.arange(1, 31*24) +y = sp.array(200*(sp.sin(2*sp.pi*x/(7*24))), dtype=int) y += gamma.rvs(15, loc=0, scale=100, size=len(x)) -y += 2 * sp.exp(x / 100.0) -y = sp.ma.array(y, mask=[y < 0]) -print(sum(y), sum(y < 0)) +y += 2 * sp.exp(x/100.0) +y = sp.ma.array(y, mask=[y<0]) +print(sum(y), sum(y<0)) plt.scatter(x, y) plt.title("Web traffic over the last month") plt.xlabel("Time") plt.ylabel("Hits/hour") -plt.xticks([w * 7 * 24 for w in [0, 1, 2, 3, 4]], ['week %i' % (w + 1) for w in - [0, 1, 2, 3, 4]]) - +plt.xticks([w*7*24 for w in range(5)], + ['week %i' %(w+1) for w in range(5)]) plt.autoscale(tight=True) plt.grid() plt.savefig(os.path.join(CHART_DIR, "1400_01_01.png")) -# sp.savetxt(os.path.join("..", "web_traffic.tsv"), -# zip(x[~y.mask],y[~y.mask]), delimiter="\t", fmt="%i") - -sp.savetxt(os.path.join( - DATA_DIR, "web_traffic.tsv"), list(zip(x, y)), delimiter="\t", fmt="%s") +sp.savetxt(os.path.join(DATA_DIR, "web_traffic.tsv"), + list(zip(x, y)), delimiter="\t", fmt="%s") diff --git a/ch02/README.rst b/ch02/README.rst index 4dc0b211..e2cb729a 100644 --- a/ch02/README.rst +++ b/ch02/README.rst @@ -6,6 +6,9 @@ Support code for *Chapter 2: Learning How to Classify with Real-world Examples*. The directory data contains the seeds dataset, originally downloaded from https://archive.ics.uci.edu/ml/datasets/seeds +chapter.py + The code as printed in the book. + figure1.py Figure 1 in the book: all 2-by-2 scatter plots diff --git a/ch02/chapter.py b/ch02/chapter.py new file mode 100644 index 00000000..c68b45ab --- /dev/null +++ b/ch02/chapter.py @@ -0,0 +1,164 @@ +# This code is supporting material for the book +# Building Machine Learning Systems with Python +# by Willi Richert and Luis Pedro Coelho +# published by PACKT Publishing +# +# It is made available under the MIT License + + +from matplotlib import pyplot as plt +import numpy as np + +# We load the data with load_iris from sklearn +from sklearn.datasets import load_iris +data = load_iris() + +# load_iris returns an object with several fields +features = data.data +feature_names = data.feature_names +target = data.target +target_names = data.target_names + +for t in range(3): + if t == 0: + c = 'r' + marker = '>' + elif t == 1: + c = 'g' + marker = 'o' + elif t == 2: + c = 'b' + marker = 'x' + plt.scatter(features[target == t, 0], + features[target == t, 1], + marker=marker, + c=c) +# We use NumPy fancy indexing to get an array of strings: +labels = target_names[target] + +# The petal length is the feature at position 2 +plength = features[:, 2] + +# Build an array of booleans: +is_setosa = (labels == 'setosa') + +# This is the important step: +max_setosa =plength[is_setosa].max() +min_non_setosa = plength[~is_setosa].min() +print('Maximum of setosa: {0}.'.format(max_setosa)) + +print('Minimum of others: {0}.'.format(min_non_setosa)) + +# ~ is the boolean negation operator +features = features[~is_setosa] +labels = labels[~is_setosa] +# Build a new target variable, is_virigina +is_virginica = (labels == 'virginica') + +# Initialize best_acc to impossibly low value +best_acc = -1.0 +for fi in range(features.shape[1]): + # We are going to test all possible thresholds + thresh = features[:,fi] + for t in thresh: + + # Get the vector for feature `fi` + feature_i = features[:, fi] + # apply threshold `t` + pred = (feature_i > t) + acc = (pred == is_virginica).mean() + rev_acc = (pred == ~is_virginica).mean() + if rev_acc > acc: + reverse = True + acc = rev_acc + else: + reverse = False + + if acc > best_acc: + best_acc = acc + best_fi = fi + best_t = t + best_reverse = reverse + +print(best_fi, best_t, best_reverse, best_acc) + +def is_virginica_test(fi, t, reverse, example): + 'Apply threshold model to a new example' + test = example[fi] > t + if reverse: + test = not test + return test +from threshold import fit_model, predict + +# ning accuracy was 96.0%. +# ing accuracy was 90.0% (N = 50). +correct = 0.0 + +for ei in range(len(features)): + # select all but the one at position `ei`: + training = np.ones(len(features), bool) + training[ei] = False + testing = ~training + model = fit_model(features[training], is_virginica[training]) + predictions = predict(model, features[testing]) + correct += np.sum(predictions == is_virginica[testing]) +acc = correct/float(len(features)) +print('Accuracy: {0:.1%}'.format(acc)) + + +########################################### +############## SEEDS DATASET ############## +########################################### + +from load import load_dataset + +feature_names = [ + 'area', + 'perimeter', + 'compactness', + 'length of kernel', + 'width of kernel', + 'asymmetry coefficien', + 'length of kernel groove', +] +features, labels = load_dataset('seeds') + + + +from sklearn.neighbors import KNeighborsClassifier +classifier = KNeighborsClassifier(n_neighbors=1) +from sklearn.cross_validation import KFold + +kf = KFold(len(features), n_folds=5, shuffle=True) +means = [] +for training,testing in kf: + # We learn a model for this fold with `fit` and then apply it to the + # testing data with `predict`: + classifier.fit(features[training], labels[training]) + prediction = classifier.predict(features[testing]) + + # np.mean on an array of booleans returns fraction + # of correct decisions for this fold: + curmean = np.mean(prediction == labels[testing]) + means.append(curmean) +print('Mean accuracy: {:.1%}'.format(np.mean(means))) + + +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler + +classifier = KNeighborsClassifier(n_neighbors=1) +classifier = Pipeline([('norm', StandardScaler()), ('knn', classifier)]) + +means = [] +for training,testing in kf: + # We learn a model for this fold with `fit` and then apply it to the + # testing data with `predict`: + classifier.fit(features[training], labels[training]) + prediction = classifier.predict(features[testing]) + + # np.mean on an array of booleans returns fraction + # of correct decisions for this fold: + curmean = np.mean(prediction == labels[testing]) + means.append(curmean) +print('Mean accuracy: {:.1%}'.format(np.mean(means))) diff --git a/ch02/figure1.py b/ch02/figure1.py index d6a4d287..4ec6fff8 100644 --- a/ch02/figure1.py +++ b/ch02/figure1.py @@ -5,9 +5,12 @@ # # It is made available under the MIT License -from sklearn.datasets import load_iris from matplotlib import pyplot as plt +# We load the data with load_iris from sklearn +from sklearn.datasets import load_iris + +# load_iris returns an object with several fields data = load_iris() features = data.data feature_names = data.feature_names @@ -16,13 +19,21 @@ fig,axes = plt.subplots(2, 3) pairs = [(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)] + +# Set up 3 different pairs of (color, marker) +color_markers = [ + ('r', '>'), + ('g', 'o'), + ('b', 'x'), + ] for i, (p0, p1) in enumerate(pairs): ax = axes.flat[i] - # Use a different marker/color for each class `t` - for t, marker, c in zip(range(3), ">ox", "rgb"): + for t in range(3): + # Use a different color/marker for each class `t` + c,marker = color_markers[t] ax.scatter(features[target == t, p0], features[ - target == t, p1], marker=marker, c=c, s=40) + target == t, p1], marker=marker, c=c) ax.set_xlabel(feature_names[p0]) ax.set_ylabel(feature_names[p1]) ax.set_xticks([]) diff --git a/ch02/figure2.py b/ch02/figure2.py index e7222dd6..0b69d395 100644 --- a/ch02/figure2.py +++ b/ch02/figure2.py @@ -10,17 +10,22 @@ from matplotlib import pyplot as plt from sklearn.datasets import load_iris data = load_iris() -features = data['data'] -feature_names = data['feature_names'] -species = data['target_names'][data['target']] +features = data.data +feature_names = data.feature_names +target = data.target +target_names = data.target_names -is_setosa = (species == 'setosa') +# We use NumPy fancy indexing to get an array of strings: +labels = target_names[target] + +is_setosa = (labels == 'setosa') features = features[~is_setosa] -species = species[~is_setosa] -is_virginica = (species == 'virginica') +labels = labels[~is_setosa] +is_virginica = (labels == 'virginica') -# Hand fixed threshold: -t = 1.75 +# Hand fixed thresholds: +t = 1.65 +t2 = 1.75 # Features to use: 3 & 2 f0, f1 = 3, 2 @@ -45,7 +50,7 @@ ax.fill_between([t, x1], [y0, y0], [y1, y1], color=area2c) ax.fill_between([x0, t], [y0, y0], [y1, y1], color=area1c) ax.plot([t, t], [y0, y1], 'k--', lw=2) -ax.plot([t - .1, t - .1], [y0, y1], 'k:', lw=2) +ax.plot([t2, t2], [y0, y1], 'k:', lw=2) ax.scatter(features[is_virginica, f0], features[is_virginica, f1], c='b', marker='o', s=40) ax.scatter(features[~is_virginica, f0], diff --git a/ch02/figure4_5_no_sklearn.py b/ch02/figure4_5_no_sklearn.py index 5f67e0d7..adc83d73 100644 --- a/ch02/figure4_5_no_sklearn.py +++ b/ch02/figure4_5_no_sklearn.py @@ -45,7 +45,7 @@ def plot_decision(features, labels): model = fit_model(1, features[:, (0, 2)], np.array(labels)) C = predict( - np.vstack([X.ravel(), Y.ravel()]).T, model).reshape(X.shape) + model, np.vstack([X.ravel(), Y.ravel()]).T).reshape(X.shape) if COLOUR_FIGURE: cmap = ListedColormap([(1., .6, .6), (.6, 1., .6), (.6, .6, 1.)]) else: diff --git a/ch02/figure4_5_sklearn.py b/ch02/figure4_5_sklearn.py index 9d20c7cb..55ac0c80 100644 --- a/ch02/figure4_5_sklearn.py +++ b/ch02/figure4_5_sklearn.py @@ -58,11 +58,11 @@ def plot_decision(features, labels, num_neighbors=1): ax.pcolormesh(X, Y, C, cmap=cmap) if COLOUR_FIGURE: cmap = ListedColormap([(1., .0, .0), (.1, .6, .1), (.0, .0, 1.)]) - ax.scatter(features[:, 0], features[:, 2], c=labels, cmap=cmap, s=40) + ax.scatter(features[:, 0], features[:, 2], c=labels, cmap=cmap) else: for lab, ma in zip(range(3), "Do^"): ax.plot(features[labels == lab, 0], features[ - labels == lab, 2], ma, c=(1., 1., 1.), ms=8) + labels == lab, 2], ma, c=(1., 1., 1.), ms=6) return fig,ax diff --git a/ch02/knn.py b/ch02/knn.py index e1d48ce0..89ebfdb4 100644 --- a/ch02/knn.py +++ b/ch02/knn.py @@ -26,7 +26,7 @@ def plurality(xs): return k # This function was called ``apply_model`` in the first edition -def predict(features, model): +def predict(model, features): '''Apply k-nn model''' k, train_feats, labels = model results = [] @@ -42,5 +42,5 @@ def predict(features, model): def accuracy(features, labels, model): - preds = predict(features, model) + preds = predict(model, features) return np.mean(preds == labels) diff --git a/ch02/simple_threshold.py b/ch02/simple_threshold.py index b0e4e71e..d174f283 100644 --- a/ch02/simple_threshold.py +++ b/ch02/simple_threshold.py @@ -12,9 +12,12 @@ target = data['target'] target_names = data['target_names'] labels = target_names[target] - plength = features[:, 2] + +# To use numpy operations to get setosa features, +# we build a boolean array is_setosa = (labels == 'setosa') + max_setosa = plength[is_setosa].max() min_non_setosa = plength[~is_setosa].min() diff --git a/ch02/threshold.py b/ch02/threshold.py index efbd942b..d621a350 100644 --- a/ch02/threshold.py +++ b/ch02/threshold.py @@ -40,7 +40,7 @@ def fit_model(features, labels): # This function was called ``apply_model`` in the first edition -def predict(features, model): +def predict(model, features): '''Apply a learned model''' # A model is a pair as returned by fit_model t, fi, reverse = model @@ -51,5 +51,5 @@ def predict(features, model): def accuracy(features, labels, model): '''Compute the accuracy of the model''' - preds = predict(features, model) + preds = predict(model, features) return np.mean(preds == labels) diff --git a/ch04/.gitignore b/ch04/.gitignore index 0a7a205c..c4c0b18a 100644 --- a/ch04/.gitignore +++ b/ch04/.gitignore @@ -1,2 +1,6 @@ wiki_lda.pkl wiki_lda.pkl.state +*.png +*.npy +*.pkl +topics.txt diff --git a/ch04/README.rst b/ch04/README.rst index e40eccbe..99a3c186 100644 --- a/ch04/README.rst +++ b/ch04/README.rst @@ -4,6 +4,16 @@ Chapter 4 Support code for *Chapter 4: Topic Modeling* + +AP Data +------- + +To download the AP data, use the ``download_ap.sh`` script inside the ``data`` +directory:: + + cd data + ./download_ap.sh + Word cloud creation ------------------- @@ -33,7 +43,9 @@ may remove the input file if you want to save disk space To generate the model, you can run the ``wikitopics_create.py`` script, while the ``wikitopics_plot.py`` script will plot the most heavily discussed topic as -well as the least heavily discussed one. +well as the least heavily discussed one. The code is split into steps as the +first one can take a very long time. Then it saves the results so that you can +later explore them at leisure. You should not expect that your results will exactly match the results in the book, for two reasons: @@ -41,3 +53,13 @@ book, for two reasons: 1. The LDA algorithm is a probabilistic algorithm and can give different results every time it is run. 2. Wikipedia keeps changing. Thus, even your input data will be different. + +Scripts +------- + +blei_lda.py + Computes LDA using the AP Corpus. +wikitopics_create.py + Create the topic model for Wikipedia using LDA (must download wikipedia database first) +wikitopics_create_hdp.py + Create the topic model for Wikipedia using HDP (must download wikipedia database first) diff --git a/ch04/blei_lda.py b/ch04/blei_lda.py index 14f18349..7f6ac2b3 100644 --- a/ch04/blei_lda.py +++ b/ch04/blei_lda.py @@ -8,19 +8,13 @@ from __future__ import print_function from wordcloud import create_cloud try: - from gensim import corpora, models + from gensim import corpora, models, matutils except: print("import gensim failed.") print() print("Please install it") raise -try: - from mpltools import style - style.use('ggplot') -except: - print("Could not import mpltools: plots will not be styled correctly") - import matplotlib.pyplot as plt import numpy as np from os import path @@ -39,27 +33,22 @@ model = models.ldamodel.LdaModel( corpus, num_topics=NUM_TOPICS, id2word=corpus.id2word, alpha=None) -ti = 0 # Iterate over all the topics in the model -for ti in xrange(model.num_topics): +for ti in range(model.num_topics): words = model.show_topic(ti, 64) - tf = sum(f for f, w in words) + tf = sum(f for _, f in words) with open('topics.txt', 'w') as output: - output.write('\n'.join('{}:{}'.format(w, int(1000. * f / tf)) for f, w in words)) + output.write('\n'.join('{}:{}'.format(w, int(1000. * f / tf)) for w, f in words)) output.write("\n\n\n") # We first identify the most discussed topic, i.e., the one with the # highest total weight -# First, we need to sum up the weights across all the documents -weight = np.zeros(model.num_topics) -for doc in corpus: - for col, val in model[doc]: - weight[col] += val - # As a reasonable alternative, we could have used the log of val: - # weight[col] += np.log(val) +topics = matutils.corpus2dense(model[corpus], num_terms=model.num_topics) +weight = topics.sum(1) max_topic = weight.argmax() + # Get the top 64 words for this topic # Without the argument, show_topic would return only 10 words words = model.show_topic(max_topic, 64) @@ -68,24 +57,30 @@ create_cloud('cloud_blei_lda.png', words) num_topics_used = [len(model[doc]) for doc in corpus] -plt.hist(num_topics_used, np.arange(42)) -plt.ylabel('Nr of documents') -plt.xlabel('Nr of topics') -plt.savefig('../1400OS_04_01+.png') -plt.clf() +fig,ax = plt.subplots() +ax.hist(num_topics_used, np.arange(42)) +ax.set_ylabel('Nr of documents') +ax.set_xlabel('Nr of topics') +fig.tight_layout() +fig.savefig('Figure_04_01.png') -# Now, repeat the same exercise using alpha=1: +# Now, repeat the same exercise using alpha=1.0 +# You can edit the constant below to play around with this parameter +ALPHA = 1.0 model1 = models.ldamodel.LdaModel( - corpus, num_topics=NUM_TOPICS, id2word=corpus.id2word, alpha=1.) + corpus, num_topics=NUM_TOPICS, id2word=corpus.id2word, alpha=ALPHA) num_topics_used1 = [len(model1[doc]) for doc in corpus] -plt.hist([num_topics_used, num_topics_used1], np.arange(42)) -plt.ylabel('Nr of documents') -plt.xlabel('Nr of topics') +fig,ax = plt.subplots() +ax.hist([num_topics_used, num_topics_used1], np.arange(42)) +ax.set_ylabel('Nr of documents') +ax.set_xlabel('Nr of topics') # The coordinates below were fit by trial and error to look good -plt.text(9, 223, r'default alpha') -plt.text(26, 156, 'alpha=1.0') -plt.savefig('../1400OS_04_02+.png') +ax.text(9, 223, r'default alpha') +ax.text(26, 156, 'alpha=1.0') +fig.tight_layout() +fig.savefig('Figure_04_02.png') + diff --git a/ch04/build_lda.py b/ch04/build_lda.py index d4dcb7f2..a0ee9c5f 100644 --- a/ch04/build_lda.py +++ b/ch04/build_lda.py @@ -79,7 +79,7 @@ def __len__(self): distances = distance.squareform(distance.pdist(thetas)) large = distances.max() + 1 -for i in xrange(len(distances)): +for i in range(len(distances)): distances[i, i] = large print(otexts[1]) diff --git a/ch04/data/.gitignore b/ch04/data/.gitignore index 2436b020..91f24bbc 100644 --- a/ch04/data/.gitignore +++ b/ch04/data/.gitignore @@ -1,8 +1,12 @@ ap.tgz ap/ +dataset-379-20news-18828_HJRZF.zip +379/ enwiki-latest-pages-articles.xml.bz2 wiki_en_output_bow.mm +wiki_en_output_bow.mm.gz wiki_en_output_bow.mm.index wiki_en_output_tfidf.mm +wiki_en_output_tfidf.mm.gz wiki_en_output_tfidf.mm.index wiki_en_output_wordids.txt.bz2 diff --git a/ch04/data/download_ap.sh b/ch04/data/download_ap.sh index 6de8ded8..da27814a 100755 --- a/ch04/data/download_ap.sh +++ b/ch04/data/download_ap.sh @@ -1,3 +1,3 @@ #!/bin/sh -wget http://www.cs.princeton.edu/~blei/lda-c/ap.tgz +wget http://www.cs.columbia.edu/~blei/lda-c/ap.tgz tar xzf ap.tgz diff --git a/ch04/wikitopics_create_hdp.py b/ch04/wikitopics_create_hdp.py new file mode 100644 index 00000000..951d1850 --- /dev/null +++ b/ch04/wikitopics_create_hdp.py @@ -0,0 +1,39 @@ +# This code is supporting material for the book +# Building Machine Learning Systems with Python +# by Willi Richert and Luis Pedro Coelho +# published by PACKT Publishing +# +# It is made available under the MIT License + +from __future__ import print_function +import logging +import gensim +import numpy as np + +# Set up logging in order to get progress information as the model is being built: +logging.basicConfig( + format='%(asctime)s : %(levelname)s : %(message)s', + level=logging.INFO) + +# Load the preprocessed corpus (id2word & mm): +id2word = gensim.corpora.Dictionary.load_from_text( + 'data/wiki_en_output_wordids.txt.bz2') +mm = gensim.corpora.MmCorpus('data/wiki_en_output_tfidf.mm') + +# Calling the constructor is enough to build the model +# This call will take a few hours! +model = gensim.models.hdpmodel.HdpModel( + corpus=mm, + id2word=id2word, + chunksize=10000) + +# Save the model so we do not need to learn it again. +model.save('wiki_hdp.pkl') + +# Compute the document/topic matrix +topics = np.zeros((len(mm), model.num_topics)) +for di,doc in enumerate(mm): + doc_top = model[doc] + for ti,tv in doc_top: + topics[di,ti] += tv +np.save('topics_hdp.npy', topics) diff --git a/ch04/wikitopics_plot.py b/ch04/wikitopics_plot.py index f51e6dec..04adf780 100644 --- a/ch04/wikitopics_plot.py +++ b/ch04/wikitopics_plot.py @@ -31,7 +31,7 @@ topics = np.load('topics.npy', mmap_mode='r') # Compute the number of topics mentioned in each document -lens = (topics > 0).sum(1) +lens = (topics > 0).sum(axis=1) print('Mean number of topics mentioned: {0:.3}'.format(np.mean(lens))) print('Percentage of articles mentioning less than 10 topics: {0:.1%}'.format(np.mean(lens <= 10))) @@ -42,16 +42,23 @@ words = model.show_topic(weights.argmax(), 64) # The parameter ``maxsize`` often needs some manual tuning to make it look nice. -create_cloud('Wikipedia_most.png', words, maxsize=410, fontname='Neucha') -print(words) +create_cloud('Wikipedia_most.png', words, maxsize=250, fontname='Cardo') + +fraction_mention = np.mean(topics[:,weights.argmax()] > 0) +print("The most mentioned topics is mentioned in {:.1%} of documents.".format(fraction_mention)) +total_weight = np.mean(topics[:,weights.argmax()]) +print("It represents {:.1%} of the total number of words.".format(total_weight)) print() print() print() # Retrieve the **least** heavily used topic and plot it as a word cloud: words = model.show_topic(weights.argmin(), 64) -create_cloud('Wikipedia_least.png', words, maxsize=180, fontname='Neucha') -print(words) +create_cloud('Wikipedia_least.png', words, maxsize=150, fontname='Cardo') +fraction_mention = np.mean(topics[:,weights.argmin()] > 0) +print("The least mentioned topics is mentioned in {:.1%} of documents.".format(fraction_mention)) +total_weight = np.mean(topics[:,weights.argmin()]) +print("It represents {:.1%} of the total number of words.".format(total_weight)) print() print() print() diff --git a/ch04/wordcloud.py b/ch04/wordcloud.py index 6c5302ea..accca2d6 100644 --- a/ch04/wordcloud.py +++ b/ch04/wordcloud.py @@ -24,8 +24,6 @@ def create_cloud(oname, words,maxsize=120, fontname='Lobster'): # gensim returns a weight between 0 and 1 for each word, while pytagcloud # expects an integer word count. So, we multiply by a large number and # round. For a visualization this is an adequate approximation. - # We also need to flip the order as gensim returns (value, word), whilst - # pytagcloud expects (word, value): - words = [(w,int(v*10000)) for v,w in words] + words = [(w,int(v*10000)) for w,v in words] tags = make_tags(words, maxsize=maxsize) create_tag_image(tags, oname, size=(1800, 1200), fontname=fontname) diff --git a/ch05/classify.py b/ch05/classify.py index ef5af419..76699819 100644 --- a/ch05/classify.py +++ b/ch05/classify.py @@ -54,6 +54,9 @@ def prepare_sent_features(): if not text: meta[pid]['AvgSentLen'] = meta[pid]['AvgWordLen'] = 0 else: + from platform import python_version + if python_version().startswith('2'): + text = text.decode('utf-8') sent_lens = [len(nltk.word_tokenize( sent)) for sent in nltk.sent_tokenize(text)] meta[pid]['AvgSentLen'] = np.mean(sent_lens) diff --git a/ch06/01_start.py b/ch06/01_start.py index 2940b0fa..0bdbdd12 100644 --- a/ch06/01_start.py +++ b/ch06/01_start.py @@ -40,7 +40,7 @@ def create_ngram_model(): def train_model(clf_factory, X, Y, name="NB ngram", plot=False): cv = ShuffleSplit( - n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) + n=len(X), n_iter=10, test_size=0.3, random_state=0) train_errors = [] test_errors = [] @@ -83,7 +83,7 @@ def train_model(clf_factory, X, Y, name="NB ngram", plot=False): summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores)) - print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary + print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) return np.mean(train_errors), np.mean(test_errors) @@ -94,18 +94,18 @@ def print_incorrect(clf, X, Y): X_wrong = X[wrong_idx] Y_wrong = Y[wrong_idx] Y_hat_wrong = Y_hat[wrong_idx] - for idx in xrange(len(X_wrong)): - print "clf.predict('%s')=%i instead of %i" %\ - (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx]) + for idx in range(len(X_wrong)): + print("clf.predict('%s')=%i instead of %i" % + (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])) if __name__ == "__main__": X_orig, Y_orig = load_sanders_data() classes = np.unique(Y_orig) for c in classes: - print "#%s: %i" % (c, sum(Y_orig == c)) + print("#%s: %i" % (c, sum(Y_orig == c))) - print "== Pos vs. neg ==" + print("== Pos vs. neg ==") pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative") X = X_orig[pos_neg] Y = Y_orig[pos_neg] @@ -113,19 +113,19 @@ def print_incorrect(clf, X, Y): train_model(create_ngram_model, X, Y, name="pos vs neg", plot=True) - print "== Pos/neg vs. irrelevant/neutral ==" + print("== Pos/neg vs. irrelevant/neutral ==") X = X_orig Y = tweak_labels(Y_orig, ["positive", "negative"]) train_model(create_ngram_model, X, Y, name="sent vs rest", plot=True) - print "== Pos vs. rest ==" + print("== Pos vs. rest ==") X = X_orig Y = tweak_labels(Y_orig, ["positive"]) train_model(create_ngram_model, X, Y, name="pos vs rest", plot=True) - print "== Neg vs. rest ==" + print("== Neg vs. rest ==") X = X_orig Y = tweak_labels(Y_orig, ["negative"]) train_model(create_ngram_model, X, Y, name="neg vs rest", plot=True) - print "time spent:", time.time() - start_time + print("time spent:", time.time() - start_time) diff --git a/ch06/02_tuning.py b/ch06/02_tuning.py index a48d3edc..5b6a835f 100644 --- a/ch06/02_tuning.py +++ b/ch06/02_tuning.py @@ -45,7 +45,7 @@ def create_ngram_model(params=None): def grid_search_model(clf_factory, X, Y): cv = ShuffleSplit( - n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) + n=len(X), n_iter=10, test_size=0.3, random_state=0) param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)], vect__min_df=[1, 2], @@ -64,7 +64,7 @@ def grid_search_model(clf_factory, X, Y): verbose=10) grid_search.fit(X, Y) clf = grid_search.best_estimator_ - print clf + print(clf) return clf @@ -114,7 +114,7 @@ def train_model(clf, X, Y, name="NB ngram", plot=False): summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores)) - print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary + print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) return np.mean(train_errors), np.mean(test_errors) @@ -125,9 +125,9 @@ def print_incorrect(clf, X, Y): X_wrong = X[wrong_idx] Y_wrong = Y[wrong_idx] Y_hat_wrong = Y_hat[wrong_idx] - for idx in xrange(len(X_wrong)): - print "clf.predict('%s')=%i instead of %i" %\ - (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx]) + for idx in range(len(X_wrong)): + print("clf.predict('%s')=%i instead of %i" % + (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])) def get_best_model(): @@ -149,16 +149,16 @@ def get_best_model(): X_orig, Y_orig = load_sanders_data() classes = np.unique(Y_orig) for c in classes: - print "#%s: %i" % (c, sum(Y_orig == c)) + print("#%s: %i" % (c, sum(Y_orig == c))) - print "== Pos vs. neg ==" + print("== Pos vs. neg ==") pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative") X = X_orig[pos_neg] Y = Y_orig[pos_neg] Y = tweak_labels(Y, ["positive"]) train_model(get_best_model(), X, Y, name="pos vs neg", plot=True) - print "== Pos/neg vs. irrelevant/neutral ==" + print("== Pos/neg vs. irrelevant/neutral ==") X = X_orig Y = tweak_labels(Y_orig, ["positive", "negative"]) @@ -166,16 +166,16 @@ def get_best_model(): # rest", plot=True) train_model(get_best_model(), X, Y, name="pos vs neg", plot=True) - print "== Pos vs. rest ==" + print("== Pos vs. rest ==") X = X_orig Y = tweak_labels(Y_orig, ["positive"]) train_model(get_best_model(), X, Y, name="pos vs rest", plot=True) - print "== Neg vs. rest ==" + print("== Neg vs. rest ==") X = X_orig Y = tweak_labels(Y_orig, ["negative"]) train_model(get_best_model(), X, Y, name="neg vs rest", plot=True) - print "time spent:", time.time() - start_time + print("time spent:", time.time() - start_time) diff --git a/ch06/03_clean.py b/ch06/03_clean.py index f00c2c44..0170ed82 100644 --- a/ch06/03_clean.py +++ b/ch06/03_clean.py @@ -57,7 +57,7 @@ } emo_repl_order = [k for (k_len, k) in reversed( - sorted([(len(k), k) for k in emo_repl.keys()]))] + sorted([(len(k), k) for k in list(emo_repl.keys())]))] re_repl = { r"\br\b": "are", @@ -84,7 +84,7 @@ def preprocessor(tweet): for k in emo_repl_order: tweet = tweet.replace(k, emo_repl[k]) - for r, repl in re_repl.iteritems(): + for r, repl in re_repl.items(): tweet = re.sub(r, repl, tweet) return tweet @@ -103,7 +103,7 @@ def preprocessor(tweet): def train_model(clf, X, Y, name="NB ngram", plot=False): # create it again for plotting cv = ShuffleSplit( - n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) + n=len(X), n_iter=10, test_size=0.3, random_state=0) train_errors = [] test_errors = [] @@ -150,7 +150,7 @@ def train_model(clf, X, Y, name="NB ngram", plot=False): summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores)) - print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary + print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) return np.mean(train_errors), np.mean(test_errors) @@ -161,9 +161,9 @@ def print_incorrect(clf, X, Y): X_wrong = X[wrong_idx] Y_wrong = Y[wrong_idx] Y_hat_wrong = Y_hat[wrong_idx] - for idx in xrange(len(X_wrong)): - print "clf.predict('%s')=%i instead of %i" %\ - (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx]) + for idx in range(len(X_wrong)): + print("clf.predict('%s')=%i instead of %i" % + (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])) def get_best_model(): @@ -185,16 +185,16 @@ def get_best_model(): X_orig, Y_orig = load_sanders_data() classes = np.unique(Y_orig) for c in classes: - print "#%s: %i" % (c, sum(Y_orig == c)) + print("#%s: %i" % (c, sum(Y_orig == c))) - print "== Pos vs. neg ==" + print("== Pos vs. neg ==") pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative") X = X_orig[pos_neg] Y = Y_orig[pos_neg] Y = tweak_labels(Y, ["positive"]) train_model(get_best_model(), X, Y, name="pos vs neg", plot=True) - print "== Pos/neg vs. irrelevant/neutral ==" + print("== Pos/neg vs. irrelevant/neutral ==") X = X_orig Y = tweak_labels(Y_orig, ["positive", "negative"]) @@ -202,16 +202,16 @@ def get_best_model(): # rest", plot=True) train_model(get_best_model(), X, Y, name="pos+neg vs rest", plot=True) - print "== Pos vs. rest ==" + print("== Pos vs. rest ==") X = X_orig Y = tweak_labels(Y_orig, ["positive"]) train_model(get_best_model(), X, Y, name="pos vs rest", plot=True) - print "== Neg vs. rest ==" + print("== Neg vs. rest ==") X = X_orig Y = tweak_labels(Y_orig, ["negative"]) train_model(get_best_model(), X, Y, name="neg vs rest", plot=True) - print "time spent:", time.time() - start_time + print("time spent:", time.time() - start_time) diff --git a/ch06/04_sent.py b/ch06/04_sent.py index 87fbafc9..c09a435f 100644 --- a/ch06/04_sent.py +++ b/ch06/04_sent.py @@ -153,7 +153,7 @@ def transform(self, documents): } emo_repl_order = [k for (k_len, k) in reversed( - sorted([(len(k), k) for k in emo_repl.keys()]))] + sorted([(len(k), k) for k in list(emo_repl.keys())]))] re_repl = { r"\br\b": "are", @@ -179,7 +179,7 @@ def preprocessor(tweet): for k in emo_repl_order: tweet = tweet.replace(k, emo_repl[k]) - for r, repl in re_repl.iteritems(): + for r, repl in re_repl.items(): tweet = re.sub(r, repl, tweet) return tweet.replace("-", " ").replace("_", " ") @@ -202,7 +202,7 @@ def preprocessor(tweet): def __grid_search_model(clf_factory, X, Y): cv = ShuffleSplit( - n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) + n=len(X), n_iter=10, test_size=0.3, random_state=0) param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)], vect__min_df=[1, 2], @@ -220,7 +220,7 @@ def __grid_search_model(clf_factory, X, Y): verbose=10) grid_search.fit(X, Y) clf = grid_search.best_estimator_ - print clf + print(clf) return clf @@ -228,7 +228,7 @@ def __grid_search_model(clf_factory, X, Y): def train_model(clf, X, Y, name="NB ngram", plot=False): # create it again for plotting cv = ShuffleSplit( - n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) + n=len(X), n_iter=10, test_size=0.3, random_state=0) train_errors = [] test_errors = [] @@ -275,7 +275,7 @@ def train_model(clf, X, Y, name="NB ngram", plot=False): summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores)) - print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary + print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) return np.mean(train_errors), np.mean(test_errors) @@ -286,9 +286,9 @@ def print_incorrect(clf, X, Y): X_wrong = X[wrong_idx] Y_wrong = Y[wrong_idx] Y_hat_wrong = Y_hat[wrong_idx] - for idx in xrange(len(X_wrong)): - print "clf.predict('%s')=%i instead of %i" %\ - (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx]) + for idx in range(len(X_wrong)): + print("clf.predict('%s')=%i instead of %i" % + (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])) def get_best_model(): @@ -315,16 +315,16 @@ def get_best_model(): #Y_orig = Y_orig[:100,] classes = np.unique(Y_orig) for c in classes: - print "#%s: %i" % (c, sum(Y_orig == c)) + print("#%s: %i" % (c, sum(Y_orig == c))) - print "== Pos vs. neg ==" + print("== Pos vs. neg ==") pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative") X = X_orig[pos_neg] Y = Y_orig[pos_neg] Y = tweak_labels(Y, ["positive"]) train_model(get_best_model(), X, Y, name="pos vs neg", plot=True) - print "== Pos/neg vs. irrelevant/neutral ==" + print("== Pos/neg vs. irrelevant/neutral ==") X = X_orig Y = tweak_labels(Y_orig, ["positive", "negative"]) @@ -332,18 +332,18 @@ def get_best_model(): # rest", plot=True) train_model(get_best_model(), X, Y, name="pos+neg vs rest", plot=True) - print "== Pos vs. rest ==" + print("== Pos vs. rest ==") X = X_orig Y = tweak_labels(Y_orig, ["positive"]) train_model(get_best_model(), X, Y, name="pos vs rest", plot=True) - print "== Neg vs. rest ==" + print("== Neg vs. rest ==") X = X_orig Y = tweak_labels(Y_orig, ["negative"]) train_model(get_best_model(), X, Y, name="neg vs rest", plot=True) - print "time spent:", time.time() - start_time + print("time spent:", time.time() - start_time) json.dump(poscache, open(poscache_filename, "w")) diff --git a/ch06/install.py b/ch06/install.py index c07a195f..be61f485 100644 --- a/ch06/install.py +++ b/ch06/install.py @@ -11,23 +11,19 @@ # # Pulls tweet data from Twitter because ToS prevents distributing it directly. # -# Right now we use unauthenticated requests, which are rate-limited to 150/hr. -# We use 125/hr to stay safe. -# -# # - Niek Sanders # njs@sananalytics.com # October 20, 2011 # # -# Excuse the ugly code. I threw this together as quickly as possible and I -# don't normally code in Python. -# # In Sanders' original form, the code was using Twitter API 1.0. # Now that Twitter moved to 1.1, we had to make a few changes. # Cf. twitterauth.py for the details. +# Regarding rate limiting, please check +# https://dev.twitter.com/rest/public/rate-limiting + import sys import csv import json @@ -38,24 +34,21 @@ import twitter except ImportError: print("""\ -You need to install python-twitter: - pip install python-twitter +You need to ... + pip install twitter If pip is not found you might have to install it using easy_install. If it does not work on your system, you might want to follow instructions -at https://github.com/bear/python-twitter, most likely: - $ git clone https://github.com/bear/python-twitter - $ cd python-twitter +at https://github.com/sixohsix/twitter, most likely: + $ git clone https://github.com/sixohsix/twitter + $ cd twitter $ sudo python setup.py install """) sys.exit(1) from twitterauth import CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN_KEY, ACCESS_TOKEN_SECRET -api = twitter.Api(consumer_key=CONSUMER_KEY, consumer_secret=CONSUMER_SECRET, - access_token_key=ACCESS_TOKEN_KEY, access_token_secret=ACCESS_TOKEN_SECRET) - - -MAX_TWEETS_PER_HR = 350 +api = twitter.Twitter(auth=twitter.OAuth(consumer_key=CONSUMER_KEY, consumer_secret=CONSUMER_SECRET, + token=ACCESS_TOKEN_KEY, token_secret=ACCESS_TOKEN_SECRET)) DATA_PATH = "data" @@ -87,16 +80,15 @@ def get_user_params(DATA_PATH): def dump_user_params(user_params): # dump user params for confirmation - print 'Input: ' + user_params['inList'] - print 'Output: ' + user_params['outList'] - print 'Raw data: ' + user_params['rawDir'] - return + print('Input: ' + user_params['inList']) + print('Output: ' + user_params['outList']) + print('Raw data: ' + user_params['rawDir']) def read_total_list(in_filename): # read total fetch list csv - fp = open(in_filename, 'rb') + fp = open(in_filename, 'rt') reader = csv.reader(fp, delimiter=',', quotechar='"') if os.path.exists(MISSING_ID_FILE): @@ -111,10 +103,12 @@ def read_total_list(in_filename): else: not_authed_ids = [] - print "We will skip %i tweets that are not available/visible any more on twitter" % (len(missing_ids) + len(not_authed_ids)) + print("We will skip %i tweets that are not available or visible any more on twitter" % ( + len(missing_ids) + len(not_authed_ids))) ignore_ids = set(missing_ids + not_authed_ids) total_list = [] + for row in reader: if row[2] not in ignore_ids: total_list.append(row) @@ -140,12 +134,12 @@ def purge_already_fetched(fetch_list, raw_dir): parse_tweet_json(tweet_file) count_done += 1 except RuntimeError: - print "Error parsing", item + print("Error parsing", item) rem_list.append(item) else: rem_list.append(item) - print "We have already downloaded %i tweets." % count_done + print("We have already downloaded %i tweets." % count_done) return rem_list @@ -156,66 +150,50 @@ def download_tweets(fetch_list, raw_dir): if not os.path.exists(raw_dir): os.mkdir(raw_dir) - # stay within rate limits - download_pause_sec = 3600 / MAX_TWEETS_PER_HR - # download tweets for idx in range(0, len(fetch_list)): - # stay in Twitter API rate limits - print 'Pausing %d sec to obey Twitter API rate limits' % \ - (download_pause_sec) - time.sleep(download_pause_sec) - # current item item = fetch_list[idx] - print item - - # print status - print '--> downloading tweet #%s (%d of %d)' % \ - (item[2], idx + 1, len(fetch_list)) + print(item) - # Old Twitter API 1.0 - # pull data - # url = 'https://api.twitter.com/1/statuses/show.json?id=' + item[2] - # print url - # urllib.urlretrieve(url, raw_dir + item[2] + '.json') + print('--> downloading tweet #%s (%d of %d)' % + (item[2], idx + 1, len(fetch_list))) - # New Twitter API 1.1 try: - sec = api.GetSleepTime('/statuses/show/:id') - if sec > 0: - print "Sleeping %i seconds to conform to Twitter's rate limiting" % sec - time.sleep(sec) + #import pdb;pdb.set_trace() + response = api.statuses.show(_id=item[2]) - result = api.GetStatus(item[2]) - json_data = result.AsJsonString() + if response.rate_limit_remaining <= 0: + wait_seconds = response.rate_limit_reset - time.time() + print("Rate limiting requests us to wait %f seconds" % + wait_seconds) + time.sleep(wait_seconds+5) - except twitter.TwitterError, e: + except twitter.TwitterError as e: fatal = True - for m in e.message: + print(e) + for m in json.loads(e.response_data.decode())['errors']: if m['code'] == 34: - print "Tweet missing: ", item - # [{u'message': u'Sorry, that page does not exist', u'code': 34}] - with open(MISSING_ID_FILE, "a") as f: + print("Tweet missing: ", item) + with open(MISSING_ID_FILE, "at") as f: f.write(item[2] + "\n") fatal = False break elif m['code'] == 63: - print "User of tweet '%s' has been suspended." % item - # [{u'message': u'Sorry, that page does not exist', u'code': 34}] - with open(MISSING_ID_FILE, "a") as f: + print("User of tweet '%s' has been suspended." % item) + with open(MISSING_ID_FILE, "at") as f: f.write(item[2] + "\n") fatal = False break elif m['code'] == 88: - print "Rate limit exceeded. Please lower max_tweets_per_hr." + print("Rate limit exceeded.") fatal = True break elif m['code'] == 179: - print "Not authorized to view this tweet." - with open(NOT_AUTHORIZED_ID_FILE, "a") as f: + print("Not authorized to view this tweet.") + with open(NOT_AUTHORIZED_ID_FILE, "at") as f: f.write(item[2] + "\n") fatal = False break @@ -225,8 +203,8 @@ def download_tweets(fetch_list, raw_dir): else: continue - with open(raw_dir + item[2] + '.json', "w") as f: - f.write(json_data + "\n") + with open(raw_dir + item[2] + '.json', "wt") as f: + f.write(json.dumps(dict(response)) + "\n") return @@ -234,12 +212,13 @@ def download_tweets(fetch_list, raw_dir): def parse_tweet_json(filename): # read tweet - fp = open(filename, 'rb') + fp = open(filename, 'r') # parse json try: tweet_json = json.load(fp) - except ValueError: + except ValueError as e: + print(e) raise RuntimeError('error parsing json') # look for twitter api error msgs @@ -281,20 +260,20 @@ def build_output_corpus(out_filename, raw_dir, total_list): writer.writerow(full_row) except RuntimeError: - print '--> bad data in tweet #' + item[2] + print('--> bad data in tweet #' + item[2]) missing_count += 1 else: - print '--> missing tweet #' + item[2] + print('--> missing tweet #' + item[2]) missing_count += 1 # indicate success if missing_count == 0: - print '\nSuccessfully downloaded corpus!' - print 'Output in: ' + out_filename + '\n' + print('\nSuccessfully downloaded corpus!') + print('Output in: ' + out_filename + '\n') else: - print '\nMissing %d of %d tweets!' % (missing_count, len(total_list)) - print 'Partial output in: ' + out_filename + '\n' + print('\nMissing %d of %d tweets!' % (missing_count, len(total_list))) + print('Partial output in: ' + out_filename + '\n') return @@ -302,7 +281,7 @@ def build_output_corpus(out_filename, raw_dir, total_list): def main(): # get user parameters user_params = get_user_params(DATA_PATH) - print user_params + print(user_params) dump_user_params(user_params) # get fetch list @@ -310,7 +289,7 @@ def main(): # remove already fetched or missing tweets fetch_list = purge_already_fetched(total_list, user_params['rawDir']) - print "Fetching %i tweets..." % len(fetch_list) + print("Fetching %i tweets..." % len(fetch_list)) if fetch_list: # start fetching data from twitter @@ -319,10 +298,11 @@ def main(): # second pass for any failed downloads fetch_list = purge_already_fetched(total_list, user_params['rawDir']) if fetch_list: - print '\nStarting second pass to retry %i failed downloads...' % len(fetch_list) + print('\nStarting second pass to retry %i failed downloads...' % + len(fetch_list)) download_tweets(fetch_list, user_params['rawDir']) else: - print "Nothing to fetch any more." + print("Nothing to fetch any more.") # build output corpus build_output_corpus(user_params['outList'], user_params['rawDir'], diff --git a/ch06/utils.py b/ch06/utils.py index ca8b2357..6757354f 100644 --- a/ch06/utils.py +++ b/ch06/utils.py @@ -6,6 +6,7 @@ # It is made available under the MIT License import os +import sys import collections import csv import json @@ -57,7 +58,7 @@ def load_sanders_data(dirname=".", line_count=-1): try: tweet = json.load(open(tweet_fn, "r")) except IOError: - print("Tweet '%s' not found. Skip."%tweet_fn) + print(("Tweet '%s' not found. Skip." % tweet_fn)) continue if 'text' in tweet and tweet['user']['lang'] == "en": @@ -84,14 +85,14 @@ def plot_pr(auc_score, name, phase, precision, recall, label=None): pylab.title('P/R curve (AUC=%0.2f) / %s' % (auc_score, label)) filename = name.replace(" ", "_") pylab.savefig(os.path.join(CHART_DIR, "pr_%s_%s.png" % - (filename, phase)), bbox_inches="tight") + (filename, phase)), bbox_inches="tight") def show_most_informative_features(vectorizer, clf, n=20): c_f = sorted(zip(clf.coef_[0], vectorizer.get_feature_names())) - top = zip(c_f[:n], c_f[:-(n + 1):-1]) + top = list(zip(c_f[:n], c_f[:-(n + 1):-1])) for (c1, f1), (c2, f2) in top: - print "\t%.4f\t%-15s\t\t%.4f\t%-15s" % (c1, f1, c2, f2) + print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (c1, f1, c2, f2)) def plot_log(): @@ -119,7 +120,7 @@ def plot_feat_importance(feature_names, clf, name): inds = np.argsort(coef) f_imp = f_imp[inds] coef = coef[inds] - xpos = np.array(range(len(coef))) + xpos = np.array(list(range(len(coef)))) pylab.bar(xpos, coef, width=1) pylab.title('Feature importance for %s' % (name)) @@ -181,8 +182,13 @@ def plot_bias_variance(data_sizes, train_errors, test_errors, name): def load_sent_word_net(): sent_scores = collections.defaultdict(list) + sentiwordnet_path = os.path.join(DATA_DIR, "SentiWordNet_3.0.0_20130122.txt") - with open(os.path.join(DATA_DIR, "SentiWordNet_3.0.0_20130122.txt"), "r") as csvfile: + if not os.path.exists(sentiwordnet_path): + print("Please download SentiWordNet_3.0.0 from http://sentiwordnet.isti.cnr.it/download.php, extract it and put it into the data directory") + sys.exit(1) + + with open(sentiwordnet_path, 'r') as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='"') for line in reader: if line[0].startswith("#"): @@ -200,7 +206,7 @@ def load_sent_word_net(): term = term.replace("-", " ").replace("_", " ") key = "%s/%s" % (POS, term.split("#")[0]) sent_scores[key].append((float(PosScore), float(NegScore))) - for key, value in sent_scores.iteritems(): + for key, value in sent_scores.items(): sent_scores[key] = np.mean(value, axis=0) return sent_scores diff --git a/ch07/.gitignore b/ch07/.gitignore index d21bb3d1..e33609d2 100644 --- a/ch07/.gitignore +++ b/ch07/.gitignore @@ -1 +1 @@ -.formula_*/ +*.png diff --git a/ch07/README.rst b/ch07/README.rst new file mode 100644 index 00000000..12a7b051 --- /dev/null +++ b/ch07/README.rst @@ -0,0 +1,42 @@ +========= +Chapter 7 +========= + +Support code for *Chapter 7: Regression* + + +Boston data analysis +-------------------- + +This dataset is shipped with sklearn. Thus, no extra download is required. + + +boston1.py + Fit a linear regression model to the Boston house price data +boston1numpy.py + Version of above script using numpy operations for linear regression +boston_cv_penalized.py + Test different penalized (and OLS) regression schemes on the Boston dataset +figure1_2.py + Show the regression line for Boston data +figure3.py + Show the regression line for Boston data with OLS and Lasso +figure4.py + Scatter plot of predicted-vs-actual for multidimensional regression + +10K data analysis +----------------- + +lr10k.py + Linear regression on 10K dataset, evaluation by cross-validation +predict10k_en.py + Elastic nets (including with inner cross-validation for parameter + settings). Produces scatter plot. + + +MovieLens data analysis +----------------------- + +In this chapter, we only consider a very simple approach, which is implemented +in the ``usermodel.py`` script. + diff --git a/ch07/boston1.py b/ch07/boston1.py index abfa40b3..d0b30447 100644 --- a/ch07/boston1.py +++ b/ch07/boston1.py @@ -19,6 +19,7 @@ x = boston.data y = boston.target +# Fitting a model is trivial: call the ``fit`` method in LinearRegression: lr = LinearRegression() lr.fit(x, y) @@ -26,11 +27,13 @@ rmse = np.sqrt(lr.residues_/len(x)) print('RMSE: {}'.format(rmse)) +fig, ax = plt.subplots() +# Plot a diagonal (for reference): +ax.plot([0, 50], [0, 50], '-', color=(.9,.3,.3), lw=4) + # Plot the prediction versus real: -plt.plot(lr.predict(x), boston.target, 'ro') +ax.scatter(lr.predict(x), boston.target) -# Plot a diagonal (for reference): -plt.plot([0, 50], [0, 50], 'g-') -plt.xlabel('predicted') -plt.ylabel('real') -plt.show() +ax.set_xlabel('predicted') +ax.set_ylabel('real') +fig.savefig('Figure_07_08.png') diff --git a/ch07/boston_cv10_penalized.py b/ch07/boston_cv_penalized.py similarity index 54% rename from ch07/boston_cv10_penalized.py rename to ch07/boston_cv_penalized.py index 3ce97093..c894c4fa 100644 --- a/ch07/boston_cv10_penalized.py +++ b/ch07/boston_cv_penalized.py @@ -10,40 +10,37 @@ from __future__ import print_function import numpy as np from sklearn.cross_validation import KFold -from sklearn.linear_model import ElasticNet, Lasso, Ridge +from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, Ridge +from sklearn.metrics import r2_score from sklearn.datasets import load_boston boston = load_boston() x = boston.data y = boston.target for name, met in [ - ('elastic-net(.5)', ElasticNet(fit_intercept=True, alpha=0.5)), - ('lasso(.5)', Lasso(fit_intercept=True, alpha=0.5)), - ('ridge(.5)', Ridge(fit_intercept=True, alpha=0.5)), + ('linear regression', LinearRegression()), + ('lasso()', Lasso()), + ('elastic-net(.5)', ElasticNet(alpha=0.5)), + ('lasso(.5)', Lasso(alpha=0.5)), + ('ridge(.5)', Ridge(alpha=0.5)), ]: # Fit on the whole data: met.fit(x, y) # Predict on the whole data: p = met.predict(x) - - e = p - y - # np.dot(e, e) == sum(ei**2 for ei in e) but faster - total_error = np.dot(e, e) - rmse_train = np.sqrt(total_error / len(p)) + r2_train = r2_score(y, p) # Now, we use 10 fold cross-validation to estimate generalization error - kf = KFold(len(x), n_folds=10) - err = 0 + kf = KFold(len(x), n_folds=5) + p = np.zeros_like(y) for train, test in kf: met.fit(x[train], y[train]) - p = met.predict(x[test]) - e = p - y[test] - err += np.dot(e, e) + p[test] = met.predict(x[test]) - rmse_10cv = np.sqrt(err / len(x)) + r2_cv = r2_score(y, p) print('Method: {}'.format(name)) - print('RMSE on training: {}'.format(rmse_train)) - print('RMSE on 10-fold CV: {}'.format(rmse_10cv)) + print('R2 on training: {}'.format(r2_train)) + print('R2 on 5-fold CV: {}'.format(r2_cv)) print() print() diff --git a/ch07/cv10_lr.py b/ch07/cv10_lr.py deleted file mode 100644 index 48ebc217..00000000 --- a/ch07/cv10_lr.py +++ /dev/null @@ -1,42 +0,0 @@ -# This code is supporting material for the book -# Building Machine Learning Systems with Python -# by Willi Richert and Luis Pedro Coelho -# published by PACKT Publishing -# -# It is made available under the MIT License - -import numpy as np -from sklearn.cross_validation import KFold -from sklearn.linear_model import LinearRegression, ElasticNet -from sklearn.datasets import load_boston -boston = load_boston() -x = boston.data -y = boston.target - - -# Switch this variable to use an Elastic Net instead of OLS -FIT_EN = False - -if FIT_EN: - model = ElasticNet(fit_intercept=True, alpha=0.5) -else: - model = LinearRegression(fit_intercept=True) - -model.fit(x, y) -rmse_train = np.sqrt(model.residues_/len(x)) - -# Alternatively, we could have computed rmse_train using this expression: -# rmse_train = np.sqrt(np.mean( (model.predict(x) - y) ** 2)) -# The results are equivalent - -kf = KFold(len(x), n_folds=10) -err = 0 -for train, test in kf: - model.fit(x[train], y[train]) - p = model.predict(x[test]) - e = p - y[test] - err += np.dot(e, e) # This is the same as np.sum(e * e) - -rmse_10cv = np.sqrt(err / len(x)) -print('RMSE on training: {}'.format(rmse_train)) -print('RMSE on 10-fold CV: {}'.format(rmse_10cv)) diff --git a/ch07/data/.gitignore b/ch07/data/.gitignore new file mode 100644 index 00000000..3286ba0f --- /dev/null +++ b/ch07/data/.gitignore @@ -0,0 +1 @@ +E2006.train diff --git a/ch07/data/download.sh b/ch07/data/download.sh index 6f938e06..74753364 100755 --- a/ch07/data/download.sh +++ b/ch07/data/download.sh @@ -1,5 +1,3 @@ #!/usr/bin/env bash -wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/E2006.train.bz2 +curl -O http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/E2006.train.bz2 bunzip2 E2006.train.bz2 -wget http://www.grouplens.org/system/files/ml-100k.zip -unzip ml-100k.zip diff --git a/ch07/figure1.py b/ch07/figure1.py deleted file mode 100644 index 9585488a..00000000 --- a/ch07/figure1.py +++ /dev/null @@ -1,40 +0,0 @@ -# This code is supporting material for the book -# Building Machine Learning Systems with Python -# by Willi Richert and Luis Pedro Coelho -# published by PACKT Publishing -# -# It is made available under the MIT License - -import numpy as np -from sklearn.datasets import load_boston -from sklearn.linear_model import LinearRegression -from matplotlib import pyplot as plt -from mpltools import style -style.use('ggplot') - -boston = load_boston() - -# Index number five in the number of rooms -plt.scatter(boston.data[:, 5], boston.target) -plt.xlabel("Number of rooms (RM)") -plt.ylabel("House Price") - -x = boston.data[:, 5] -# fit (used below) takes a two-dimensional array as input. We use np.atleast_2d -# to convert from one to two dimensional, then transpose to make sure that the -# format matches: -x = np.transpose(np.atleast_2d(x)) - -y = boston.target - -lr = LinearRegression(fit_intercept=False) - -lr.fit(x, y) - -plt.plot([0, boston.data[:, 5].max() + 1], - [0, lr.predict(boston.data[:, 5].max() + 1)], '-', lw=4) -plt.savefig('Figure1.png', dpi=150) - -# The instance member `residues_` contains the sum of the squared residues -rmse = np.sqrt(lr.residues_ / len(x)) -print('RMSE: {}'.format(rmse)) diff --git a/ch07/figure1_2.py b/ch07/figure1_2.py new file mode 100644 index 00000000..3f11a0c7 --- /dev/null +++ b/ch07/figure1_2.py @@ -0,0 +1,63 @@ +# This code is supporting material for the book +# Building Machine Learning Systems with Python +# by Willi Richert and Luis Pedro Coelho +# published by PACKT Publishing +# +# It is made available under the MIT License + +import numpy as np +from sklearn.datasets import load_boston +from sklearn.linear_model import LinearRegression +from sklearn.metrics import mean_squared_error, r2_score +from matplotlib import pyplot as plt + +boston = load_boston() + +# Index number five in the number of rooms +fig,ax = plt.subplots() +ax.scatter(boston.data[:, 5], boston.target) +ax.set_xlabel("Average number of rooms (RM)") +ax.set_ylabel("House Price") + +x = boston.data[:, 5] +# fit (used below) takes a two-dimensional array as input. We use np.atleast_2d +# to convert from one to two dimensional, then transpose to make sure that the +# format matches: +x = np.transpose(np.atleast_2d(x)) + +y = boston.target + +lr = LinearRegression(fit_intercept=False) +lr.fit(x, y) + +ax.plot([0, boston.data[:, 5].max() + 1], + [0, lr.predict(boston.data[:, 5].max() + 1)], '-', lw=4) +fig.savefig('Figure1.png') + +mse = mean_squared_error(y, lr.predict(x)) +rmse = np.sqrt(mse) +print('RMSE (no intercept): {}'.format(rmse)) + +# Repeat, but fitting an intercept this time: +lr = LinearRegression(fit_intercept=True) + +lr.fit(x, y) + +fig,ax = plt.subplots() +ax.set_xlabel("Average number of rooms (RM)") +ax.set_ylabel("House Price") +ax.scatter(boston.data[:, 5], boston.target) +xmin = x.min() +xmax = x.max() +ax.plot([xmin, xmax], lr.predict([[xmin], [xmax]]) , '-', lw=4) +fig.savefig('Figure2.png') + +mse = mean_squared_error(y, lr.predict(x)) +print("Mean squared error (of training data): {:.3}".format(mse)) + +rmse = np.sqrt(mse) +print("Root mean squared error (of training data): {:.3}".format(rmse)) + +cod = r2_score(y, lr.predict(x)) +print('COD (on training data): {:.2}'.format(cod)) + diff --git a/ch07/figure2.py b/ch07/figure2.py deleted file mode 100644 index c5977207..00000000 --- a/ch07/figure2.py +++ /dev/null @@ -1,31 +0,0 @@ -# This code is supporting material for the book -# Building Machine Learning Systems with Python -# by Willi Richert and Luis Pedro Coelho -# published by PACKT Publishing -# -# It is made available under the MIT License - -import numpy as np -from sklearn.datasets import load_boston -import pylab as plt -from mpltools import style -style.use('ggplot') - -boston = load_boston() -plt.scatter(boston.data[:, 5], boston.target) -plt.xlabel("RM") -plt.ylabel("House Price") - - -x = boston.data[:, 5] -xmin = x.min() -xmax = x.max() -x = np.array([[v, 1] for v in x]) -y = boston.target - -(slope, bias), res, _, _ = np.linalg.lstsq(x, y) -plt.plot([xmin, xmax], [slope * xmin + bias, slope * xmax + bias], '-', lw=4) -plt.savefig('Figure2.png', dpi=150) - -rmse = np.sqrt(res[0] / len(x)) -print('Residual: {}'.format(rmse)) diff --git a/ch07/figure3.py b/ch07/figure3.py new file mode 100644 index 00000000..7543c1ec --- /dev/null +++ b/ch07/figure3.py @@ -0,0 +1,33 @@ +# This code is supporting material for the book +# Building Machine Learning Systems with Python +# by Willi Richert and Luis Pedro Coelho +# published by PACKT Publishing +# +# It is made available under the MIT License + +from sklearn.linear_model import LinearRegression, Lasso +import numpy as np +from sklearn.datasets import load_boston +from matplotlib import pyplot as plt + +boston = load_boston() +fig, ax = plt.subplots() +ax.scatter(boston.data[:, 5], boston.target) +ax.set_xlabel("Number of rooms (RM)") +ax.set_ylabel("House Price") + + +x = boston.data[:, 5] +xmin = x.min() +xmax = x.max() +x = np.transpose(np.atleast_2d(x)) +y = boston.target + +lr = LinearRegression() +lr.fit(x, y) +ax.plot([xmin, xmax], lr.predict([[xmin], [xmax]]), ':', lw=4, label='OLS model') + +las = Lasso() +las.fit(x, y) +ax.plot([xmin, xmax], las.predict([ [xmin], [xmax] ]), '-', lw=4, label='Lasso model') +fig.savefig('Figure3.png') diff --git a/ch07/figure4.py b/ch07/figure4.py index ab71b3a8..a24d48be 100644 --- a/ch07/figure4.py +++ b/ch07/figure4.py @@ -5,31 +5,29 @@ # # It is made available under the MIT License -from sklearn.linear_model import Lasso + +# This script plots prediction-vs-actual on training set for the Boston dataset +# using OLS regression import numpy as np +from sklearn.linear_model import LinearRegression from sklearn.datasets import load_boston -import pylab as plt -from mpltools import style -style.use('ggplot') +from sklearn.metrics import mean_squared_error +from matplotlib import pyplot as plt boston = load_boston() -plt.scatter(boston.data[:, 5], boston.target) -plt.xlabel("RM") -plt.ylabel("House Price") - -x = boston.data[:, 5] -xmin = x.min() -xmax = x.max() -x = np.array([[v, 1] for v in x]) +x = boston.data y = boston.target -(slope, bias), res, _, _ = np.linalg.lstsq(x, y) -plt.plot([xmin, xmax], [slope * xmin + bias, slope * xmax + bias], ':', lw=4) +lr = LinearRegression() +lr.fit(x, y) +p = lr.predict(x) +print("RMSE: {:.2}.".format(np.sqrt(mean_squared_error(y, p)))) +print("R2: {:.2}.".format(lr.score(x, y))) +fig,ax = plt.subplots() +ax.scatter(p, y) +ax.set_xlabel('Predicted price') +ax.set_ylabel('Actual price') +ax.plot([y.min(), y.max()], [y.min(), y.max()], lw=4) -las = Lasso() -las.fit(x, y) -y0 = las.predict([xmin, 1]) -y1 = las.predict([xmax, 1]) -plt.plot([xmin, xmax], [y0, y1], '-', lw=4) -plt.savefig('Figure3.png', dpi=150) +fig.savefig('Figure4.png') diff --git a/ch07/lasso_path_plot.py b/ch07/lasso_path_plot.py new file mode 100644 index 00000000..eab64c26 --- /dev/null +++ b/ch07/lasso_path_plot.py @@ -0,0 +1,29 @@ +# This code is supporting material for the book +# Building Machine Learning Systems with Python +# by Willi Richert and Luis Pedro Coelho +# published by PACKT Publishing +# +# It is made available under the MIT License + +from sklearn.linear_model import Lasso +from sklearn.datasets import load_boston +from matplotlib import pyplot as plt +import numpy as np + +boston = load_boston() +x = boston.data +y = boston.target + +las = Lasso(normalize=1) +alphas = np.logspace(-5, 2, 1000) +alphas, coefs, _= las.path(x, y, alphas=alphas) + +fig,ax = plt.subplots() +ax.plot(alphas, coefs.T) +ax.set_xscale('log') +ax.set_xlim(alphas.max(), alphas.min()) +ax.set_xlabel('Lasso coefficient path as a function of alpha') +ax.set_xlabel('Alpha') +ax.set_ylabel('Coefficient weight') +fig.savefig('Figure_LassoPath.png') + diff --git a/ch07/lr10k.py b/ch07/lr10k.py index 05346c1c..831706a1 100644 --- a/ch07/lr10k.py +++ b/ch07/lr10k.py @@ -6,36 +6,33 @@ # It is made available under the MIT License import numpy as np +from sklearn.metrics import mean_squared_error, r2_score from sklearn.datasets import load_svmlight_file -from sklearn.linear_model import ElasticNet, LinearRegression +from sklearn.linear_model import LinearRegression from sklearn.cross_validation import KFold -USE_EN = False +# Whether to use Elastic nets (otherwise, ordinary linear regression is used) +# Load data: data, target = load_svmlight_file('data/E2006.train') -if USE_EN: - lr = ElasticNet(fit_intercept=True) -else: - lr = LinearRegression(fit_intercept=True) -kf = KFold(len(target), n_folds=10) -err = 0 -for train, test in kf: - lr.fit(data[train], target[train]) - p = lr.predict(data[test]) - p = np.array(p).ravel() - e = p - target[test] - err += np.dot(e, e) - -rmse_10cv = np.sqrt(err / len(target)) +lr = LinearRegression() +# Compute error on training data to demonstrate that we can obtain near perfect +# scores: lr.fit(data, target) -p = lr.predict(data) -p = p.ravel() -e = p - target -total_error = np.dot(e, e) -rmse_train = np.sqrt(total_error / len(p)) - -print('RMSE on training: {}'.format(rmse_train)) -print('RMSE on 10-fold CV: {}'.format(rmse_10cv)) +pred = lr.predict(data) + +print('RMSE on training, {:.2}'.format(np.sqrt(mean_squared_error(target, pred)))) +print('R2 on training, {:.2}'.format(r2_score(target, pred))) +print('') + +pred = np.zeros_like(target) +kf = KFold(len(target), n_folds=5) +for train, test in kf: + lr.fit(data[train], target[train]) + pred[test] = lr.predict(data[test]) + +print('RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(target, pred)))) +print('R2 on testing (5 fold), {:.2}'.format(r2_score(target, pred))) diff --git a/ch07/predict10k_en.py b/ch07/predict10k_en.py index 4b240574..a7dd960a 100644 --- a/ch07/predict10k_en.py +++ b/ch07/predict10k_en.py @@ -8,33 +8,66 @@ import numpy as np from sklearn.datasets import load_svmlight_file from sklearn.cross_validation import KFold -from sklearn.linear_model import ElasticNet, LinearRegression +from sklearn.linear_model import ElasticNetCV, ElasticNet +from sklearn.metrics import mean_squared_error, r2_score +from matplotlib import pyplot as plt data, target = load_svmlight_file('data/E2006.train') # Edit the lines below if you want to switch method: -# met = LinearRegression(fit_intercept=True) -met = ElasticNet(fit_intercept=True, alpha=.1) +# from sklearn.linear_model import Lasso +# met = Lasso(alpha=0.1) +met = ElasticNet(alpha=0.1) -kf = KFold(len(target), n_folds=10) -err = 0 +kf = KFold(len(target), n_folds=5) +pred = np.zeros_like(target) for train, test in kf: met.fit(data[train], target[train]) - p = met.predict(data[test]) - p = np.array(p).ravel() - e = p - target[test] - err += np.dot(e, e) + pred[test] = met.predict(data[test]) -rmse_10cv = np.sqrt(err / len(target)) +print('[EN 0.1] RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(target, pred)))) +print('[EN 0.1] R2 on testing (5 fold), {:.2}'.format(r2_score(target, pred))) +print('') +# Construct an ElasticNetCV object (use all available CPUs) +met = ElasticNetCV(n_jobs=-1) + +kf = KFold(len(target), n_folds=5) +pred = np.zeros_like(target) +for train, test in kf: + met.fit(data[train], target[train]) + pred[test] = met.predict(data[test]) + +print('[EN CV] RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(target, pred)))) +print('[EN CV] R2 on testing (5 fold), {:.2}'.format(r2_score(target, pred))) +print('') met.fit(data, target) -p = met.predict(data) -p = p.ravel() -e = p - target -total_error = np.dot(e, e) -rmse_train = np.sqrt(total_error / len(p)) +pred = met.predict(data) +print('[EN CV] RMSE on training, {:.2}'.format(np.sqrt(mean_squared_error(target, pred)))) +print('[EN CV] R2 on training, {:.2}'.format(r2_score(target, pred))) + + +# Construct an ElasticNetCV object (use all available CPUs) +met = ElasticNetCV(n_jobs=-1, l1_ratio=[.01, .05, .25, .5, .75, .95, .99]) + +kf = KFold(len(target), n_folds=5) +pred = np.zeros_like(target) +for train, test in kf: + met.fit(data[train], target[train]) + pred[test] = met.predict(data[test]) + + +print('[EN CV l1_ratio] RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(target, pred)))) +print('[EN CV l1_ratio] R2 on testing (5 fold), {:.2}'.format(r2_score(target, pred))) +print('') + +fig, ax = plt.subplots() +y = target +ax.scatter(y, pred, c='k') +ax.plot([-5,-1], [-5,-1], 'r-', lw=2) +ax.set_xlabel('Actual value') +ax.set_ylabel('Predicted value') +fig.savefig('Figure_10k_scatter_EN_l1_ratio.png') -print('RMSE on training: {}'.format(rmse_train)) -print('RMSE on 10-fold CV: {}'.format(rmse_10cv)) diff --git a/ch07/usermodel.py b/ch07/usermodel.py deleted file mode 100644 index 9a93b2c8..00000000 --- a/ch07/usermodel.py +++ /dev/null @@ -1,61 +0,0 @@ -# This code is supporting material for the book -# Building Machine Learning Systems with Python -# by Willi Richert and Luis Pedro Coelho -# published by PACKT Publishing -# -# It is made available under the MIT License - -import numpy as np -from scipy import sparse -from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV -from sklearn.cross_validation import KFold - -data = np.array([[int(tok) for tok in line.split('\t')[:3]] - for line in open('data/ml-100k/u.data')]) -ij = data[:, :2] -ij -= 1 # original data is in 1-based system -values = data[:, 2] -reviews = sparse.csc_matrix((values, ij.T)).astype(float) - -reg = ElasticNetCV(fit_intercept=True, alphas=[ - 0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) - - -def movie_norm(xc): - '''Normalize per movie''' - xc = xc.copy().toarray() - # x1 is the mean of the positive items - x1 = np.array([xi[xi > 0].mean() for xi in xc]) - x1 = np.nan_to_num(x1) - - for i in range(xc.shape[0]): - xc[i] -= (xc[i] > 0) * x1[i] - return xc, x1 - - -def learn_for(i): - u = reviews[i] - us = np.delete(np.arange(reviews.shape[0]), i) - ps, = np.where(u.toarray().ravel() > 0) - x = reviews[us][:, ps].T - y = u.data - err = 0 - eb = 0 - kf = KFold(len(y), n_folds=4) - for train, test in kf: - xc, x1 = movie_norm(x[train]) - reg.fit(xc, y[train] - x1) - - xc, x1 = movie_norm(x[test]) - p = reg.predict(xc).ravel() - e = (p + x1) - y[test] - err += np.sum(e * e) - eb += np.sum((y[train].mean() - y[test]) ** 2) - return np.sqrt(err / float(len(y))), np.sqrt(eb / float(len(y))) - -whole_data = [] -for i in range(reviews.shape[0]): - s = learn_for(i) - print(s[0] < s[1]) - print(s) - whole_data.append(s) diff --git a/ch08/README.rst b/ch08/README.rst new file mode 100644 index 00000000..488844e6 --- /dev/null +++ b/ch08/README.rst @@ -0,0 +1,41 @@ +========= +Chapter 8 +========= + +Support code for *Chapter 8: Recommendations*. + +The code refers to the second edition of the book and this code has been +significantly refactored when compared to the first one. + +Ratings Prediction +------------------ + +Note that since the partition of the data into training and testing is random, +everytime you run the code, the results will be different. + + +load_ml100k.py + Load data & partition into test/train +norm.py + Normalize the data +corrneighbours.py + Neighbour models based on ncrroaltoin +regression.py + Regression models +stacked.py + Stacked predictions +averaged.py + Averaging of predictions (mentioned in book, but code is not shown there). + +Association Rule Mining +----------------------- + +Check the folder ``apriori/`` + +apriori/histogram.py + Print a histogram of how many times each product was bought +apriori/apriori.py + Implementation of Apriori algorithm and association rule building +apriori/apriori_example.py + Example of Apriori algorithm in retail dataset + diff --git a/ch08/all_correlations.py b/ch08/all_correlations.py index 509e5b8e..d3817bf0 100644 --- a/ch08/all_correlations.py +++ b/ch08/all_correlations.py @@ -7,9 +7,14 @@ import numpy as np -# This is the version in the book: - +def all_correlations(y, X): + from scipy import spatial + y = np.atleast_2d(y) + sp = spatial.distance.cdist(X, y, 'correlation') + # The "correlation distance" is 1 - corr(x,y); so we invert that to obtain the correlation + return 1 - sp.ravel() +# This is the version in the book (1st Edition): def all_correlations_book_version(bait, target): ''' corrs = all_correlations(bait, target) @@ -21,9 +26,7 @@ def all_correlations_book_version(bait, target): for c in target]) # This is a faster, but harder to read, implementation: - - -def all_correlations(y, X): +def all_correlations_fast_no_scipy(y, X): ''' Cs = all_correlations(y, X) @@ -42,12 +45,4 @@ def all_correlations(y, X): return (xy - x_ * y_ * n) / n / xs_ / ys_ -# If you have scipy installed, then you can compute correlations with -# scipy.spatial.cdist: -def all_correlations_scipy(y, X): - from scipy import spatial - y = np.atleast_2d(y) - sp = spatial.distance.cdist(X, y, 'correlation') - # The "correlation distance" is 1 - corr(x,y); so we invert that to obtain the correlation - return 1 - sp.ravel() diff --git a/ch08/apriori/.gitignore b/ch08/apriori/.gitignore index 05697a95..6379e20f 100644 --- a/ch08/apriori/.gitignore +++ b/ch08/apriori/.gitignore @@ -1 +1 @@ -retail.dat +retail.dat.gz diff --git a/ch08/apriori/apriori.py b/ch08/apriori/apriori.py index 1ebb7e54..eeaf0d46 100644 --- a/ch08/apriori/apriori.py +++ b/ch08/apriori/apriori.py @@ -5,9 +5,12 @@ # # It is made available under the MIT License +from collections import namedtuple + + def apriori(dataset, minsupport, maxsize): ''' - freqsets, baskets = apriori(dataset, minsupport, maxsize) + freqsets, support = apriori(dataset, minsupport, maxsize) Parameters ---------- @@ -21,48 +24,70 @@ def apriori(dataset, minsupport, maxsize): Returns ------- freqsets : sequence of sequences - baskets : dictionary + support : dictionary + This associates each itemset (represented as a frozenset) with a float + (the support of that itemset) ''' from collections import defaultdict baskets = defaultdict(list) pointers = defaultdict(list) + for i, ds in enumerate(dataset): for ell in ds: pointers[ell].append(i) baskets[frozenset([ell])].append(i) - pointers = dict([(k, frozenset(v)) for k, v in pointers.items()]) - baskets = dict([(k, frozenset(v)) for k, v in baskets.items()]) - valid = set(list(el)[0] - for el, c in baskets.items() if (len(c) >= minsupport)) - dataset = [[el for el in ds if (el in valid)] for ds in dataset] - dataset = [ds for ds in dataset if len(ds) > 1] - dataset = map(frozenset, dataset) + # Convert pointer items to frozensets to speed up operations later + new_pointers = dict() + for k in pointers: + if len(pointers[k]) >= minsupport: + new_pointers[k] = frozenset(pointers[k]) + pointers = new_pointers + for k in baskets: + baskets[k] = frozenset(baskets[k]) + + # Valid are all elements whose support is >= minsupport + valid = set() + for el, c in baskets.items(): + if len(c) >= minsupport: + valid.update(el) + + # Itemsets at first iteration are simply all singleton with valid elements: itemsets = [frozenset([v]) for v in valid] freqsets = [] for i in range(maxsize - 1): - print(len(itemsets)) + print("At iteration {}, number of frequent baskets: {}".format( + i, len(itemsets))) newsets = [] - for i, ell in enumerate(itemsets): - ccounts = baskets[ell] - for v_, pv in pointers.items(): - if v_ not in ell: + for it in itemsets: + ccounts = baskets[it] + + for v, pv in pointers.items(): + if v not in it: csup = (ccounts & pv) if len(csup) >= minsupport: - new = frozenset(ell | set([v_])) + new = frozenset(it | frozenset([v])) if new not in baskets: newsets.append(new) baskets[new] = csup freqsets.extend(itemsets) itemsets = newsets - return freqsets, baskets + if not len(itemsets): + break + support = {} + for k in baskets: + support[k] = float(len(baskets[k])) + return freqsets, support -def association_rules(dataset, freqsets, baskets, minlift): +# A namedtuple to collect all values that may be interesting +AssociationRule = namedtuple('AssociationRule', ['antecendent', 'consequent', 'base', 'py_x', 'lift']) + +def association_rules(dataset, freqsets, support, minlift): ''' - for (antecendent, consequent, base, py_x, lift) in association_rules(dataset, freqsets, baskets, minlift): + for assoc_rule in association_rules(dataset, freqsets, support, minlift): ... This function takes the returns from ``apriori``. @@ -72,9 +97,13 @@ def association_rules(dataset, freqsets, baskets, minlift): dataset : sequence of sequences input dataset freqsets : sequence of sequences - baskets : dictionary + support : dictionary minlift : int minimal lift of yielded rules + + Returns + ------- + assoc_rule : sequence of AssociationRule objects ''' nr_transactions = float(len(dataset)) freqsets = [f for f in freqsets if len(f) > 1] @@ -82,8 +111,9 @@ def association_rules(dataset, freqsets, baskets, minlift): for f in fset: consequent = frozenset([f]) antecendent = fset - consequent - base = len(baskets[consequent]) / nr_transactions - py_x = len(baskets[fset]) / float(len(baskets[antecendent])) + py_x = support[fset] / support[antecendent] + base = support[consequent] / nr_transactions lift = py_x / base if lift > minlift: - yield (antecendent, consequent, base, py_x, lift) + yield AssociationRule(antecendent, consequent, base, py_x, lift) + diff --git a/ch08/apriori/apriori_example.py b/ch08/apriori/apriori_example.py index da77e96c..971ed4d5 100644 --- a/ch08/apriori/apriori_example.py +++ b/ch08/apriori/apriori_example.py @@ -7,10 +7,17 @@ from apriori import apriori, association_rules from gzip import GzipFile + +# Load dataset dataset = [[int(tok) for tok in line.strip().split()] for line in GzipFile('retail.dat.gz')] -freqsets, baskets = apriori(dataset, 80, maxsize=5) -nr_transactions = float(len(dataset)) -for ant, con, base, pyx, lift in association_rules(dataset, freqsets, baskets, 30): - print('{} | {} | {} ({:%}) | {} | {} | {}' - .format(ant, con, len(baskets[con]), len(baskets[con]) / nr_transactions, len(baskets[ant]), len(baskets[con | ant]), int(lift))) + +freqsets, support = apriori(dataset, 80, maxsize=16) +rules = list(association_rules(dataset, freqsets, support, minlift=30.0)) + +rules.sort(key=(lambda ar: -ar.lift)) +for ar in rules: + print('{} -> {} (lift = {:.4})' + .format(set(ar.antecendent), + set(ar.consequent), + ar.lift)) diff --git a/ch08/apriori/apriori_naive.py b/ch08/apriori/apriori_naive.py index cb50a8e1..03ab9130 100644 --- a/ch08/apriori/apriori_naive.py +++ b/ch08/apriori/apriori_naive.py @@ -5,35 +5,83 @@ # # It is made available under the MIT License -import numpy as np from collections import defaultdict from itertools import chain from gzip import GzipFile -minsupport = 44 +minsupport = 80 dataset = [[int(tok) for tok in line.strip().split()] for line in GzipFile('retail.dat.gz')] -dataset = dataset[::20] counts = defaultdict(int) for elem in chain(*dataset): counts[elem] += 1 +# Only elements that have at least minsupport should be considered. valid = set(el for el, c in counts.items() if (c >= minsupport)) + +# Filter the dataset to contain only valid elements +# (This step is not strictly necessary, but will make the rest of the code +# faster as the itemsets will be smaller): dataset = [[el for el in ds if (el in valid)] for ds in dataset] -dataset = [frozenset(ds) for ds in dataset if len(ds) > 1] +# Convert to frozenset for fast processing +dataset = [frozenset(ds) for ds in dataset] itemsets = [frozenset([v]) for v in valid] -allsets = [itemsets] +freqsets = itemsets[:] for i in range(16): - print(len(itemsets)) + print("At iteration {}, number of frequent baskets: {}".format( + i, len(itemsets))) nextsets = [] - for i, ell in enumerate(itemsets): - for v_ in valid: - if v_ not in ell: - c = (ell | set([v_])) - if sum(1 for d in dataset if d.issuperset(c)) > minsupport: + + tested = set() + for it in itemsets: + for v in valid: + if v not in it: + # Create a new candidate set by adding v to it + c = (it | frozenset([v])) + + # Check if we have tested it already: + if c in tested: + continue + tested.add(c) + + # Count support by looping over dataset + # This step is slow. + # Check `apriori.py` for a better implementation. + support_c = sum(1 for d in dataset if d.issuperset(c)) + if support_c > minsupport: nextsets.append(c) - allsets.append(nextsets) + freqsets.extend(nextsets) itemsets = nextsets + if not len(itemsets): + break +print("Finished!") + + +def rules_from_itemset(itemset, dataset, minlift=1.): + nr_transactions = float(len(dataset)) + for item in itemset: + consequent = frozenset([item]) + antecedent = itemset-consequent + base = 0.0 + # acount: antecedent count + acount = 0.0 + + # ccount : consequent count + ccount = 0.0 + for d in dataset: + if item in d: base += 1 + if d.issuperset(itemset): ccount += 1 + if d.issuperset(antecedent): acount += 1 + base /= nr_transactions + p_y_given_x = ccount/acount + lift = p_y_given_x / base + if lift > minlift: + print('Rule {0} -> {1} has lift {2}' + .format(antecedent, consequent,lift)) + +for itemset in freqsets: + if len(itemset) > 1: + rules_from_itemset(itemset, dataset, minlift=4.) diff --git a/ch08/averaged.py b/ch08/averaged.py new file mode 100644 index 00000000..5b19bba7 --- /dev/null +++ b/ch08/averaged.py @@ -0,0 +1,33 @@ +import numpy as np +import load_ml100k +import regression +import corrneighbours +from sklearn import metrics +import norm + +def predict(train): + predicted0 = regression.predict(train) + predicted1 = regression.predict(train.T).T + predicted2 = corrneighbours.predict(train) + predicted3 = corrneighbours.predict(train.T).T + predicted4 = norm.predict(train) + predicted5 = norm.predict(train.T).T + stack = np.array([ + predicted0, + predicted1, + predicted2, + predicted3, + predicted4, + predicted5, + ]) + return stack.mean(0) + + +def main(): + train,test = load_ml100k.get_train_test(random_state=12) + predicted = predict(train) + r2 = metrics.r2_score(test[test > 0], predicted[test > 0]) + print('R2 averaged: {:.2%}'.format(r2)) + +if __name__ == '__main__': + main() diff --git a/ch08/chapter.py b/ch08/chapter.py new file mode 100644 index 00000000..d039d93f --- /dev/null +++ b/ch08/chapter.py @@ -0,0 +1,208 @@ +import numpy as np # NOT IN BOOK +from matplotlib import pyplot as plt # NOT IN BOOK + +def load(): + import numpy as np + from scipy import sparse + + data = np.loadtxt('data/ml-100k/u.data') + ij = data[:, :2] + ij -= 1 # original data is in 1-based system + values = data[:, 2] + reviews = sparse.csc_matrix((values, ij.T)).astype(float) + return reviews.toarray() +reviews = load() +U,M = np.where(reviews) +import random +test_idxs = np.array(random.sample(range(len(U)), len(U)//10)) + +train = reviews.copy() +train[U[test_idxs], M[test_idxs]] = 0 + +test = np.zeros_like(reviews) +test[U[test_idxs], M[test_idxs]] = reviews[U[test_idxs], M[test_idxs]] + +class NormalizePositive(object): + def __init__(self, axis=0): + self.axis = axis + + def fit(self, features, y=None): + if self.axis == 1: + features = features.T + # count features that are greater than zero in axis 0: + binary = (features > 0) + + count0 = binary.sum(axis=0) + + # to avoid division by zero, set zero counts to one: + count0[count0 == 0] = 1. + + # computing the mean is easy: + self.mean = features.sum(axis=0)/count0 + + # only consider differences where binary is True: + diff = (features - self.mean) * binary + diff **= 2 + # regularize the estimate of std by adding 0.1 + self.std = np.sqrt(0.1 + diff.sum(axis=0)/count0) + return self + + + def transform(self, features): + if self.axis == 1: + features = features.T + binary = (features > 0) + features = features - self.mean + features /= self.std + features *= binary + if self.axis == 1: + features = features.T + return features + + def inverse_transform(self, features, copy=True): + if copy: + features = features.copy() + if self.axis == 1: + features = features.T + features *= self.std + features += self.mean + if self.axis == 1: + features = features.T + return features + + def fit_transform(self, features): + return self.fit(features).transform(features) + + +norm = NormalizePositive(axis=1) +binary = (train > 0) +train = norm.fit_transform(train) +# plot just 200x200 area for space reasons +plt.imshow(binary[:200, :200], interpolation='nearest') + +from scipy.spatial import distance +# compute all pair-wise distances: +dists = distance.pdist(binary, 'correlation') +# Convert to square form, so that dists[i,j] +# is distance between binary[i] and binary[j]: +dists = distance.squareform(dists) +neighbors = dists.argsort(axis=1) + +# We are going to fill this matrix with results +filled = train.copy() +for u in range(filled.shape[0]): + # n_u is neighbors of user + n_u = neighbors[u, 1:] + for m in range(filled.shape[1]): + # get relevant reviews in order! + revs = [train[neigh, m] + for neigh in n_u + if binary [neigh, m]] + if len(revs): + # n is the number of reviews for this movie + n = len(revs) + # take half of the reviews plus one into consideration: + n //= 2 + n += 1 + revs = revs[:n] + filled[u,m] = np.mean(revs) + +predicted = norm.inverse_transform(filled) +from sklearn import metrics +r2 = metrics.r2_score(test[test > 0], predicted[test > 0]) +print('R2 score (binary neighbors): {:.1%}'.format(r2)) + +reviews = reviews.T +# use same code as before +r2 = metrics.r2_score(test[test > 0], predicted[test > 0]) +print('R2 score (binary movie neighbors): {:.1%}'.format(r2)) + + +from sklearn.linear_model import ElasticNetCV # NOT IN BOOK + +reg = ElasticNetCV(alphas=[ + 0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) +filled = train.copy() +# iterate over all users: +for u in range(train.shape[0]): + curtrain = np.delete(train, u, axis=0) + bu = binary[u] + reg.fit(curtrain[:,bu].T, train[u, bu]) + filled[u, ~bu] = reg.predict(curtrain[:,~bu].T) +predicted = norm.inverse_transform(filled) +r2 = metrics.r2_score(test[test > 0], predicted[test > 0]) +print('R2 score (user regression): {:.1%}'.format(r2)) + + +# SHOPPING BASKET ANALYSIS +# This is the slow version of the code, which will take a long time to +# complete. + + +from collections import defaultdict +from itertools import chain + +# File is downloaded as a compressed file +import gzip +# file format is a line per transaction +# of the form '12 34 342 5...' +dataset = [[int(tok) for tok in line.strip().split()] + for line in gzip.open('data/retail.dat.gz')] +dataset = [set(d) for d in dataset] +# count how often each product was purchased: +counts = defaultdict(int) +for elem in chain(*dataset): + counts[elem] += 1 + +minsupport = 80 +valid = set(k for k,v in counts.items() if (v >= minsupport)) +itemsets = [frozenset([v]) for v in valid] +freqsets = [] +for i in range(16): + nextsets = [] + tested = set() + for it in itemsets: + for v in valid: + if v not in it: + # Create a new candidate set by adding v to it + c = (it | frozenset([v])) + # check If we have tested it already + if c in tested: + continue + tested.add(c) + + # Count support by looping over dataset + # This step is slow. + # Check `apriori.py` for a better implementation. + support_c = sum(1 for d in dataset if d.issuperset(c)) + if support_c > minsupport: + nextsets.append(c) + freqsets.extend(nextsets) + itemsets = nextsets + if not len(itemsets): + break +print("Finished!") + + +minlift = 5.0 +nr_transactions = float(len(dataset)) +for itemset in freqsets: + for item in itemset: + consequent = frozenset([item]) + antecedent = itemset-consequent + base = 0.0 + # acount: antecedent count + acount = 0.0 + + # ccount : consequent count + ccount = 0.0 + for d in dataset: + if item in d: base += 1 + if d.issuperset(itemset): ccount += 1 + if d.issuperset(antecedent): acount += 1 + base /= nr_transactions + p_y_given_x = ccount/acount + lift = p_y_given_x / base + if lift > minlift: + print('Rule {0} -> {1} has lift {2}' + .format(antecedent, consequent,lift)) diff --git a/ch08/corrneighbours.py b/ch08/corrneighbours.py index eef3881e..eb30e685 100644 --- a/ch08/corrneighbours.py +++ b/ch08/corrneighbours.py @@ -6,75 +6,53 @@ # It is made available under the MIT License from __future__ import print_function -from all_correlations import all_correlations import numpy as np -from load_ml100k import load - -def estimate_user(user, rest, num_neighbors=100): - '''Estimate ratings for user based on the binary rating matrix - - Returns - ------- - estimates: ndarray - Returns a rating estimate for each movie - ''' - - # Compute binary matrix correlations: - bu = user > 0 - br = rest > 0 - ws = all_correlations(bu, br) - - # Select top `num_neighbors`: - selected = ws.argsort()[-num_neighbors:] - - # Use these to compute estimates: - estimates = rest[selected].mean(0) - estimates /= (.1 + br[selected].mean(0)) - return estimates - - -def train_test(user, rest): - '''Train & test on a single user - - Returns both the prediction error and the null error - ''' - estimates = estimate_user(user, rest) - bu = user > 0 - br = rest > 0 - err = estimates[bu] - user[bu] - null = rest.mean(0) - null /= (.1 + br.mean(0)) - nerr = null[bu] - user[bu] - return np.dot(err, err), np.dot(nerr, nerr) - - -def all_estimates(reviews): - estimates = np.zeros_like(reviews) - for i in range(reviews.shape[0]): - estimates[i] = estimate_user(reviews[i], np.delete(reviews, i, 0)) - return estimates - -def main(): - reviews = load() - - err = [] - for i in range(reviews.shape[0]): - err.append( - train_test(reviews[i], np.delete(reviews, i, 0)) - ) - revs = (reviews > 0).sum(1) - err = np.array(err) - rmse = np.sqrt(err / revs[:, None]) - - rmse_model, rmse_null = np.mean(rmse, 0) - - print("Average of RMSE / Null-model RMSE") - print("{:.2}\t{:.2} (improvement: {:.1%}".format(rmse_model, rmse_null, (rmse_null-rmse_model)/rmse_null)) - print() - - rmse_model, rmse_null = np.mean(rmse[revs > 60], 0) - print("Average of RMSE / Null-model RMSE (users with more than 60 reviewed movies)") - print("{:.2}\t{:.2} (improvement: {:.1%}".format(rmse_model, rmse_null, (rmse_null-rmse_model)/rmse_null)) +from load_ml100k import get_train_test +from scipy.spatial import distance +from sklearn import metrics + +from norm import NormalizePositive + +def predict(otrain): + binary = (otrain > 0) + norm = NormalizePositive(axis=1) + train = norm.fit_transform(otrain) + + dists = distance.pdist(binary, 'correlation') + dists = distance.squareform(dists) + + neighbors = dists.argsort(axis=1) + filled = train.copy() + for u in range(filled.shape[0]): + # n_u are the neighbors of user + n_u = neighbors[u, 1:] + for m in range(filled.shape[1]): + # This code could be faster using numpy indexing trickery as the + # cost of readibility (this is left as an exercise to the reader): + revs = [train[neigh, m] + for neigh in n_u + if binary[neigh, m]] + if len(revs): + n = len(revs) + n //= 2 + n += 1 + revs = revs[:n] + filled[u,m] = np.mean(revs) + + return norm.inverse_transform(filled) + +def main(transpose_inputs=False): + train, test = get_train_test(random_state=12) + if transpose_inputs: + train = train.T + test = test.T + + predicted = predict(train) + r2 = metrics.r2_score(test[test > 0], predicted[test > 0]) + print('R2 score (binary {} neighbours): {:.1%}'.format( + ('movie' if transpose_inputs else 'user'), + r2)) if __name__ == '__main__': main() + main(transpose_inputs=True) diff --git a/ch08/data b/ch08/data deleted file mode 120000 index 32d37b53..00000000 --- a/ch08/data +++ /dev/null @@ -1 +0,0 @@ -../ch07/data \ No newline at end of file diff --git a/ch08/data/.gitignore b/ch08/data/.gitignore new file mode 100644 index 00000000..c391d625 --- /dev/null +++ b/ch08/data/.gitignore @@ -0,0 +1,3 @@ +retail.dat.gz +ml-100k.zip +/ml-100k/ diff --git a/ch08/data/download.sh b/ch08/data/download.sh new file mode 100755 index 00000000..b671171d --- /dev/null +++ b/ch08/data/download.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash +curl -L -O http://files.grouplens.org/papers/ml-100k.zip +unzip ml-100k.zip +curl -L -O http://fimi.ua.ac.be/data/retail.dat.gz diff --git a/ch08/figure3.py b/ch08/figure3.py index ac6961f3..daafc300 100644 --- a/ch08/figure3.py +++ b/ch08/figure3.py @@ -12,4 +12,4 @@ plt.imshow(data[:200, :200], interpolation='nearest') plt.xlabel('User ID') plt.ylabel('Film ID') -plt.savefig('../1400_08_03+.png') +plt.savefig('Figure_08_03_DataMatrix.png') diff --git a/ch08/load_ml100k.py b/ch08/load_ml100k.py index b9c99b9d..7096e75c 100644 --- a/ch08/load_ml100k.py +++ b/ch08/load_ml100k.py @@ -6,15 +6,54 @@ # It is made available under the MIT License def load(): - '''Load ML-100k data - Returns a sparse matrix''' + Returns the review matrix as a numpy array''' import numpy as np from scipy import sparse + from os import path + + if not path.exists('data/ml-100k/u.data'): + raise IOError("Data has not been downloaded.\nTry the following:\n\n\tcd data\n\t./download.sh") + + # The input is in the form of a CSC sparse matrix, so it's a natural fit to + # load the data, but we then convert to a more traditional array before + # returning data = np.loadtxt('data/ml-100k/u.data') ij = data[:, :2] ij -= 1 # original data is in 1-based system values = data[:, 2] reviews = sparse.csc_matrix((values, ij.T)).astype(float) return reviews.toarray() + +def get_train_test(reviews=None, random_state=None): + '''Split data into training & testing + + Parameters + ---------- + reviews : ndarray, optional + Input data + + Returns + ------- + train : ndarray + training data + test : ndarray + testing data + ''' + import numpy as np + import random + r = random.Random(random_state) + + if reviews is None: + reviews = load() + U,M = np.where(reviews) + test_idxs = np.array(r.sample(range(len(U)), len(U)//10)) + train = reviews.copy() + train[U[test_idxs], M[test_idxs]] = 0 + + test = np.zeros_like(reviews) + test[U[test_idxs], M[test_idxs]] = reviews[U[test_idxs], M[test_idxs]] + + return train, test + diff --git a/ch08/norm.py b/ch08/norm.py new file mode 100644 index 00000000..2925bbca --- /dev/null +++ b/ch08/norm.py @@ -0,0 +1,75 @@ +import numpy as np + +class NormalizePositive(object): + + def __init__(self, axis=0): + self.axis = axis + + def fit(self, features, y=None): + # count features that are greater than zero in axis `self.axis`: + if self.axis == 1: + features = features.T + binary = (features > 0) + count = binary.sum(axis=0) + + # to avoid division by zero, set zero counts to one: + count[count == 0] = 1. + + self.mean = features.sum(axis=0)/count + + # Compute variance by average squared difference to the mean, but only + # consider differences where binary is True (i.e., where there was a + # true rating): + diff = (features - self.mean) * binary + diff **= 2 + # regularize the estimate of std by adding 0.1 + self.std = np.sqrt(0.1 + diff.sum(axis=0)/count) + return self + + def transform(self, features): + if self.axis == 1: + features = features.T + binary = (features > 0) + features = features - self.mean + features /= self.std + features *= binary + if self.axis == 1: + features = features.T + return features + + def inverse_transform(self, features, copy=True): + if copy: + features = features.copy() + if self.axis == 1: + features = features.T + features *= self.std + features += self.mean + if self.axis == 1: + features = features.T + return features + + def fit_transform(self, features): + return self.fit(features).transform(features) + + +def predict(train): + norm = NormalizePositive() + train = norm.fit_transform(train) + return norm.inverse_transform(train * 0.) + + +def main(transpose_inputs=False): + from load_ml100k import get_train_test + from sklearn import metrics + train,test = get_train_test(random_state=12) + if transpose_inputs: + train = train.T + test = test.T + predicted = predict(train) + r2 = metrics.r2_score(test[test > 0], predicted[test > 0]) + print('R2 score ({} normalization): {:.1%}'.format( + ('movie' if transpose_inputs else 'user'), + r2)) +if __name__ == '__main__': + main() + main(transpose_inputs=True) diff --git a/ch08/regression.py b/ch08/regression.py new file mode 100644 index 00000000..693e99a4 --- /dev/null +++ b/ch08/regression.py @@ -0,0 +1,50 @@ +# This code is supporting material for the book +# Building Machine Learning Systems with Python +# by Willi Richert and Luis Pedro Coelho +# published by PACKT Publishing +# +# It is made available under the MIT License + +import numpy as np +from sklearn.linear_model import ElasticNetCV +from norm import NormalizePositive +from sklearn import metrics + + +def predict(train): + binary = (train > 0) + reg = ElasticNetCV(fit_intercept=True, alphas=[ + 0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) + norm = NormalizePositive() + train = norm.fit_transform(train) + + filled = train.copy() + # iterate over all users + for u in range(train.shape[0]): + # remove the current user for training + curtrain = np.delete(train, u, axis=0) + bu = binary[u] + if np.sum(bu) > 5: + reg.fit(curtrain[:,bu].T, train[u, bu]) + + # Fill the values that were not there already + filled[u, ~bu] = reg.predict(curtrain[:,~bu].T) + return norm.inverse_transform(filled) + + +def main(transpose_inputs=False): + from load_ml100k import get_train_test + train,test = get_train_test(random_state=12) + if transpose_inputs: + train = train.T + test = test.T + filled = predict(train) + r2 = metrics.r2_score(test[test > 0], filled[test > 0]) + + print('R2 score ({} regression): {:.1%}'.format( + ('movie' if transpose_inputs else 'user'), + r2)) + +if __name__ == '__main__': + main() + main(transpose_inputs=True) diff --git a/ch08/stacked.py b/ch08/stacked.py index 76afe9fc..8fa6344e 100644 --- a/ch08/stacked.py +++ b/ch08/stacked.py @@ -1,40 +1,47 @@ -# This code is supporting material for the book -# Building Machine Learning Systems with Python -# by Willi Richert and Luis Pedro Coelho -# published by PACKT Publishing -# -# It is made available under the MIT License - -from __future__ import print_function -from sklearn.linear_model import LinearRegression -from load_ml100k import load import numpy as np -import similar_movie -import usermodel +import load_ml100k +import regression import corrneighbours +from sklearn import linear_model, metrics +import norm + +def predict(train): + tr_train,tr_test = load_ml100k.get_train_test(train, random_state=34) + tr_predicted0 = regression.predict(tr_train) + tr_predicted1 = regression.predict(tr_train.T).T + tr_predicted2 = corrneighbours.predict(tr_train) + tr_predicted3 = corrneighbours.predict(tr_train.T).T + tr_predicted4 = norm.predict(tr_train) + tr_predicted5 = norm.predict(tr_train.T).T + stack_tr = np.array([ + tr_predicted0[tr_test > 0], + tr_predicted1[tr_test > 0], + tr_predicted2[tr_test > 0], + tr_predicted3[tr_test > 0], + tr_predicted4[tr_test > 0], + tr_predicted5[tr_test > 0], + ]).T + + lr = linear_model.LinearRegression() + lr.fit(stack_tr, tr_test[tr_test > 0]) + + stack_te = np.array([ + tr_predicted0.ravel(), + tr_predicted1.ravel(), + tr_predicted2.ravel(), + tr_predicted3.ravel(), + tr_predicted4.ravel(), + tr_predicted5.ravel(), + ]).T + + return lr.predict(stack_te).reshape(train.shape) -reviews = load() -reg = LinearRegression() -es = np.array([ - usermodel.all_estimates(reviews), - corrneighbours.all_estimates(reviews), - similar_movie.all_estimates(reviews), -]) -total_error = 0.0 -coefficients = [] -for u in range(reviews.shape[0]): - es0 = np.delete(es, u, 1) - r0 = np.delete(reviews, u, 0) - X, Y = np.where(r0 > 0) - X = es[:, X, Y] - y = r0[r0 > 0] - reg.fit(X.T, y) - coefficients.append(reg.coef_) +def main(): + train,test = load_ml100k.get_train_test(random_state=12) + predicted = predict(train) + r2 = metrics.r2_score(test[test > 0], predicted[test > 0]) + print('R2 stacked: {:.2%}'.format(r2)) - r0 = reviews[u] - X = np.where(r0 > 0) - p0 = reg.predict(es[:, u, X].squeeze().T) - err0 = r0[r0 > 0] - p0 - total_error += np.dot(err0, err0) - print(u) +if __name__ == '__main__': + main() diff --git a/ch08/stacked5.py b/ch08/stacked5.py deleted file mode 100644 index 7ace7d32..00000000 --- a/ch08/stacked5.py +++ /dev/null @@ -1,44 +0,0 @@ -# This code is supporting material for the book -# Building Machine Learning Systems with Python -# by Willi Richert and Luis Pedro Coelho -# published by PACKT Publishing -# -# It is made available under the MIT License - -from sklearn.linear_model import LinearRegression -from load_ml100k import load -import numpy as np -import similar_movie -import usermodel - -reviews = load() -# Collect several estimates -es = np.array([ - usermodel.all_estimates(reviews), - similar_movie.all_estimates(reviews, k=1), - similar_movie.all_estimates(reviews, k=2), - similar_movie.all_estimates(reviews, k=3), - similar_movie.all_estimates(reviews, k=4), - similar_movie.all_estimates(reviews, k=5), -]) - -total_error = 0.0 -coefficients = [] - -reg = LinearRegression() -# Iterate over all users -for u in range(reviews.shape[0]): - es0 = np.delete(es, u, axis=1) - r0 = np.delete(reviews, u, axis=0) - X, Y = np.where(r0 > 0) - X = es[:, X, Y] - y = r0[r0 > 0] - reg.fit(X.T, y) - coefficients.append(reg.coef_) - - r0 = reviews[u] - X = np.where(r0 > 0) - p0 = reg.predict(es[:, u, X].squeeze().T) - err0 = r0[r0 > 0] - p0 - total_error += np.dot(err0, err0) -coefficients = np.array(coefficients) diff --git a/ch08/usermodel.py b/ch08/usermodel.py deleted file mode 100644 index 589afca4..00000000 --- a/ch08/usermodel.py +++ /dev/null @@ -1,54 +0,0 @@ -# This code is supporting material for the book -# Building Machine Learning Systems with Python -# by Willi Richert and Luis Pedro Coelho -# published by PACKT Publishing -# -# It is made available under the MIT License - -import numpy as np -from sklearn.linear_model import ElasticNetCV -from sklearn.cross_validation import KFold - - -def learn_for(reviews, i): - reg = ElasticNetCV(fit_intercept=True, alphas=[ - 0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) - nusers,nmovies = reviews.shape - u = reviews[i] - us = np.arange(reviews.shape[0]) - us = np.delete(us, i) - ps, = np.where(u.ravel() > 0) - x = reviews[us][:, ps].T - kf = KFold(len(ps), n_folds=4) - predictions = np.zeros(len(ps)) - for train, test in kf: - xc = x[train].copy() - x1 = np.array([xi[xi > 0].mean() for xi in xc]) - x1 = np.nan_to_num(x1) - - for i in range(xc.shape[0]): - xc[i] -= (xc[i] > 0) * x1[i] - - reg.fit(xc, u[train] - x1) - - xc = x[test].copy() - x1 = np.array([xi[xi > 0].mean() for xi in xc]) - x1 = np.nan_to_num(x1) - - for i in range(xc.shape[0]): - xc[i] -= (xc[i] > 0) * x1[i] - - p = reg.predict(xc).ravel() - predictions[test] = p - fill_preds = np.zeros(nmovies) - fill_preds[ps] = predictions - return fill_preds - - -def all_estimates(reviews): - whole_data = [] - for i in range(reviews.shape[0]): - s = learn_for(reviews, i) - whole_data.append(s) - return np.array(whole_data) - diff --git a/ch10/.gitignore b/ch10/.gitignore new file mode 100644 index 00000000..2f266195 --- /dev/null +++ b/ch10/.gitignore @@ -0,0 +1 @@ +AnimTransDistr/ diff --git a/ch10/README.rst b/ch10/README.rst index 6aea7be0..91e32756 100644 --- a/ch10/README.rst +++ b/ch10/README.rst @@ -17,22 +17,21 @@ Running ``download.sh`` will retrieve the other dataset into a directory Scripts ------- -threshold.py - Threshold building image with both Otsu and Ridley-Calvard threshold +chapter.py + Code as written in the book. +thresholded_figure.py + Computes the thresholded figures, including after Gaussian blurring lena-ring.py - Show Lena in a sharp ring around a soft focus image -figure5_6.py - Computes figures 5 and 6 in the book -figure9.py - Compute Sobel images + Lena image with center in focus and blurred edges figure10.py Just paste two images next to each others -figure13.py - Demonstrate salt&pepper effect -edginess.py - Contains the ``edginess_sobel`` function from the book +features.py + Contains the color histogram function from the book as well as a simple + wrapper around ``mahotas.texture.haralick`` simple_classification.py - Classify SimpleImageDataset with texture features + sobel feature -figure18.py + Classify SimpleImageDataset with texture features + color histogram features +large_classification.py Classify ``AnimTransDistr`` with both texture and SURF features. +neighbors.py + Computes image neighbors as well as the neighbor figure from the book. diff --git a/ch10/chapter.py b/ch10/chapter.py new file mode 100644 index 00000000..233720bb --- /dev/null +++ b/ch10/chapter.py @@ -0,0 +1,186 @@ +import numpy as np +import mahotas as mh +image = mh.imread('scene00.jpg') +from matplotlib import pyplot as plt +plt.imshow(image) +plt.show() +image = mh.colors.rgb2grey(image, dtype=np.uint8) +plt.imshow(image) # Display the image +plt.gray() +thresh = mh.thresholding.otsu(image) +print('Otsu threshold is {}.'.format(thresh)) +# Otsu threshold is 138. +plt.imshow(image > thresh) + +im16 = mh.gaussian_filter(image,16) +im = mh.demos.load('lenna') + +r,g,b = im.transpose(2,0,1) +r12 = mh.gaussian_filter(r, 12.) +g12 = mh.gaussian_filter(g, 12.) +b12 = mh.gaussian_filter(b, 12.) +im12 = mh.as_rgb(r12,g12,b12) +h, w = r.shape # height and width +Y, X = np.mgrid[:h,:w] +Y = Y-h/2. # center at h/2 +Y = Y / Y.max() # normalize to -1 .. +1 + +X = X-w/2. +X = X / X.max() + +C = np.exp(-2.*(X**2+ Y**2)) + +# Normalize again to 0..1 +C = C - C.min() +C = C / C.ptp() +C = C[:,:,None] # This adds a dummy third dimension to C + +ringed = mh.stretch(im*C + (1-C)*im12) + +haralick_features = mh.features.haralick(image) +haralick_features_mean = np.mean(haralick_features, axis=0) +haralick_features_all = np.ravel(haralick_features) + +from glob import glob +images = glob('../SimpleImageDataset/*.jpg') +features = [] +labels = [] +for im in images: + labels.append(im[:-len('00.jpg')]) + im = mh.imread(im) + im = mh.colors.rgb2gray(im, dtype=np.uint8) + features.append(mh.features.haralick(im).ravel()) + +features = np.array(features) +labels = np.array(labels) + +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.linear_model import LogisticRegression +clf = Pipeline([('preproc', StandardScaler()), + ('classifier', LogisticRegression())]) + + +from sklearn import cross_validation +cv = cross_validation.LeaveOneOut(len(images)) +scores = cross_validation.cross_val_score( + clf, features, labels, cv=cv) +print('Accuracy: {:.1%}'.format(scores.mean())) +# Accuracy: 81.1% + +def chist(im): + im = im // 64 + r,g,b = im.transpose((2,0,1)) + pixels = 1 * r + 4 * b + 16 * g + hist = np.bincount(pixels.ravel(), minlength=64) + hist = hist.astype(float) + hist = np.log1p(hist) + return hist + +features = [] +for im in images: + im = mh.imread(im) + features.append(chist(im)) + + + +features = [] +for im in images: + imcolor = mh.imread(im) + im = mh.colors.rgb2gray(imcolor, dtype=np.uint8) + features.append(np.concatenate([ + mh.features.haralick(im).ravel(), + chist(imcolor), + ])) + + +scores = cross_validation.cross_val_score( + clf, features, labels, cv=cv) +print('Accuracy: {:.1%}'.format(scores.mean())) +# Accuracy: 95.6% + + +features = [] +for im in images: + imcolor = mh.imread(im) + # Ignore everything in the 200 pixels close to the borders + imcolor = imcolor[200:-200, 200:-200] + im = mh.colors.rgb2gray(imcolor, dtype=np.uint8) + features.append(np.concatenate([ + mh.features.haralick(im).ravel(), + chist(imcolor), + ])) + +sc = StandardScaler() +features = sc.fit_transform(features) +from scipy.spatial import distance +dists = distance.squareform(distance.pdist(features)) + + +fig, axes = plt.subplots(2, 9) +for ci,i in enumerate(range(0,90,10)): + left = images[i] + dists_left = dists[i] + right = dists_left.argsort() + # right[0] is the same as left[i], so pick the next closest element + right = right[1] + right = images[right] + left = mh.imread(left) + right = mh.imread(right) + axes[0, ci].imshow(left) + axes[1, ci].imshow(right) + + + +from sklearn.grid_search import GridSearchCV +C_range = 10.0 ** np.arange(-4, 3) +grid = GridSearchCV(LogisticRegression(), param_grid={'C' : C_range}) +clf = Pipeline([('preproc', StandardScaler()), + ('classifier', grid)]) + +cv = cross_validation.KFold(len(features), 5, + shuffle=True, random_state=123) +scores = cross_validation.cross_val_score( + clf, features, labels, cv=cv) +print('Accuracy: {:.1%}'.format(scores.mean())) + + + + +from mahotas.features import surf +image = mh.demos.load('lena') +image = mh.colors.rgb2gray(image, dtype=np.uint8) +descriptors = surf.surf(image, descriptor_only=True) + +from mahotas.features import surf +descriptors = surf.dense(image, spacing=16) +alldescriptors = [] +for im in images: + im = mh.imread(im, as_grey=True) + im = im.astype(np.uint8) + alldescriptors.append(surf.dense(image, spacing=16)) +# get all descriptors into a single array +concatenated = np.concatenate(alldescriptors) +print('Number of descriptors: {}'.format( + len(concatenated))) +# use only every 64th vector +concatenated = concatenated[::64] +from sklearn.cluster import KMeans # FIXME CAPITALIZATION +k = 256 +km = KMeans(k) +km.fit(concatenated) + +features = [] +for d in alldescriptors: + c = km.predict(d) + features.append( + np.array([np.sum(c == ci) for ci in range(k)]) + ) +# build single array and convert to float +features = np.array(features, dtype=float) +scores = cross_validation.cross_val_score( + clf, features, labels, cv=cv) +print('Accuracy: {:.1%}'.format(scores.mean())) +# Accuracy: 62.6% + + diff --git a/ch10/download.sh b/ch10/download.sh index 93121a7f..fb623f3d 100755 --- a/ch10/download.sh +++ b/ch10/download.sh @@ -2,7 +2,7 @@ mkdir -p AnimTransDistr cd AnimTransDistr -wget http://vision.stanford.edu/Datasets/AnimTransDistr.rar +curl -O http://vision.stanford.edu/Datasets/AnimTransDistr.rar unrar x AnimTransDistr.rar # The following file is a weird file: rm Anims/104034.jpg diff --git a/ch10/edginess.py b/ch10/edginess.py deleted file mode 100644 index 5815d654..00000000 --- a/ch10/edginess.py +++ /dev/null @@ -1,22 +0,0 @@ -# This code is supporting material for the book -# Building Machine Learning Systems with Python -# by Willi Richert and Luis Pedro Coelho -# published by PACKT Publishing -# -# It is made available under the MIT License - -import numpy as np -import mahotas as mh - - -def edginess_sobel(image): - '''Measure the "edginess" of an image - - image should be a 2d numpy array (an image) - - Returns a floating point value which is higher the "edgier" the image is. - - ''' - edges = mh.sobel(image, just_filter=True) - edges = edges.ravel() - return np.sqrt(np.dot(edges, edges)) diff --git a/ch10/features.py b/ch10/features.py new file mode 100644 index 00000000..42847b30 --- /dev/null +++ b/ch10/features.py @@ -0,0 +1,70 @@ +# This code is supporting material for the book +# Building Machine Learning Systems with Python +# by Willi Richert and Luis Pedro Coelho +# published by PACKT Publishing +# +# It is made available under the MIT License + +import numpy as np +import mahotas as mh + + +def edginess_sobel(image): + '''Measure the "edginess" of an image + + image should be a 2d numpy array (an image) + + Returns a floating point value which is higher the "edgier" the image is. + + ''' + edges = mh.sobel(image, just_filter=True) + edges = edges.ravel() + return np.sqrt(np.dot(edges, edges)) + +def texture(im): + '''Compute features for an image + + Parameters + ---------- + im : ndarray + + Returns + ------- + fs : ndarray + 1-D array of features + ''' + im = im.astype(np.uint8) + return mh.features.haralick(im).ravel() + + +def chist(im): + '''Compute color histogram of input image + + Parameters + ---------- + im : ndarray + should be an RGB image + + Returns + ------- + c : ndarray + 1-D array of histogram values + ''' + + # Downsample pixel values: + im = im // 64 + + # We can also implement the following by using np.histogramdd + # im = im.reshape((-1,3)) + # bins = [np.arange(5), np.arange(5), np.arange(5)] + # hist = np.histogramdd(im, bins=bins)[0] + # hist = hist.ravel() + + # Separate RGB channels: + r,g,b = im.transpose((2,0,1)) + + pixels = 1 * r + 4 * g + 16 * b + hist = np.bincount(pixels.ravel(), minlength=64) + hist = hist.astype(float) + return np.log1p(hist) + diff --git a/ch10/figure10.py b/ch10/figure10.py index 058b1069..6cb45e7a 100644 --- a/ch10/figure10.py +++ b/ch10/figure10.py @@ -11,10 +11,10 @@ # This little script just builds an image with two examples, side-by-side: text = mh.imread("../SimpleImageDataset/text21.jpg") -scene = mh.imread("../SimpleImageDataset/scene00.jpg") +building = mh.imread("../SimpleImageDataset/building00.jpg") h, w, _ = text.shape canvas = np.zeros((h, 2 * w + 128, 3), np.uint8) -canvas[:, -w:] = scene +canvas[:, -w:] = building canvas[:, :w] = text canvas = canvas[::4, ::4] -mh.imsave('../1400OS_10_10+.jpg', canvas) +mh.imsave('figure10.jpg', canvas) diff --git a/ch10/figure13.py b/ch10/figure13.py deleted file mode 100644 index 89174e8e..00000000 --- a/ch10/figure13.py +++ /dev/null @@ -1,28 +0,0 @@ -# This code is supporting material for the book -# Building Machine Learning Systems with Python -# by Willi Richert and Luis Pedro Coelho -# published by PACKT Publishing -# -# It is made available under the MIT License - -import mahotas as mh -from mahotas.colors import rgb2grey -import numpy as np - -# Adds a little salt-n-pepper noise to an image - -im = mh.demos.load('lena') -im = rgb2grey(im) - -# Salt & pepper arrays -salt = np.random.random(im.shape) > .975 -pepper = np.random.random(im.shape) > .975 - -# salt is 170 & pepper is 30 -# Some playing around showed that setting these to more extreme values looks -# very artificial. These look nicer - -im = np.maximum(salt * 170, mh.stretch(im)) -im = np.minimum(pepper * 30 + im * (~pepper), im) - -mh.imsave('../1400OS_10_13+.jpg', im.astype(np.uint8)) diff --git a/ch10/figure18.py b/ch10/figure18.py deleted file mode 100644 index c0e80d0a..00000000 --- a/ch10/figure18.py +++ /dev/null @@ -1,121 +0,0 @@ -# This code is supporting material for the book -# Building Machine Learning Systems with Python -# by Willi Richert and Luis Pedro Coelho -# published by PACKT Publishing -# -# It is made available under the MIT License - -from __future__ import print_function -import mahotas as mh -from sklearn import cross_validation -from sklearn.linear_model.logistic import LogisticRegression -from matplotlib import pyplot as plt -import numpy as np -from glob import glob - -basedir = 'AnimTransDistr' -print('This script will test classification of the AnimTransDistr dataset') - - -def features_for(images): - fs = [] - for im in images: - im = mh.imread(im, as_grey=True).astype(np.uint8) - fs.append(mh.features.haralick(im).mean(0)) - return np.array(fs) - - -def features_labels(groups): - labels = np.zeros(sum(map(len, groups))) - st = 0 - for i, g in enumerate(groups): - labels[st:st + len(g)] = i - st += len(g) - return np.vstack(groups), labels - -classes = [ - 'Anims', - 'Cars', - 'Distras', - 'Trans', -] - -print('Computing whole-image texture features...') -features = [] -labels = [] -for ci, cl in enumerate(classes): - images = glob('{}/{}/*.jpg'.format(basedir, cl)) - features.extend(features_for(images)) - labels.extend([ci for _ in images]) - -features = np.array(features) -labels = np.array(labels) - -scores0 = cross_validation.cross_val_score( - LogisticRegression(), features, labels, cv=10) -print('Accuracy (5 fold x-val) with Logistic Regrssion [std features]: %s%%' % ( - 0.1 * round(1000 * scores0.mean()))) - -tfeatures = features - -from sklearn.cluster import KMeans -from mahotas.features import surf - -images = [] -labels = [] - -for ci, cl in enumerate(classes): - curimages = glob('{}/{}/*.jpg'.format(basedir, cl)) - images.extend(curimages) - labels.extend([ci for _ in curimages]) -labels = np.array(labels) - -print('Computing SURF descriptors...') -alldescriptors = [] -for im in images: - im = mh.imread(im, as_grey=1) - im = im.astype(np.uint8) - - # To use dense sampling, you can try the following line: - # alldescriptors.append(surf.dense(im, spacing=max(im.shape)//32)) - alldescriptors.append(surf.surf(im, descriptor_only=True)) - -print('Descriptors done') -k = 256 -km = KMeans(k) - -concatenated = np.concatenate(alldescriptors) -concatenated = concatenated[::64] -print('Clustering with K-means...') -km.fit(concatenated) -features = [] -for d in alldescriptors: - c = km.predict(d) - features.append( - np.array([np.sum(c == i) for i in xrange(k)]) - ) -features = np.array(features) -print('predicting...') -scoreSURFlr = cross_validation.cross_val_score( - LogisticRegression(), features, labels, cv=5).mean() -print('Accuracy (5 fold x-val) with Log. Reg [SURF features]: %s%%' % ( - 0.1 * round(1000 * scoreSURFlr.mean()))) - -print('Attemping classification with all features combined...') -allfeatures = np.hstack([features, tfeatures]) -scoreSURFplr = cross_validation.cross_val_score( - LogisticRegression(), allfeatures, labels, cv=5).mean() - -print('Accuracy (5 fold x-val) with Log. Reg [All features]: %s%%' % ( - 0.1 * round(1000 * scoreSURFplr.mean()))) - -plt.plot([0, 1, 2], 100 * - np.array([scores0.mean(), scoreSURFlr, scoreSURFplr]), 'k-', lw=8) -plt.plot( - [0, 1, 2], 100 * np.array([scores0.mean(), scoreSURFlr, scoreSURFplr]), - 'o', mec='#cccccc', mew=12, mfc='white') -plt.xlim(-.5, 2.5) -plt.ylim(scores0.mean() * 90., scoreSURFplr * 110) -plt.xticks([0, 1, 2], ["Texture", "SURF", "combined"]) -plt.ylabel('Accuracy (%)') -plt.savefig('../1400OS_10_18+.png') diff --git a/ch10/figure5_6.py b/ch10/figure5_6.py deleted file mode 100644 index 37d52752..00000000 --- a/ch10/figure5_6.py +++ /dev/null @@ -1,37 +0,0 @@ -# This code is supporting material for the book -# Building Machine Learning Systems with Python -# by Willi Richert and Luis Pedro Coelho -# published by PACKT Publishing -# -# It is made available under the MIT License - -import numpy as np -import mahotas as mh -image = mh.imread('../SimpleImageDataset/building05.jpg') -image = mh.colors.rgb2gray(image) - -# Compute Gaussian filtered versions with increasing kernel widths -im8 = mh.gaussian_filter(image, 8) -im16 = mh.gaussian_filter(image, 16) -im32 = mh.gaussian_filter(image, 32) - -# We now build a composite image with three panels: -# -# [ IM8 | | IM16 | | IM32 ] - -h, w = im8.shape -canvas = np.ones((h, 3 * w + 256), np.uint8) -canvas *= 255 -canvas[:, :w] = im8 -canvas[:, w + 128:2 * w + 128] = im16 -canvas[:, -w:] = im32 -mh.imsave('../1400OS_10_05+.jpg', canvas[:, ::2]) - -# Threshold the image -# We need to first stretch it to convert to an integer image -im32 = mh.stretch(im32) -ot32 = mh.otsu(im32) - -# Convert to 255 np.uint8 to match the other images -im255 = 255 * (im32 > ot32).astype(np.uint8) -mh.imsave('../1400OS_10_06+.jpg', im255) diff --git a/ch10/figure9.py b/ch10/figure9.py deleted file mode 100644 index fbaa985f..00000000 --- a/ch10/figure9.py +++ /dev/null @@ -1,28 +0,0 @@ -# This code is supporting material for the book -# Building Machine Learning Systems with Python -# by Willi Richert and Luis Pedro Coelho -# published by PACKT Publishing -# -# It is made available under the MIT License - -import numpy as np -import mahotas as mh -image = mh.imread('../SimpleImageDataset/building05.jpg') -image = mh.colors.rgb2gray(image, dtype=np.uint8) - -# We need to downsample ``image`` so that the details are visibly pixelated. -# This exaggerates the effect so that the result is clear -image = image[::4, ::4] -thresh = mh.sobel(image) -filtered = mh.sobel(image, just_filter=True) - -thresh = mh.dilate(thresh, np.ones((7, 7))) -filtered = mh.dilate(mh.stretch(filtered), np.ones((7, 7))) - -# Paste the thresholded and non-thresholded versions of the image side-by-side -h, w = thresh.shape -canvas = 255 * np.ones((h, w * 2 + 64), np.uint8) -canvas[:, :w] = thresh * 255 -canvas[:, -w:] = filtered - -mh.imsave('../1400OS_10_09+.jpg', canvas) diff --git a/ch10/large_classification.py b/ch10/large_classification.py new file mode 100644 index 00000000..8db3571b --- /dev/null +++ b/ch10/large_classification.py @@ -0,0 +1,108 @@ +# This code is supporting material for the book +# Building Machine Learning Systems with Python +# by Willi Richert and Luis Pedro Coelho +# published by PACKT Publishing +# +# It is made available under the MIT License + +from __future__ import print_function +import mahotas as mh +from glob import glob +from sklearn import cross_validation +from sklearn.linear_model import LogisticRegression +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.grid_search import GridSearchCV +import numpy as np + +basedir = 'AnimTransDistr' +print('This script will test classification of the AnimTransDistr dataset') + +C_range = 10.0 ** np.arange(-4, 3) +grid = GridSearchCV(LogisticRegression(), param_grid={'C' : C_range}) +clf = Pipeline([('preproc', StandardScaler()), + ('classifier', grid)]) + +def features_for(im): + from features import chist + im = mh.imread(im) + img = mh.colors.rgb2grey(im).astype(np.uint8) + return np.concatenate([mh.features.haralick(img).ravel(), + chist(im)]) + +def images(): + '''Iterate over all (image,label) pairs + + This function will return + ''' + for ci, cl in enumerate(classes): + images = glob('{}/{}/*.jpg'.format(basedir, cl)) + for im in sorted(images): + yield im, ci + +classes = [ + 'Anims', + 'Cars', + 'Distras', + 'Trans', +] + +print('Computing whole-image texture features...') +ifeatures = [] +labels = [] +for im, ell in images(): + ifeatures.append(features_for(im)) + labels.append(ell) + +ifeatures = np.array(ifeatures) +labels = np.array(labels) + +cv = cross_validation.KFold(len(ifeatures), 5, shuffle=True, random_state=123) +scores0 = cross_validation.cross_val_score( + clf, ifeatures, labels, cv=cv) +print('Accuracy (5 fold x-val) with Logistic Regression [image features]: {:.1%}'.format( + scores0.mean())) + + +from sklearn.cluster import KMeans +from mahotas.features import surf + + +print('Computing SURF descriptors...') +alldescriptors = [] +for im,_ in images(): + im = mh.imread(im, as_grey=True) + im = im.astype(np.uint8) + + # To use dense sampling, you can try the following line: + # alldescriptors.append(surf.dense(im, spacing=16)) + alldescriptors.append(surf.surf(im, descriptor_only=True)) + +print('Descriptor computation complete.') +k = 256 +km = KMeans(k) + +concatenated = np.concatenate(alldescriptors) +print('Number of descriptors: {}'.format( + len(concatenated))) +concatenated = concatenated[::64] +print('Clustering with K-means...') +km.fit(concatenated) +sfeatures = [] +for d in alldescriptors: + c = km.predict(d) + sfeatures.append(np.bincount(c, minlength=k)) +sfeatures = np.array(sfeatures, dtype=float) +print('predicting...') +score_SURF = cross_validation.cross_val_score( + clf, sfeatures, labels, cv=cv).mean() +print('Accuracy (5 fold x-val) with Logistic Regression [SURF features]: {:.1%}'.format( + score_SURF.mean())) + + +print('Performing classification with all features combined...') +allfeatures = np.hstack([sfeatures, ifeatures]) +score_SURF_global = cross_validation.cross_val_score( + clf, allfeatures, labels, cv=cv).mean() +print('Accuracy (5 fold x-val) with Logistic Regression [All features]: {:.1%}'.format( + score_SURF_global.mean())) diff --git a/ch10/neighbors.py b/ch10/neighbors.py new file mode 100644 index 00000000..1f71d0de --- /dev/null +++ b/ch10/neighbors.py @@ -0,0 +1,64 @@ +# This code is supporting material for the book +# Building Machine Learning Systems with Python +# by Willi Richert and Luis Pedro Coelho +# published by PACKT Publishing + +import numpy as np +import mahotas as mh +from glob import glob +from features import texture, chist +from matplotlib import pyplot as plt +from sklearn.preprocessing import StandardScaler +from scipy.spatial import distance + +basedir = '../SimpleImageDataset/' + + +haralicks = [] +chists = [] + +print('Computing features...') +# Use glob to get all the images +images = glob('{}/*.jpg'.format(basedir)) +# We sort the images to ensure that they are always processed in the same order +# Otherwise, this would introduce some variation just based on the random +# ordering that the filesystem uses +images.sort() + +for fname in images: + imc = mh.imread(fname) + imc = imc[200:-200,200:-200] + haralicks.append(texture(mh.colors.rgb2grey(imc))) + chists.append(chist(imc)) + +haralicks = np.array(haralicks) +chists = np.array(chists) +features = np.hstack([chists, haralicks]) + +print('Computing neighbors...') +sc = StandardScaler() +features = sc.fit_transform(features) +dists = distance.squareform(distance.pdist(features)) + +print('Plotting...') +fig, axes = plt.subplots(2, 9, figsize=(16,8)) + +# Remove ticks from all subplots +for ax in axes.flat: + ax.set_xticks([]) + ax.set_yticks([]) + +for ci,i in enumerate(range(0,90,10)): + left = images[i] + dists_left = dists[i] + right = dists_left.argsort() + # right[0] is the same as left[i], so pick the next closest element + right = right[1] + right = images[right] + left = mh.imread(left) + right = mh.imread(right) + axes[0, ci].imshow(left) + axes[1, ci].imshow(right) + +fig.tight_layout() +fig.savefig('figure_neighbors.png', dpi=300) diff --git a/ch10/scene00.jpg b/ch10/scene00.jpg new file mode 100644 index 00000000..ed727a50 Binary files /dev/null and b/ch10/scene00.jpg differ diff --git a/ch10/simple_classification.py b/ch10/simple_classification.py index 6a512f64..a5a448d2 100644 --- a/ch10/simple_classification.py +++ b/ch10/simple_classification.py @@ -6,42 +6,33 @@ # It is made available under the MIT License import mahotas as mh -from sklearn import cross_validation -from sklearn.linear_model.logistic import LogisticRegression import numpy as np from glob import glob -from edginess import edginess_sobel - -basedir = '../SimpleImageDataset/' +from features import texture, chist +from sklearn.linear_model import LogisticRegression +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler -def features_for(im): - '''Compute features for an image - - Parameters - ---------- - im : str - filepath for image to process +basedir = '../SimpleImageDataset/' - Returns - ------- - fs : ndarray - 1-D array of features - ''' - im = mh.imread(im, as_grey=True).astype(np.uint8) - return mh.features.haralick(im).mean(0) haralicks = [] -sobels = [] labels = [] +chists = [] print('This script will test (with cross-validation) classification of the simple 3 class dataset') print('Computing features...') # Use glob to get all the images images = glob('{}/*.jpg'.format(basedir)) -for fname in images: - haralicks.append(features_for(fname)) - sobels.append(edginess_sobel(mh.imread(fname, as_grey=True))) + +# We sort the images to ensure that they are always processed in the same order +# Otherwise, this would introduce some variation just based on the random +# ordering that the filesystem uses +for fname in sorted(images): + imc = mh.imread(fname) + haralicks.append(texture(mh.colors.rgb2grey(imc))) + chists.append(chist(imc)) # Files are named like building00.jpg, scene23.jpg... labels.append(fname[:-len('xx.jpg')]) @@ -49,27 +40,31 @@ def features_for(im): print('Finished computing features.') haralicks = np.array(haralicks) -sobels = np.array(sobels) labels = np.array(labels) +chists = np.array(chists) -# We use logistic regression because it is very fast. +haralick_plus_chists = np.hstack([chists, haralicks]) + + +# We use Logistic Regression because it achieves high accuracy on small(ish) datasets # Feel free to experiment with other classifiers -scores = cross_validation.cross_val_score( - LogisticRegression(), haralicks, labels, cv=5) -print('Accuracy (5 fold x-val) with Logistic Regrssion [std features]: {}%'.format( - 0.1 * round(1000 * scores.mean()))) +clf = Pipeline([('preproc', StandardScaler()), + ('classifier', LogisticRegression())]) -haralick_plus_sobel = np.hstack([np.atleast_2d(sobels).T, haralicks]) +from sklearn import cross_validation +cv = cross_validation.LeaveOneOut(len(images)) scores = cross_validation.cross_val_score( - LogisticRegression(), haralick_plus_sobel, labels, cv=5).mean() -print('Accuracy (5 fold x-val) with Logistic Regrssion [std features + sobel]: {}%'.format( - 0.1 * round(1000 * scores.mean()))) + clf, haralicks, labels, cv=cv) +print('Accuracy (Leave-one-out) with Logistic Regression [haralick features]: {:.1%}'.format( + scores.mean())) +scores = cross_validation.cross_val_score( + clf, chists, labels, cv=cv) +print('Accuracy (Leave-one-out) with Logistic Regression [color histograms]: {:.1%}'.format( + scores.mean())) -# We can try to just use the sobel feature. The result is almost completely -# random. scores = cross_validation.cross_val_score( - LogisticRegression(), np.atleast_2d(sobels).T, labels, cv=5).mean() -print('Accuracy (5 fold x-val) with Logistic Regrssion [only using sobel feature]: {}%'.format( - 0.1 * round(1000 * scores.mean()))) + clf, haralick_plus_chists, labels, cv=cv) +print('Accuracy (Leave-one-out) with Logistic Regression [texture features + color histograms]: {:.1%}'.format( + scores.mean())) diff --git a/ch10/thresholded_figure.py b/ch10/thresholded_figure.py new file mode 100644 index 00000000..947762e8 --- /dev/null +++ b/ch10/thresholded_figure.py @@ -0,0 +1,31 @@ +import mahotas as mh +import numpy as np +from matplotlib import pyplot as plt + +# Load image & convert to B&W +image = mh.imread('../SimpleImageDataset/scene00.jpg') +image = mh.colors.rgb2grey(image, dtype=np.uint8) +plt.imshow(image) +plt.gray() +plt.title('original image') + +thresh = mh.thresholding.otsu(image) +print('Otsu threshold is {}.'.format(thresh)) + +threshed = (image > thresh) +plt.figure() +plt.imshow(threshed) +plt.title('threholded image') +mh.imsave('thresholded.png', threshed.astype(np.uint8)*255) + +im16 = mh.gaussian_filter(image, 16) + +# Repeat the thresholding operations with the blurred image +thresh = mh.thresholding.otsu(im16.astype(np.uint8)) +threshed = (im16 > thresh) +plt.figure() +plt.imshow(threshed) +plt.title('threholded image (after blurring)') +print('Otsu threshold after blurring is {}.'.format(thresh)) +mh.imsave('thresholded16.png', threshed.astype(np.uint8)*255) +plt.show() diff --git a/ch12/.gitignore b/ch12/.gitignore index fb47e2ef..fa8fbcc2 100644 --- a/ch12/.gitignore +++ b/ch12/.gitignore @@ -1 +1,3 @@ *.jugdata/ +output.txt +results.image.txt diff --git a/ch12/README.rst b/ch12/README.rst new file mode 100644 index 00000000..fba88d4a --- /dev/null +++ b/ch12/README.rst @@ -0,0 +1,28 @@ +========== +Chapter 12 +========== + +Support code for *Chapter 12: Big(ger) Data* + +Data +---- + +This chapter relies only on the image dataset that is packaged with the +repository at ``../SimpleImageDataset/``. + +Scripts +------- + +chapter.py + Code as written in the book +jugfile.py + Example jugfile +image-classification.py + Jugfile implementation of image classification from Chapter 10 + +setup-aws.txt + Commands to setup Amazon WebServices machine +run-jugfile.sh + Wrapper script to run jug file on jugfile.py +run-image-classification.sh + Wrapper script to run jug file on image-classification.py diff --git a/ch12/chapter.py b/ch12/chapter.py new file mode 100644 index 00000000..15326680 --- /dev/null +++ b/ch12/chapter.py @@ -0,0 +1,95 @@ +from jug import TaskGenerator +from glob import glob +import mahotas as mh +@TaskGenerator +def compute_texture(im): + from features import texture + imc = mh.imread(im) + return texture(mh.colors.rgb2gray(imc)) + +@TaskGenerator +def chist_file(fname): + from features import chist + im = mh.imread(fname) + return chist(im) + +import numpy as np +to_array = TaskGenerator(np.array) +hstack = TaskGenerator(np.hstack) + +haralicks = [] +chists = [] +labels = [] + +# Change this variable to point to +# the location of the dataset is on disk +basedir = '../SimpleImageDataset/' +# Use glob to get all the images +images = glob('{}/*.jpg'.format(basedir)) + +for fname in sorted(images): + haralicks.append(compute_texture(fname)) + chists.append(chist_file(fname)) + # The class is encoded in the filename as xxxx00.jpg + labels.append(fname[:-len('00.jpg')]) + +haralicks = to_array(haralicks) +chists = to_array(chists) +labels = to_array(labels) + +@TaskGenerator +def accuracy(features, labels): + from sklearn.linear_model import LogisticRegression + from sklearn.pipeline import Pipeline + from sklearn.preprocessing import StandardScaler + from sklearn import cross_validation + + clf = Pipeline([('preproc', StandardScaler()), + ('classifier', LogisticRegression())]) + cv = cross_validation.LeaveOneOut(len(features)) + scores = cross_validation.cross_val_score( + clf, features, labels, cv=cv) + return scores.mean() +scores_base = accuracy(haralicks, labels) +scores_chist = accuracy(chists, labels) + +combined = hstack([chists, haralicks]) +scores_combined = accuracy(combined, labels) + +@TaskGenerator +def print_results(scores): + with open('results.image.txt', 'w') as output: + for k,v in scores: + output.write('Accuracy [{}]: {:.1%}\n'.format( + k, v.mean())) + +print_results([ + ('base', scores_base), + ('chists', scores_chist), + ('combined' , scores_combined), + ]) + +@TaskGenerator +def compute_lbp(fname): + from mahotas.features import lbp + imc = mh.imread(fname) + im = mh.colors.rgb2grey(imc) + return lbp(im, radius=8, points=6) + +lbps = [] +for fname in sorted(images): + # the rest of the loop as before + lbps.append(compute_lbp(fname)) +lbps = to_array(lbps) + +scores_lbps = accuracy(lbps, labels) +combined_all = hstack([chists, haralicks, lbps]) +scores_combined_all = accuracy(combined_all, labels) + +print_results([ + ('base', scores_base), + ('chists', scores_chist), + ('lbps', scores_lbps), + ('combined' , scores_combined), + ('combined_all' , scores_combined_all), + ]) diff --git a/ch12/features.py b/ch12/features.py new file mode 120000 index 00000000..142a324d --- /dev/null +++ b/ch12/features.py @@ -0,0 +1 @@ +../ch10/features.py \ No newline at end of file diff --git a/ch12/image-classification.py b/ch12/image-classification.py index bae62d6d..6f76d26d 100644 --- a/ch12/image-classification.py +++ b/ch12/image-classification.py @@ -6,11 +6,11 @@ # It is made available under the MIT License import mahotas as mh -from sklearn import cross_validation -from sklearn.linear_model.logistic import LogisticRegression import numpy as np from glob import glob from jug import TaskGenerator + +# We need to use the `features` module from chapter 10. from sys import path path.append('../ch10') @@ -20,7 +20,7 @@ basedir = '../SimpleImageDataset/' @TaskGenerator -def features_for(im): +def compute_texture(im): '''Compute features for an image Parameters @@ -33,57 +33,84 @@ def features_for(im): fs : ndarray 1-D array of features ''' - im = mh.imread(im, as_grey=True).astype(np.uint8) - return mh.features.haralick(im).mean(0) + from features import texture + imc = mh.imread(im) + return texture(mh.colors.rgb2grey(imc)) + +@TaskGenerator +def chist(fname): + from features import chist as color_histogram + im = mh.imread(fname) + return color_histogram(im) @TaskGenerator -def edginess_sobel_from_fname(fname): - from edginess import edginess_sobel - return edginess_sobel(mh.imread(fname, as_grey=True)) +def compute_lbp(fname): + from mahotas.features import lbp + imc = mh.imread(fname) + im = mh.colors.rgb2grey(imc) + return lbp(im, radius=8, points=6) + @TaskGenerator def accuracy(features, labels): + from sklearn.linear_model import LogisticRegression + from sklearn.pipeline import Pipeline + from sklearn.preprocessing import StandardScaler + from sklearn import cross_validation # We use logistic regression because it is very fast. # Feel free to experiment with other classifiers + clf = Pipeline([('preproc', StandardScaler()), + ('classifier', LogisticRegression())]) + cv = cross_validation.LeaveOneOut(len(features)) scores = cross_validation.cross_val_score( - LogisticRegression(), features, labels, cv=5) + clf, features, labels, cv=cv) return scores.mean() @TaskGenerator -def stack_features(sobels, haralicks): - return np.hstack([np.atleast_2d(sobels).T, haralicks]) - -@TaskGenerator -def print_results(scores_base, scores_combined): - output = open('results.image.txt', 'w') - output.write('Accuracy (5 fold x-val) with Logistic Regrssion [std features]: {}%\n'.format( - 0.1 * round(1000 * scores_base.mean()))) - output.write('Accuracy (5 fold x-val) with Logistic Regrssion [std features + sobel]: {}%\n'.format( - 0.1 * round(1000 * scores_combined.mean()))) - output.close() +def print_results(scores): + with open('results.image.txt', 'w') as output: + for k,v in scores: + output.write('Accuracy (LOO x-val) with Logistic Regression [{0}]: {1:.1%}\n'.format( + k, v.mean())) to_array = TaskGenerator(np.array) +hstack = TaskGenerator(np.hstack) haralicks = [] -sobels = [] +chists = [] +lbps = [] labels = [] # Use glob to get all the images -images = glob('{}/*.jpg'.format(basedir)) -for fname in images: - haralicks.append(features_for(fname)) - sobels.append(edginess_sobel_from_fname(fname)) +images = glob('{0}/*.jpg'.format(basedir)) +for fname in sorted(images): + haralicks.append(compute_texture(fname)) + chists.append(chist(fname)) + lbps.append(compute_lbp(fname)) labels.append(fname[:-len('00.jpg')]) # The class is encoded in the filename as xxxx00.jpg haralicks = to_array(haralicks) -sobels = to_array(sobels) +chists = to_array(chists) +lbps = to_array(lbps) labels = to_array(labels) scores_base = accuracy(haralicks, labels) -haralick_plus_sobel = stack_features(sobels, haralicks) -scores_combined = accuracy(haralick_plus_sobel, labels) - -print_results(scores_base, scores_combined) +scores_chist = accuracy(chists, labels) +scores_lbps = accuracy(lbps, labels) + +combined = hstack([chists, haralicks]) +scores_combined = accuracy(combined, labels) + +combined_all = hstack([chists, haralicks, lbps]) +scores_combined_all = accuracy(combined_all, labels) + +print_results([ + ('base', scores_base), + ('chists', scores_chist), + ('lbps', scores_lbps), + ('combined' , scores_combined), + ('combined_all' , scores_combined_all), + ]) diff --git a/ch12/jugfile.py b/ch12/jugfile.py index 858c64f4..9d7e2b7a 100644 --- a/ch12/jugfile.py +++ b/ch12/jugfile.py @@ -23,7 +23,7 @@ def add(a, b): @TaskGenerator def print_final_result(oname, value): with open(oname, 'w') as output: - output.write("Final result: {}\n".format(value)) + output.write("Final result: {0}\n".format(value)) input = 2 y = double(input) diff --git a/ch12/run-image-classification.sh b/ch12/run-image-classification.sh new file mode 100755 index 00000000..868d07fa --- /dev/null +++ b/ch12/run-image-classification.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env bash + +jug execute image-classification.py diff --git a/ch12/run-jugfile.sh b/ch12/run-jugfile.sh new file mode 100755 index 00000000..0ff59131 --- /dev/null +++ b/ch12/run-jugfile.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +jug execute + diff --git a/ch12/setup-aws.txt b/ch12/setup-aws.txt new file mode 100644 index 00000000..292654eb --- /dev/null +++ b/ch12/setup-aws.txt @@ -0,0 +1,7 @@ +sudo yum update +sudo yum -y install python-devel python-pip numpy scipy python-matplotlib +sudo yum -y install gcc-c++ +sudo yum -y install git +sudo pip-python install -U pip +sudo pip install scikit-learn jug mahotas +