diff --git a/ch01/gen_webstats.py b/ch01/gen_webstats.py index fa133d76..61d0b738 100644 --- a/ch01/gen_webstats.py +++ b/ch01/gen_webstats.py @@ -17,26 +17,22 @@ sp.random.seed(3) # to reproduce the data later on -x = sp.arange(1, 31 * 24) -y = sp.array(200 * (sp.sin(2 * sp.pi * x / (7 * 24))), dtype=int) +x = sp.arange(1, 31*24) +y = sp.array(200*(sp.sin(2*sp.pi*x/(7*24))), dtype=int) y += gamma.rvs(15, loc=0, scale=100, size=len(x)) -y += 2 * sp.exp(x / 100.0) -y = sp.ma.array(y, mask=[y < 0]) -print(sum(y), sum(y < 0)) +y += 2 * sp.exp(x/100.0) +y = sp.ma.array(y, mask=[y<0]) +print(sum(y), sum(y<0)) plt.scatter(x, y) plt.title("Web traffic over the last month") plt.xlabel("Time") plt.ylabel("Hits/hour") -plt.xticks([w * 7 * 24 for w in [0, 1, 2, 3, 4]], ['week %i' % (w + 1) for w in - [0, 1, 2, 3, 4]]) - +plt.xticks([w*7*24 for w in range(5)], + ['week %i' %(w+1) for w in range(5)]) plt.autoscale(tight=True) plt.grid() plt.savefig(os.path.join(CHART_DIR, "1400_01_01.png")) -# sp.savetxt(os.path.join("..", "web_traffic.tsv"), -# zip(x[~y.mask],y[~y.mask]), delimiter="\t", fmt="%i") - -sp.savetxt(os.path.join( - DATA_DIR, "web_traffic.tsv"), list(zip(x, y)), delimiter="\t", fmt="%s") +sp.savetxt(os.path.join(DATA_DIR, "web_traffic.tsv"), + list(zip(x, y)), delimiter="\t", fmt="%s") diff --git a/ch02/chapter.py b/ch02/chapter.py index ac887650..c68b45ab 100644 --- a/ch02/chapter.py +++ b/ch02/chapter.py @@ -100,7 +100,6 @@ def is_virginica_test(fi, t, reverse, example): training[ei] = False testing = ~training model = fit_model(features[training], is_virginica[training]) - predict(model, features[testing]) predictions = predict(model, features[testing]) correct += np.sum(predictions == is_virginica[testing]) acc = correct/float(len(features)) diff --git a/ch02/figure4_5_no_sklearn.py b/ch02/figure4_5_no_sklearn.py index 5f67e0d7..adc83d73 100644 --- a/ch02/figure4_5_no_sklearn.py +++ b/ch02/figure4_5_no_sklearn.py @@ -45,7 +45,7 @@ def plot_decision(features, labels): model = fit_model(1, features[:, (0, 2)], np.array(labels)) C = predict( - np.vstack([X.ravel(), Y.ravel()]).T, model).reshape(X.shape) + model, np.vstack([X.ravel(), Y.ravel()]).T).reshape(X.shape) if COLOUR_FIGURE: cmap = ListedColormap([(1., .6, .6), (.6, 1., .6), (.6, .6, 1.)]) else: diff --git a/ch04/README.rst b/ch04/README.rst index 7fe0a92f..99a3c186 100644 --- a/ch04/README.rst +++ b/ch04/README.rst @@ -4,6 +4,16 @@ Chapter 4 Support code for *Chapter 4: Topic Modeling* + +AP Data +------- + +To download the AP data, use the ``download_ap.sh`` script inside the ``data`` +directory:: + + cd data + ./download_ap.sh + Word cloud creation ------------------- diff --git a/ch04/blei_lda.py b/ch04/blei_lda.py index bbad9d1f..7f6ac2b3 100644 --- a/ch04/blei_lda.py +++ b/ch04/blei_lda.py @@ -36,9 +36,9 @@ # Iterate over all the topics in the model for ti in range(model.num_topics): words = model.show_topic(ti, 64) - tf = sum(f for f, w in words) + tf = sum(f for _, f in words) with open('topics.txt', 'w') as output: - output.write('\n'.join('{}:{}'.format(w, int(1000. * f / tf)) for f, w in words)) + output.write('\n'.join('{}:{}'.format(w, int(1000. * f / tf)) for w, f in words)) output.write("\n\n\n") # We first identify the most discussed topic, i.e., the one with the diff --git a/ch04/data/download_ap.sh b/ch04/data/download_ap.sh index 6de8ded8..da27814a 100755 --- a/ch04/data/download_ap.sh +++ b/ch04/data/download_ap.sh @@ -1,3 +1,3 @@ #!/bin/sh -wget http://www.cs.princeton.edu/~blei/lda-c/ap.tgz +wget http://www.cs.columbia.edu/~blei/lda-c/ap.tgz tar xzf ap.tgz diff --git a/ch04/wordcloud.py b/ch04/wordcloud.py index 6c5302ea..accca2d6 100644 --- a/ch04/wordcloud.py +++ b/ch04/wordcloud.py @@ -24,8 +24,6 @@ def create_cloud(oname, words,maxsize=120, fontname='Lobster'): # gensim returns a weight between 0 and 1 for each word, while pytagcloud # expects an integer word count. So, we multiply by a large number and # round. For a visualization this is an adequate approximation. - # We also need to flip the order as gensim returns (value, word), whilst - # pytagcloud expects (word, value): - words = [(w,int(v*10000)) for v,w in words] + words = [(w,int(v*10000)) for w,v in words] tags = make_tags(words, maxsize=maxsize) create_tag_image(tags, oname, size=(1800, 1200), fontname=fontname) diff --git a/ch10/neighbors.py b/ch10/neighbors.py index c62a0e2c..1f71d0de 100644 --- a/ch10/neighbors.py +++ b/ch10/neighbors.py @@ -6,7 +6,7 @@ import numpy as np import mahotas as mh from glob import glob -from features import texture, color_histogram +from features import texture, chist from matplotlib import pyplot as plt from sklearn.preprocessing import StandardScaler from scipy.spatial import distance @@ -29,7 +29,7 @@ imc = mh.imread(fname) imc = imc[200:-200,200:-200] haralicks.append(texture(mh.colors.rgb2grey(imc))) - chists.append(color_histogram(imc)) + chists.append(chist(imc)) haralicks = np.array(haralicks) chists = np.array(chists) diff --git a/ch10/simple_classification.py b/ch10/simple_classification.py index 0e3ab347..a5a448d2 100644 --- a/ch10/simple_classification.py +++ b/ch10/simple_classification.py @@ -9,7 +9,7 @@ import numpy as np from glob import glob -from features import texture, color_histogram +from features import texture, chist from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler @@ -32,7 +32,7 @@ for fname in sorted(images): imc = mh.imread(fname) haralicks.append(texture(mh.colors.rgb2grey(imc))) - chists.append(color_histogram(imc)) + chists.append(chist(imc)) # Files are named like building00.jpg, scene23.jpg... labels.append(fname[:-len('xx.jpg')]) diff --git a/ch12/image-classification.py b/ch12/image-classification.py index 09dbd5b4..6f76d26d 100644 --- a/ch12/image-classification.py +++ b/ch12/image-classification.py @@ -39,7 +39,7 @@ def compute_texture(im): @TaskGenerator def chist(fname): - from features import color_histogram + from features import chist as color_histogram im = mh.imread(fname) return color_histogram(im)