Skip to content

Commit afcdc0b

Browse files
committed
ENH Add chapter.py files
These contain the code as typed in the text
1 parent bb9b8de commit afcdc0b

File tree

6 files changed

+494
-0
lines changed

6 files changed

+494
-0
lines changed

ch08/chapter.py

Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
import numpy as np # NOT IN BOOK
2+
from matplotlib import pyplot as plt # NOT IN BOOK
3+
4+
def load():
5+
import numpy as np
6+
from scipy import sparse
7+
8+
data = np.loadtxt('data/ml-100k/u.data')
9+
ij = data[:, :2]
10+
ij -= 1 # original data is in 1-based system
11+
values = data[:, 2]
12+
reviews = sparse.csc_matrix((values, ij.T)).astype(float)
13+
return reviews.toarray()
14+
reviews = load()
15+
U,M = np.where(reviews)
16+
import random
17+
test_idxs = np.array(random.sample(range(len(U)), len(U)//10))
18+
19+
train = reviews.copy()
20+
train[U[test_idxs], M[test_idxs]] = 0
21+
22+
test = np.zeros_like(reviews)
23+
test[U[test_idxs], M[test_idxs]] = reviews[U[test_idxs], M[test_idxs]]
24+
25+
class NormalizePositive(object):
26+
def __init__(self, axis=0):
27+
self.axis = axis
28+
29+
def fit(self, features, y=None):
30+
if self.axis == 1:
31+
features = features.T
32+
# count features that are greater than zero in axis 0:
33+
binary = (features > 0)
34+
35+
count0 = binary.sum(axis=0)
36+
37+
# to avoid division by zero, set zero counts to one:
38+
count0[count0 == 0] = 1.
39+
40+
# computing the mean is easy:
41+
self.mean = features.sum(axis=0)/count0
42+
43+
# only consider differences where binary is True:
44+
diff = (features - self.mean) * binary
45+
diff **= 2
46+
# regularize the estimate of std by adding 0.1
47+
self.std = np.sqrt(0.1 + diff.sum(axis=0)/count0)
48+
return self
49+
50+
51+
def transform(self, features):
52+
if self.axis == 1:
53+
features = features.T
54+
binary = (features > 0)
55+
features = features - self.mean
56+
features /= self.std
57+
features *= binary
58+
if self.axis == 1:
59+
features = features.T
60+
return features
61+
62+
def inverse_transform(self, features, copy=True):
63+
if copy:
64+
features = features.copy()
65+
if self.axis == 1:
66+
features = features.T
67+
features *= self.std
68+
features += self.mean
69+
if self.axis == 1:
70+
features = features.T
71+
return features
72+
73+
def fit_transform(self, features):
74+
return self.fit(features).transform(features)
75+
76+
77+
norm = NormalizePositive(axis=1)
78+
binary = (train > 0)
79+
train = norm.fit_transform(train)
80+
# plot just 200x200 area for space reasons
81+
plt.imshow(binary[:200, :200], interpolation='nearest')
82+
83+
from scipy.spatial import distance
84+
# compute all pair-wise distances:
85+
dists = distance.pdist(binary, 'correlation')
86+
# Convert to square form, so that dists[i,j]
87+
# is distance between binary[i] and binary[j]:
88+
dists = distance.squareform(dists)
89+
neighbors = dists.argsort(axis=1)
90+
91+
# We are going to fill this matrix with results
92+
filled = train.copy()
93+
for u in range(filled.shape[0]):
94+
# n_u is neighbors of user
95+
n_u = neighbors[u, 1:]
96+
for m in range(filled.shape[1]):
97+
# get relevant reviews in order!
98+
revs = [train[neigh, m]
99+
for neigh in n_u
100+
if binary [neigh, m]]
101+
if len(revs):
102+
# n is the number of reviews for this movie
103+
n = len(revs)
104+
# take half of the reviews plus one into consideration:
105+
n //= 2
106+
n += 1
107+
revs = revs[:n]
108+
filled[u,m] = np.mean(revs)
109+
110+
predicted = norm.inverse_transform(filled)
111+
from sklearn import metrics
112+
r2 = metrics.r2_score(test[test > 0], predicted[test > 0])
113+
print('R2 score (binary neighbors): {:.1%}'.format(r2))
114+
115+
reviews = reviews.T
116+
# use same code as before
117+
r2 = metrics.r2_score(test[test > 0], predicted[test > 0])
118+
print('R2 score (binary movie neighbors): {:.1%}'.format(r2))
119+
120+
121+
from sklearn.linear_model import ElasticNetCV # NOT IN BOOK
122+
123+
reg = ElasticNetCV(alphas=[
124+
0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.])
125+
filled = train.copy()
126+
# iterate over all users:
127+
for u in range(train.shape[0]):
128+
curtrain = np.delete(train, u, axis=0)
129+
bu = binary[u]
130+
reg.fit(curtrain[:,bu].T, train[u, bu])
131+
filled[u, ~bu] = reg.predict(curtrain[:,~bu].T)
132+
predicted = norm.inverse_transform(filled)
133+
r2 = metrics.r2_score(test[test > 0], predicted[test > 0])
134+
print('R2 score (user regression): {:.1%}'.format(r2))
135+
136+
137+
# SHOPPING BASKET ANALYSIS
138+
# This is the slow version of the code, which will take a long time to
139+
# complete.
140+
141+
142+
from collections import defaultdict
143+
from itertools import chain
144+
145+
# File is downloaded as a compressed file
146+
import gzip
147+
# file format is a line per transaction
148+
# of the form '12 34 342 5...'
149+
dataset = [[int(tok) for tok in line.strip().split()]
150+
for line in gzip.open('data/retail.dat.gz')]
151+
dataset = [set(d) for d in dataset]
152+
# count how often each product was purchased:
153+
counts = defaultdict(int)
154+
for elem in chain(*dataset):
155+
counts[elem] += 1
156+
157+
minsupport = 80
158+
valid = set(k for k,v in counts.items() if (v >= minsupport))
159+
itemsets = [frozenset([v]) for v in valid]
160+
freqsets = []
161+
for i in range(16):
162+
nextsets = []
163+
tested = set()
164+
for it in itemsets:
165+
for v in valid:
166+
if v not in it:
167+
# Create a new candidate set by adding v to it
168+
c = (it | frozenset([v]))
169+
# check If we have tested it already
170+
if c in tested:
171+
continue
172+
tested.add(c)
173+
174+
# Count support by looping over dataset
175+
# This step is slow.
176+
# Check `apriori.py` for a better implementation.
177+
support_c = sum(1 for d in dataset if d.issuperset(c))
178+
if support_c > minsupport:
179+
nextsets.append(c)
180+
freqsets.extend(nextsets)
181+
itemsets = nextsets
182+
if not len(itemsets):
183+
break
184+
print("Finished!")
185+
186+
187+
minlift = 5.0
188+
nr_transactions = float(len(dataset))
189+
for itemset in freqsets:
190+
for item in itemset:
191+
consequent = frozenset([item])
192+
antecedent = itemset-consequent
193+
base = 0.0
194+
# acount: antecedent count
195+
acount = 0.0
196+
197+
# ccount : consequent count
198+
ccount = 0.0
199+
for d in dataset:
200+
if item in d: base += 1
201+
if d.issuperset(itemset): ccount += 1
202+
if d.issuperset(antecedent): acount += 1
203+
base /= nr_transactions
204+
p_y_given_x = ccount/acount
205+
lift = p_y_given_x / base
206+
if lift > minlift:
207+
print('Rule {0} -> {1} has lift {2}'
208+
.format(antecedent, consequent,lift))

ch10/README.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ Running ``download.sh`` will retrieve the other dataset into a directory
1717
Scripts
1818
-------
1919

20+
chapter.py
21+
Code as written in the book.
2022
thresholded_figure.py
2123
Computes the thresholded figures, including after Gaussian blurring
2224
lena-ring.py

0 commit comments

Comments
 (0)