|
| 1 | +import numpy as np # NOT IN BOOK |
| 2 | +from matplotlib import pyplot as plt # NOT IN BOOK |
| 3 | + |
| 4 | +def load(): |
| 5 | + import numpy as np |
| 6 | + from scipy import sparse |
| 7 | + |
| 8 | + data = np.loadtxt('data/ml-100k/u.data') |
| 9 | + ij = data[:, :2] |
| 10 | + ij -= 1 # original data is in 1-based system |
| 11 | + values = data[:, 2] |
| 12 | + reviews = sparse.csc_matrix((values, ij.T)).astype(float) |
| 13 | + return reviews.toarray() |
| 14 | +reviews = load() |
| 15 | +U,M = np.where(reviews) |
| 16 | +import random |
| 17 | +test_idxs = np.array(random.sample(range(len(U)), len(U)//10)) |
| 18 | + |
| 19 | +train = reviews.copy() |
| 20 | +train[U[test_idxs], M[test_idxs]] = 0 |
| 21 | + |
| 22 | +test = np.zeros_like(reviews) |
| 23 | +test[U[test_idxs], M[test_idxs]] = reviews[U[test_idxs], M[test_idxs]] |
| 24 | + |
| 25 | +class NormalizePositive(object): |
| 26 | + def __init__(self, axis=0): |
| 27 | + self.axis = axis |
| 28 | + |
| 29 | + def fit(self, features, y=None): |
| 30 | + if self.axis == 1: |
| 31 | + features = features.T |
| 32 | + # count features that are greater than zero in axis 0: |
| 33 | + binary = (features > 0) |
| 34 | + |
| 35 | + count0 = binary.sum(axis=0) |
| 36 | + |
| 37 | + # to avoid division by zero, set zero counts to one: |
| 38 | + count0[count0 == 0] = 1. |
| 39 | + |
| 40 | + # computing the mean is easy: |
| 41 | + self.mean = features.sum(axis=0)/count0 |
| 42 | + |
| 43 | + # only consider differences where binary is True: |
| 44 | + diff = (features - self.mean) * binary |
| 45 | + diff **= 2 |
| 46 | + # regularize the estimate of std by adding 0.1 |
| 47 | + self.std = np.sqrt(0.1 + diff.sum(axis=0)/count0) |
| 48 | + return self |
| 49 | + |
| 50 | + |
| 51 | + def transform(self, features): |
| 52 | + if self.axis == 1: |
| 53 | + features = features.T |
| 54 | + binary = (features > 0) |
| 55 | + features = features - self.mean |
| 56 | + features /= self.std |
| 57 | + features *= binary |
| 58 | + if self.axis == 1: |
| 59 | + features = features.T |
| 60 | + return features |
| 61 | + |
| 62 | + def inverse_transform(self, features, copy=True): |
| 63 | + if copy: |
| 64 | + features = features.copy() |
| 65 | + if self.axis == 1: |
| 66 | + features = features.T |
| 67 | + features *= self.std |
| 68 | + features += self.mean |
| 69 | + if self.axis == 1: |
| 70 | + features = features.T |
| 71 | + return features |
| 72 | + |
| 73 | + def fit_transform(self, features): |
| 74 | + return self.fit(features).transform(features) |
| 75 | + |
| 76 | + |
| 77 | +norm = NormalizePositive(axis=1) |
| 78 | +binary = (train > 0) |
| 79 | +train = norm.fit_transform(train) |
| 80 | +# plot just 200x200 area for space reasons |
| 81 | +plt.imshow(binary[:200, :200], interpolation='nearest') |
| 82 | + |
| 83 | +from scipy.spatial import distance |
| 84 | +# compute all pair-wise distances: |
| 85 | +dists = distance.pdist(binary, 'correlation') |
| 86 | +# Convert to square form, so that dists[i,j] |
| 87 | +# is distance between binary[i] and binary[j]: |
| 88 | +dists = distance.squareform(dists) |
| 89 | +neighbors = dists.argsort(axis=1) |
| 90 | + |
| 91 | +# We are going to fill this matrix with results |
| 92 | +filled = train.copy() |
| 93 | +for u in range(filled.shape[0]): |
| 94 | + # n_u is neighbors of user |
| 95 | + n_u = neighbors[u, 1:] |
| 96 | + for m in range(filled.shape[1]): |
| 97 | + # get relevant reviews in order! |
| 98 | + revs = [train[neigh, m] |
| 99 | + for neigh in n_u |
| 100 | + if binary [neigh, m]] |
| 101 | + if len(revs): |
| 102 | + # n is the number of reviews for this movie |
| 103 | + n = len(revs) |
| 104 | + # take half of the reviews plus one into consideration: |
| 105 | + n //= 2 |
| 106 | + n += 1 |
| 107 | + revs = revs[:n] |
| 108 | + filled[u,m] = np.mean(revs) |
| 109 | + |
| 110 | +predicted = norm.inverse_transform(filled) |
| 111 | +from sklearn import metrics |
| 112 | +r2 = metrics.r2_score(test[test > 0], predicted[test > 0]) |
| 113 | +print('R2 score (binary neighbors): {:.1%}'.format(r2)) |
| 114 | + |
| 115 | +reviews = reviews.T |
| 116 | +# use same code as before |
| 117 | +r2 = metrics.r2_score(test[test > 0], predicted[test > 0]) |
| 118 | +print('R2 score (binary movie neighbors): {:.1%}'.format(r2)) |
| 119 | + |
| 120 | + |
| 121 | +from sklearn.linear_model import ElasticNetCV # NOT IN BOOK |
| 122 | + |
| 123 | +reg = ElasticNetCV(alphas=[ |
| 124 | + 0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) |
| 125 | +filled = train.copy() |
| 126 | +# iterate over all users: |
| 127 | +for u in range(train.shape[0]): |
| 128 | + curtrain = np.delete(train, u, axis=0) |
| 129 | + bu = binary[u] |
| 130 | + reg.fit(curtrain[:,bu].T, train[u, bu]) |
| 131 | + filled[u, ~bu] = reg.predict(curtrain[:,~bu].T) |
| 132 | +predicted = norm.inverse_transform(filled) |
| 133 | +r2 = metrics.r2_score(test[test > 0], predicted[test > 0]) |
| 134 | +print('R2 score (user regression): {:.1%}'.format(r2)) |
| 135 | + |
| 136 | + |
| 137 | +# SHOPPING BASKET ANALYSIS |
| 138 | +# This is the slow version of the code, which will take a long time to |
| 139 | +# complete. |
| 140 | + |
| 141 | + |
| 142 | +from collections import defaultdict |
| 143 | +from itertools import chain |
| 144 | + |
| 145 | +# File is downloaded as a compressed file |
| 146 | +import gzip |
| 147 | +# file format is a line per transaction |
| 148 | +# of the form '12 34 342 5...' |
| 149 | +dataset = [[int(tok) for tok in line.strip().split()] |
| 150 | + for line in gzip.open('data/retail.dat.gz')] |
| 151 | +dataset = [set(d) for d in dataset] |
| 152 | +# count how often each product was purchased: |
| 153 | +counts = defaultdict(int) |
| 154 | +for elem in chain(*dataset): |
| 155 | + counts[elem] += 1 |
| 156 | + |
| 157 | +minsupport = 80 |
| 158 | +valid = set(k for k,v in counts.items() if (v >= minsupport)) |
| 159 | +itemsets = [frozenset([v]) for v in valid] |
| 160 | +freqsets = [] |
| 161 | +for i in range(16): |
| 162 | + nextsets = [] |
| 163 | + tested = set() |
| 164 | + for it in itemsets: |
| 165 | + for v in valid: |
| 166 | + if v not in it: |
| 167 | + # Create a new candidate set by adding v to it |
| 168 | + c = (it | frozenset([v])) |
| 169 | + # check If we have tested it already |
| 170 | + if c in tested: |
| 171 | + continue |
| 172 | + tested.add(c) |
| 173 | + |
| 174 | + # Count support by looping over dataset |
| 175 | + # This step is slow. |
| 176 | + # Check `apriori.py` for a better implementation. |
| 177 | + support_c = sum(1 for d in dataset if d.issuperset(c)) |
| 178 | + if support_c > minsupport: |
| 179 | + nextsets.append(c) |
| 180 | + freqsets.extend(nextsets) |
| 181 | + itemsets = nextsets |
| 182 | + if not len(itemsets): |
| 183 | + break |
| 184 | +print("Finished!") |
| 185 | + |
| 186 | + |
| 187 | +minlift = 5.0 |
| 188 | +nr_transactions = float(len(dataset)) |
| 189 | +for itemset in freqsets: |
| 190 | + for item in itemset: |
| 191 | + consequent = frozenset([item]) |
| 192 | + antecedent = itemset-consequent |
| 193 | + base = 0.0 |
| 194 | + # acount: antecedent count |
| 195 | + acount = 0.0 |
| 196 | + |
| 197 | + # ccount : consequent count |
| 198 | + ccount = 0.0 |
| 199 | + for d in dataset: |
| 200 | + if item in d: base += 1 |
| 201 | + if d.issuperset(itemset): ccount += 1 |
| 202 | + if d.issuperset(antecedent): acount += 1 |
| 203 | + base /= nr_transactions |
| 204 | + p_y_given_x = ccount/acount |
| 205 | + lift = p_y_given_x / base |
| 206 | + if lift > minlift: |
| 207 | + print('Rule {0} -> {1} has lift {2}' |
| 208 | + .format(antecedent, consequent,lift)) |
0 commit comments