|
| 1 | +# cython: cdivision=True |
| 2 | +# cython: boundscheck=False |
| 3 | +# cython: wraparound=False |
| 4 | +# cython: profile=True |
| 5 | +# |
| 6 | +# Author: Andreas Mueller |
| 7 | +# |
| 8 | +# Licence: BSD 3 clause |
| 9 | + |
| 10 | +import numpy as np |
| 11 | +cimport numpy as np |
| 12 | +cimport cython |
| 13 | + |
| 14 | +from libc.math cimport sqrt |
| 15 | + |
| 16 | +from ..metrics import euclidean_distances |
| 17 | +from ._k_means import _centers_dense |
| 18 | +from ..utils.fixes import partition |
| 19 | + |
| 20 | + |
| 21 | +cdef double euclidian_dist(double* a, double* b, int n_features) nogil: |
| 22 | + cdef double result, tmp |
| 23 | + result = 0 |
| 24 | + cdef int i |
| 25 | + for i in range(n_features): |
| 26 | + tmp = (a[i] - b[i]) |
| 27 | + result += tmp * tmp |
| 28 | + return sqrt(result) |
| 29 | + |
| 30 | + |
| 31 | +cdef update_labels_distances_inplace( |
| 32 | + double* X, double* centers, double[:, :] center_half_distances, |
| 33 | + int[:] labels, double[:, :] lower_bounds, double[:] upper_bounds, |
| 34 | + int n_samples, int n_features, int n_clusters): |
| 35 | + """ |
| 36 | + Calculate upper and lower bounds for each sample. |
| 37 | +
|
| 38 | + Given X, centers and the pairwise distances divided by 2.0 between the |
| 39 | + centers this calculates the upper bounds and lower bounds for each sample. |
| 40 | + The upper bound for each sample is set to the distance between the sample |
| 41 | + and the closest center. |
| 42 | +
|
| 43 | + The lower bound for each sample is a one-dimensional array of n_clusters. |
| 44 | + For each sample i assume that the previously assigned cluster is c1 and the |
| 45 | + previous closest distance is dist, for a new cluster c2, the |
| 46 | + lower_bound[i][c2] is set to distance between the sample and this new |
| 47 | + cluster, if and only if dist > center_half_distances[c1][c2]. This prevents |
| 48 | + computation of unnecessary distances for each sample to the clusters that |
| 49 | + it is unlikely to be assigned to. |
| 50 | +
|
| 51 | + Parameters |
| 52 | + ---------- |
| 53 | + X : nd-array, shape (n_samples, n_features) |
| 54 | + The input data. |
| 55 | +
|
| 56 | + centers : nd-array, shape (n_clusters, n_features) |
| 57 | + The cluster centers. |
| 58 | +
|
| 59 | + center_half_distances : nd-array, shape (n_clusters, n_clusters) |
| 60 | + The half of the distance between any 2 clusters centers. |
| 61 | +
|
| 62 | + labels : nd-array, shape(n_samples) |
| 63 | + The label for each sample. This array is modified in place. |
| 64 | +
|
| 65 | + lower_bounds : nd-array, shape(n_samples, n_clusters) |
| 66 | + The lower bound on the distance between a sample and each cluster |
| 67 | + center. It is modified in place. |
| 68 | +
|
| 69 | + upper_bounds : nd-array, shape(n_samples,) |
| 70 | + The distance of each sample from its closest cluster center. This is |
| 71 | + modified in place by the function. |
| 72 | +
|
| 73 | + n_samples : int |
| 74 | + The number of samples. |
| 75 | +
|
| 76 | + n_features : int |
| 77 | + The number of features. |
| 78 | +
|
| 79 | + n_clusters : int |
| 80 | + The number of clusters. |
| 81 | + """ |
| 82 | + # assigns closest center to X |
| 83 | + # uses triangle inequality |
| 84 | + cdef double* x |
| 85 | + cdef double* c |
| 86 | + cdef double d_c, dist |
| 87 | + cdef int c_x, j, sample |
| 88 | + for sample in range(n_samples): |
| 89 | + # assign first cluster center |
| 90 | + c_x = 0 |
| 91 | + x = X + sample * n_features |
| 92 | + d_c = euclidian_dist(x, centers, n_features) |
| 93 | + lower_bounds[sample, 0] = d_c |
| 94 | + for j in range(1, n_clusters): |
| 95 | + if d_c > center_half_distances[c_x, j]: |
| 96 | + c = centers + j * n_features |
| 97 | + dist = euclidian_dist(x, c, n_features) |
| 98 | + lower_bounds[sample, j] = dist |
| 99 | + if dist < d_c: |
| 100 | + d_c = dist |
| 101 | + c_x = j |
| 102 | + labels[sample] = c_x |
| 103 | + upper_bounds[sample] = d_c |
| 104 | + |
| 105 | + |
| 106 | +def k_means_elkan(np.ndarray[np.float64_t, ndim=2, mode='c'] X_, int n_clusters, |
| 107 | + np.ndarray[np.float64_t, ndim=2, mode='c'] init, |
| 108 | + float tol=1e-4, int max_iter=30, verbose=False): |
| 109 | + """Run Elkan's k-means. |
| 110 | +
|
| 111 | + Parameters |
| 112 | + ---------- |
| 113 | + X_ : nd-array, shape (n_samples, n_features) |
| 114 | +
|
| 115 | + n_clusters : int |
| 116 | + Number of clusters to find. |
| 117 | +
|
| 118 | + init : nd-array, shape (n_clusters, n_features) |
| 119 | + Initial position of centers. |
| 120 | +
|
| 121 | + tol : float, default=1e-4 |
| 122 | + The relative increment in cluster means before declaring convergence. |
| 123 | +
|
| 124 | + max_iter : int, default=30 |
| 125 | + Maximum number of iterations of the k-means algorithm. |
| 126 | +
|
| 127 | + verbose : bool, default=False |
| 128 | + Whether to be verbose. |
| 129 | +
|
| 130 | + """ |
| 131 | + #initialize |
| 132 | + cdef np.ndarray[np.float64_t, ndim=2, mode='c'] centers_ = init |
| 133 | + cdef double* centers_p = <double*>centers_.data |
| 134 | + cdef double* X_p = <double*>X_.data |
| 135 | + cdef double* x_p |
| 136 | + cdef Py_ssize_t n_samples = X_.shape[0] |
| 137 | + cdef Py_ssize_t n_features = X_.shape[1] |
| 138 | + cdef int point_index, center_index, label |
| 139 | + cdef float upper_bound, distance |
| 140 | + cdef double[:, :] center_half_distances = euclidean_distances(centers_) / 2. |
| 141 | + cdef double[:, :] lower_bounds = np.zeros((n_samples, n_clusters)) |
| 142 | + cdef double[:] distance_next_center |
| 143 | + labels_ = np.empty(n_samples, dtype=np.int32) |
| 144 | + cdef int[:] labels = labels_ |
| 145 | + upper_bounds_ = np.empty(n_samples, dtype=np.float) |
| 146 | + cdef double[:] upper_bounds = upper_bounds_ |
| 147 | + |
| 148 | + # Get the inital set of upper bounds and lower bounds for each sample. |
| 149 | + update_labels_distances_inplace(X_p, centers_p, center_half_distances, |
| 150 | + labels, lower_bounds, upper_bounds, |
| 151 | + n_samples, n_features, n_clusters) |
| 152 | + cdef np.uint8_t[:] bounds_tight = np.ones(n_samples, dtype=np.uint8) |
| 153 | + cdef np.uint8_t[:] points_to_update = np.zeros(n_samples, dtype=np.uint8) |
| 154 | + cdef np.ndarray[np.float64_t, ndim=2, mode='c'] new_centers |
| 155 | + |
| 156 | + if max_iter <= 0: |
| 157 | + raise ValueError('Number of iterations should be a positive number' |
| 158 | + ', got %d instead' % max_iter) |
| 159 | + |
| 160 | + col_indices = np.arange(center_half_distances.shape[0], dtype=np.int) |
| 161 | + for iteration in range(max_iter): |
| 162 | + if verbose: |
| 163 | + print("start iteration") |
| 164 | + |
| 165 | + cd = np.asarray(center_half_distances) |
| 166 | + distance_next_center = partition(cd, kth=1, axis=0)[1] |
| 167 | + |
| 168 | + if verbose: |
| 169 | + print("done sorting") |
| 170 | + |
| 171 | + for point_index in range(n_samples): |
| 172 | + upper_bound = upper_bounds[point_index] |
| 173 | + label = labels[point_index] |
| 174 | + |
| 175 | + # This means that the next likely center is far away from the |
| 176 | + # currently assigned center and the sample is unlikely to be |
| 177 | + # reassigned. |
| 178 | + if distance_next_center[label] >= upper_bound: |
| 179 | + continue |
| 180 | + x_p = X_p + point_index * n_features |
| 181 | + |
| 182 | + # TODO: get pointer to lower_bounds[point_index, center_index] |
| 183 | + for center_index in range(n_clusters): |
| 184 | + |
| 185 | + # If this holds, then center_index is a good candidate for the |
| 186 | + # sample to be relabelled, and we need to confirm this by |
| 187 | + # recomputing the upper and lower bounds. |
| 188 | + if (center_index != label |
| 189 | + and (upper_bound > lower_bounds[point_index, center_index]) |
| 190 | + and (upper_bound > center_half_distances[center_index, label])): |
| 191 | + |
| 192 | + # Recompute the upper bound by calculating the actual distance |
| 193 | + # between the sample and label. |
| 194 | + if not bounds_tight[point_index]: |
| 195 | + upper_bound = euclidian_dist(x_p, centers_p + label * n_features, n_features) |
| 196 | + lower_bounds[point_index, label] = upper_bound |
| 197 | + bounds_tight[point_index] = 1 |
| 198 | + |
| 199 | + # If the condition still holds, then compute the actual distance between |
| 200 | + # the sample and center_index. If this is still lesser than the previous |
| 201 | + # distance, reassign labels. |
| 202 | + if (upper_bound > lower_bounds[point_index, center_index] |
| 203 | + or (upper_bound > center_half_distances[label, center_index])): |
| 204 | + distance = euclidian_dist(x_p, centers_p + center_index * n_features, n_features) |
| 205 | + lower_bounds[point_index, center_index] = distance |
| 206 | + if distance < upper_bound: |
| 207 | + label = center_index |
| 208 | + upper_bound = distance |
| 209 | + |
| 210 | + labels[point_index] = label |
| 211 | + upper_bounds[point_index] = upper_bound |
| 212 | + |
| 213 | + if verbose: |
| 214 | + print("end inner loop") |
| 215 | + |
| 216 | + # compute new centers |
| 217 | + new_centers = _centers_dense(X_, labels_, n_clusters, upper_bounds_) |
| 218 | + bounds_tight[:] = 0 |
| 219 | + |
| 220 | + # compute distance each center moved |
| 221 | + center_shift = np.sqrt(np.sum((centers_ - new_centers) ** 2, axis=1)) |
| 222 | + |
| 223 | + # update bounds accordingly |
| 224 | + lower_bounds = np.maximum(lower_bounds - center_shift, 0) |
| 225 | + upper_bounds = upper_bounds + center_shift[labels_] |
| 226 | + |
| 227 | + # reassign centers |
| 228 | + centers_ = new_centers |
| 229 | + centers_p = <double*>new_centers.data |
| 230 | + |
| 231 | + # update between-center distances |
| 232 | + center_half_distances = euclidean_distances(centers_) / 2. |
| 233 | + if verbose: |
| 234 | + print('Iteration %i, inertia %s' |
| 235 | + % (iteration, np.sum((X_ - centers_[labels]) ** 2))) |
| 236 | + center_shift_total = np.sum(center_shift) |
| 237 | + if center_shift_total ** 2 < tol: |
| 238 | + if verbose: |
| 239 | + print("center shift %e within tolerance %e" |
| 240 | + % (center_shift_total, tol)) |
| 241 | + break |
| 242 | + |
| 243 | + # We need this to make sure that the labels give the same output as |
| 244 | + # predict(X) |
| 245 | + if center_shift_total > 0: |
| 246 | + update_labels_distances_inplace(X_p, centers_p, center_half_distances, |
| 247 | + labels, lower_bounds, upper_bounds, |
| 248 | + n_samples, n_features, n_clusters) |
| 249 | + return centers_, labels_, iteration |
0 commit comments