Skip to content
This repository was archived by the owner on Jan 8, 2019. It is now read-only.

Commit 6d7dfdb

Browse files
committed
Merge pull request scikit-learn#5414 from vighneshbirodkar/el
[MRG+1] Elkans K means
2 parents 78a6748 + 88e7e77 commit 6d7dfdb

File tree

8 files changed

+384
-26
lines changed

8 files changed

+384
-26
lines changed

.gitattributes

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
/sklearn/cluster/_dbscan_inner.cpp -diff
44
/sklearn/cluster/_hierarchical.cpp -diff
55
/sklearn/cluster/_k_means.c -diff
6+
/sklearn/cluster/_k_means_elkan.c -diff
67
/sklearn/datasets/_svmlight_format.c -diff
78
/sklearn/decomposition/_online_lda.c -diff
89
/sklearn/decomposition/cdnmf_fast.c -diff

doc/tutorial/statistical_inference/unsupervised_learning.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ algorithms. The simplest clustering algorithm is
3939

4040
>>> k_means = cluster.KMeans(n_clusters=3)
4141
>>> k_means.fit(X_iris) # doctest: +ELLIPSIS
42-
KMeans(copy_x=True, init='k-means++', ...
42+
KMeans(algorithm='auto', copy_x=True, init='k-means++', ...
4343
>>> print(k_means.labels_[::10])
4444
[1 1 1 1 1 0 0 0 0 0 2 2 2 2 2]
4545
>>> print(y_iris[::10])
@@ -118,7 +118,7 @@ algorithms. The simplest clustering algorithm is
118118
>>> X = face.reshape((-1, 1)) # We need an (n_sample, n_feature) array
119119
>>> k_means = cluster.KMeans(n_clusters=5, n_init=1)
120120
>>> k_means.fit(X) # doctest: +ELLIPSIS
121-
KMeans(copy_x=True, init='k-means++', ...
121+
KMeans(algorithm='auto', copy_x=True, init='k-means++', ...
122122
>>> values = k_means.cluster_centers_.squeeze()
123123
>>> labels = k_means.labels_
124124
>>> face_compressed = np.choose(labels, values)

doc/whats_new.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,9 @@ New features
5151
converts single output regressors to multi-ouput regressors by fitting
5252
one regressor per output. By `Tim Head`_.
5353

54+
- Added ``algorithm="elkan"`` to :class:`cluster.KMeans` implementing
55+
Elkan's fast K-Means algorithm. By `Andreas Müller`_.
56+
5457
Enhancements
5558
............
5659

sklearn/cluster/_k_means_elkan.pyx

Lines changed: 249 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,249 @@
1+
# cython: cdivision=True
2+
# cython: boundscheck=False
3+
# cython: wraparound=False
4+
# cython: profile=True
5+
#
6+
# Author: Andreas Mueller
7+
#
8+
# Licence: BSD 3 clause
9+
10+
import numpy as np
11+
cimport numpy as np
12+
cimport cython
13+
14+
from libc.math cimport sqrt
15+
16+
from ..metrics import euclidean_distances
17+
from ._k_means import _centers_dense
18+
from ..utils.fixes import partition
19+
20+
21+
cdef double euclidian_dist(double* a, double* b, int n_features) nogil:
22+
cdef double result, tmp
23+
result = 0
24+
cdef int i
25+
for i in range(n_features):
26+
tmp = (a[i] - b[i])
27+
result += tmp * tmp
28+
return sqrt(result)
29+
30+
31+
cdef update_labels_distances_inplace(
32+
double* X, double* centers, double[:, :] center_half_distances,
33+
int[:] labels, double[:, :] lower_bounds, double[:] upper_bounds,
34+
int n_samples, int n_features, int n_clusters):
35+
"""
36+
Calculate upper and lower bounds for each sample.
37+
38+
Given X, centers and the pairwise distances divided by 2.0 between the
39+
centers this calculates the upper bounds and lower bounds for each sample.
40+
The upper bound for each sample is set to the distance between the sample
41+
and the closest center.
42+
43+
The lower bound for each sample is a one-dimensional array of n_clusters.
44+
For each sample i assume that the previously assigned cluster is c1 and the
45+
previous closest distance is dist, for a new cluster c2, the
46+
lower_bound[i][c2] is set to distance between the sample and this new
47+
cluster, if and only if dist > center_half_distances[c1][c2]. This prevents
48+
computation of unnecessary distances for each sample to the clusters that
49+
it is unlikely to be assigned to.
50+
51+
Parameters
52+
----------
53+
X : nd-array, shape (n_samples, n_features)
54+
The input data.
55+
56+
centers : nd-array, shape (n_clusters, n_features)
57+
The cluster centers.
58+
59+
center_half_distances : nd-array, shape (n_clusters, n_clusters)
60+
The half of the distance between any 2 clusters centers.
61+
62+
labels : nd-array, shape(n_samples)
63+
The label for each sample. This array is modified in place.
64+
65+
lower_bounds : nd-array, shape(n_samples, n_clusters)
66+
The lower bound on the distance between a sample and each cluster
67+
center. It is modified in place.
68+
69+
upper_bounds : nd-array, shape(n_samples,)
70+
The distance of each sample from its closest cluster center. This is
71+
modified in place by the function.
72+
73+
n_samples : int
74+
The number of samples.
75+
76+
n_features : int
77+
The number of features.
78+
79+
n_clusters : int
80+
The number of clusters.
81+
"""
82+
# assigns closest center to X
83+
# uses triangle inequality
84+
cdef double* x
85+
cdef double* c
86+
cdef double d_c, dist
87+
cdef int c_x, j, sample
88+
for sample in range(n_samples):
89+
# assign first cluster center
90+
c_x = 0
91+
x = X + sample * n_features
92+
d_c = euclidian_dist(x, centers, n_features)
93+
lower_bounds[sample, 0] = d_c
94+
for j in range(1, n_clusters):
95+
if d_c > center_half_distances[c_x, j]:
96+
c = centers + j * n_features
97+
dist = euclidian_dist(x, c, n_features)
98+
lower_bounds[sample, j] = dist
99+
if dist < d_c:
100+
d_c = dist
101+
c_x = j
102+
labels[sample] = c_x
103+
upper_bounds[sample] = d_c
104+
105+
106+
def k_means_elkan(np.ndarray[np.float64_t, ndim=2, mode='c'] X_, int n_clusters,
107+
np.ndarray[np.float64_t, ndim=2, mode='c'] init,
108+
float tol=1e-4, int max_iter=30, verbose=False):
109+
"""Run Elkan's k-means.
110+
111+
Parameters
112+
----------
113+
X_ : nd-array, shape (n_samples, n_features)
114+
115+
n_clusters : int
116+
Number of clusters to find.
117+
118+
init : nd-array, shape (n_clusters, n_features)
119+
Initial position of centers.
120+
121+
tol : float, default=1e-4
122+
The relative increment in cluster means before declaring convergence.
123+
124+
max_iter : int, default=30
125+
Maximum number of iterations of the k-means algorithm.
126+
127+
verbose : bool, default=False
128+
Whether to be verbose.
129+
130+
"""
131+
#initialize
132+
cdef np.ndarray[np.float64_t, ndim=2, mode='c'] centers_ = init
133+
cdef double* centers_p = <double*>centers_.data
134+
cdef double* X_p = <double*>X_.data
135+
cdef double* x_p
136+
cdef Py_ssize_t n_samples = X_.shape[0]
137+
cdef Py_ssize_t n_features = X_.shape[1]
138+
cdef int point_index, center_index, label
139+
cdef float upper_bound, distance
140+
cdef double[:, :] center_half_distances = euclidean_distances(centers_) / 2.
141+
cdef double[:, :] lower_bounds = np.zeros((n_samples, n_clusters))
142+
cdef double[:] distance_next_center
143+
labels_ = np.empty(n_samples, dtype=np.int32)
144+
cdef int[:] labels = labels_
145+
upper_bounds_ = np.empty(n_samples, dtype=np.float)
146+
cdef double[:] upper_bounds = upper_bounds_
147+
148+
# Get the inital set of upper bounds and lower bounds for each sample.
149+
update_labels_distances_inplace(X_p, centers_p, center_half_distances,
150+
labels, lower_bounds, upper_bounds,
151+
n_samples, n_features, n_clusters)
152+
cdef np.uint8_t[:] bounds_tight = np.ones(n_samples, dtype=np.uint8)
153+
cdef np.uint8_t[:] points_to_update = np.zeros(n_samples, dtype=np.uint8)
154+
cdef np.ndarray[np.float64_t, ndim=2, mode='c'] new_centers
155+
156+
if max_iter <= 0:
157+
raise ValueError('Number of iterations should be a positive number'
158+
', got %d instead' % max_iter)
159+
160+
col_indices = np.arange(center_half_distances.shape[0], dtype=np.int)
161+
for iteration in range(max_iter):
162+
if verbose:
163+
print("start iteration")
164+
165+
cd = np.asarray(center_half_distances)
166+
distance_next_center = partition(cd, kth=1, axis=0)[1]
167+
168+
if verbose:
169+
print("done sorting")
170+
171+
for point_index in range(n_samples):
172+
upper_bound = upper_bounds[point_index]
173+
label = labels[point_index]
174+
175+
# This means that the next likely center is far away from the
176+
# currently assigned center and the sample is unlikely to be
177+
# reassigned.
178+
if distance_next_center[label] >= upper_bound:
179+
continue
180+
x_p = X_p + point_index * n_features
181+
182+
# TODO: get pointer to lower_bounds[point_index, center_index]
183+
for center_index in range(n_clusters):
184+
185+
# If this holds, then center_index is a good candidate for the
186+
# sample to be relabelled, and we need to confirm this by
187+
# recomputing the upper and lower bounds.
188+
if (center_index != label
189+
and (upper_bound > lower_bounds[point_index, center_index])
190+
and (upper_bound > center_half_distances[center_index, label])):
191+
192+
# Recompute the upper bound by calculating the actual distance
193+
# between the sample and label.
194+
if not bounds_tight[point_index]:
195+
upper_bound = euclidian_dist(x_p, centers_p + label * n_features, n_features)
196+
lower_bounds[point_index, label] = upper_bound
197+
bounds_tight[point_index] = 1
198+
199+
# If the condition still holds, then compute the actual distance between
200+
# the sample and center_index. If this is still lesser than the previous
201+
# distance, reassign labels.
202+
if (upper_bound > lower_bounds[point_index, center_index]
203+
or (upper_bound > center_half_distances[label, center_index])):
204+
distance = euclidian_dist(x_p, centers_p + center_index * n_features, n_features)
205+
lower_bounds[point_index, center_index] = distance
206+
if distance < upper_bound:
207+
label = center_index
208+
upper_bound = distance
209+
210+
labels[point_index] = label
211+
upper_bounds[point_index] = upper_bound
212+
213+
if verbose:
214+
print("end inner loop")
215+
216+
# compute new centers
217+
new_centers = _centers_dense(X_, labels_, n_clusters, upper_bounds_)
218+
bounds_tight[:] = 0
219+
220+
# compute distance each center moved
221+
center_shift = np.sqrt(np.sum((centers_ - new_centers) ** 2, axis=1))
222+
223+
# update bounds accordingly
224+
lower_bounds = np.maximum(lower_bounds - center_shift, 0)
225+
upper_bounds = upper_bounds + center_shift[labels_]
226+
227+
# reassign centers
228+
centers_ = new_centers
229+
centers_p = <double*>new_centers.data
230+
231+
# update between-center distances
232+
center_half_distances = euclidean_distances(centers_) / 2.
233+
if verbose:
234+
print('Iteration %i, inertia %s'
235+
% (iteration, np.sum((X_ - centers_[labels]) ** 2)))
236+
center_shift_total = np.sum(center_shift)
237+
if center_shift_total ** 2 < tol:
238+
if verbose:
239+
print("center shift %e within tolerance %e"
240+
% (center_shift_total, tol))
241+
break
242+
243+
# We need this to make sure that the labels give the same output as
244+
# predict(X)
245+
if center_shift_total > 0:
246+
update_labels_distances_inplace(X_p, centers_p, center_half_distances,
247+
labels, lower_bounds, upper_bounds,
248+
n_samples, n_features, n_clusters)
249+
return centers_, labels_, iteration

0 commit comments

Comments
 (0)