Skip to content

Commit 45e82c3

Browse files
committed
Merge branch 'pr/3613'
2 parents cb1798a + 6409357 commit 45e82c3

File tree

2 files changed

+48
-5
lines changed

2 files changed

+48
-5
lines changed

sklearn/cluster/k_means_.py

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ def _tolerance(X, tol):
147147
return np.mean(variances) * tol
148148

149149

150-
def k_means(X, n_clusters, init='k-means++', precompute_distances=True,
150+
def k_means(X, n_clusters, init='k-means++', precompute_distances='auto',
151151
n_init=10, max_iter=300, verbose=False,
152152
tol=1e-4, random_state=None, copy_x=True, n_jobs=1,
153153
return_n_iter=False):
@@ -186,6 +186,17 @@ def k_means(X, n_clusters, init='k-means++', precompute_distances=True,
186186
If a callable is passed, it should take arguments X, k and
187187
and a random state and return an initialization.
188188
189+
precompute_distances : {'auto', True, False}
190+
Precompute distances (faster but takes more memory).
191+
192+
'auto' : do not precompute distances if n_samples * n_clusters > 12
193+
million. This corresponds to about 100MB overhead per job using
194+
double precision.
195+
196+
True : always precompute distances
197+
198+
False : never precompute distances
199+
189200
tol : float, optional
190201
The relative increment in the results before declaring convergence.
191202
@@ -240,6 +251,20 @@ def k_means(X, n_clusters, init='k-means++', precompute_distances=True,
240251
X = as_float_array(X, copy=copy_x)
241252
tol = _tolerance(X, tol)
242253

254+
# If the distances are precomputed every job will create a matrix of shape
255+
# (n_clusters, n_samples). To stop KMeans from eating up memory we only
256+
# activate this if the created matrix is guaranteed to be under 100MB. 12
257+
# million entries consume a little under 100MB if they are of type double.
258+
if precompute_distances == 'auto':
259+
n_samples = X.shape[0]
260+
precompute_distances = (n_clusters * n_samples) < 12e6
261+
elif isinstance(precompute_distances, bool):
262+
pass
263+
else:
264+
raise ValueError("precompute_distances should be 'auto' or True/False"
265+
", but a value of %r was passed" %
266+
precompute_distances)
267+
243268
# subtract of mean of x for more accurate distance computations
244269
if not sp.issparse(X) or hasattr(init, '__array__'):
245270
X_mean = X.mean(axis=0)
@@ -348,6 +373,9 @@ def _kmeans_single(X, n_clusters, x_squared_norms, max_iter=300,
348373
x_squared_norms: array
349374
Precomputed x_squared_norms.
350375
376+
precompute_distances : boolean, default: True
377+
Precompute distances (faster but takes more memory).
378+
351379
random_state: integer or numpy.RandomState, optional
352380
The generator used to initialize the centers. If an integer is
353381
given, it fixes the seed. Defaults to the global numpy random
@@ -624,9 +652,17 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
624652
If an ndarray is passed, it should be of shape (n_clusters, n_features)
625653
and gives the initial centers.
626654
627-
precompute_distances : boolean, default: True
655+
precompute_distances : {'auto', True, False}
628656
Precompute distances (faster but takes more memory).
629657
658+
'auto' : do not precompute distances if n_samples * n_clusters > 12
659+
million. This corresponds to about 100MB overhead per job using
660+
double precision.
661+
662+
True : always precompute distances
663+
664+
False : never precompute distances
665+
630666
tol : float, default: 1e-4
631667
Relative tolerance with regards to inertia to declare convergence
632668
@@ -683,7 +719,7 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
683719
"""
684720

685721
def __init__(self, n_clusters=8, init='k-means++', n_init=10, max_iter=300,
686-
tol=1e-4, precompute_distances=True,
722+
tol=1e-4, precompute_distances='auto',
687723
verbose=0, random_state=None, copy_x=True, n_jobs=1):
688724

689725
if hasattr(init, '__array__'):
@@ -717,7 +753,7 @@ def _check_test_data(self, X):
717753
raise ValueError("Incorrect number of features. "
718754
"Got %d features, expected %d" % (
719755
n_features, expected_n_features))
720-
if not X.dtype.kind is 'f':
756+
if X.dtype.kind is not 'f':
721757
warnings.warn("Got data type %s, converted to float "
722758
"to avoid overflows" % X.dtype,
723759
RuntimeWarning, stacklevel=2)
@@ -910,7 +946,7 @@ def _mini_batch_step(X, x_squared_norms, centers, counts,
910946
random_state=random_state)
911947
if verbose:
912948
print("[MiniBatchKMeans] Reassigning %i cluster centers."
913-
% n_reassigns)
949+
% n_reassigns)
914950

915951
if sp.issparse(X) and not sp.issparse(centers):
916952
assign_rows_csr(X,

sklearn/cluster/tests/test_k_means.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,13 @@ def test_k_means_plus_plus_init_2_jobs():
211211
_check_fitted_model(km)
212212

213213

214+
def test_k_means_precompute_distances_flag():
215+
# check that a warning is raised if the precompute_distances flag is not
216+
# supported
217+
km = KMeans(precompute_distances="wrong")
218+
assert_raises(ValueError, km.fit, X)
219+
220+
214221
def test_k_means_plus_plus_init_sparse():
215222
km = KMeans(init="k-means++", n_clusters=n_clusters, random_state=42)
216223
km.fit(X_csr)

0 commit comments

Comments
 (0)