@@ -147,7 +147,7 @@ def _tolerance(X, tol):
147147 return np .mean (variances ) * tol
148148
149149
150- def k_means (X , n_clusters , init = 'k-means++' , precompute_distances = True ,
150+ def k_means (X , n_clusters , init = 'k-means++' , precompute_distances = 'auto' ,
151151 n_init = 10 , max_iter = 300 , verbose = False ,
152152 tol = 1e-4 , random_state = None , copy_x = True , n_jobs = 1 ,
153153 return_n_iter = False ):
@@ -186,6 +186,17 @@ def k_means(X, n_clusters, init='k-means++', precompute_distances=True,
186186 If a callable is passed, it should take arguments X, k and
187187 and a random state and return an initialization.
188188
189+ precompute_distances : {'auto', True, False}
190+ Precompute distances (faster but takes more memory).
191+
192+ 'auto' : do not precompute distances if n_samples * n_clusters > 12
193+ million. This corresponds to about 100MB overhead per job using
194+ double precision.
195+
196+ True : always precompute distances
197+
198+ False : never precompute distances
199+
189200 tol : float, optional
190201 The relative increment in the results before declaring convergence.
191202
@@ -240,6 +251,20 @@ def k_means(X, n_clusters, init='k-means++', precompute_distances=True,
240251 X = as_float_array (X , copy = copy_x )
241252 tol = _tolerance (X , tol )
242253
254+ # If the distances are precomputed every job will create a matrix of shape
255+ # (n_clusters, n_samples). To stop KMeans from eating up memory we only
256+ # activate this if the created matrix is guaranteed to be under 100MB. 12
257+ # million entries consume a little under 100MB if they are of type double.
258+ if precompute_distances == 'auto' :
259+ n_samples = X .shape [0 ]
260+ precompute_distances = (n_clusters * n_samples ) < 12e6
261+ elif isinstance (precompute_distances , bool ):
262+ pass
263+ else :
264+ raise ValueError ("precompute_distances should be 'auto' or True/False"
265+ ", but a value of %r was passed" %
266+ precompute_distances )
267+
243268 # subtract of mean of x for more accurate distance computations
244269 if not sp .issparse (X ) or hasattr (init , '__array__' ):
245270 X_mean = X .mean (axis = 0 )
@@ -348,6 +373,9 @@ def _kmeans_single(X, n_clusters, x_squared_norms, max_iter=300,
348373 x_squared_norms: array
349374 Precomputed x_squared_norms.
350375
376+ precompute_distances : boolean, default: True
377+ Precompute distances (faster but takes more memory).
378+
351379 random_state: integer or numpy.RandomState, optional
352380 The generator used to initialize the centers. If an integer is
353381 given, it fixes the seed. Defaults to the global numpy random
@@ -624,9 +652,17 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
624652 If an ndarray is passed, it should be of shape (n_clusters, n_features)
625653 and gives the initial centers.
626654
627- precompute_distances : boolean, default: True
655+ precompute_distances : {'auto', True, False}
628656 Precompute distances (faster but takes more memory).
629657
658+ 'auto' : do not precompute distances if n_samples * n_clusters > 12
659+ million. This corresponds to about 100MB overhead per job using
660+ double precision.
661+
662+ True : always precompute distances
663+
664+ False : never precompute distances
665+
630666 tol : float, default: 1e-4
631667 Relative tolerance with regards to inertia to declare convergence
632668
@@ -683,7 +719,7 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
683719 """
684720
685721 def __init__ (self , n_clusters = 8 , init = 'k-means++' , n_init = 10 , max_iter = 300 ,
686- tol = 1e-4 , precompute_distances = True ,
722+ tol = 1e-4 , precompute_distances = 'auto' ,
687723 verbose = 0 , random_state = None , copy_x = True , n_jobs = 1 ):
688724
689725 if hasattr (init , '__array__' ):
@@ -717,7 +753,7 @@ def _check_test_data(self, X):
717753 raise ValueError ("Incorrect number of features. "
718754 "Got %d features, expected %d" % (
719755 n_features , expected_n_features ))
720- if not X .dtype .kind is 'f' :
756+ if X .dtype .kind is not 'f' :
721757 warnings .warn ("Got data type %s, converted to float "
722758 "to avoid overflows" % X .dtype ,
723759 RuntimeWarning , stacklevel = 2 )
@@ -910,7 +946,7 @@ def _mini_batch_step(X, x_squared_norms, centers, counts,
910946 random_state = random_state )
911947 if verbose :
912948 print ("[MiniBatchKMeans] Reassigning %i cluster centers."
913- % n_reassigns )
949+ % n_reassigns )
914950
915951 if sp .issparse (X ) and not sp .issparse (centers ):
916952 assign_rows_csr (X ,
0 commit comments