1- """K-means clustering"""
1+ """K-means clustering. """
22
33# Authors: Gael Varoquaux <[email protected] > 44# Thomas Rueckstiess <[email protected] > @@ -55,7 +55,7 @@ def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None):
5555 should be double precision (dtype=np.float64).
5656
5757 n_clusters : int
58- The number of seeds to choose
58+ The number of seeds to choose.
5959
6060 x_squared_norms : ndarray of shape (n_samples,)
6161 Squared Euclidean norm of each data point.
@@ -145,7 +145,7 @@ def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None):
145145# K-means batch estimation by EM (expectation maximization)
146146
147147def _tolerance (X , tol ):
148- """Return a tolerance which is independent of the dataset"""
148+ """Return a tolerance which is independent of the dataset. """
149149 if tol == 0 :
150150 return 0
151151 if sp .issparse (X ):
@@ -166,7 +166,7 @@ def k_means(X, n_clusters, *, sample_weight=None, init='k-means++',
166166
167167 Parameters
168168 ----------
169- X : {array-like, sparse} matrix of shape (n_samples, n_features)
169+ X : {array-like, sparse matrix} of shape (n_samples, n_features)
170170 The observations to cluster. It must be noted that the data
171171 will be converted to C ordering, which will cause a memory copy
172172 if the given data is not C-contiguous.
@@ -177,7 +177,7 @@ def k_means(X, n_clusters, *, sample_weight=None, init='k-means++',
177177
178178 sample_weight : array-like of shape (n_samples,), default=None
179179 The weights for each observation in X. If None, all observations
180- are assigned equal weight
180+ are assigned equal weight.
181181
182182 init : {'k-means++', 'random'}, callable or array-like of shape \
183183 (n_clusters, n_features), default='k-means++'
@@ -227,7 +227,7 @@ def k_means(X, n_clusters, *, sample_weight=None, init='k-means++',
227227 in the cluster centers of two consecutive iterations to declare
228228 convergence.
229229
230- random_state : int, RandomState instance, default=None
230+ random_state : int, RandomState instance or None , default=None
231231 Determines random number generation for centroid initialization. Use
232232 an int to make the randomness deterministic.
233233 See :term:`Glossary <random_state>`.
@@ -315,7 +315,7 @@ def _kmeans_single_elkan(X, sample_weight, centers_init, max_iter=300,
315315 Maximum number of iterations of the k-means algorithm to run.
316316
317317 verbose : bool, default=False
318- Verbosity mode
318+ Verbosity mode.
319319
320320 x_squared_norms : array-like, default=None
321321 Precomputed x_squared_norms.
@@ -659,7 +659,7 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
659659 verbose : int, default=0
660660 Verbosity mode.
661661
662- random_state : int, RandomState instance, default=None
662+ random_state : int, RandomState instance or None , default=None
663663 Determines random number generation for centroid initialization. Use
664664 an int to make the randomness deterministic.
665665 See :term:`Glossary <random_state>`.
@@ -843,7 +843,7 @@ def _check_params(self, X):
843843 self ._n_init = 1
844844
845845 def _validate_center_shape (self , X , centers ):
846- """Check if centers is compatible with X and n_clusters"""
846+ """Check if centers is compatible with X and n_clusters. """
847847 if centers .shape [0 ] != self .n_clusters :
848848 raise ValueError (
849849 f"The shape of the initial centers { centers .shape } does not "
@@ -867,7 +867,7 @@ def _check_test_data(self, X):
867867
868868 def _init_centroids (self , X , x_squared_norms , init , random_state ,
869869 init_size = None ):
870- """Compute the initial centroids
870+ """Compute the initial centroids.
871871
872872 Parameters
873873 ----------
@@ -1066,7 +1066,7 @@ def fit_transform(self, X, y=None, sample_weight=None):
10661066
10671067 Returns
10681068 -------
1069- X_new : array of shape (n_samples, n_clusters)
1069+ X_new : ndarray of shape (n_samples, n_clusters)
10701070 X transformed in the new space.
10711071 """
10721072 # Currently, this just skips a copy of the data if it is not in
@@ -1079,7 +1079,7 @@ def transform(self, X):
10791079 """Transform X to a cluster-distance space.
10801080
10811081 In the new space, each dimension is the distance to the cluster
1082- centers. Note that even if X is sparse, the array returned by
1082+ centers. Note that even if X is sparse, the array returned by
10831083 `transform` will typically be dense.
10841084
10851085 Parameters
@@ -1098,7 +1098,7 @@ def transform(self, X):
10981098 return self ._transform (X )
10991099
11001100 def _transform (self , X ):
1101- """guts of transform method; no input validation"""
1101+ """Guts of transform method; no input validation. """
11021102 return euclidean_distances (X , self .cluster_centers_ )
11031103
11041104 def predict (self , X , sample_weight = None ):
@@ -1191,26 +1191,28 @@ def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums,
11911191 centers : ndarray of shape (k, n_features)
11921192 The cluster centers. This array is MODIFIED IN PLACE
11931193
1194- counts : ndarray of shape (k,)
1195- The vector in which we keep track of the numbers of elements in a
1196- cluster. This array is MODIFIED IN PLACE
1194+ old_center_buffer : int
1195+ Copy of old centers for monitoring convergence.
1196+
1197+ compute_squared_diff : bool
1198+ If set to False, the squared diff computation is skipped.
11971199
11981200 distances : ndarray of shape (n_samples,), dtype=float, default=None
11991201 If not None, should be a pre-allocated array that will be used to store
12001202 the distances of each sample to its closest center.
12011203 May not be None when random_reassign is True.
12021204
1203- random_state : int, RandomState instance, default=None
1205+ random_reassign : bool, default=False
1206+ If True, centers with very low counts are randomly reassigned
1207+ to observations.
1208+
1209+ random_state : int, RandomState instance or None, default=None
12041210 Determines random number generation for centroid initialization and to
12051211 pick new clusters amongst observations with uniform probability. Use
12061212 an int to make the randomness deterministic.
12071213 See :term:`Glossary <random_state>`.
12081214
1209- random_reassign : bool, default=None
1210- If True, centers with very low counts are randomly reassigned
1211- to observations.
1212-
1213- reassignment_ratio : float, default=None
1215+ reassignment_ratio : float, default=.01
12141216 Control the fraction of the maximum number of counts for a
12151217 center to be reassigned. A higher value means that low count
12161218 centers are more likely to be reassigned, which means that the
@@ -1220,12 +1222,6 @@ def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums,
12201222 verbose : bool, default=False
12211223 Controls the verbosity.
12221224
1223- compute_squared_diff : bool
1224- If set to False, the squared diff computation is skipped.
1225-
1226- old_center_buffer : int
1227- Copy of old centers for monitoring convergence.
1228-
12291225 Returns
12301226 -------
12311227 inertia : float
@@ -1315,7 +1311,7 @@ def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums,
13151311def _mini_batch_convergence (model , iteration_idx , n_iter , tol ,
13161312 n_samples , centers_squared_diff , batch_inertia ,
13171313 context , verbose = 0 ):
1318- """Helper function to encapsulate the early stopping logic"""
1314+ """Helper function to encapsulate the early stopping logic. """
13191315 # Normalize inertia to be able to compare values when
13201316 # batch_size changes
13211317 batch_inertia /= model .batch_size
@@ -1422,7 +1418,7 @@ class MiniBatchKMeans(KMeans):
14221418 Compute label assignment and inertia for the complete dataset
14231419 once the minibatch optimization has converged in fit.
14241420
1425- random_state : int, RandomState instance, default=None
1421+ random_state : int, RandomState instance or None , default=None
14261422 Determines random number generation for centroid initialization and
14271423 random reassignment. Use an int to make the randomness deterministic.
14281424 See :term:`Glossary <random_state>`.
@@ -1469,7 +1465,7 @@ class MiniBatchKMeans(KMeans):
14691465 ----------
14701466
14711467 cluster_centers_ : ndarray of shape (n_clusters, n_features)
1472- Coordinates of cluster centers
1468+ Coordinates of cluster centers.
14731469
14741470 labels_ : int
14751471 Labels of each point (if compute_labels is set to True).
@@ -1755,7 +1751,7 @@ def fit(self, X, y=None, sample_weight=None):
17551751 def _labels_inertia_minibatch (self , X , sample_weight ):
17561752 """Compute labels and inertia using mini batches.
17571753
1758- This is slightly slower than doing everything at once but preventes
1754+ This is slightly slower than doing everything at once but prevents
17591755 memory errors / segfaults.
17601756
17611757 Parameters
0 commit comments