Skip to content

Commit 5008c28

Browse files
authored
DOC Add None as option for random_state and fix docstring in kmeans module (scikit-learn#18335)
1 parent e325f16 commit 5008c28

File tree

1 file changed

+28
-32
lines changed

1 file changed

+28
-32
lines changed

sklearn/cluster/_kmeans.py

Lines changed: 28 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""K-means clustering"""
1+
"""K-means clustering."""
22

33
# Authors: Gael Varoquaux <[email protected]>
44
# Thomas Rueckstiess <[email protected]>
@@ -55,7 +55,7 @@ def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None):
5555
should be double precision (dtype=np.float64).
5656
5757
n_clusters : int
58-
The number of seeds to choose
58+
The number of seeds to choose.
5959
6060
x_squared_norms : ndarray of shape (n_samples,)
6161
Squared Euclidean norm of each data point.
@@ -145,7 +145,7 @@ def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None):
145145
# K-means batch estimation by EM (expectation maximization)
146146

147147
def _tolerance(X, tol):
148-
"""Return a tolerance which is independent of the dataset"""
148+
"""Return a tolerance which is independent of the dataset."""
149149
if tol == 0:
150150
return 0
151151
if sp.issparse(X):
@@ -166,7 +166,7 @@ def k_means(X, n_clusters, *, sample_weight=None, init='k-means++',
166166
167167
Parameters
168168
----------
169-
X : {array-like, sparse} matrix of shape (n_samples, n_features)
169+
X : {array-like, sparse matrix} of shape (n_samples, n_features)
170170
The observations to cluster. It must be noted that the data
171171
will be converted to C ordering, which will cause a memory copy
172172
if the given data is not C-contiguous.
@@ -177,7 +177,7 @@ def k_means(X, n_clusters, *, sample_weight=None, init='k-means++',
177177
178178
sample_weight : array-like of shape (n_samples,), default=None
179179
The weights for each observation in X. If None, all observations
180-
are assigned equal weight
180+
are assigned equal weight.
181181
182182
init : {'k-means++', 'random'}, callable or array-like of shape \
183183
(n_clusters, n_features), default='k-means++'
@@ -227,7 +227,7 @@ def k_means(X, n_clusters, *, sample_weight=None, init='k-means++',
227227
in the cluster centers of two consecutive iterations to declare
228228
convergence.
229229
230-
random_state : int, RandomState instance, default=None
230+
random_state : int, RandomState instance or None, default=None
231231
Determines random number generation for centroid initialization. Use
232232
an int to make the randomness deterministic.
233233
See :term:`Glossary <random_state>`.
@@ -315,7 +315,7 @@ def _kmeans_single_elkan(X, sample_weight, centers_init, max_iter=300,
315315
Maximum number of iterations of the k-means algorithm to run.
316316
317317
verbose : bool, default=False
318-
Verbosity mode
318+
Verbosity mode.
319319
320320
x_squared_norms : array-like, default=None
321321
Precomputed x_squared_norms.
@@ -659,7 +659,7 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
659659
verbose : int, default=0
660660
Verbosity mode.
661661
662-
random_state : int, RandomState instance, default=None
662+
random_state : int, RandomState instance or None, default=None
663663
Determines random number generation for centroid initialization. Use
664664
an int to make the randomness deterministic.
665665
See :term:`Glossary <random_state>`.
@@ -843,7 +843,7 @@ def _check_params(self, X):
843843
self._n_init = 1
844844

845845
def _validate_center_shape(self, X, centers):
846-
"""Check if centers is compatible with X and n_clusters"""
846+
"""Check if centers is compatible with X and n_clusters."""
847847
if centers.shape[0] != self.n_clusters:
848848
raise ValueError(
849849
f"The shape of the initial centers {centers.shape} does not "
@@ -867,7 +867,7 @@ def _check_test_data(self, X):
867867

868868
def _init_centroids(self, X, x_squared_norms, init, random_state,
869869
init_size=None):
870-
"""Compute the initial centroids
870+
"""Compute the initial centroids.
871871
872872
Parameters
873873
----------
@@ -1066,7 +1066,7 @@ def fit_transform(self, X, y=None, sample_weight=None):
10661066
10671067
Returns
10681068
-------
1069-
X_new : array of shape (n_samples, n_clusters)
1069+
X_new : ndarray of shape (n_samples, n_clusters)
10701070
X transformed in the new space.
10711071
"""
10721072
# Currently, this just skips a copy of the data if it is not in
@@ -1079,7 +1079,7 @@ def transform(self, X):
10791079
"""Transform X to a cluster-distance space.
10801080
10811081
In the new space, each dimension is the distance to the cluster
1082-
centers. Note that even if X is sparse, the array returned by
1082+
centers. Note that even if X is sparse, the array returned by
10831083
`transform` will typically be dense.
10841084
10851085
Parameters
@@ -1098,7 +1098,7 @@ def transform(self, X):
10981098
return self._transform(X)
10991099

11001100
def _transform(self, X):
1101-
"""guts of transform method; no input validation"""
1101+
"""Guts of transform method; no input validation."""
11021102
return euclidean_distances(X, self.cluster_centers_)
11031103

11041104
def predict(self, X, sample_weight=None):
@@ -1191,26 +1191,28 @@ def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums,
11911191
centers : ndarray of shape (k, n_features)
11921192
The cluster centers. This array is MODIFIED IN PLACE
11931193
1194-
counts : ndarray of shape (k,)
1195-
The vector in which we keep track of the numbers of elements in a
1196-
cluster. This array is MODIFIED IN PLACE
1194+
old_center_buffer : int
1195+
Copy of old centers for monitoring convergence.
1196+
1197+
compute_squared_diff : bool
1198+
If set to False, the squared diff computation is skipped.
11971199
11981200
distances : ndarray of shape (n_samples,), dtype=float, default=None
11991201
If not None, should be a pre-allocated array that will be used to store
12001202
the distances of each sample to its closest center.
12011203
May not be None when random_reassign is True.
12021204
1203-
random_state : int, RandomState instance, default=None
1205+
random_reassign : bool, default=False
1206+
If True, centers with very low counts are randomly reassigned
1207+
to observations.
1208+
1209+
random_state : int, RandomState instance or None, default=None
12041210
Determines random number generation for centroid initialization and to
12051211
pick new clusters amongst observations with uniform probability. Use
12061212
an int to make the randomness deterministic.
12071213
See :term:`Glossary <random_state>`.
12081214
1209-
random_reassign : bool, default=None
1210-
If True, centers with very low counts are randomly reassigned
1211-
to observations.
1212-
1213-
reassignment_ratio : float, default=None
1215+
reassignment_ratio : float, default=.01
12141216
Control the fraction of the maximum number of counts for a
12151217
center to be reassigned. A higher value means that low count
12161218
centers are more likely to be reassigned, which means that the
@@ -1220,12 +1222,6 @@ def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums,
12201222
verbose : bool, default=False
12211223
Controls the verbosity.
12221224
1223-
compute_squared_diff : bool
1224-
If set to False, the squared diff computation is skipped.
1225-
1226-
old_center_buffer : int
1227-
Copy of old centers for monitoring convergence.
1228-
12291225
Returns
12301226
-------
12311227
inertia : float
@@ -1315,7 +1311,7 @@ def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums,
13151311
def _mini_batch_convergence(model, iteration_idx, n_iter, tol,
13161312
n_samples, centers_squared_diff, batch_inertia,
13171313
context, verbose=0):
1318-
"""Helper function to encapsulate the early stopping logic"""
1314+
"""Helper function to encapsulate the early stopping logic."""
13191315
# Normalize inertia to be able to compare values when
13201316
# batch_size changes
13211317
batch_inertia /= model.batch_size
@@ -1422,7 +1418,7 @@ class MiniBatchKMeans(KMeans):
14221418
Compute label assignment and inertia for the complete dataset
14231419
once the minibatch optimization has converged in fit.
14241420
1425-
random_state : int, RandomState instance, default=None
1421+
random_state : int, RandomState instance or None, default=None
14261422
Determines random number generation for centroid initialization and
14271423
random reassignment. Use an int to make the randomness deterministic.
14281424
See :term:`Glossary <random_state>`.
@@ -1469,7 +1465,7 @@ class MiniBatchKMeans(KMeans):
14691465
----------
14701466
14711467
cluster_centers_ : ndarray of shape (n_clusters, n_features)
1472-
Coordinates of cluster centers
1468+
Coordinates of cluster centers.
14731469
14741470
labels_ : int
14751471
Labels of each point (if compute_labels is set to True).
@@ -1755,7 +1751,7 @@ def fit(self, X, y=None, sample_weight=None):
17551751
def _labels_inertia_minibatch(self, X, sample_weight):
17561752
"""Compute labels and inertia using mini batches.
17571753
1758-
This is slightly slower than doing everything at once but preventes
1754+
This is slightly slower than doing everything at once but prevents
17591755
memory errors / segfaults.
17601756
17611757
Parameters

0 commit comments

Comments
 (0)