Skip to content

Commit 548a452

Browse files
ogriselNicolasHug
andauthored
MNT n_features_in_ consistency in decomposition (scikit-learn#18557)
Co-authored-by: Nicolas Hug <[email protected]>
1 parent 73732e5 commit 548a452

File tree

13 files changed

+37
-59
lines changed

13 files changed

+37
-59
lines changed

sklearn/decomposition/_base.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
from scipy import linalg
1313

1414
from ..base import BaseEstimator, TransformerMixin
15-
from ..utils import check_array
1615
from ..utils.validation import check_is_fitted
1716
from abc import ABCMeta, abstractmethod
1817

@@ -124,7 +123,7 @@ def transform(self, X):
124123
"""
125124
check_is_fitted(self)
126125

127-
X = check_array(X)
126+
X = self._validate_data(X, dtype=[np.float64, np.float32], reset=False)
128127
if self.mean_ is not None:
129128
X = X - self.mean_
130129
X_transformed = np.dot(X, self.components_.T)

sklearn/decomposition/_dict_learning.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -907,7 +907,7 @@ def __init__(self, transform_algorithm, transform_n_nonzero_coefs,
907907
def _transform(self, X, dictionary):
908908
"""Private method allowing to accomodate both DictionaryLearning and
909909
SparseCoder."""
910-
X = check_array(X)
910+
X = self._validate_data(X, reset=False)
911911

912912
code = sparse_encode(
913913
X, dictionary, algorithm=self.transform_algorithm,
@@ -1622,14 +1622,14 @@ def partial_fit(self, X, y=None, iter_offset=None):
16221622
"""
16231623
if not hasattr(self, 'random_state_'):
16241624
self.random_state_ = check_random_state(self.random_state)
1625-
X = check_array(X)
16261625
if hasattr(self, 'components_'):
16271626
dict_init = self.components_
16281627
else:
16291628
dict_init = self.dict_init
16301629
inner_stats = getattr(self, 'inner_stats_', None)
16311630
if iter_offset is None:
16321631
iter_offset = getattr(self, 'iter_offset_', 0)
1632+
X = self._validate_data(X, reset=(iter_offset == 0))
16331633
U, (A, B) = dict_learning_online(
16341634
X, self.n_components, alpha=self.alpha,
16351635
n_iter=1, method=self.fit_algorithm,

sklearn/decomposition/_factor_analysis.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626

2727

2828
from ..base import BaseEstimator, TransformerMixin
29-
from ..utils import check_array, check_random_state
29+
from ..utils import check_random_state
3030
from ..utils.extmath import fast_logdet, randomized_svd, squared_norm
3131
from ..utils.validation import check_is_fitted, _deprecate_positional_args
3232
from ..exceptions import ConvergenceWarning
@@ -279,7 +279,7 @@ def transform(self, X):
279279
"""
280280
check_is_fitted(self)
281281

282-
X = check_array(X)
282+
X = self._validate_data(X, reset=False)
283283
Ih = np.eye(len(self.components_))
284284

285285
X_transformed = X - self.mean_
@@ -350,7 +350,7 @@ def score_samples(self, X):
350350
Log-likelihood of each sample under the current model
351351
"""
352352
check_is_fitted(self)
353-
353+
X = self._validate_data(X, reset=False)
354354
Xr = X - self.mean_
355355
precision = self.get_precision()
356356
n_features = X.shape[1]

sklearn/decomposition/_fastica.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -584,15 +584,16 @@ def transform(self, X, copy=True):
584584
and n_features is the number of features.
585585
586586
copy : bool, default=True
587-
If False, data passed to fit are overwritten. Defaults to True.
587+
If False, data passed to fit can be overwritten. Defaults to True.
588588
589589
Returns
590590
-------
591591
X_new : ndarray of shape (n_samples, n_components)
592592
"""
593593
check_is_fitted(self)
594594

595-
X = check_array(X, copy=copy, dtype=FLOAT_DTYPES)
595+
X = self._validate_data(X, copy=(copy and self.whiten),
596+
dtype=FLOAT_DTYPES, reset=False)
596597
if self.whiten:
597598
X -= self.mean_
598599

sklearn/decomposition/_incremental_pca.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from scipy import linalg, sparse
99

1010
from ._base import _BasePCA
11-
from ..utils import check_array, gen_batches
11+
from ..utils import gen_batches
1212
from ..utils.extmath import svd_flip, _incremental_mean_and_var
1313
from ..utils.validation import _deprecate_positional_args
1414

@@ -234,15 +234,18 @@ def partial_fit(self, X, y=None, check_input=True):
234234
self : object
235235
Returns the instance itself.
236236
"""
237+
first_pass = not hasattr(self, "components_")
237238
if check_input:
238239
if sparse.issparse(X):
239240
raise TypeError(
240241
"IncrementalPCA.partial_fit does not support "
241242
"sparse input. Either convert data to dense "
242243
"or use IncrementalPCA.fit to do so in batches.")
243-
X = check_array(X, copy=self.copy, dtype=[np.float64, np.float32])
244+
X = self._validate_data(
245+
X, copy=self.copy, dtype=[np.float64, np.float32],
246+
reset=first_pass)
244247
n_samples, n_features = X.shape
245-
if not hasattr(self, 'components_'):
248+
if first_pass:
246249
self.components_ = None
247250

248251
if self.n_components is None:

sklearn/decomposition/_kernel_pca.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,7 @@ def transform(self, X):
331331
X_new : ndarray of shape (n_samples, n_components)
332332
"""
333333
check_is_fitted(self)
334+
X = self._validate_data(X, accept_sparse='csr', reset=False)
334335

335336
# Compute centered gram matrix between X and training data X_fit_
336337
K = self._centerer.transform(self._get_kernel(X, self.X_fit_))

sklearn/decomposition/_lda.py

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -509,17 +509,9 @@ def partial_fit(self, X, y=None):
509509
"""
510510
self._check_params()
511511
first_time = not hasattr(self, 'components_')
512-
513-
# In theory reset should be equal to `first_time`, but there are tests
514-
# checking the input number of feature and they expect a specific
515-
# string, which is not the same one raised by check_n_features. So we
516-
# don't check n_features_in_ here for now (it's done with adhoc code in
517-
# the estimator anyway).
518-
# TODO: set reset=first_time when addressing reset in
519-
# predict/transform/etc.
520-
reset_n_features = True
521-
X = self._check_non_neg_array(X, reset_n_features,
522-
"LatentDirichletAllocation.partial_fit")
512+
X = self._check_non_neg_array(
513+
X, reset_n_features=first_time,
514+
whom="LatentDirichletAllocation.partial_fit")
523515
n_samples, n_features = X.shape
524516
batch_size = self.batch_size
525517

@@ -663,6 +655,10 @@ def transform(self, X):
663655
doc_topic_distr : ndarray of shape (n_samples, n_components)
664656
Document topic distribution for X.
665657
"""
658+
check_is_fitted(self)
659+
X = self._check_non_neg_array(
660+
X, reset_n_features=False,
661+
whom="LatentDirichletAllocation.transform")
666662
doc_topic_distr = self._unnormalized_transform(X)
667663
doc_topic_distr /= doc_topic_distr.sum(axis=1)[:, np.newaxis]
668664
return doc_topic_distr
@@ -758,7 +754,8 @@ def score(self, X, y=None):
758754
score : float
759755
Use approximate bound as score.
760756
"""
761-
X = self._check_non_neg_array(X, reset_n_features=True,
757+
check_is_fitted(self)
758+
X = self._check_non_neg_array(X, reset_n_features=False,
762759
whom="LatentDirichletAllocation.score")
763760

764761
doc_topic_distr = self._unnormalized_transform(X)

sklearn/decomposition/_nmf.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1299,6 +1299,8 @@ def fit_transform(self, X, y=None, W=None, H=None):
12991299
X = self._validate_data(X, accept_sparse=('csr', 'csc'),
13001300
dtype=[np.float64, np.float32])
13011301

1302+
# XXX: input data validation is performed again in
1303+
# non_negative_factorization.
13021304
W, H, n_iter_ = non_negative_factorization(
13031305
X=X, W=W, H=H, n_components=self.n_components, init=self.init,
13041306
update_H=True, solver=self.solver, beta_loss=self.beta_loss,
@@ -1347,7 +1349,12 @@ def transform(self, X):
13471349
Transformed data.
13481350
"""
13491351
check_is_fitted(self)
1352+
X = self._validate_data(X, accept_sparse=('csr', 'csc'),
1353+
dtype=[np.float64, np.float32],
1354+
reset=False)
13501355

1356+
# XXX: input data validation is performed again in
1357+
# non_negative_factorization.
13511358
W, _, n_iter_ = non_negative_factorization(
13521359
X=X, W=None, H=self.components_, n_components=self.n_components_,
13531360
init=self.init, update_H=False, solver=self.solver,

sklearn/decomposition/_pca.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121

2222
from ._base import _BasePCA
2323
from ..utils import check_random_state
24-
from ..utils import check_array
2524
from ..utils.extmath import fast_logdet, randomized_svd, svd_flip
2625
from ..utils.extmath import stable_cumsum
2726
from ..utils.validation import check_is_fitted
@@ -583,7 +582,7 @@ def score_samples(self, X):
583582
"""
584583
check_is_fitted(self)
585584

586-
X = check_array(X)
585+
X = self._validate_data(X, dtype=[np.float64, np.float32], reset=False)
587586
Xr = X - self.mean_
588587
n_features = X.shape[1]
589588
precision = self.get_precision()

sklearn/decomposition/_sparse_pca.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
import numpy as np
66

7-
from ..utils import check_random_state, check_array
7+
from ..utils import check_random_state
88
from ..utils.validation import check_is_fitted
99
from ..utils.validation import _deprecate_positional_args
1010
from ..linear_model import ridge_regression
@@ -197,7 +197,7 @@ def transform(self, X):
197197
"""
198198
check_is_fitted(self)
199199

200-
X = check_array(X)
200+
X = self._validate_data(X, reset=False)
201201
X = X - self.mean_
202202

203203
U = ridge_regression(self.components_.T, X.T, self.ridge_alpha,

0 commit comments

Comments
 (0)