Skip to content

Commit bcee3b9

Browse files
qinhanmin2014paulha
authored andcommitted
FIX Incorrent implementation of noise_variance_ in PCA._fit_truncated (scikit-learn#9108)
1 parent 3ac3933 commit bcee3b9

File tree

3 files changed

+55
-1
lines changed

3 files changed

+55
-1
lines changed

doc/whats_new.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,9 @@ Decomposition, manifold learning and clustering
240240
``singular_values_``, like in :class:`decomposition.IncrementalPCA`.
241241
:issue:`7685` by :user:`Tommy Löfstedt <tomlof>`
242242

243+
- Fixed the implementation of noise_variance_ in :class:`decomposition.PCA`.
244+
:issue:`9108` by `Hanmin Qin <https://github.com/qinhanmin2014>`_.
245+
243246
- :class:`decomposition.NMF` now faster when ``beta_loss=0``.
244247
:issue:`9277` by :user:`hongkahjun`.
245248

sklearn/decomposition/pca.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,9 @@ class PCA(_BasePCA):
201201
explained_variance_ : array, shape (n_components,)
202202
The amount of variance explained by each of the selected components.
203203
204+
Equal to n_components largest eigenvalues
205+
of the covariance matrix of X.
206+
204207
.. versionadded:: 0.18
205208
206209
explained_variance_ratio_ : array, shape (n_components,)
@@ -232,6 +235,9 @@ class PCA(_BasePCA):
232235
http://www.miketipping.com/papers/met-mppca.pdf. It is required to
233236
computed the estimated data covariance and score samples.
234237
238+
Equal to the average of (min(n_features, n_samples) - n_components)
239+
smallest eigenvalues of the covariance matrix of X.
240+
235241
References
236242
----------
237243
For n_components == 'mle', this class uses the method of `Thomas P. Minka:
@@ -494,9 +500,10 @@ def _fit_truncated(self, X, n_components, svd_solver):
494500
self.explained_variance_ratio_ = \
495501
self.explained_variance_ / total_var.sum()
496502
self.singular_values_ = S.copy() # Store the singular values.
497-
if self.n_components_ < n_features:
503+
if self.n_components_ < min(n_features, n_samples):
498504
self.noise_variance_ = (total_var.sum() -
499505
self.explained_variance_.sum())
506+
self.noise_variance_ /= min(n_features, n_samples) - n_components
500507
else:
501508
self.noise_variance_ = 0.
502509

sklearn/decomposition/tests/test_pca.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -529,6 +529,50 @@ def test_pca_score3():
529529
assert_true(ll.argmax() == 1)
530530

531531

532+
def test_pca_score_with_different_solvers():
533+
digits = datasets.load_digits()
534+
X_digits = digits.data
535+
536+
pca_dict = {svd_solver: PCA(n_components=30, svd_solver=svd_solver,
537+
random_state=0)
538+
for svd_solver in solver_list}
539+
540+
for pca in pca_dict.values():
541+
pca.fit(X_digits)
542+
# Sanity check for the noise_variance_. For more details see
543+
# https://github.com/scikit-learn/scikit-learn/issues/7568
544+
# https://github.com/scikit-learn/scikit-learn/issues/8541
545+
# https://github.com/scikit-learn/scikit-learn/issues/8544
546+
assert np.all((pca.explained_variance_ - pca.noise_variance_) >= 0)
547+
548+
# Compare scores with different svd_solvers
549+
score_dict = {svd_solver: pca.score(X_digits)
550+
for svd_solver, pca in pca_dict.items()}
551+
assert_almost_equal(score_dict['full'], score_dict['arpack'])
552+
assert_almost_equal(score_dict['full'], score_dict['randomized'],
553+
decimal=3)
554+
555+
556+
def test_pca_zero_noise_variance_edge_cases():
557+
# ensure that noise_variance_ is 0 in edge cases
558+
# when n_components == min(n_samples, n_features)
559+
n, p = 100, 3
560+
561+
rng = np.random.RandomState(0)
562+
X = rng.randn(n, p) * .1 + np.array([3, 4, 5])
563+
# arpack raises ValueError for n_components == min(n_samples,
564+
# n_features)
565+
svd_solvers = ['full', 'randomized']
566+
567+
for svd_solver in svd_solvers:
568+
pca = PCA(svd_solver=svd_solver, n_components=p)
569+
pca.fit(X)
570+
assert pca.noise_variance_ == 0
571+
572+
pca.fit(X.T)
573+
assert pca.noise_variance_ == 0
574+
575+
532576
def test_svd_solver_auto():
533577
rng = np.random.RandomState(0)
534578
X = rng.uniform(size=(1000, 50))

0 commit comments

Comments
 (0)