Skip to content

Commit 456fb56

Browse files
acadiansithTomDLT
authored andcommitted
[MRG + 2] Allow f_regression to accept a sparse matrix with centering (scikit-learn#8065)
* Updated centering for f_regression Allows f_regression to accept a sparse matrix when centering=True. * Fixed E226 spacing issue. * Added f_regression sparse update to whats_new.rst
1 parent 5ce43f6 commit 456fb56

File tree

3 files changed

+24
-5
lines changed

3 files changed

+24
-5
lines changed

doc/whats_new.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,9 @@ Enhancements
110110
kernels which were previously prohibited. :issue:`8005` by `Andreas Müller`_ .
111111

112112

113+
- Added ability to use sparse matrices in :func:`feature_selection.f_regression`
114+
with ``center=True``. :issue:`8065` by :user:`Daniel LeJeune <acadiansith>`.
115+
113116
Bug fixes
114117
.........
115118

sklearn/feature_selection/tests/test_feature_select.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,12 @@ def test_f_regression():
9292
assert_true((pv[:5] < 0.05).all())
9393
assert_true((pv[5:] > 1.e-4).all())
9494

95+
# with centering, compare with sparse
96+
F, pv = f_regression(X, y, center=True)
97+
F_sparse, pv_sparse = f_regression(sparse.csr_matrix(X), y, center=True)
98+
assert_array_almost_equal(F_sparse, F)
99+
assert_array_almost_equal(pv_sparse, pv)
100+
95101
# again without centering, compare with sparse
96102
F, pv = f_regression(X, y, center=False)
97103
F_sparse, pv_sparse = f_regression(sparse.csr_matrix(X), y, center=False)

sklearn/feature_selection/univariate_selection.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -266,17 +266,27 @@ def f_regression(X, y, center=True):
266266
f_classif: ANOVA F-value between label/feature for classification tasks.
267267
chi2: Chi-squared stats of non-negative features for classification tasks.
268268
"""
269-
if issparse(X) and center:
270-
raise ValueError("center=True only allowed for dense data")
271269
X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=np.float64)
270+
n_samples = X.shape[0]
271+
272+
# compute centered values
273+
# note that E[(x - mean(x))*(y - mean(y))] = E[x*(y - mean(y))], so we
274+
# need not center X
272275
if center:
273276
y = y - np.mean(y)
274-
X = X.copy('F') # faster in fortran
275-
X -= X.mean(axis=0)
277+
if issparse(X):
278+
X_means = X.mean(axis=0).getA1()
279+
else:
280+
X_means = X.mean(axis=0)
281+
# compute the scaled standard deviations via moments
282+
X_norms = np.sqrt(row_norms(X.T, squared=True) -
283+
n_samples * X_means ** 2)
284+
else:
285+
X_norms = row_norms(X.T)
276286

277287
# compute the correlation
278288
corr = safe_sparse_dot(y, X)
279-
corr /= row_norms(X.T)
289+
corr /= X_norms
280290
corr /= norm(y)
281291

282292
# convert to p-value

0 commit comments

Comments
 (0)