Skip to content

Commit b94876a

Browse files
jnothmanglemaitre
authored andcommitted
[MRG+1] ENH Polynomial features for sparse data (scikit-learn#10452)
1 parent 3e5469e commit b94876a

File tree

3 files changed

+49
-9
lines changed

3 files changed

+49
-9
lines changed

doc/whats_new/v0.20.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,11 @@ Classifiers and regressors
125125
only require X to be an object with finite length or shape.
126126
:issue:`9832` by :user:`Vrishank Bhardwaj <vrishank97>`.
127127

128+
Preprocessing
129+
130+
- :class:`preprocessing.PolynomialFeatures` now supports sparse input.
131+
:issue:`10452` by :user:`Aman Dalmia <dalmia>` and `Joel Nothman`_.
132+
128133
Model evaluation and meta-estimators
129134

130135
- A scorer based on :func:`metrics.brier_score_loss` is also available.

sklearn/preprocessing/data.py

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1329,7 +1329,7 @@ def fit(self, X, y=None):
13291329
-------
13301330
self : instance
13311331
"""
1332-
n_samples, n_features = check_array(X).shape
1332+
n_samples, n_features = check_array(X, accept_sparse=True).shape
13331333
combinations = self._combinations(n_features, self.degree,
13341334
self.interaction_only,
13351335
self.include_bias)
@@ -1342,31 +1342,42 @@ def transform(self, X):
13421342
13431343
Parameters
13441344
----------
1345-
X : array-like, shape [n_samples, n_features]
1345+
X : array-like or sparse matrix, shape [n_samples, n_features]
13461346
The data to transform, row by row.
1347+
Sparse input should preferably be in CSC format.
13471348
13481349
Returns
13491350
-------
1350-
XP : np.ndarray shape [n_samples, NP]
1351+
XP : np.ndarray or CSC sparse matrix, shape [n_samples, NP]
13511352
The matrix of features, where NP is the number of polynomial
13521353
features generated from the combination of inputs.
13531354
"""
13541355
check_is_fitted(self, ['n_input_features_', 'n_output_features_'])
13551356

1356-
X = check_array(X, dtype=FLOAT_DTYPES)
1357+
X = check_array(X, dtype=FLOAT_DTYPES, accept_sparse='csc')
13571358
n_samples, n_features = X.shape
13581359

13591360
if n_features != self.n_input_features_:
13601361
raise ValueError("X shape does not match training shape")
13611362

1362-
# allocate output data
1363-
XP = np.empty((n_samples, self.n_output_features_), dtype=X.dtype)
1364-
13651363
combinations = self._combinations(n_features, self.degree,
13661364
self.interaction_only,
13671365
self.include_bias)
1368-
for i, c in enumerate(combinations):
1369-
XP[:, i] = X[:, c].prod(1)
1366+
if sparse.isspmatrix(X):
1367+
columns = []
1368+
for comb in combinations:
1369+
if comb:
1370+
out_col = 1
1371+
for col_idx in comb:
1372+
out_col = X[:, col_idx].multiply(out_col)
1373+
columns.append(out_col)
1374+
else:
1375+
columns.append(sparse.csc_matrix(np.ones((X.shape[0], 1))))
1376+
XP = sparse.hstack(columns, dtype=X.dtype).tocsc()
1377+
else:
1378+
XP = np.empty((n_samples, self.n_output_features_), dtype=X.dtype)
1379+
for i, comb in enumerate(combinations):
1380+
XP[:, i] = X[:, comb].prod(1)
13701381

13711382
return XP
13721383

sklearn/preprocessing/tests/test_data.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,12 @@
77

88
import warnings
99
import re
10+
1011
import numpy as np
1112
import numpy.linalg as la
1213
from scipy import sparse, stats
1314
from distutils.version import LooseVersion
15+
import pytest
1416

1517
from sklearn.utils import gen_batches
1618

@@ -155,6 +157,28 @@ def test_polynomial_feature_names():
155157
feature_names)
156158

157159

160+
@pytest.mark.parametrize(['deg', 'include_bias', 'interaction_only', 'dtype'],
161+
[(1, True, False, int),
162+
(2, True, False, int),
163+
(2, True, False, np.float32),
164+
(2, True, False, np.float64),
165+
(3, False, False, np.float64),
166+
(3, False, True, np.float64)])
167+
def test_polynomial_features_sparse_X(deg, include_bias, interaction_only,
168+
dtype):
169+
rng = np.random.RandomState(0)
170+
X = rng.randint(0, 2, (100, 2))
171+
X_sparse = sparse.csr_matrix(X)
172+
173+
est = PolynomialFeatures(deg, include_bias=include_bias)
174+
Xt_sparse = est.fit_transform(X_sparse.astype(dtype))
175+
Xt_dense = est.fit_transform(X.astype(dtype))
176+
177+
assert isinstance(Xt_sparse, sparse.csc_matrix)
178+
assert Xt_sparse.dtype == Xt_dense.dtype
179+
assert_array_almost_equal(Xt_sparse.A, Xt_dense)
180+
181+
158182
def test_standard_scaler_1d():
159183
# Test scaling of dataset along single axis
160184
for X in [X_1row, X_1col, X_list_1row, X_list_1row]:

0 commit comments

Comments
 (0)