Skip to content

Commit 276e260

Browse files
kaushik94larsmans
authored andcommitted
ENH add sparse parameter to OneHotEncoder
1 parent 88ecfa0 commit 276e260

File tree

2 files changed

+27
-6
lines changed

2 files changed

+27
-6
lines changed

sklearn/preprocessing/data.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -927,6 +927,9 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
927927
dtype : number type, default=np.float
928928
Desired dtype of output.
929929
930+
sparse : boolean, default=True
931+
Will return sparse matrix if set True else will return an array.
932+
930933
Attributes
931934
----------
932935
`active_features_` : array
@@ -953,7 +956,7 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
953956
>>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], \
954957
[1, 0, 2]]) # doctest: +ELLIPSIS
955958
OneHotEncoder(categorical_features='all', dtype=<... 'float'>,
956-
n_values='auto')
959+
n_values='auto', sparse=True)
957960
>>> enc.n_values_
958961
array([2, 3, 4])
959962
>>> enc.feature_indices_
@@ -969,10 +972,11 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
969972
encoding of dictionary items or strings.
970973
"""
971974
def __init__(self, n_values="auto", categorical_features="all",
972-
dtype=np.float):
975+
dtype=np.float, sparse=True):
973976
self.n_values = n_values
974977
self.categorical_features = categorical_features
975978
self.dtype = dtype
979+
self.sparse = sparse
976980

977981
def fit(self, X, y=None):
978982
"""Fit OneHotEncoder to X.
@@ -1033,7 +1037,7 @@ def _fit_transform(self, X):
10331037
out = out[:, active_features]
10341038
self.active_features_ = active_features
10351039

1036-
return out
1040+
return out if self.sparse else out.toarray()
10371041

10381042
def fit_transform(self, X, y=None):
10391043
"""Fit OneHotEncoder to X, then transform X.
@@ -1069,7 +1073,8 @@ def _transform(self, X):
10691073
dtype=self.dtype).tocsr()
10701074
if self.n_values == 'auto':
10711075
out = out[:, self.active_features_]
1072-
return out
1076+
1077+
return out if self.sparse else out.toarray()
10731078

10741079
def transform(self, X):
10751080
"""Transform X using one-hot encoding.
@@ -1081,7 +1086,7 @@ def transform(self, X):
10811086
10821087
Returns
10831088
-------
1084-
X_out : sparse matrix, dtype=int
1089+
X_out : sparse matrix if sparse=True else a 2-d array, dtype=int
10851090
Transformed input.
10861091
"""
10871092
return _transform_selected(X, self._transform,

sklearn/preprocessing/tests/test_data.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -622,7 +622,7 @@ def test_add_dummy_feature_csr():
622622
assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]])
623623

624624

625-
def test_one_hot_encoder():
625+
def test_one_hot_encoder_sparse():
626626
"""Test OneHotEncoder's fit and transform."""
627627
X = [[3, 2, 1], [0, 1, 1]]
628628
enc = OneHotEncoder()
@@ -674,6 +674,22 @@ def test_one_hot_encoder():
674674
# test negative input to transform
675675
enc.fit([[0], [1]])
676676
assert_raises(ValueError, enc.transform, [[0], [-1]])
677+
678+
def test_one_hot_encoder_dense():
679+
"""check for sparse=False"""
680+
X = [[3, 2, 1], [0, 1, 1]]
681+
enc = OneHotEncoder(sparse=False)
682+
# discover max values automatically
683+
X_trans = enc.fit_transform(X)
684+
assert_equal(X_trans.shape, (2, 5))
685+
assert_array_equal(enc.active_features_,
686+
np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0])
687+
assert_array_equal(enc.feature_indices_, [0, 4, 7, 9])
688+
689+
# check outcome
690+
assert_array_equal(X_trans,
691+
np.array([[0., 1., 0., 1., 1.],
692+
[1., 0., 1., 0., 1.]]))
677693

678694

679695
def _check_transform_selected(X, X_expected, sel):

0 commit comments

Comments
 (0)