Skip to content

Commit 992ed41

Browse files
glemaitrethomasjpfan
authored andcommitted
FIX change boolean array-likes indexing in old NumPy ver… (scikit-learn#14510)
1 parent e3cc1ea commit 992ed41

File tree

4 files changed

+72
-3
lines changed

4 files changed

+72
-3
lines changed

doc/whats_new/v0.22.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,14 @@ Changelog
6161
`sample_weights` are not supported by the wrapped estimator). :pr:`13575`
6262
by :user:`William de Vazelhes <wdevazelhes>`.
6363

64+
:mod:`sklearn.compose`
65+
......................
66+
67+
- |Fix| Fixed a bug in :class:`compose.ColumnTransformer` which failed to
68+
select the proper columns when using a boolean list, with NumPy older than
69+
1.12.
70+
:pr:`14510` by :user:`Guillaume Lemaitre <glemaitre>`.
71+
6472
:mod:`sklearn.datasets`
6573
.......................
6674

sklearn/compose/tests/test_column_transformer.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from sklearn.base import BaseEstimator
1717
from sklearn.compose import ColumnTransformer, make_column_transformer
1818
from sklearn.exceptions import NotFittedError
19+
from sklearn.preprocessing import FunctionTransformer
1920
from sklearn.preprocessing import StandardScaler, Normalizer, OneHotEncoder
2021
from sklearn.feature_extraction import DictVectorizer
2122

@@ -1108,3 +1109,17 @@ def test_column_transformer_reordered_column_names_remainder(explicit_colname):
11081109
err_msg = 'Specifying the columns'
11091110
with pytest.raises(ValueError, match=err_msg):
11101111
tf.transform(X_array)
1112+
1113+
1114+
@pytest.mark.parametrize("array_type", [np.asarray, sparse.csr_matrix])
1115+
def test_column_transformer_mask_indexing(array_type):
1116+
# Regression test for #14510
1117+
# Boolean array-like does not behave as boolean array with NumPy < 1.12
1118+
# and sparse matrices as well
1119+
X = np.transpose([[1, 2, 3], [4, 5, 6], [5, 6, 7], [8, 9, 10]])
1120+
X = array_type(X)
1121+
column_transformer = ColumnTransformer(
1122+
[('identity', FunctionTransformer(), [False, True, False, True])]
1123+
)
1124+
X_trans = column_transformer.fit_transform(X)
1125+
assert X_trans.shape == (3, 2)

sklearn/utils/__init__.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from . import _joblib
1919
from ..exceptions import DataConversionWarning
2020
from .deprecation import deprecated
21+
from .fixes import np_version
2122
from .validation import (as_float_array,
2223
assert_all_finite,
2324
check_random_state, column_or_1d, check_array,
@@ -225,6 +226,21 @@ def safe_indexing(X, indices, axis=0):
225226
)
226227

227228

229+
def _array_indexing(array, key, axis=0):
230+
"""Index an array consistently across NumPy version."""
231+
if axis not in (0, 1):
232+
raise ValueError(
233+
"'axis' should be either 0 (to index rows) or 1 (to index "
234+
" column). Got {} instead.".format(axis)
235+
)
236+
if np_version < (1, 12) or issparse(array):
237+
# check if we have an boolean array-likes to make the proper indexing
238+
key_array = np.asarray(key)
239+
if np.issubdtype(key_array.dtype, np.bool_):
240+
key = key_array
241+
return array[key] if axis == 0 else array[:, key]
242+
243+
228244
def _safe_indexing_row(X, indices):
229245
"""Return items or rows from X using indices.
230246
@@ -266,7 +282,7 @@ def _safe_indexing_row(X, indices):
266282
# This is often substantially faster than X[indices]
267283
return X.take(indices, axis=0)
268284
else:
269-
return X[indices]
285+
return _array_indexing(X, indices, axis=0)
270286
else:
271287
return [X[idx] for idx in indices]
272288

@@ -356,7 +372,7 @@ def _safe_indexing_column(X, key):
356372
return X.iloc[:, key]
357373
else:
358374
# numpy arrays, sparse arrays
359-
return X[:, key]
375+
return _array_indexing(X, key, axis=1)
360376

361377

362378
def _get_column_indices(X, key):
@@ -371,7 +387,7 @@ def _get_column_indices(X, key):
371387
or hasattr(key, 'dtype') and np.issubdtype(key.dtype, np.bool_)):
372388
# Convert key into positive indexes
373389
try:
374-
idx = np.arange(n_columns)[key]
390+
idx = safe_indexing(np.arange(n_columns), key)
375391
except IndexError as e:
376392
raise ValueError(
377393
'all features must be in [0, %d]' % (n_columns - 1)

sklearn/utils/tests/test_utils.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,10 @@
1010

1111
from sklearn.utils.testing import (assert_raises,
1212
assert_array_equal,
13+
assert_allclose_dense_sparse,
1314
assert_raises_regex,
1415
assert_warns_message, assert_no_warnings)
16+
from sklearn.utils import _array_indexing
1517
from sklearn.utils import check_random_state
1618
from sklearn.utils import _check_key_type
1719
from sklearn.utils import deprecated
@@ -365,6 +367,34 @@ def test_safe_indexing_mock_pandas(asarray):
365367
assert_array_equal(np.array(X_df_indexed), X_indexed)
366368

367369

370+
@pytest.mark.parametrize("array_type", ['array', 'sparse', 'dataframe'])
371+
def test_safe_indexing_mask_axis_1(array_type):
372+
# regression test for #14510
373+
# check that boolean array-like and boolean array lead to the same indexing
374+
# even in NumPy < 1.12
375+
if array_type == 'array':
376+
array_constructor = np.asarray
377+
elif array_type == 'sparse':
378+
array_constructor = sp.csr_matrix
379+
elif array_type == 'dataframe':
380+
pd = pytest.importorskip('pandas')
381+
array_constructor = pd.DataFrame
382+
383+
X = array_constructor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
384+
mask = [True, False, True]
385+
mask_array = np.array(mask)
386+
X_masked = safe_indexing(X, mask, axis=1)
387+
X_masked_array = safe_indexing(X, mask_array, axis=1)
388+
assert_allclose_dense_sparse(X_masked, X_masked_array)
389+
390+
391+
def test_array_indexing_array_error():
392+
X = np.array([[0, 1], [2, 3]])
393+
mask = [True, False]
394+
with pytest.raises(ValueError, match="'axis' should be either 0"):
395+
_array_indexing(X, mask, axis=3)
396+
397+
368398
def test_shuffle_on_ndim_equals_three():
369399
def to_tuple(A): # to make the inner arrays hashable
370400
return tuple(tuple(tuple(C) for C in B) for B in A)

0 commit comments

Comments
 (0)