Skip to content

Commit e28a577

Browse files
brentyirth
authored andcommitted
Add min_features_to_select parameter to RFECV (scikit-learn#11293)
1 parent d990f72 commit e28a577

File tree

3 files changed

+55
-19
lines changed

3 files changed

+55
-19
lines changed

doc/whats_new/v0.20.rst

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -345,11 +345,14 @@ Support for Python 3.3 has been officially dropped.
345345
:issue:`6689` by :user:`Nihar Sheth <nsheth12>` and
346346
:user:`Quazi Rahman <qmaruf>`.
347347

348+
- |Feature| Added ``min_features_to_select`` parameter to
349+
:class:`feature_selection.RFECV` to bound evaluated features counts.
350+
:issue:`11293` by :user:`Brent Yi <brentyi>`.
351+
348352
- |Fix| Fixed computation of ``n_features_to_compute`` for edge case with tied
349353
CV scores in :class:`feature_selection.RFECV`.
350354
:issue:`9222` by :user:`Nick Hoh <nickypie>`.
351355

352-
353356
:mod:`sklearn.gaussian_process`
354357
...............................
355358

sklearn/feature_selection/rfe.py

Lines changed: 29 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,12 @@ class RFE(BaseEstimator, MetaEstimatorMixin, SelectorMixin):
6060
are selected.
6161
6262
step : int or float, optional (default=1)
63-
If greater than or equal to 1, then `step` corresponds to the (integer)
64-
number of features to remove at each iteration.
65-
If within (0.0, 1.0), then `step` corresponds to the percentage
63+
If greater than or equal to 1, then ``step`` corresponds to the
64+
(integer) number of features to remove at each iteration.
65+
If within (0.0, 1.0), then ``step`` corresponds to the percentage
6666
(rounded down) of features to remove at each iteration.
6767
68-
verbose : int, default=0
68+
verbose : int, (default=0)
6969
Controls verbosity of output.
7070
7171
Attributes
@@ -335,10 +335,18 @@ class RFECV(RFE, MetaEstimatorMixin):
335335
attribute or through a ``feature_importances_`` attribute.
336336
337337
step : int or float, optional (default=1)
338-
If greater than or equal to 1, then `step` corresponds to the (integer)
339-
number of features to remove at each iteration.
340-
If within (0.0, 1.0), then `step` corresponds to the percentage
338+
If greater than or equal to 1, then ``step`` corresponds to the
339+
(integer) number of features to remove at each iteration.
340+
If within (0.0, 1.0), then ``step`` corresponds to the percentage
341341
(rounded down) of features to remove at each iteration.
342+
Note that the last iteration may remove fewer than ``step`` features in
343+
order to reach ``min_features_to_select``.
344+
345+
min_features_to_select : int, (default=1)
346+
The minimum number of features to be selected. This number of features
347+
will always be scored, even if the difference between the original
348+
feature count and ``min_features_to_select`` isn't divisible by
349+
``step``.
342350
343351
cv : int, cross-validation generator or an iterable, optional
344352
Determines the cross-validation splitting strategy.
@@ -358,20 +366,20 @@ class RFECV(RFE, MetaEstimatorMixin):
358366
cross-validation strategies that can be used here.
359367
360368
.. versionchanged:: 0.20
361-
``cv`` default value if None will change from 3-fold to 5-fold
369+
``cv`` default value of None will change from 3-fold to 5-fold
362370
in v0.22.
363371
364-
scoring : string, callable or None, optional, default: None
372+
scoring : string, callable or None, optional, (default=None)
365373
A string (see model evaluation documentation) or
366374
a scorer callable object / function with signature
367375
``scorer(estimator, X, y)``.
368376
369-
verbose : int, default=0
377+
verbose : int, (default=0)
370378
Controls verbosity of output.
371379
372-
n_jobs : int, default 1
380+
n_jobs : int, (default=1)
373381
Number of cores to run in parallel while fitting across folds.
374-
Defaults to 1 core. If `n_jobs=-1`, then number of jobs is set
382+
Defaults to 1 core. If ``n_jobs=-1``, then number of jobs is set
375383
to number of cores.
376384
377385
Attributes
@@ -399,7 +407,8 @@ class RFECV(RFE, MetaEstimatorMixin):
399407
400408
Notes
401409
-----
402-
The size of ``grid_scores_`` is equal to ceil((n_features - 1) / step) + 1,
410+
The size of ``grid_scores_`` is equal to
411+
``ceil((n_features - min_features_to_select) / step) + 1``,
403412
where step is the number of features removed at each iteration.
404413
405414
Examples
@@ -431,14 +440,15 @@ class RFECV(RFE, MetaEstimatorMixin):
431440
for cancer classification using support vector machines",
432441
Mach. Learn., 46(1-3), 389--422, 2002.
433442
"""
434-
def __init__(self, estimator, step=1, cv='warn', scoring=None, verbose=0,
435-
n_jobs=None):
443+
def __init__(self, estimator, step=1, min_features_to_select=1, cv='warn',
444+
scoring=None, verbose=0, n_jobs=None):
436445
self.estimator = estimator
437446
self.step = step
438447
self.cv = cv
439448
self.scoring = scoring
440449
self.verbose = verbose
441450
self.n_jobs = n_jobs
451+
self.min_features_to_select = min_features_to_select
442452

443453
def fit(self, X, y, groups=None):
444454
"""Fit the RFE model and automatically tune the number of selected
@@ -464,7 +474,6 @@ def fit(self, X, y, groups=None):
464474
cv = check_cv(self.cv, y, is_classifier(self.estimator))
465475
scorer = check_scoring(self.estimator, scoring=self.scoring)
466476
n_features = X.shape[1]
467-
n_features_to_select = 1
468477

469478
if 0.0 < self.step < 1.0:
470479
step = int(max(1, self.step * n_features))
@@ -473,8 +482,10 @@ def fit(self, X, y, groups=None):
473482
if step <= 0:
474483
raise ValueError("Step must be >0")
475484

485+
# Build an RFE object, which will evaluate and score each possible
486+
# feature count, down to self.min_features_to_select
476487
rfe = RFE(estimator=self.estimator,
477-
n_features_to_select=n_features_to_select,
488+
n_features_to_select=self.min_features_to_select,
478489
step=self.step, verbose=self.verbose)
479490

480491
# Determine the number of subsets of features by fitting across
@@ -504,7 +515,7 @@ def fit(self, X, y, groups=None):
504515
argmax_idx = len(scores) - np.argmax(scores_rev) - 1
505516
n_features_to_select = max(
506517
n_features - (argmax_idx * step),
507-
n_features_to_select)
518+
self.min_features_to_select)
508519

509520
# Re-execute an elimination with best_k over the whole set
510521
rfe = RFE(estimator=self.estimator,

sklearn/feature_selection/tests/test_rfe.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
"""
22
Testing Recursive feature elimination
33
"""
4+
from __future__ import division
5+
46
import pytest
57
import numpy as np
68
from numpy.testing import assert_array_almost_equal, assert_array_equal
@@ -229,6 +231,26 @@ def test_rfecv_verbose_output():
229231
assert_greater(len(verbose_output.readline()), 0)
230232

231233

234+
def test_rfecv_grid_scores_size():
235+
generator = check_random_state(0)
236+
iris = load_iris()
237+
X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
238+
y = list(iris.target) # regression test: list should be supported
239+
240+
# Non-regression test for varying combinations of step and
241+
# min_features_to_select.
242+
for step, min_features_to_select in [[2, 1], [2, 2], [3, 3]]:
243+
rfecv = RFECV(estimator=MockClassifier(), step=step,
244+
min_features_to_select=min_features_to_select, cv=5)
245+
rfecv.fit(X, y)
246+
247+
score_len = np.ceil(
248+
(X.shape[1] - min_features_to_select) / step) + 1
249+
assert len(rfecv.grid_scores_) == score_len
250+
assert len(rfecv.ranking_) == X.shape[1]
251+
assert rfecv.n_features_ >= min_features_to_select
252+
253+
232254
@pytest.mark.filterwarnings('ignore: You should specify a value') # 0.22
233255
def test_rfe_estimator_tags():
234256
rfe = RFE(SVC(kernel='linear'))

0 commit comments

Comments
 (0)