Skip to content

Commit 194c231

Browse files
yl565MechCoder
authored andcommitted
[MRG+1] Replace or remove VotingClassifier estimators by set_params (scikit-learn#7674)
* PR to 7288 Use _BaseComposition as base * Fix flakes problem * Change ``pipeline``, add more tests and other changes 1. Use ``_BaseComposition`` in class ``Pipeline`` and ``FeatureUnion`` 2. Add tests of soft voting ``transform`` when one estimator is set to None 3. Add estimator name validation in ``_BaseComposition`` and tests 4. Other requested changes. * Remove the unused import warn * Add more test and documentation * resolve conflict with master * Add testing cases and modify documentation * Add to whats_new.rst * Fix too many blank lines
1 parent 7d1e430 commit 194c231

File tree

6 files changed

+236
-92
lines changed

6 files changed

+236
-92
lines changed

doc/whats_new.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,13 @@ Enhancements
163163

164164
- In :class:`gaussian_process.GaussianProcessRegressor`, method ``predict``
165165
is a lot faster with ``return_std=True`` by :user:`Hadrien Bertrand <hbertrand>`.
166+
- Added ability to use sparse matrices in :func:`feature_selection.f_regression`
167+
with ``center=True``. :issue:`8065` by :user:`Daniel LeJeune <acadiansith>`.
168+
169+
- :class:`ensemble.VotingClassifier` now allow changing estimators by using
170+
:meth:`ensemble.VotingClassifier.set_params`. Estimators can also be
171+
removed by setting it to `None`.
172+
:issue:`7674` by:user:`Yichuan Liu <yl565>`.
166173

167174
Bug fixes
168175
.........

sklearn/ensemble/tests/test_voting_classifier.py

Lines changed: 90 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import numpy as np
44
from sklearn.utils.testing import assert_almost_equal, assert_array_equal
5-
from sklearn.utils.testing import assert_equal
5+
from sklearn.utils.testing import assert_equal, assert_true, assert_false
66
from sklearn.utils.testing import assert_raise_message
77
from sklearn.exceptions import NotFittedError
88
from sklearn.linear_model import LogisticRegression
@@ -40,6 +40,19 @@ def test_estimator_init():
4040
'; got 2 weights, 1 estimators')
4141
assert_raise_message(ValueError, msg, eclf.fit, X, y)
4242

43+
eclf = VotingClassifier(estimators=[('lr', clf), ('lr', clf)],
44+
weights=[1, 2])
45+
msg = "Names provided are not unique: ['lr', 'lr']"
46+
assert_raise_message(ValueError, msg, eclf.fit, X, y)
47+
48+
eclf = VotingClassifier(estimators=[('lr__', clf)])
49+
msg = "Estimator names must not contain __: got ['lr__']"
50+
assert_raise_message(ValueError, msg, eclf.fit, X, y)
51+
52+
eclf = VotingClassifier(estimators=[('estimators', clf)])
53+
msg = "Estimator names conflict with constructor arguments: ['estimators']"
54+
assert_raise_message(ValueError, msg, eclf.fit, X, y)
55+
4356

4457
def test_predictproba_hardvoting():
4558
eclf = VotingClassifier(estimators=[('lr1', LogisticRegression()),
@@ -260,6 +273,82 @@ def test_sample_weight():
260273
assert_raise_message(ValueError, msg, eclf3.fit, X, y, sample_weight)
261274

262275

276+
def test_set_params():
277+
"""set_params should be able to set estimators"""
278+
clf1 = LogisticRegression(random_state=123, C=1.0)
279+
clf2 = RandomForestClassifier(random_state=123, max_depth=None)
280+
clf3 = GaussianNB()
281+
eclf1 = VotingClassifier([('lr', clf1), ('rf', clf2)], voting='soft',
282+
weights=[1, 2])
283+
eclf1.fit(X, y)
284+
eclf2 = VotingClassifier([('lr', clf1), ('nb', clf3)], voting='soft',
285+
weights=[1, 2])
286+
eclf2.set_params(nb=clf2).fit(X, y)
287+
assert_false(hasattr(eclf2, 'nb'))
288+
289+
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
290+
assert_array_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
291+
assert_equal(eclf2.estimators[0][1].get_params(), clf1.get_params())
292+
assert_equal(eclf2.estimators[1][1].get_params(), clf2.get_params())
293+
294+
eclf1.set_params(lr__C=10.0)
295+
eclf2.set_params(nb__max_depth=5)
296+
297+
assert_true(eclf1.estimators[0][1].get_params()['C'] == 10.0)
298+
assert_true(eclf2.estimators[1][1].get_params()['max_depth'] == 5)
299+
assert_equal(eclf1.get_params()["lr__C"],
300+
eclf1.get_params()["lr"].get_params()['C'])
301+
302+
303+
def test_set_estimator_none():
304+
"""VotingClassifier set_params should be able to set estimators as None"""
305+
# Test predict
306+
clf1 = LogisticRegression(random_state=123)
307+
clf2 = RandomForestClassifier(random_state=123)
308+
clf3 = GaussianNB()
309+
eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
310+
('nb', clf3)],
311+
voting='hard', weights=[1, 0, 0.5]).fit(X, y)
312+
313+
eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
314+
('nb', clf3)],
315+
voting='hard', weights=[1, 1, 0.5])
316+
eclf2.set_params(rf=None).fit(X, y)
317+
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
318+
319+
assert_true(dict(eclf2.estimators)["rf"] is None)
320+
assert_true(len(eclf2.estimators_) == 2)
321+
assert_true(all([not isinstance(est, RandomForestClassifier) for est in
322+
eclf2.estimators_]))
323+
assert_true(eclf2.get_params()["rf"] is None)
324+
325+
eclf1.set_params(voting='soft').fit(X, y)
326+
eclf2.set_params(voting='soft').fit(X, y)
327+
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
328+
assert_array_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
329+
msg = ('All estimators are None. At least one is required'
330+
' to be a classifier!')
331+
assert_raise_message(
332+
ValueError, msg, eclf2.set_params(lr=None, rf=None, nb=None).fit, X, y)
333+
334+
# Test soft voting transform
335+
X1 = np.array([[1], [2]])
336+
y1 = np.array([1, 2])
337+
eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
338+
voting='soft', weights=[0, 0.5]).fit(X1, y1)
339+
340+
eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
341+
voting='soft', weights=[1, 0.5])
342+
eclf2.set_params(rf=None).fit(X1, y1)
343+
assert_array_equal(eclf1.transform(X1), np.array([[[0.7, 0.3], [0.3, 0.7]],
344+
[[1., 0.], [0., 1.]]]))
345+
assert_array_equal(eclf2.transform(X1), np.array([[[1., 0.], [0., 1.]]]))
346+
eclf1.set_params(voting='hard')
347+
eclf2.set_params(voting='hard')
348+
assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]]))
349+
assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
350+
351+
263352
def test_estimator_weights_format():
264353
# Test estimator weights inputs as list and array
265354
clf1 = LogisticRegression(random_state=123)

sklearn/ensemble/voting_classifier.py

Lines changed: 71 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,13 @@
1313

1414
import numpy as np
1515

16-
from ..base import BaseEstimator
1716
from ..base import ClassifierMixin
1817
from ..base import TransformerMixin
1918
from ..base import clone
2019
from ..preprocessing import LabelEncoder
21-
from ..externals import six
2220
from ..externals.joblib import Parallel, delayed
2321
from ..utils.validation import has_fit_parameter, check_is_fitted
22+
from ..utils.metaestimators import _BaseComposition
2423

2524

2625
def _parallel_fit_estimator(estimator, X, y, sample_weight):
@@ -32,7 +31,7 @@ def _parallel_fit_estimator(estimator, X, y, sample_weight):
3231
return estimator
3332

3433

35-
class VotingClassifier(BaseEstimator, ClassifierMixin, TransformerMixin):
34+
class VotingClassifier(_BaseComposition, ClassifierMixin, TransformerMixin):
3635
"""Soft Voting/Majority Rule classifier for unfitted estimators.
3736
3837
.. versionadded:: 0.17
@@ -44,7 +43,8 @@ class VotingClassifier(BaseEstimator, ClassifierMixin, TransformerMixin):
4443
estimators : list of (string, estimator) tuples
4544
Invoking the ``fit`` method on the ``VotingClassifier`` will fit clones
4645
of those original estimators that will be stored in the class attribute
47-
`self.estimators_`.
46+
``self.estimators_``. An estimator can be set to `None` using
47+
``set_params``.
4848
4949
voting : str, {'hard', 'soft'} (default='hard')
5050
If 'hard', uses predicted class labels for majority rule voting.
@@ -64,7 +64,8 @@ class VotingClassifier(BaseEstimator, ClassifierMixin, TransformerMixin):
6464
Attributes
6565
----------
6666
estimators_ : list of classifiers
67-
The collection of fitted sub-estimators.
67+
The collection of fitted sub-estimators as defined in ``estimators``
68+
that are not `None`.
6869
6970
classes_ : array-like, shape = [n_predictions]
7071
The classes labels.
@@ -102,11 +103,14 @@ class VotingClassifier(BaseEstimator, ClassifierMixin, TransformerMixin):
102103

103104
def __init__(self, estimators, voting='hard', weights=None, n_jobs=1):
104105
self.estimators = estimators
105-
self.named_estimators = dict(estimators)
106106
self.voting = voting
107107
self.weights = weights
108108
self.n_jobs = n_jobs
109109

110+
@property
111+
def named_estimators(self):
112+
return dict(self.estimators)
113+
110114
def fit(self, X, y, sample_weight=None):
111115
""" Fit the estimators.
112116
@@ -150,23 +154,36 @@ def fit(self, X, y, sample_weight=None):
150154
if sample_weight is not None:
151155
for name, step in self.estimators:
152156
if not has_fit_parameter(step, 'sample_weight'):
153-
raise ValueError('Underlying estimator \'%s\' does not support'
154-
' sample weights.' % name)
155-
156-
self.le_ = LabelEncoder()
157-
self.le_.fit(y)
157+
raise ValueError('Underlying estimator \'%s\' does not'
158+
' support sample weights.' % name)
159+
names, clfs = zip(*self.estimators)
160+
self._validate_names(names)
161+
162+
n_isnone = np.sum([clf is None for _, clf in self.estimators])
163+
if n_isnone == len(self.estimators):
164+
raise ValueError('All estimators are None. At least one is '
165+
'required to be a classifier!')
166+
self.le_ = LabelEncoder().fit(y)
158167
self.classes_ = self.le_.classes_
159168
self.estimators_ = []
160169

161170
transformed_y = self.le_.transform(y)
162171

163172
self.estimators_ = Parallel(n_jobs=self.n_jobs)(
164173
delayed(_parallel_fit_estimator)(clone(clf), X, transformed_y,
165-
sample_weight)
166-
for _, clf in self.estimators)
174+
sample_weight)
175+
for clf in clfs if clf is not None)
167176

168177
return self
169178

179+
@property
180+
def _weights_not_none(self):
181+
"""Get the weights of not `None` estimators"""
182+
if self.weights is None:
183+
return None
184+
return [w for est, w in zip(self.estimators,
185+
self.weights) if est[1] is not None]
186+
170187
def predict(self, X):
171188
""" Predict class labels for X.
172189
@@ -188,11 +205,10 @@ def predict(self, X):
188205

189206
else: # 'hard' voting
190207
predictions = self._predict(X)
191-
maj = np.apply_along_axis(lambda x:
192-
np.argmax(np.bincount(x,
193-
weights=self.weights)),
194-
axis=1,
195-
arr=predictions.astype('int'))
208+
maj = np.apply_along_axis(
209+
lambda x: np.argmax(
210+
np.bincount(x, weights=self._weights_not_none)),
211+
axis=1, arr=predictions.astype('int'))
196212

197213
maj = self.le_.inverse_transform(maj)
198214

@@ -208,7 +224,8 @@ def _predict_proba(self, X):
208224
raise AttributeError("predict_proba is not available when"
209225
" voting=%r" % self.voting)
210226
check_is_fitted(self, 'estimators_')
211-
avg = np.average(self._collect_probas(X), axis=0, weights=self.weights)
227+
avg = np.average(self._collect_probas(X), axis=0,
228+
weights=self._weights_not_none)
212229
return avg
213230

214231
@property
@@ -252,17 +269,42 @@ def transform(self, X):
252269
else:
253270
return self._predict(X)
254271

272+
def set_params(self, **params):
273+
""" Setting the parameters for the voting classifier
274+
275+
Valid parameter keys can be listed with get_params().
276+
277+
Parameters
278+
----------
279+
params: keyword arguments
280+
Specific parameters using e.g. set_params(parameter_name=new_value)
281+
In addition, to setting the parameters of the ``VotingClassifier``,
282+
the individual classifiers of the ``VotingClassifier`` can also be
283+
set or replaced by setting them to None.
284+
285+
Examples
286+
--------
287+
# In this example, the RandomForestClassifier is removed
288+
clf1 = LogisticRegression()
289+
clf2 = RandomForestClassifier()
290+
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2)]
291+
eclf.set_params(rf=None)
292+
293+
"""
294+
super(VotingClassifier, self)._set_params('estimators', **params)
295+
return self
296+
255297
def get_params(self, deep=True):
256-
"""Return estimator parameter names for GridSearch support"""
257-
if not deep:
258-
return super(VotingClassifier, self).get_params(deep=False)
259-
else:
260-
out = super(VotingClassifier, self).get_params(deep=False)
261-
out.update(self.named_estimators.copy())
262-
for name, step in six.iteritems(self.named_estimators):
263-
for key, value in six.iteritems(step.get_params(deep=True)):
264-
out['%s__%s' % (name, key)] = value
265-
return out
298+
""" Get the parameters of the VotingClassifier
299+
300+
Parameters
301+
----------
302+
deep: bool
303+
Setting it to True gets the various classifiers and the parameters
304+
of the classifiers as well
305+
"""
306+
return super(VotingClassifier,
307+
self)._get_params('estimators', deep=deep)
266308

267309
def _predict(self, X):
268310
"""Collect results from clf.predict calls. """

sklearn/pipeline.py

Lines changed: 5 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
# License: BSD
1111

1212
from collections import defaultdict
13+
1314
from abc import ABCMeta, abstractmethod
1415

1516
import numpy as np
@@ -22,68 +23,12 @@
2223
from .utils.metaestimators import if_delegate_has_method
2324
from .utils import Bunch
2425

25-
__all__ = ['Pipeline', 'FeatureUnion']
26-
26+
from .utils.metaestimators import _BaseComposition
2727

28-
class _BasePipeline(six.with_metaclass(ABCMeta, BaseEstimator)):
29-
"""Handles parameter management for classifiers composed of named steps.
30-
"""
28+
__all__ = ['Pipeline', 'FeatureUnion']
3129

32-
@abstractmethod
33-
def __init__(self):
34-
pass
35-
36-
def _replace_step(self, steps_attr, name, new_val):
37-
# assumes `name` is a valid step name
38-
new_steps = getattr(self, steps_attr)[:]
39-
for i, (step_name, _) in enumerate(new_steps):
40-
if step_name == name:
41-
new_steps[i] = (name, new_val)
42-
break
43-
setattr(self, steps_attr, new_steps)
44-
45-
def _get_params(self, steps_attr, deep=True):
46-
out = super(_BasePipeline, self).get_params(deep=False)
47-
if not deep:
48-
return out
49-
steps = getattr(self, steps_attr)
50-
out.update(steps)
51-
for name, estimator in steps:
52-
if estimator is None:
53-
continue
54-
for key, value in six.iteritems(estimator.get_params(deep=True)):
55-
out['%s__%s' % (name, key)] = value
56-
return out
57-
58-
def _set_params(self, steps_attr, **params):
59-
# Ensure strict ordering of parameter setting:
60-
# 1. All steps
61-
if steps_attr in params:
62-
setattr(self, steps_attr, params.pop(steps_attr))
63-
# 2. Step replacement
64-
step_names, _ = zip(*getattr(self, steps_attr))
65-
for name in list(six.iterkeys(params)):
66-
if '__' not in name and name in step_names:
67-
self._replace_step(steps_attr, name, params.pop(name))
68-
# 3. Step parameters and other initilisation arguments
69-
super(_BasePipeline, self).set_params(**params)
70-
return self
7130

72-
def _validate_names(self, names):
73-
if len(set(names)) != len(names):
74-
raise ValueError('Names provided are not unique: '
75-
'{0!r}'.format(list(names)))
76-
invalid_names = set(names).intersection(self.get_params(deep=False))
77-
if invalid_names:
78-
raise ValueError('Step names conflict with constructor arguments: '
79-
'{0!r}'.format(sorted(invalid_names)))
80-
invalid_names = [name for name in names if '__' in name]
81-
if invalid_names:
82-
raise ValueError('Step names must not contain __: got '
83-
'{0!r}'.format(invalid_names))
84-
85-
86-
class Pipeline(_BasePipeline):
31+
class Pipeline(_BaseComposition):
8732
"""Pipeline of transforms with a final estimator.
8833
8934
Sequentially apply a list of transforms and a final estimator.
@@ -631,7 +576,7 @@ def _fit_transform_one(transformer, weight, X, y,
631576
return res * weight, transformer
632577

633578

634-
class FeatureUnion(_BasePipeline, TransformerMixin):
579+
class FeatureUnion(_BaseComposition, TransformerMixin):
635580
"""Concatenates results of multiple transformer objects.
636581
637582
This estimator applies a list of transformer objects in parallel to the

0 commit comments

Comments
 (0)