Skip to content

Commit 9f842e3

Browse files
thomasjpfanjnothman
authored andcommitted
API Adds passthrough option to Pipeline (scikit-learn#11674)
1 parent a028416 commit 9f842e3

File tree

5 files changed

+101
-81
lines changed

5 files changed

+101
-81
lines changed

doc/modules/compose.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,10 +107,10 @@ This is particularly important for doing grid searches::
107107
>>> grid_search = GridSearchCV(pipe, param_grid=param_grid)
108108

109109
Individual steps may also be replaced as parameters, and non-final steps may be
110-
ignored by setting them to ``None``::
110+
ignored by setting them to ``'passthrough'``::
111111

112112
>>> from sklearn.linear_model import LogisticRegression
113-
>>> param_grid = dict(reduce_dim=[None, PCA(5), PCA(10)],
113+
>>> param_grid = dict(reduce_dim=['passthrough', PCA(5), PCA(10)],
114114
... clf=[SVC(), LogisticRegression()],
115115
... clf__C=[0.1, 10, 100])
116116
>>> grid_search = GridSearchCV(pipe, param_grid=param_grid)

doc/whats_new/v0.21.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,11 @@ Support for Python 3.4 and below has been officially dropped.
7171
in the dense case. Also added a new parameter ``order`` which controls output
7272
order for further speed performances. :issue:`12251` by `Tom Dupre la Tour`_.
7373

74+
:mod:`sklearn.pipeline`
75+
.......................
76+
77+
- |API| :class:`pipeline.Pipeline` now supports using ``'passthrough'`` as a
78+
transformer. :issue:`11144` by :user:`thomasjpfan`.
7479

7580
:mod:`sklearn.tree`
7681
...................

examples/compose/plot_compare_reduction.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444

4545
pipe = Pipeline([
4646
# the reduce_dim stage is populated by the param_grid
47-
('reduce_dim', None),
47+
('reduce_dim', 'passthrough'),
4848
('classify', LinearSVC())
4949
])
5050

sklearn/pipeline.py

Lines changed: 78 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
# License: BSD
1111

1212
from collections import defaultdict
13+
from itertools import islice
1314

1415
import numpy as np
1516
from scipy import sparse
@@ -41,7 +42,7 @@ class Pipeline(_BaseComposition):
4142
names and the parameter name separated by a '__', as in the example below.
4243
A step's estimator may be replaced entirely by setting the parameter
4344
with its name to another estimator, or a transformer removed by setting
44-
to None.
45+
it to 'passthrough' or ``None``.
4546
4647
Read more in the :ref:`User Guide <pipeline>`.
4748
@@ -158,19 +159,34 @@ def _validate_steps(self):
158159
estimator = estimators[-1]
159160

160161
for t in transformers:
161-
if t is None:
162+
if t is None or t == 'passthrough':
162163
continue
163164
if (not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not
164165
hasattr(t, "transform")):
165166
raise TypeError("All intermediate steps should be "
166-
"transformers and implement fit and transform."
167-
" '%s' (type %s) doesn't" % (t, type(t)))
167+
"transformers and implement fit and transform "
168+
"or be the string 'passthrough' "
169+
"'%s' (type %s) doesn't" % (t, type(t)))
168170

169171
# We allow last estimator to be None as an identity transformation
170-
if estimator is not None and not hasattr(estimator, "fit"):
171-
raise TypeError("Last step of Pipeline should implement fit. "
172-
"'%s' (type %s) doesn't"
173-
% (estimator, type(estimator)))
172+
if (estimator is not None and estimator != 'passthrough'
173+
and not hasattr(estimator, "fit")):
174+
raise TypeError(
175+
"Last step of Pipeline should implement fit "
176+
"or be the string 'passthrough'. "
177+
"'%s' (type %s) doesn't" % (estimator, type(estimator)))
178+
179+
def _iter(self, with_final=True):
180+
"""
181+
Generate (name, trans) tuples excluding 'passthrough' transformers
182+
"""
183+
stop = len(self.steps)
184+
if not with_final:
185+
stop -= 1
186+
187+
for name, trans in islice(self.steps, 0, stop):
188+
if trans is not None and trans != 'passthrough':
189+
yield name, trans
174190

175191
@property
176192
def _estimator_type(self):
@@ -183,7 +199,8 @@ def named_steps(self):
183199

184200
@property
185201
def _final_estimator(self):
186-
return self.steps[-1][1]
202+
estimator = self.steps[-1][1]
203+
return 'passthrough' if estimator is None else estimator
187204

188205
# Estimator interface
189206

@@ -202,37 +219,35 @@ def _fit(self, X, y=None, **fit_params):
202219
step, param = pname.split('__', 1)
203220
fit_params_steps[step][param] = pval
204221
Xt = X
205-
for step_idx, (name, transformer) in enumerate(self.steps[:-1]):
206-
if transformer is None:
207-
pass
208-
else:
209-
if hasattr(memory, 'location'):
210-
# joblib >= 0.12
211-
if memory.location is None:
212-
# we do not clone when caching is disabled to
213-
# preserve backward compatibility
214-
cloned_transformer = transformer
215-
else:
216-
cloned_transformer = clone(transformer)
217-
elif hasattr(memory, 'cachedir'):
218-
# joblib < 0.11
219-
if memory.cachedir is None:
220-
# we do not clone when caching is disabled to
221-
# preserve backward compatibility
222-
cloned_transformer = transformer
223-
else:
224-
cloned_transformer = clone(transformer)
222+
for step_idx, (name, transformer) in enumerate(
223+
self._iter(with_final=False)):
224+
if hasattr(memory, 'location'):
225+
# joblib >= 0.12
226+
if memory.location is None:
227+
# we do not clone when caching is disabled to
228+
# preserve backward compatibility
229+
cloned_transformer = transformer
225230
else:
226231
cloned_transformer = clone(transformer)
227-
# Fit or load from cache the current transfomer
228-
Xt, fitted_transformer = fit_transform_one_cached(
229-
cloned_transformer, Xt, y, None,
230-
**fit_params_steps[name])
231-
# Replace the transformer of the step with the fitted
232-
# transformer. This is necessary when loading the transformer
233-
# from the cache.
234-
self.steps[step_idx] = (name, fitted_transformer)
235-
if self._final_estimator is None:
232+
elif hasattr(memory, 'cachedir'):
233+
# joblib < 0.11
234+
if memory.cachedir is None:
235+
# we do not clone when caching is disabled to
236+
# preserve backward compatibility
237+
cloned_transformer = transformer
238+
else:
239+
cloned_transformer = clone(transformer)
240+
else:
241+
cloned_transformer = clone(transformer)
242+
# Fit or load from cache the current transfomer
243+
Xt, fitted_transformer = fit_transform_one_cached(
244+
cloned_transformer, Xt, y, None,
245+
**fit_params_steps[name])
246+
# Replace the transformer of the step with the fitted
247+
# transformer. This is necessary when loading the transformer
248+
# from the cache.
249+
self.steps[step_idx] = (name, fitted_transformer)
250+
if self._final_estimator == 'passthrough':
236251
return Xt, {}
237252
return Xt, fit_params_steps[self.steps[-1][0]]
238253

@@ -263,7 +278,7 @@ def fit(self, X, y=None, **fit_params):
263278
This estimator
264279
"""
265280
Xt, fit_params = self._fit(X, y, **fit_params)
266-
if self._final_estimator is not None:
281+
if self._final_estimator != 'passthrough':
267282
self._final_estimator.fit(Xt, y, **fit_params)
268283
return self
269284

@@ -298,7 +313,7 @@ def fit_transform(self, X, y=None, **fit_params):
298313
Xt, fit_params = self._fit(X, y, **fit_params)
299314
if hasattr(last_step, 'fit_transform'):
300315
return last_step.fit_transform(Xt, y, **fit_params)
301-
elif last_step is None:
316+
elif last_step == 'passthrough':
302317
return Xt
303318
else:
304319
return last_step.fit(Xt, y, **fit_params).transform(Xt)
@@ -326,9 +341,8 @@ def predict(self, X, **predict_params):
326341
y_pred : array-like
327342
"""
328343
Xt = X
329-
for name, transform in self.steps[:-1]:
330-
if transform is not None:
331-
Xt = transform.transform(Xt)
344+
for name, transform in self._iter(with_final=False):
345+
Xt = transform.transform(Xt)
332346
return self.steps[-1][-1].predict(Xt, **predict_params)
333347

334348
@if_delegate_has_method(delegate='_final_estimator')
@@ -376,9 +390,8 @@ def predict_proba(self, X):
376390
y_proba : array-like, shape = [n_samples, n_classes]
377391
"""
378392
Xt = X
379-
for name, transform in self.steps[:-1]:
380-
if transform is not None:
381-
Xt = transform.transform(Xt)
393+
for name, transform in self._iter(with_final=False):
394+
Xt = transform.transform(Xt)
382395
return self.steps[-1][-1].predict_proba(Xt)
383396

384397
@if_delegate_has_method(delegate='_final_estimator')
@@ -396,9 +409,8 @@ def decision_function(self, X):
396409
y_score : array-like, shape = [n_samples, n_classes]
397410
"""
398411
Xt = X
399-
for name, transform in self.steps[:-1]:
400-
if transform is not None:
401-
Xt = transform.transform(Xt)
412+
for name, transform in self._iter(with_final=False):
413+
Xt = transform.transform(Xt)
402414
return self.steps[-1][-1].decision_function(Xt)
403415

404416
@if_delegate_has_method(delegate='_final_estimator')
@@ -416,9 +428,8 @@ def predict_log_proba(self, X):
416428
y_score : array-like, shape = [n_samples, n_classes]
417429
"""
418430
Xt = X
419-
for name, transform in self.steps[:-1]:
420-
if transform is not None:
421-
Xt = transform.transform(Xt)
431+
for name, transform in self._iter(with_final=False):
432+
Xt = transform.transform(Xt)
422433
return self.steps[-1][-1].predict_log_proba(Xt)
423434

424435
@property
@@ -440,15 +451,14 @@ def transform(self):
440451
"""
441452
# _final_estimator is None or has transform, otherwise attribute error
442453
# XXX: Handling the None case means we can't use if_delegate_has_method
443-
if self._final_estimator is not None:
454+
if self._final_estimator != 'passthrough':
444455
self._final_estimator.transform
445456
return self._transform
446457

447458
def _transform(self, X):
448459
Xt = X
449-
for name, transform in self.steps:
450-
if transform is not None:
451-
Xt = transform.transform(Xt)
460+
for _, transform in self._iter():
461+
Xt = transform.transform(Xt)
452462
return Xt
453463

454464
@property
@@ -471,16 +481,15 @@ def inverse_transform(self):
471481
"""
472482
# raise AttributeError if necessary for hasattr behaviour
473483
# XXX: Handling the None case means we can't use if_delegate_has_method
474-
for name, transform in self.steps:
475-
if transform is not None:
476-
transform.inverse_transform
484+
for _, transform in self._iter():
485+
transform.inverse_transform
477486
return self._inverse_transform
478487

479488
def _inverse_transform(self, X):
480489
Xt = X
481-
for name, transform in self.steps[::-1]:
482-
if transform is not None:
483-
Xt = transform.inverse_transform(Xt)
490+
reverse_iter = reversed(list(self._iter()))
491+
for _, transform in reverse_iter:
492+
Xt = transform.inverse_transform(Xt)
484493
return Xt
485494

486495
@if_delegate_has_method(delegate='_final_estimator')
@@ -506,9 +515,8 @@ def score(self, X, y=None, sample_weight=None):
506515
score : float
507516
"""
508517
Xt = X
509-
for name, transform in self.steps[:-1]:
510-
if transform is not None:
511-
Xt = transform.transform(Xt)
518+
for name, transform in self._iter(with_final=False):
519+
Xt = transform.transform(Xt)
512520
score_params = {}
513521
if sample_weight is not None:
514522
score_params['sample_weight'] = sample_weight
@@ -527,7 +535,11 @@ def _pairwise(self):
527535
def _name_estimators(estimators):
528536
"""Generate names for estimators."""
529537

530-
names = [type(estimator).__name__.lower() for estimator in estimators]
538+
names = [
539+
estimator
540+
if isinstance(estimator, str) else type(estimator).__name__.lower()
541+
for estimator in estimators
542+
]
531543
namecount = defaultdict(int)
532544
for est, name in zip(estimators, names):
533545
namecount[name] += 1

sklearn/tests/test_pipeline.py

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,8 @@ def test_pipeline_init():
163163
# Check that we can't instantiate pipelines with objects without fit
164164
# method
165165
assert_raises_regex(TypeError,
166-
'Last step of Pipeline should implement fit. '
166+
'Last step of Pipeline should implement fit '
167+
'or be the string \'passthrough\''
167168
'.*NoFit.*',
168169
Pipeline, [('clf', NoFit())])
169170
# Smoke test with only an estimator
@@ -230,7 +231,7 @@ def test_pipeline_init_tuple():
230231
pipe.fit(X, y=None)
231232
pipe.score(X)
232233

233-
pipe.set_params(transf=None)
234+
pipe.set_params(transf='passthrough')
234235
pipe.fit(X, y=None)
235236
pipe.score(X)
236237

@@ -574,8 +575,8 @@ def test_pipeline_named_steps():
574575
assert_true(pipeline.named_steps.mult is mult2)
575576

576577

577-
def test_set_pipeline_step_none():
578-
# Test setting Pipeline steps to None
578+
@pytest.mark.parametrize('passthrough', [None, 'passthrough'])
579+
def test_set_pipeline_step_passthrough(passthrough):
579580
X = np.array([[1]])
580581
y = np.array([1])
581582
mult2 = Mult(mult=2)
@@ -592,22 +593,22 @@ def make():
592593
assert_array_equal([exp], pipeline.fit(X).predict(X))
593594
assert_array_equal(X, pipeline.inverse_transform([[exp]]))
594595

595-
pipeline.set_params(m3=None)
596+
pipeline.set_params(m3=passthrough)
596597
exp = 2 * 5
597598
assert_array_equal([[exp]], pipeline.fit_transform(X, y))
598599
assert_array_equal([exp], pipeline.fit(X).predict(X))
599600
assert_array_equal(X, pipeline.inverse_transform([[exp]]))
600601
assert_dict_equal(pipeline.get_params(deep=True),
601602
{'steps': pipeline.steps,
602603
'm2': mult2,
603-
'm3': None,
604+
'm3': passthrough,
604605
'last': mult5,
605606
'memory': None,
606607
'm2__mult': 2,
607608
'last__mult': 5,
608609
})
609610

610-
pipeline.set_params(m2=None)
611+
pipeline.set_params(m2=passthrough)
611612
exp = 5
612613
assert_array_equal([[exp]], pipeline.fit_transform(X, y))
613614
assert_array_equal([exp], pipeline.fit(X).predict(X))
@@ -626,19 +627,20 @@ def make():
626627
assert_array_equal(X, pipeline.inverse_transform([[exp]]))
627628

628629
pipeline = make()
629-
pipeline.set_params(last=None)
630+
pipeline.set_params(last=passthrough)
630631
# mult2 and mult3 are active
631632
exp = 6
632633
assert_array_equal([[exp]], pipeline.fit(X, y).transform(X))
633634
assert_array_equal([[exp]], pipeline.fit_transform(X, y))
634635
assert_array_equal(X, pipeline.inverse_transform([[exp]]))
635636
assert_raise_message(AttributeError,
636-
"'NoneType' object has no attribute 'predict'",
637+
"'str' object has no attribute 'predict'",
637638
getattr, pipeline, 'predict')
638639

639-
# Check None step at construction time
640+
# Check 'passthrough' step at construction time
640641
exp = 2 * 5
641-
pipeline = Pipeline([('m2', mult2), ('m3', None), ('last', mult5)])
642+
pipeline = Pipeline(
643+
[('m2', mult2), ('m3', passthrough), ('last', mult5)])
642644
assert_array_equal([[exp]], pipeline.fit_transform(X, y))
643645
assert_array_equal([exp], pipeline.fit(X).predict(X))
644646
assert_array_equal(X, pipeline.inverse_transform([[exp]]))
@@ -655,7 +657,8 @@ def test_pipeline_ducktyping():
655657
pipeline.transform
656658
pipeline.inverse_transform
657659

658-
pipeline = make_pipeline(None)
660+
pipeline = make_pipeline('passthrough')
661+
assert pipeline.steps[0] == ('passthrough', 'passthrough')
659662
assert_false(hasattr(pipeline, 'predict'))
660663
pipeline.transform
661664
pipeline.inverse_transform

0 commit comments

Comments
 (0)