ENH: Adds CallableTransformer

Joe Jevnik · amueller · commit cb0916c44068 · 2015-08-03T11:45:16.000-04:00
CallableTransformer allows a user to convert a standard python callable
into a transformer for use in a Pipeline.
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
@@ -1104,6 +1104,7 @@ See the :ref:`metrics` section of the user guide for further details.
    :template: class.rst
 
    preprocessing.Binarizer
+   preprocessing.CallableTransformer
    preprocessing.Imputer
    preprocessing.KernelCenterer
    preprocessing.LabelBinarizer
diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
@@ -508,3 +508,23 @@ The features of X have been transformed from :math:`(X_1, X_2, X_3)` to :math:`(
 Note that polynomial features are used implicitily in `kernel methods <http://en.wikipedia.org/wiki/Kernel_method>`_ (e.g., :class:`sklearn.svm.SVC`, :class:`sklearn.decomposition.KernelPCA`) when using polynomial :ref:`svm_kernels`.
 
 See :ref:`example_linear_model_plot_polynomial_interpolation.py` for Ridge regression using created polynomial features.
+
+Custom Transformers
+===================
+
+Often, you will want to convert an existing python function into a transformer
+to assist in data cleaning or processing. Users may implement a transformer from
+an arbitrary callable with :class:`CallableTransformer`. For example, one could
+apply a log transformation in a pipeline like::
+
+    >>> import numpy as np
+    >>> from sklearn.preprocessing import CallableTransformer
+    >>> transformer = CallableTransformer(np.log)
+    >>> X = np.array([[1, 2], [3, 4]])
+    >>> transformer.transform(X)
+    array([[ 0.        ,  0.69314718],
+           [ 1.09861229,  1.38629436]])
+
+For a full code example that demonstrates using a :class:`CallableTransformer`
+to do column selection,
+see :ref:`example_preprocessing_plot_callable_transformer.py`
diff --git a/examples/preprocessing/plot_callable_transformer.py b/examples/preprocessing/plot_callable_transformer.py
@@ -0,0 +1,69 @@
+"""
+=========================================================
+Using CallableTransformer to select columns
+=========================================================
+
+Shows how to use a callable transformer in a pipeline. If you know your
+dataset's first principle component is irrelevant for a classification task,
+you can use the CallableTransformer to select all but the first column of the
+PCA transformed data.
+"""
+import matplotlib.pyplot as plt
+import numpy as np
+
+from sklearn.cross_validation import train_test_split
+from sklearn.decomposition import PCA
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import CallableTransformer
+
+
+def _generate_vector(shift=0.5, noise=15):
+    return np.arange(1000) + (np.random.rand(1000) - shift) * noise
+
+
+def generate_dataset():
+    """
+    This dataset is two lines with a slope ~ 1, where one has
+    a y offset of ~100
+    """
+    return np.vstack((
+        np.vstack((
+            _generate_vector(),
+            _generate_vector() + 100,
+        )).T,
+        np.vstack((
+            _generate_vector(),
+            _generate_vector(),
+        )).T,
+    )), np.hstack((np.zeros(1000), np.ones(1000)))
+
+
+def all_but_first_column(X, y):
+    return X[:, 1:]
+
+
+def drop_first_component(X, y):
+    """
+    Create a pipeline with PCA and the column selector and use it to
+    transform the dataset.
+    """
+    pipeline = make_pipeline(
+        PCA(), CallableTransformer(all_but_first_column),
+    )
+    X_train, X_test, y_train, y_test = train_test_split(X, y)
+    pipeline.fit(X_train, y_train)
+    return pipeline.transform(X_test), y_test
+
+
+if __name__ == '__main__':
+    X, y = generate_dataset()
+    plt.scatter(X[:, 0], X[:, 1], c=y, s=50)
+    plt.show()
+    X_transformed, y_transformed = drop_first_component(*generate_dataset())
+    plt.scatter(
+        X_transformed[:, 0],
+        np.zeros(len(X_transformed)),
+        c=y_transformed,
+        s=50,
+    )
+    plt.show()
diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py
@@ -3,6 +3,8 @@
 normalization, binarization and imputation methods.
 """
 
+from .callable_transformer import CallableTransformer
+
 from .data import Binarizer
 from .data import KernelCenterer
 from .data import MinMaxScaler
@@ -28,8 +30,10 @@
 
 from .imputation import Imputer
 
+
 __all__ = [
     'Binarizer',
+    'CallableTransformer',
     'Imputer',
     'KernelCenterer',
     'LabelBinarizer',
diff --git a/sklearn/preprocessing/callable_transformer.py b/sklearn/preprocessing/callable_transformer.py
@@ -0,0 +1,44 @@
+from ..base import BaseEstimator, TransformerMixin
+from ..utils import check_array
+
+
+class CallableTransformer(BaseEstimator, TransformerMixin):
+    """Allows the construction of a transformer from an arbitrary callable.
+
+    Parameters
+    ----------
+    func : callable, optional default=None
+        The callable to use for the transformation. This will be passed
+        the same arguments as transform, with args and kwargs forwarded.
+        If func is None, then func will be the identity function.
+    validate : bool, optional default=True
+        Indicate that the input X array should be checked before calling
+        func. If validate is false, there will be no input validation.
+    accept_sparse : boolean, optional
+        Indicate that func accepts a sparse matrix as input.
+    args : tuple, optional
+        A tuple of positional arguments to be passed to func. These will
+        be passed after X and y.
+    kwargs : dict, optional
+        A dictionary of keyword arguments to be passed to func.
+
+    """
+    def __init__(self, func=None, validate=True, accept_sparse=False,
+                 args=None, kwargs=None):
+        self.func = func
+        self.validate = validate
+        self.accept_sparse = accept_sparse
+        self.args = args
+        self.kwargs = kwargs
+
+    def fit(self, X, y=None):
+        if self.validate:
+            check_array(X, self.accept_sparse)
+        return self
+
+    def transform(self, X, y=None):
+        if self.validate:
+            X = check_array(X, self.accept_sparse)
+        return (self.func or (lambda X, y, *args, **kwargs: X))(
+            X, y, *(self.args or ()), **(self.kwargs or {})
+        )
diff --git a/sklearn/preprocessing/tests/test_callable_transformer.py b/sklearn/preprocessing/tests/test_callable_transformer.py
@@ -0,0 +1,79 @@
+from nose.tools import assert_equal
+import numpy as np
+
+from ..callable_transformer import CallableTransformer
+
+
+def _make_func(args_store, kwargs_store, func=lambda X, *a, **k: X):
+    def _func(X, *args, **kwargs):
+        args_store.append(X)
+        args_store.extend(args)
+        kwargs_store.update(kwargs)
+        return func(X)
+
+    return _func
+
+
+def test_delegate_to_func():
+    # (args|kwargs)_store will hold the positional and keyword arguments
+    # passed to the function inside the CallableTransformer.
+    args_store = []
+    kwargs_store = {}
+    X = np.arange(10).reshape((5, 2))
+    np.testing.assert_array_equal(
+        CallableTransformer(_make_func(args_store, kwargs_store)).transform(X),
+        X,
+        'transform should have returned X unchanged',
+    )
+
+    # The function should only have recieved X and y, where y is None.
+    assert_equal(
+        args_store,
+        [X, None],
+        'Incorrect positional arguments passed to func: {args}'.format(
+            args=args_store,
+        ),
+    )
+    assert_equal(
+        kwargs_store,
+        {},
+        'Unexpected keyword arguments passed to func: {args}'.format(
+            args=kwargs_store,
+        ),
+    )
+
+
+def test_argument_closure():
+    # (args|kwargs)_store will hold the positional and keyword arguments
+    # passed to the function inside the CallableTransformer.
+    args_store = []
+    kwargs_store = {}
+    args = (object(), object())
+    kwargs = {'a': object(), 'b': object()}
+    X = np.arange(10).reshape((5, 2))
+    np.testing.assert_array_equal(
+        CallableTransformer(
+            _make_func(args_store, kwargs_store),
+            args=args,
+            kwargs=kwargs,
+        ).transform(X),
+        X,
+        'transform should have returned X unchanged',
+    )
+
+    # The function should have been passed X, y (None), and the args
+    # that were passed to the CallableTransformer.
+    assert_equal(
+        args_store,
+        [X, None] + list(args),
+        'Incorrect positional arguments passed to func: {args}'.format(
+            args=args_store,
+        ),
+    )
+    assert_equal(
+        kwargs_store,
+        kwargs,
+        'Incorrect keyword arguments passed to func: {args}'.format(
+            args=kwargs_store,
+        ),
+    )
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
@@ -138,7 +138,8 @@ def _yield_transformer_checks(name, Transformer):
                     'PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD']:
         yield check_transformer_data_not_an_array
     # these don't actually fit the data, so don't raise errors
-    if name not in ['AdditiveChi2Sampler', 'Binarizer', 'Normalizer']:
+    if name not in ['AdditiveChi2Sampler', 'Binarizer',
+                    'Normalizer', 'CallableTransformer']:
         # basic tests
         yield check_transformer_general
         yield check_transformers_unfitted