| 
 | 1 | +"""  | 
 | 2 | +=========================================================  | 
 | 3 | +Using CallableTransformer to select columns  | 
 | 4 | +=========================================================  | 
 | 5 | +
  | 
 | 6 | +Shows how to use a callable transformer in a pipeline. If you know your  | 
 | 7 | +dataset's first principle component is irrelevant for a classification task,  | 
 | 8 | +you can use the CallableTransformer to select all but the first column of the  | 
 | 9 | +PCA transformed data.  | 
 | 10 | +"""  | 
 | 11 | +import matplotlib.pyplot as plt  | 
 | 12 | +import numpy as np  | 
 | 13 | + | 
 | 14 | +from sklearn.cross_validation import train_test_split  | 
 | 15 | +from sklearn.decomposition import PCA  | 
 | 16 | +from sklearn.pipeline import make_pipeline  | 
 | 17 | +from sklearn.preprocessing import CallableTransformer  | 
 | 18 | + | 
 | 19 | + | 
 | 20 | +def _generate_vector(shift=0.5, noise=15):  | 
 | 21 | +    return np.arange(1000) + (np.random.rand(1000) - shift) * noise  | 
 | 22 | + | 
 | 23 | + | 
 | 24 | +def generate_dataset():  | 
 | 25 | +    """  | 
 | 26 | +    This dataset is two lines with a slope ~ 1, where one has  | 
 | 27 | +    a y offset of ~100  | 
 | 28 | +    """  | 
 | 29 | +    return np.vstack((  | 
 | 30 | +        np.vstack((  | 
 | 31 | +            _generate_vector(),  | 
 | 32 | +            _generate_vector() + 100,  | 
 | 33 | +        )).T,  | 
 | 34 | +        np.vstack((  | 
 | 35 | +            _generate_vector(),  | 
 | 36 | +            _generate_vector(),  | 
 | 37 | +        )).T,  | 
 | 38 | +    )), np.hstack((np.zeros(1000), np.ones(1000)))  | 
 | 39 | + | 
 | 40 | + | 
 | 41 | +def all_but_first_column(X, y):  | 
 | 42 | +    return X[:, 1:]  | 
 | 43 | + | 
 | 44 | + | 
 | 45 | +def drop_first_component(X, y):  | 
 | 46 | +    """  | 
 | 47 | +    Create a pipeline with PCA and the column selector and use it to  | 
 | 48 | +    transform the dataset.  | 
 | 49 | +    """  | 
 | 50 | +    pipeline = make_pipeline(  | 
 | 51 | +        PCA(), CallableTransformer(all_but_first_column),  | 
 | 52 | +    )  | 
 | 53 | +    X_train, X_test, y_train, y_test = train_test_split(X, y)  | 
 | 54 | +    pipeline.fit(X_train, y_train)  | 
 | 55 | +    return pipeline.transform(X_test), y_test  | 
 | 56 | + | 
 | 57 | + | 
 | 58 | +if __name__ == '__main__':  | 
 | 59 | +    X, y = generate_dataset()  | 
 | 60 | +    plt.scatter(X[:, 0], X[:, 1], c=y, s=50)  | 
 | 61 | +    plt.show()  | 
 | 62 | +    X_transformed, y_transformed = drop_first_component(*generate_dataset())  | 
 | 63 | +    plt.scatter(  | 
 | 64 | +        X_transformed[:, 0],  | 
 | 65 | +        np.zeros(len(X_transformed)),  | 
 | 66 | +        c=y_transformed,  | 
 | 67 | +        s=50,  | 
 | 68 | +    )  | 
 | 69 | +    plt.show()  | 
0 commit comments