|
44 | 44 | # X = titanic.frame.drop('survived', axis=1) |
45 | 45 | # y = titanic.frame['survived'] |
46 | 46 |
|
47 | | -############################################################################### |
| 47 | +# %% |
48 | 48 | # Use ``ColumnTransformer`` by selecting column by names |
49 | 49 | ############################################################################### |
50 | 50 | # We will train our classifier with the following features: |
|
90 | 90 | clf.fit(X_train, y_train) |
91 | 91 | print("model score: %.3f" % clf.score(X_test, y_test)) |
92 | 92 |
|
93 | | -############################################################################## |
| 93 | +# %% |
94 | 94 | # HTML representation of ``Pipeline`` |
95 | 95 | ############################################################################### |
96 | 96 | # When the ``Pipeline`` is printed out in a jupyter notebook an HTML |
|
100 | 100 | set_config(display='diagram') |
101 | 101 | clf |
102 | 102 |
|
103 | | -############################################################################### |
| 103 | +# %% |
104 | 104 | # Use ``ColumnTransformer`` by selecting column by data types |
105 | 105 | ############################################################################### |
106 | 106 | # When dealing with a cleaned dataset, the preprocessing can be automatic by |
|
113 | 113 | subset_feature = ['embarked', 'sex', 'pclass', 'age', 'fare'] |
114 | 114 | X_train, X_test = X_train[subset_feature], X_test[subset_feature] |
115 | 115 |
|
116 | | -############################################################################### |
| 116 | +# %% |
117 | 117 | # Then, we introspect the information regarding each column data type. |
118 | 118 |
|
119 | 119 | X_train.info() |
120 | 120 |
|
121 | | -############################################################################### |
| 121 | +# %% |
122 | 122 | # We can observe that the `embarked` and `sex` columns were tagged as |
123 | 123 | # `category` columns when loading the data with ``fetch_openml``. Therefore, we |
124 | 124 | # can use this information to dispatch the categorical columns to the |
125 | 125 | # ``categorical_transformer`` and the remaining columns to the |
126 | 126 | # ``numerical_transformer``. |
127 | 127 |
|
128 | | -############################################################################### |
| 128 | +# %% |
129 | 129 | # .. note:: In practice, you will have to handle yourself the column data type. |
130 | 130 | # If you want some columns to be considered as `category`, you will have to |
131 | 131 | # convert them into categorical columns. If you are using pandas, you can |
|
145 | 145 | clf.fit(X_train, y_train) |
146 | 146 | print("model score: %.3f" % clf.score(X_test, y_test)) |
147 | 147 |
|
148 | | -############################################################################### |
| 148 | +# %% |
149 | 149 | # The resulting score is not exactly the same as the one from the previous |
150 | 150 | # pipeline becase the dtype-based selector treats the ``pclass`` columns as |
151 | 151 | # a numeric features instead of a categorical feature as previously: |
152 | 152 |
|
153 | 153 | selector(dtype_exclude="category")(X_train) |
154 | 154 |
|
155 | | -############################################################################### |
| 155 | +# %% |
156 | 156 |
|
157 | 157 | selector(dtype_include="category")(X_train) |
158 | 158 |
|
159 | | -############################################################################### |
| 159 | +# %% |
160 | 160 | # Using the prediction pipeline in a grid search |
161 | | -############################################################################### |
| 161 | +############################################################################## |
162 | 162 | # Grid search can also be performed on the different preprocessing steps |
163 | 163 | # defined in the ``ColumnTransformer`` object, together with the classifier's |
164 | 164 | # hyperparameters as part of the ``Pipeline``. |
|
174 | 174 | grid_search = GridSearchCV(clf, param_grid, cv=10) |
175 | 175 | grid_search |
176 | 176 |
|
177 | | -############################################################################### |
| 177 | +# %% |
178 | 178 | # Calling 'fit' triggers the cross-validated search for the best |
179 | 179 | # hyper-parameters combination: |
180 | 180 | # |
|
183 | 183 | print(f"Best params:") |
184 | 184 | print(grid_search.best_params_) |
185 | 185 |
|
186 | | -############################################################################### |
| 186 | +# %% |
187 | 187 | # The internal cross-validation scores obtained by those parameters is: |
188 | 188 | print(f"Internal CV score: {grid_search.best_score_:.3f}") |
189 | 189 |
|
190 | | -############################################################################### |
| 190 | +# %% |
191 | 191 | # We can also introspect the top grid search results as a pandas dataframe: |
192 | 192 | import pandas as pd |
193 | 193 |
|
|
198 | 198 | "param_classifier__C" |
199 | 199 | ]].head(5) |
200 | 200 |
|
201 | | -############################################################################### |
| 201 | +# %% |
202 | 202 | # The best hyper-parameters have be used to re-fit a final model on the full |
203 | 203 | # training set. We can evaluate that final model on held out test data that was |
204 | 204 | # not used for hyparameter tuning. |
|
0 commit comments