|
18 | 18 | from .common import Y_DTYPE |
19 | 19 | from .common import G_H_DTYPE |
20 | 20 | from ._loss import _update_gradients_least_squares |
| 21 | +from ._loss import _update_gradients_least_absolute_deviation |
21 | 22 | from ._loss import _update_gradients_hessians_binary_crossentropy |
22 | 23 | from ._loss import _update_gradients_hessians_categorical_crossentropy |
23 | 24 |
|
24 | 25 |
|
25 | 26 | class BaseLoss(ABC): |
26 | 27 | """Base class for a loss.""" |
27 | 28 |
|
| 29 | + # This variable indicates whether the loss requires the leaves values to |
| 30 | + # be updated once the tree has been trained. The trees are trained to |
| 31 | + # predict a Newton-Raphson step (see grower._finalize_leaf()). But for |
| 32 | + # some losses (e.g. least absolute deviation) we need to adjust the tree |
| 33 | + # values to account for the "line search" of the gradient descent |
| 34 | + # procedure. See the original paper Greedy Function Approximation: A |
| 35 | + # Gradient Boosting Machine by Friedman |
| 36 | + # (https://statweb.stanford.edu/~jhf/ftp/trebst.pdf) for the theory. |
| 37 | + need_update_leaves_values = False |
| 38 | + |
28 | 39 | def init_gradients_and_hessians(self, n_samples, prediction_dim): |
29 | 40 | """Return initial gradients and hessians. |
30 | 41 |
|
@@ -53,9 +64,10 @@ def init_gradients_and_hessians(self, n_samples, prediction_dim): |
53 | 64 | shape = (prediction_dim, n_samples) |
54 | 65 | gradients = np.empty(shape=shape, dtype=G_H_DTYPE) |
55 | 66 | if self.hessians_are_constant: |
56 | | - # if the hessians are constant, we consider they are equal to 1. |
57 | | - # this is correct as long as we adjust the gradients. See e.g. LS |
58 | | - # loss |
| 67 | + # If the hessians are constant, we consider they are equal to 1. |
| 68 | + # - This is correct for the half LS loss |
| 69 | + # - For LAD loss, hessians are actually 0, but they are always |
| 70 | + # ignored anyway. |
59 | 71 | hessians = np.ones(shape=(1, 1), dtype=G_H_DTYPE) |
60 | 72 | else: |
61 | 73 | hessians = np.empty(shape=shape, dtype=G_H_DTYPE) |
@@ -141,6 +153,63 @@ def update_gradients_and_hessians(self, gradients, hessians, y_true, |
141 | 153 | _update_gradients_least_squares(gradients, y_true, raw_predictions) |
142 | 154 |
|
143 | 155 |
|
| 156 | +class LeastAbsoluteDeviation(BaseLoss): |
| 157 | + """Least asbolute deviation, for regression. |
| 158 | +
|
| 159 | + For a given sample x_i, the loss is defined as:: |
| 160 | +
|
| 161 | + loss(x_i) = |y_true_i - raw_pred_i| |
| 162 | + """ |
| 163 | + |
| 164 | + hessians_are_constant = True |
| 165 | + # This variable indicates whether the loss requires the leaves values to |
| 166 | + # be updated once the tree has been trained. The trees are trained to |
| 167 | + # predict a Newton-Raphson step (see grower._finalize_leaf()). But for |
| 168 | + # some losses (e.g. least absolute deviation) we need to adjust the tree |
| 169 | + # values to account for the "line search" of the gradient descent |
| 170 | + # procedure. See the original paper Greedy Function Approximation: A |
| 171 | + # Gradient Boosting Machine by Friedman |
| 172 | + # (https://statweb.stanford.edu/~jhf/ftp/trebst.pdf) for the theory. |
| 173 | + need_update_leaves_values = True |
| 174 | + |
| 175 | + def __call__(self, y_true, raw_predictions, average=True): |
| 176 | + # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to |
| 177 | + # return a view. |
| 178 | + raw_predictions = raw_predictions.reshape(-1) |
| 179 | + loss = np.abs(y_true - raw_predictions) |
| 180 | + return loss.mean() if average else loss |
| 181 | + |
| 182 | + def get_baseline_prediction(self, y_train, prediction_dim): |
| 183 | + return np.median(y_train) |
| 184 | + |
| 185 | + @staticmethod |
| 186 | + def inverse_link_function(raw_predictions): |
| 187 | + return raw_predictions |
| 188 | + |
| 189 | + def update_gradients_and_hessians(self, gradients, hessians, y_true, |
| 190 | + raw_predictions): |
| 191 | + # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to |
| 192 | + # return a view. |
| 193 | + raw_predictions = raw_predictions.reshape(-1) |
| 194 | + gradients = gradients.reshape(-1) |
| 195 | + _update_gradients_least_absolute_deviation(gradients, y_true, |
| 196 | + raw_predictions) |
| 197 | + |
| 198 | + def update_leaves_values(self, grower, y_true, raw_predictions): |
| 199 | + # Update the values predicted by the tree with |
| 200 | + # median(y_true - raw_predictions). |
| 201 | + # See note about need_update_leaves_values in BaseLoss. |
| 202 | + |
| 203 | + # TODO: ideally this should be computed in parallel over the leaves |
| 204 | + # using something similar to _update_raw_predictions(), but this |
| 205 | + # requires a cython version of median() |
| 206 | + for leaf in grower.finalized_leaves: |
| 207 | + indices = leaf.sample_indices |
| 208 | + median_res = np.median(y_true[indices] - raw_predictions[indices]) |
| 209 | + leaf.value = grower.shrinkage * median_res |
| 210 | + # Note that the regularization is ignored here |
| 211 | + |
| 212 | + |
144 | 213 | class BinaryCrossEntropy(BaseLoss): |
145 | 214 | """Binary cross-entropy loss, for binary classification. |
146 | 215 |
|
@@ -242,6 +311,7 @@ def predict_proba(self, raw_predictions): |
242 | 311 |
|
243 | 312 | _LOSSES = { |
244 | 313 | 'least_squares': LeastSquares, |
| 314 | + 'least_absolute_deviation': LeastAbsoluteDeviation, |
245 | 315 | 'binary_crossentropy': BinaryCrossEntropy, |
246 | 316 | 'categorical_crossentropy': CategoricalCrossEntropy |
247 | 317 | } |
0 commit comments