boricles
diff --git a/‎doc/modules/model_evaluation.rst‎
Lines changed: 66 additions & 11 deletions b/‎doc/modules/model_evaluation.rst‎
Lines changed: 66 additions & 11 deletions
diff --git a/‎sklearn/metrics/_ranking.py‎
Lines changed: 62 additions & 16 deletions b/‎sklearn/metrics/_ranking.py‎
Lines changed: 62 additions & 16 deletions
@@ -1326,21 +1326,48 @@ area under the roc curve, the curve information is summarized in one number.
 For more information see the `Wikipedia article on AUC
 <https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve>`_.
 
-  >>> import numpy as np
+Compared to metrics such as the subset accuracy, the Hamming loss, or the
+F1 score, ROC doesn't require optimizing a threshold for each label.
+
+.. _roc_auc_binary:
+
+Binary case
+^^^^^^^^^^^
+
+In the **binary case**, you can either provide the probability estimates, using
+the `classifier.predict_proba()` method, or the non-thresholded decision values
+given by the `classifier.decision_function()` method. In the case of providing
+the probability estimates, the probability of the class with the
+"greater label" should be provided. The "greater label" corresponds to
+`classifier.classes_[1]` and thus `classifier.predict_proba(X)[:, 1]`.
+Therefore, the `y_score` parameter is of size (n_samples,).
+
+  >>> from sklearn.datasets import load_breast_cancer
+  >>> from sklearn.linear_model import LogisticRegression
   >>> from sklearn.metrics import roc_auc_score
-  >>> y_true = np.array([0, 0, 1, 1])
-  >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8])
-  >>> roc_auc_score(y_true, y_scores)
-  0.75
+  >>> X, y = load_breast_cancer(return_X_y=True)
+  >>> clf = LogisticRegression(solver="liblinear").fit(X, y)
+  >>> clf.classes_
+  array([0, 1])
 
-In multi-label classification, the :func:`roc_auc_score` function is
-extended by averaging over the labels as :ref:`above <average>`.
+We can use the probability estimates corresponding to `clf.classes_[1]`.
 
-Compared to metrics such as the subset accuracy, the Hamming loss, or the
-F1 score, ROC doesn't require optimizing a threshold for each label.
+  >>> y_score = clf.predict_proba(X)[:, 1]
+  >>> roc_auc_score(y, y_score)
+  0.99...
+
+Otherwise, we can use the non-thresholded decision values
 
-The :func:`roc_auc_score` function can also be used in multi-class
-classification. Two averaging strategies are currently supported: the
+  >>> roc_auc_score(y, clf.decision_function(X))
+  0.99...
+
+.. _roc_auc_multiclass:
+
+Multi-class case
+^^^^^^^^^^^^^^^^
+
+The :func:`roc_auc_score` function can also be used in **multi-class
+classification**. Two averaging strategies are currently supported: the
 one-vs-one algorithm computes the average of the pairwise ROC AUC scores, and
 the one-vs-rest algorithm computes the average of the ROC AUC scores for each
 class against all other classes. In both cases, the predicted labels are
@@ -1394,6 +1421,34 @@ to the given limit.
    :scale: 75
    :align: center
 
+.. _roc_auc_multilabel:
+
+Multi-label case
+^^^^^^^^^^^^^^^^
+
+In **multi-label classification**, the :func:`roc_auc_score` function is
+extended by averaging over the labels as :ref:`above <average>`. In this case,
+you should provide a `y_score` of shape `(n_samples, n_classes)`. Thus, when
+using the probability estimates, one needs to select the probability of the
+class with the greater label for each output.
+
+  >>> from sklearn.datasets import make_multilabel_classification
+  >>> from sklearn.multioutput import MultiOutputClassifier
+  >>> X, y = make_multilabel_classification(random_state=0)
+  >>> inner_clf = LogisticRegression(solver="liblinear", random_state=0)
+  >>> clf = MultiOutputClassifier(inner_clf).fit(X, y)
+  >>> y_score = np.transpose([y_pred[:, 1] for y_pred in clf.predict_proba(X)])
+  >>> roc_auc_score(y, y_score, average=None)
+  array([0.82..., 0.86..., 0.94..., 0.85... , 0.94...])
+
+And the decision values do not require such processing.
+
+  >>> from sklearn.linear_model import RidgeClassifierCV
+  >>> clf = RidgeClassifierCV().fit(X, y)
+  >>> y_score = clf.decision_function(X)
+  >>> roc_auc_score(y, y_score, average=None)
+  array([0.81..., 0.84... , 0.93..., 0.87..., 0.94...])
+
 .. topic:: Examples:
 
   * See :ref:`sphx_glr_auto_examples_model_selection_plot_roc.py`
 
@@ -360,16 +360,31 @@ def roc_auc_score(y_true, y_score, *, average="macro", sample_weight=None,
         binary label indicators with shape (n_samples, n_classes).
 
     y_score : array-like of shape (n_samples,) or (n_samples, n_classes)
-        Target scores. In the binary and multilabel cases, these can be either
-        probability estimates or non-thresholded decision values (as returned
-        by `decision_function` on some classifiers). In the multiclass case,
-        these must be probability estimates which sum to 1. The binary
-        case expects a shape (n_samples,), and the scores must be the scores of
-        the class with the greater label. The multiclass and multilabel
-        cases expect a shape (n_samples, n_classes). In the multiclass case,
-        the order of the class scores must correspond to the order of
-        ``labels``, if provided, or else to the numerical or lexicographical
-        order of the labels in ``y_true``.
+        Target scores.
+
+        * In the binary case, it corresponds to an array of shape
+          `(n_samples,)`. Both probability estimates and non-thresholded
+          decision values can be provided. The probability estimates correspond
+          to the **probability of the class with the greater label**,
+          i.e. `estimator.classes_[1]` and thus
+          `estimator.predict_proba(X, y)[:, 1]`. The decision values
+          corresponds to the output of `estimator.decision_function(X, y)`.
+          See more information in the :ref:`User guide <roc_auc_binary>`;
+        * In the multiclass case, it corresponds to an array of shape
+          `(n_samples, n_classes)` of probability estimates provided by the
+          `predict_proba` method. The probability estimates **must**
+          sum to 1 across the possible classes. In addition, the order of the
+          class scores must correspond to the order of ``labels``,
+          if provided, or else to the numerical or lexicographical order of
+          the labels in ``y_true``. See more information in the
+          :ref:`User guide <roc_auc_multiclass>`;
+        * In the multilabel case, it corresponds to an array of shape
+          `(n_samples, n_classes)`. Probability estimates are provided by the
+          `predict_proba` method and the non-thresholded decision values by
+          the `decision_function` method. The probability estimates correspond
+          to the **probability of the class with the greater label for each
+          output** of the classifier. See more information in the
+          :ref:`User guide <roc_auc_multilabel>`.
 
     average : {'micro', 'macro', 'samples', 'weighted'} or None, \
             default='macro'
@@ -447,7 +462,7 @@ def roc_auc_score(y_true, y_score, *, average="macro", sample_weight=None,
             Machine Learning, 45(2), 171-186.
             <http://link.springer.com/article/10.1023/A:1010920819831>`_
 
-    See also
+    See Also
     --------
     average_precision_score : Area under the precision-recall curve
 
@@ -457,12 +472,43 @@ def roc_auc_score(y_true, y_score, *, average="macro", sample_weight=None,
 
     Examples
     --------
-    >>> import numpy as np
+    Binary case:
+
+    >>> from sklearn.datasets import load_breast_cancer
+    >>> from sklearn.linear_model import LogisticRegression
     >>> from sklearn.metrics import roc_auc_score
-    >>> y_true = np.array([0, 0, 1, 1])
-    >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8])
-    >>> roc_auc_score(y_true, y_scores)
-    0.75
+    >>> X, y = load_breast_cancer(return_X_y=True)
+    >>> clf = LogisticRegression(solver="liblinear", random_state=0).fit(X, y)
+    >>> roc_auc_score(y, clf.predict_proba(X)[:, 1])
+    0.99...
+    >>> roc_auc_score(y, clf.decision_function(X))
+    0.99...
+
+    Multiclass case:
+
+    >>> from sklearn.datasets import load_iris
+    >>> X, y = load_iris(return_X_y=True)
+    >>> clf = LogisticRegression(solver="liblinear").fit(X, y)
+    >>> roc_auc_score(y, clf.predict_proba(X), multi_class='ovr')
+    0.99...
+
+    Multilabel case:
+
+    >>> from sklearn.datasets import make_multilabel_classification
+    >>> from sklearn.multioutput import MultiOutputClassifier
+    >>> X, y = make_multilabel_classification(random_state=0)
+    >>> clf = MultiOutputClassifier(clf).fit(X, y)
+    >>> # get a list of n_output containing probability arrays of shape
+    >>> # (n_samples, n_classes)
+    >>> y_pred = clf.predict_proba(X)
+    >>> # extract the positive columns for each output
+    >>> y_pred = np.transpose([pred[:, 1] for pred in y_pred])
+    >>> roc_auc_score(y, y_pred, average=None)
+    array([0.82..., 0.86..., 0.94..., 0.85... , 0.94...])
+    >>> from sklearn.linear_model import RidgeClassifierCV
+    >>> clf = RidgeClassifierCV().fit(X, y)
+    >>> roc_auc_score(y, clf.decision_function(X), average=None)
+    array([0.81..., 0.84... , 0.93..., 0.87..., 0.94...])
     """
 
     y_type = type_of_target(y_true)