scikit-learn
diff --git a/‎1.6/.buildinfo
+4 b/‎1.6/.buildinfo
+4
diff --git a/‎1.6/_downloads/006fc185672e58b056a5c134db26935c/plot_coin_segmentation.ipynb
+61 b/‎1.6/_downloads/006fc185672e58b056a5c134db26935c/plot_coin_segmentation.ipynb
+61
diff --git a/‎1.6/_downloads/00ae629d652473137a3905a5e08ea815/plot_iris_dtc.py
+89 b/‎1.6/_downloads/00ae629d652473137a3905a5e08ea815/plot_iris_dtc.py
+89
diff --git a/‎1.6/_downloads/010337852815f8103ac6cca38a812b3c/plot_roc_crossval.py
+133 b/‎1.6/_downloads/010337852815f8103ac6cca38a812b3c/plot_roc_crossval.py
+133
diff --git a/‎1.6/_downloads/01fdc7c95204e4a420de7cd297711693/plot_feature_union.py
+61 b/‎1.6/_downloads/01fdc7c95204e4a420de7cd297711693/plot_feature_union.py
+61
@@ -0,0 +1,4 @@
+# Sphinx build info version 1
+# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
+config: d926c2f4528fe6bc7527c4e7380cc80a
+tags: 645f666f9bcd5a90fca523b33c5a78b7
@@ -0,0 +1,61 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n# Segmenting the picture of greek coins in regions\n\nThis example uses `spectral_clustering` on a graph created from\nvoxel-to-voxel difference on an image to break this image into multiple\npartly-homogeneous regions.\n\nThis procedure (spectral clustering on an image) is an efficient\napproximate solution for finding normalized graph cuts.\n\nThere are three options to assign labels:\n\n* 'kmeans' spectral clustering clusters samples in the embedding space\n  using a kmeans algorithm\n* 'discrete' iteratively searches for the closest partition\n  space to the embedding space of spectral clustering.\n* 'cluster_qr' assigns labels using the QR factorization with pivoting\n  that directly determines the partition in the embedding space.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# Authors: The scikit-learn developers\n# SPDX-License-Identifier: BSD-3-Clause\n\nimport time\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom scipy.ndimage import gaussian_filter\nfrom skimage.data import coins\nfrom skimage.transform import rescale\n\nfrom sklearn.cluster import spectral_clustering\nfrom sklearn.feature_extraction import image\n\n# load the coins as a numpy array\norig_coins = coins()\n\n# Resize it to 20% of the original size to speed up the processing\n# Applying a Gaussian filter for smoothing prior to down-scaling\n# reduces aliasing artifacts.\nsmoothened_coins = gaussian_filter(orig_coins, sigma=2)\nrescaled_coins = rescale(smoothened_coins, 0.2, mode=\"reflect\", anti_aliasing=False)\n\n# Convert the image into a graph with the value of the gradient on the\n# edges.\ngraph = image.img_to_graph(rescaled_coins)\n\n# Take a decreasing function of the gradient: an exponential\n# The smaller beta is, the more independent the segmentation is of the\n# actual image. For beta=1, the segmentation is close to a voronoi\nbeta = 10\neps = 1e-6\ngraph.data = np.exp(-beta * graph.data / graph.data.std()) + eps\n\n# The number of segmented regions to display needs to be chosen manually.\n# The current version of 'spectral_clustering' does not support determining\n# the number of good quality clusters automatically.\nn_regions = 26"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Compute and visualize the resulting regions\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# Computing a few extra eigenvectors may speed up the eigen_solver.\n# The spectral clustering quality may also benefit from requesting\n# extra regions for segmentation.\nn_regions_plus = 3\n\n# Apply spectral clustering using the default eigen_solver='arpack'.\n# Any implemented solver can be used: eigen_solver='arpack', 'lobpcg', or 'amg'.\n# Choosing eigen_solver='amg' requires an extra package called 'pyamg'.\n# The quality of segmentation and the speed of calculations is mostly determined\n# by the choice of the solver and the value of the tolerance 'eigen_tol'.\n# TODO: varying eigen_tol seems to have no effect for 'lobpcg' and 'amg' #21243.\nfor assign_labels in (\"kmeans\", \"discretize\", \"cluster_qr\"):\n    t0 = time.time()\n    labels = spectral_clustering(\n        graph,\n        n_clusters=(n_regions + n_regions_plus),\n        eigen_tol=1e-7,\n        assign_labels=assign_labels,\n        random_state=42,\n    )\n\n    t1 = time.time()\n    labels = labels.reshape(rescaled_coins.shape)\n    plt.figure(figsize=(5, 5))\n    plt.imshow(rescaled_coins, cmap=plt.cm.gray)\n\n    plt.xticks(())\n    plt.yticks(())\n    title = \"Spectral clustering: %s, %.2fs\" % (assign_labels, (t1 - t0))\n    print(title)\n    plt.title(title)\n    for l in range(n_regions):\n        colors = [plt.cm.nipy_spectral((l + 4) / float(n_regions + 4))]\n        plt.contour(labels == l, colors=colors)\n        # To view individual segments as appear comment in plt.pause(0.5)\nplt.show()\n\n# TODO: After #21194 is merged and #21243 is fixed, check which eigen_solver\n# is the best and set eigen_solver='arpack', 'lobpcg', or 'amg' and eigen_tol\n# explicitly in this example."
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.9.20"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
@@ -0,0 +1,89 @@
+"""
+=======================================================================
+Plot the decision surface of decision trees trained on the iris dataset
+=======================================================================
+
+Plot the decision surface of a decision tree trained on pairs
+of features of the iris dataset.
+
+See :ref:`decision tree <tree>` for more information on the estimator.
+
+For each pair of iris features, the decision tree learns decision
+boundaries made of combinations of simple thresholding rules inferred from
+the training samples.
+
+We also show the tree structure of a model built on all of the features.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+# %%
+# First load the copy of the Iris dataset shipped with scikit-learn:
+from sklearn.datasets import load_iris
+
+iris = load_iris()
+
+
+# %%
+# Display the decision functions of trees trained on all pairs of features.
+import matplotlib.pyplot as plt
+import numpy as np
+
+from sklearn.datasets import load_iris
+from sklearn.inspection import DecisionBoundaryDisplay
+from sklearn.tree import DecisionTreeClassifier
+
+# Parameters
+n_classes = 3
+plot_colors = "ryb"
+plot_step = 0.02
+
+
+for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]):
+    # We only take the two corresponding features
+    X = iris.data[:, pair]
+    y = iris.target
+
+    # Train
+    clf = DecisionTreeClassifier().fit(X, y)
+
+    # Plot the decision boundary
+    ax = plt.subplot(2, 3, pairidx + 1)
+    plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5)
+    DecisionBoundaryDisplay.from_estimator(
+        clf,
+        X,
+        cmap=plt.cm.RdYlBu,
+        response_method="predict",
+        ax=ax,
+        xlabel=iris.feature_names[pair[0]],
+        ylabel=iris.feature_names[pair[1]],
+    )
+
+    # Plot the training points
+    for i, color in zip(range(n_classes), plot_colors):
+        idx = np.where(y == i)
+        plt.scatter(
+            X[idx, 0],
+            X[idx, 1],
+            c=color,
+            label=iris.target_names[i],
+            edgecolor="black",
+            s=15,
+        )
+
+plt.suptitle("Decision surface of decision trees trained on pairs of features")
+plt.legend(loc="lower right", borderpad=0, handletextpad=0)
+_ = plt.axis("tight")
+
+# %%
+# Display the structure of a single decision tree trained on all the features
+# together.
+from sklearn.tree import plot_tree
+
+plt.figure()
+clf = DecisionTreeClassifier().fit(iris.data, iris.target)
+plot_tree(clf, filled=True)
+plt.title("Decision tree trained on all the iris features")
+plt.show()
@@ -0,0 +1,133 @@
+"""
+=============================================================
+Receiver Operating Characteristic (ROC) with cross validation
+=============================================================
+
+This example presents how to estimate and visualize the variance of the Receiver
+Operating Characteristic (ROC) metric using cross-validation.
+
+ROC curves typically feature true positive rate (TPR) on the Y axis, and false
+positive rate (FPR) on the X axis. This means that the top left corner of the
+plot is the "ideal" point - a FPR of zero, and a TPR of one. This is not very
+realistic, but it does mean that a larger Area Under the Curve (AUC) is usually
+better. The "steepness" of ROC curves is also important, since it is ideal to
+maximize the TPR while minimizing the FPR.
+
+This example shows the ROC response of different datasets, created from K-fold
+cross-validation. Taking all of these curves, it is possible to calculate the
+mean AUC, and see the variance of the curve when the
+training set is split into different subsets. This roughly shows how the
+classifier output is affected by changes in the training data, and how different
+the splits generated by K-fold cross-validation are from one another.
+
+.. note::
+
+    See :ref:`sphx_glr_auto_examples_model_selection_plot_roc.py` for a
+    complement of the present example explaining the averaging strategies to
+    generalize the metrics for multiclass classifiers.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+# %%
+# Load and prepare data
+# =====================
+#
+# We import the :ref:`iris_dataset` which contains 3 classes, each one
+# corresponding to a type of iris plant. One class is linearly separable from
+# the other 2; the latter are **not** linearly separable from each other.
+#
+# In the following we binarize the dataset by dropping the "virginica" class
+# (`class_id=2`). This means that the "versicolor" class (`class_id=1`) is
+# regarded as the positive class and "setosa" as the negative class
+# (`class_id=0`).
+
+import numpy as np
+
+from sklearn.datasets import load_iris
+
+iris = load_iris()
+target_names = iris.target_names
+X, y = iris.data, iris.target
+X, y = X[y != 2], y[y != 2]
+n_samples, n_features = X.shape
+
+# %%
+# We also add noisy features to make the problem harder.
+random_state = np.random.RandomState(0)
+X = np.concatenate([X, random_state.randn(n_samples, 200 * n_features)], axis=1)
+
+# %%
+# Classification and ROC analysis
+# -------------------------------
+#
+# Here we run a :class:`~sklearn.svm.SVC` classifier with cross-validation and
+# plot the ROC curves fold-wise. Notice that the baseline to define the chance
+# level (dashed ROC curve) is a classifier that would always predict the most
+# frequent class.
+
+import matplotlib.pyplot as plt
+
+from sklearn import svm
+from sklearn.metrics import RocCurveDisplay, auc
+from sklearn.model_selection import StratifiedKFold
+
+n_splits = 6
+cv = StratifiedKFold(n_splits=n_splits)
+classifier = svm.SVC(kernel="linear", probability=True, random_state=random_state)
+
+tprs = []
+aucs = []
+mean_fpr = np.linspace(0, 1, 100)
+
+fig, ax = plt.subplots(figsize=(6, 6))
+for fold, (train, test) in enumerate(cv.split(X, y)):
+    classifier.fit(X[train], y[train])
+    viz = RocCurveDisplay.from_estimator(
+        classifier,
+        X[test],
+        y[test],
+        name=f"ROC fold {fold}",
+        alpha=0.3,
+        lw=1,
+        ax=ax,
+        plot_chance_level=(fold == n_splits - 1),
+    )
+    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
+    interp_tpr[0] = 0.0
+    tprs.append(interp_tpr)
+    aucs.append(viz.roc_auc)
+
+mean_tpr = np.mean(tprs, axis=0)
+mean_tpr[-1] = 1.0
+mean_auc = auc(mean_fpr, mean_tpr)
+std_auc = np.std(aucs)
+ax.plot(
+    mean_fpr,
+    mean_tpr,
+    color="b",
+    label=r"Mean ROC (AUC = %0.2f $\pm$ %0.2f)" % (mean_auc, std_auc),
+    lw=2,
+    alpha=0.8,
+)
+
+std_tpr = np.std(tprs, axis=0)
+tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
+tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
+ax.fill_between(
+    mean_fpr,
+    tprs_lower,
+    tprs_upper,
+    color="grey",
+    alpha=0.2,
+    label=r"$\pm$ 1 std. dev.",
+)
+
+ax.set(
+    xlabel="False Positive Rate",
+    ylabel="True Positive Rate",
+    title=f"Mean ROC curve with variability\n(Positive label '{target_names[1]}')",
+)
+ax.legend(loc="lower right")
+plt.show()
@@ -0,0 +1,61 @@
+"""
+=================================================
+Concatenating multiple feature extraction methods
+=================================================
+
+In many real-world examples, there are many ways to extract features from a
+dataset. Often it is beneficial to combine several methods to obtain good
+performance. This example shows how to use ``FeatureUnion`` to combine
+features obtained by PCA and univariate selection.
+
+Combining features using this transformer has the benefit that it allows
+cross validation and grid searches over the whole process.
+
+The combination used in this example is not particularly helpful on this
+dataset and is only used to illustrate the usage of FeatureUnion.
+
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from sklearn.datasets import load_iris
+from sklearn.decomposition import PCA
+from sklearn.feature_selection import SelectKBest
+from sklearn.model_selection import GridSearchCV
+from sklearn.pipeline import FeatureUnion, Pipeline
+from sklearn.svm import SVC
+
+iris = load_iris()
+
+X, y = iris.data, iris.target
+
+# This dataset is way too high-dimensional. Better do PCA:
+pca = PCA(n_components=2)
+
+# Maybe some original features were good, too?
+selection = SelectKBest(k=1)
+
+# Build estimator from PCA and Univariate selection:
+
+combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])
+
+# Use combined features to transform dataset:
+X_features = combined_features.fit(X, y).transform(X)
+print("Combined space has", X_features.shape[1], "features")
+
+svm = SVC(kernel="linear")
+
+# Do grid search over k, n_components and C:
+
+pipeline = Pipeline([("features", combined_features), ("svm", svm)])
+
+param_grid = dict(
+    features__pca__n_components=[1, 2, 3],
+    features__univ_select__k=[1, 2],
+    svm__C=[0.1, 1, 10],
+)
+
+grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10)
+grid_search.fit(X, y)
+print(grid_search.best_estimator_)