Skip to content

Commit 7d6a933

Browse files
committed
Pushing the docs to 1.6/ for branch: 1.6.X, commit 66b71f059c401f5a70e8c16755b7bb98e24e88ec
1 parent b0fffa2 commit 7d6a933

File tree

5,271 files changed

+2027385
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

5,271 files changed

+2027385
-0
lines changed

1.6/.buildinfo

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# Sphinx build info version 1
2+
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
3+
config: d926c2f4528fe6bc7527c4e7380cc80a
4+
tags: 645f666f9bcd5a90fca523b33c5a78b7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"\n# Segmenting the picture of greek coins in regions\n\nThis example uses `spectral_clustering` on a graph created from\nvoxel-to-voxel difference on an image to break this image into multiple\npartly-homogeneous regions.\n\nThis procedure (spectral clustering on an image) is an efficient\napproximate solution for finding normalized graph cuts.\n\nThere are three options to assign labels:\n\n* 'kmeans' spectral clustering clusters samples in the embedding space\n using a kmeans algorithm\n* 'discrete' iteratively searches for the closest partition\n space to the embedding space of spectral clustering.\n* 'cluster_qr' assigns labels using the QR factorization with pivoting\n that directly determines the partition in the embedding space.\n"
8+
]
9+
},
10+
{
11+
"cell_type": "code",
12+
"execution_count": null,
13+
"metadata": {
14+
"collapsed": false
15+
},
16+
"outputs": [],
17+
"source": [
18+
"# Authors: The scikit-learn developers\n# SPDX-License-Identifier: BSD-3-Clause\n\nimport time\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom scipy.ndimage import gaussian_filter\nfrom skimage.data import coins\nfrom skimage.transform import rescale\n\nfrom sklearn.cluster import spectral_clustering\nfrom sklearn.feature_extraction import image\n\n# load the coins as a numpy array\norig_coins = coins()\n\n# Resize it to 20% of the original size to speed up the processing\n# Applying a Gaussian filter for smoothing prior to down-scaling\n# reduces aliasing artifacts.\nsmoothened_coins = gaussian_filter(orig_coins, sigma=2)\nrescaled_coins = rescale(smoothened_coins, 0.2, mode=\"reflect\", anti_aliasing=False)\n\n# Convert the image into a graph with the value of the gradient on the\n# edges.\ngraph = image.img_to_graph(rescaled_coins)\n\n# Take a decreasing function of the gradient: an exponential\n# The smaller beta is, the more independent the segmentation is of the\n# actual image. For beta=1, the segmentation is close to a voronoi\nbeta = 10\neps = 1e-6\ngraph.data = np.exp(-beta * graph.data / graph.data.std()) + eps\n\n# The number of segmented regions to display needs to be chosen manually.\n# The current version of 'spectral_clustering' does not support determining\n# the number of good quality clusters automatically.\nn_regions = 26"
19+
]
20+
},
21+
{
22+
"cell_type": "markdown",
23+
"metadata": {},
24+
"source": [
25+
"Compute and visualize the resulting regions\n\n"
26+
]
27+
},
28+
{
29+
"cell_type": "code",
30+
"execution_count": null,
31+
"metadata": {
32+
"collapsed": false
33+
},
34+
"outputs": [],
35+
"source": [
36+
"# Computing a few extra eigenvectors may speed up the eigen_solver.\n# The spectral clustering quality may also benefit from requesting\n# extra regions for segmentation.\nn_regions_plus = 3\n\n# Apply spectral clustering using the default eigen_solver='arpack'.\n# Any implemented solver can be used: eigen_solver='arpack', 'lobpcg', or 'amg'.\n# Choosing eigen_solver='amg' requires an extra package called 'pyamg'.\n# The quality of segmentation and the speed of calculations is mostly determined\n# by the choice of the solver and the value of the tolerance 'eigen_tol'.\n# TODO: varying eigen_tol seems to have no effect for 'lobpcg' and 'amg' #21243.\nfor assign_labels in (\"kmeans\", \"discretize\", \"cluster_qr\"):\n t0 = time.time()\n labels = spectral_clustering(\n graph,\n n_clusters=(n_regions + n_regions_plus),\n eigen_tol=1e-7,\n assign_labels=assign_labels,\n random_state=42,\n )\n\n t1 = time.time()\n labels = labels.reshape(rescaled_coins.shape)\n plt.figure(figsize=(5, 5))\n plt.imshow(rescaled_coins, cmap=plt.cm.gray)\n\n plt.xticks(())\n plt.yticks(())\n title = \"Spectral clustering: %s, %.2fs\" % (assign_labels, (t1 - t0))\n print(title)\n plt.title(title)\n for l in range(n_regions):\n colors = [plt.cm.nipy_spectral((l + 4) / float(n_regions + 4))]\n plt.contour(labels == l, colors=colors)\n # To view individual segments as appear comment in plt.pause(0.5)\nplt.show()\n\n# TODO: After #21194 is merged and #21243 is fixed, check which eigen_solver\n# is the best and set eigen_solver='arpack', 'lobpcg', or 'amg' and eigen_tol\n# explicitly in this example."
37+
]
38+
}
39+
],
40+
"metadata": {
41+
"kernelspec": {
42+
"display_name": "Python 3",
43+
"language": "python",
44+
"name": "python3"
45+
},
46+
"language_info": {
47+
"codemirror_mode": {
48+
"name": "ipython",
49+
"version": 3
50+
},
51+
"file_extension": ".py",
52+
"mimetype": "text/x-python",
53+
"name": "python",
54+
"nbconvert_exporter": "python",
55+
"pygments_lexer": "ipython3",
56+
"version": "3.9.20"
57+
}
58+
},
59+
"nbformat": 4,
60+
"nbformat_minor": 0
61+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
"""
2+
=======================================================================
3+
Plot the decision surface of decision trees trained on the iris dataset
4+
=======================================================================
5+
6+
Plot the decision surface of a decision tree trained on pairs
7+
of features of the iris dataset.
8+
9+
See :ref:`decision tree <tree>` for more information on the estimator.
10+
11+
For each pair of iris features, the decision tree learns decision
12+
boundaries made of combinations of simple thresholding rules inferred from
13+
the training samples.
14+
15+
We also show the tree structure of a model built on all of the features.
16+
"""
17+
18+
# Authors: The scikit-learn developers
19+
# SPDX-License-Identifier: BSD-3-Clause
20+
21+
# %%
22+
# First load the copy of the Iris dataset shipped with scikit-learn:
23+
from sklearn.datasets import load_iris
24+
25+
iris = load_iris()
26+
27+
28+
# %%
29+
# Display the decision functions of trees trained on all pairs of features.
30+
import matplotlib.pyplot as plt
31+
import numpy as np
32+
33+
from sklearn.datasets import load_iris
34+
from sklearn.inspection import DecisionBoundaryDisplay
35+
from sklearn.tree import DecisionTreeClassifier
36+
37+
# Parameters
38+
n_classes = 3
39+
plot_colors = "ryb"
40+
plot_step = 0.02
41+
42+
43+
for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]):
44+
# We only take the two corresponding features
45+
X = iris.data[:, pair]
46+
y = iris.target
47+
48+
# Train
49+
clf = DecisionTreeClassifier().fit(X, y)
50+
51+
# Plot the decision boundary
52+
ax = plt.subplot(2, 3, pairidx + 1)
53+
plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5)
54+
DecisionBoundaryDisplay.from_estimator(
55+
clf,
56+
X,
57+
cmap=plt.cm.RdYlBu,
58+
response_method="predict",
59+
ax=ax,
60+
xlabel=iris.feature_names[pair[0]],
61+
ylabel=iris.feature_names[pair[1]],
62+
)
63+
64+
# Plot the training points
65+
for i, color in zip(range(n_classes), plot_colors):
66+
idx = np.where(y == i)
67+
plt.scatter(
68+
X[idx, 0],
69+
X[idx, 1],
70+
c=color,
71+
label=iris.target_names[i],
72+
edgecolor="black",
73+
s=15,
74+
)
75+
76+
plt.suptitle("Decision surface of decision trees trained on pairs of features")
77+
plt.legend(loc="lower right", borderpad=0, handletextpad=0)
78+
_ = plt.axis("tight")
79+
80+
# %%
81+
# Display the structure of a single decision tree trained on all the features
82+
# together.
83+
from sklearn.tree import plot_tree
84+
85+
plt.figure()
86+
clf = DecisionTreeClassifier().fit(iris.data, iris.target)
87+
plot_tree(clf, filled=True)
88+
plt.title("Decision tree trained on all the iris features")
89+
plt.show()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
"""
2+
=============================================================
3+
Receiver Operating Characteristic (ROC) with cross validation
4+
=============================================================
5+
6+
This example presents how to estimate and visualize the variance of the Receiver
7+
Operating Characteristic (ROC) metric using cross-validation.
8+
9+
ROC curves typically feature true positive rate (TPR) on the Y axis, and false
10+
positive rate (FPR) on the X axis. This means that the top left corner of the
11+
plot is the "ideal" point - a FPR of zero, and a TPR of one. This is not very
12+
realistic, but it does mean that a larger Area Under the Curve (AUC) is usually
13+
better. The "steepness" of ROC curves is also important, since it is ideal to
14+
maximize the TPR while minimizing the FPR.
15+
16+
This example shows the ROC response of different datasets, created from K-fold
17+
cross-validation. Taking all of these curves, it is possible to calculate the
18+
mean AUC, and see the variance of the curve when the
19+
training set is split into different subsets. This roughly shows how the
20+
classifier output is affected by changes in the training data, and how different
21+
the splits generated by K-fold cross-validation are from one another.
22+
23+
.. note::
24+
25+
See :ref:`sphx_glr_auto_examples_model_selection_plot_roc.py` for a
26+
complement of the present example explaining the averaging strategies to
27+
generalize the metrics for multiclass classifiers.
28+
"""
29+
30+
# Authors: The scikit-learn developers
31+
# SPDX-License-Identifier: BSD-3-Clause
32+
33+
# %%
34+
# Load and prepare data
35+
# =====================
36+
#
37+
# We import the :ref:`iris_dataset` which contains 3 classes, each one
38+
# corresponding to a type of iris plant. One class is linearly separable from
39+
# the other 2; the latter are **not** linearly separable from each other.
40+
#
41+
# In the following we binarize the dataset by dropping the "virginica" class
42+
# (`class_id=2`). This means that the "versicolor" class (`class_id=1`) is
43+
# regarded as the positive class and "setosa" as the negative class
44+
# (`class_id=0`).
45+
46+
import numpy as np
47+
48+
from sklearn.datasets import load_iris
49+
50+
iris = load_iris()
51+
target_names = iris.target_names
52+
X, y = iris.data, iris.target
53+
X, y = X[y != 2], y[y != 2]
54+
n_samples, n_features = X.shape
55+
56+
# %%
57+
# We also add noisy features to make the problem harder.
58+
random_state = np.random.RandomState(0)
59+
X = np.concatenate([X, random_state.randn(n_samples, 200 * n_features)], axis=1)
60+
61+
# %%
62+
# Classification and ROC analysis
63+
# -------------------------------
64+
#
65+
# Here we run a :class:`~sklearn.svm.SVC` classifier with cross-validation and
66+
# plot the ROC curves fold-wise. Notice that the baseline to define the chance
67+
# level (dashed ROC curve) is a classifier that would always predict the most
68+
# frequent class.
69+
70+
import matplotlib.pyplot as plt
71+
72+
from sklearn import svm
73+
from sklearn.metrics import RocCurveDisplay, auc
74+
from sklearn.model_selection import StratifiedKFold
75+
76+
n_splits = 6
77+
cv = StratifiedKFold(n_splits=n_splits)
78+
classifier = svm.SVC(kernel="linear", probability=True, random_state=random_state)
79+
80+
tprs = []
81+
aucs = []
82+
mean_fpr = np.linspace(0, 1, 100)
83+
84+
fig, ax = plt.subplots(figsize=(6, 6))
85+
for fold, (train, test) in enumerate(cv.split(X, y)):
86+
classifier.fit(X[train], y[train])
87+
viz = RocCurveDisplay.from_estimator(
88+
classifier,
89+
X[test],
90+
y[test],
91+
name=f"ROC fold {fold}",
92+
alpha=0.3,
93+
lw=1,
94+
ax=ax,
95+
plot_chance_level=(fold == n_splits - 1),
96+
)
97+
interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
98+
interp_tpr[0] = 0.0
99+
tprs.append(interp_tpr)
100+
aucs.append(viz.roc_auc)
101+
102+
mean_tpr = np.mean(tprs, axis=0)
103+
mean_tpr[-1] = 1.0
104+
mean_auc = auc(mean_fpr, mean_tpr)
105+
std_auc = np.std(aucs)
106+
ax.plot(
107+
mean_fpr,
108+
mean_tpr,
109+
color="b",
110+
label=r"Mean ROC (AUC = %0.2f $\pm$ %0.2f)" % (mean_auc, std_auc),
111+
lw=2,
112+
alpha=0.8,
113+
)
114+
115+
std_tpr = np.std(tprs, axis=0)
116+
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
117+
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
118+
ax.fill_between(
119+
mean_fpr,
120+
tprs_lower,
121+
tprs_upper,
122+
color="grey",
123+
alpha=0.2,
124+
label=r"$\pm$ 1 std. dev.",
125+
)
126+
127+
ax.set(
128+
xlabel="False Positive Rate",
129+
ylabel="True Positive Rate",
130+
title=f"Mean ROC curve with variability\n(Positive label '{target_names[1]}')",
131+
)
132+
ax.legend(loc="lower right")
133+
plt.show()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
"""
2+
=================================================
3+
Concatenating multiple feature extraction methods
4+
=================================================
5+
6+
In many real-world examples, there are many ways to extract features from a
7+
dataset. Often it is beneficial to combine several methods to obtain good
8+
performance. This example shows how to use ``FeatureUnion`` to combine
9+
features obtained by PCA and univariate selection.
10+
11+
Combining features using this transformer has the benefit that it allows
12+
cross validation and grid searches over the whole process.
13+
14+
The combination used in this example is not particularly helpful on this
15+
dataset and is only used to illustrate the usage of FeatureUnion.
16+
17+
"""
18+
19+
# Authors: The scikit-learn developers
20+
# SPDX-License-Identifier: BSD-3-Clause
21+
22+
from sklearn.datasets import load_iris
23+
from sklearn.decomposition import PCA
24+
from sklearn.feature_selection import SelectKBest
25+
from sklearn.model_selection import GridSearchCV
26+
from sklearn.pipeline import FeatureUnion, Pipeline
27+
from sklearn.svm import SVC
28+
29+
iris = load_iris()
30+
31+
X, y = iris.data, iris.target
32+
33+
# This dataset is way too high-dimensional. Better do PCA:
34+
pca = PCA(n_components=2)
35+
36+
# Maybe some original features were good, too?
37+
selection = SelectKBest(k=1)
38+
39+
# Build estimator from PCA and Univariate selection:
40+
41+
combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])
42+
43+
# Use combined features to transform dataset:
44+
X_features = combined_features.fit(X, y).transform(X)
45+
print("Combined space has", X_features.shape[1], "features")
46+
47+
svm = SVC(kernel="linear")
48+
49+
# Do grid search over k, n_components and C:
50+
51+
pipeline = Pipeline([("features", combined_features), ("svm", svm)])
52+
53+
param_grid = dict(
54+
features__pca__n_components=[1, 2, 3],
55+
features__univ_select__k=[1, 2],
56+
svm__C=[0.1, 1, 10],
57+
)
58+
59+
grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10)
60+
grid_search.fit(X, y)
61+
print(grid_search.best_estimator_)

0 commit comments

Comments
 (0)