Skip to content

Commit 55d09ce

Browse files
committed
Pushing the docs to dev/ for branch: master, commit 6ee390d3fec73a6a57bd1761ff6e69db2e722caa
1 parent 76e4b47 commit 55d09ce

File tree

1,188 files changed

+3686
-3680
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,188 files changed

+3686
-3680
lines changed
Binary file not shown.

dev/_downloads/36b58500501fbf3f06587ee0039d1985/plot_johnson_lindenstrauss_bound.ipynb

+3-3
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@
6969
"cell_type": "markdown",
7070
"metadata": {},
7171
"source": [
72-
"Empirical validation\n====================\n\nWe validate the above bounds on the digits dataset or on the 20 newsgroups\ntext document (TF-IDF word frequencies) dataset:\n\n- for the digits dataset, some 8x8 gray level pixels data for 500\n handwritten digits pictures are randomly projected to spaces for various\n larger number of dimensions ``n_components``.\n\n- for the 20 newsgroups dataset some 500 documents with 100k\n features in total are projected using a sparse random matrix to smaller\n euclidean spaces with various values for the target number of dimensions\n ``n_components``.\n\nThe default dataset is the digits dataset. To run the example on the twenty\nnewsgroups dataset, pass the --twenty-newsgroups command line argument to\nthis script.\n\n"
72+
"Empirical validation\n====================\n\nWe validate the above bounds on the 20 newsgroups text document\n(TF-IDF word frequencies) dataset or on the digits dataset:\n\n- for the 20 newsgroups dataset some 500 documents with 100k\n features in total are projected using a sparse random matrix to smaller\n euclidean spaces with various values for the target number of dimensions\n ``n_components``.\n\n- for the digits dataset, some 8x8 gray level pixels data for 500\n handwritten digits pictures are randomly projected to spaces for various\n larger number of dimensions ``n_components``.\n\nThe default dataset is the 20 newsgroups dataset. To run the example on the\ndigits dataset, pass the ``--use-digits-dataset`` command line argument to\nthis script.\n\n"
7373
]
7474
},
7575
{
@@ -80,7 +80,7 @@
8080
},
8181
"outputs": [],
8282
"source": [
83-
"if '--twenty-newsgroups' in sys.argv:\n # Need an internet connection hence not enabled by default\n data = fetch_20newsgroups_vectorized().data[:500]\nelse:\n data = load_digits().data[:500]"
83+
"if '--use-digits-dataset' in sys.argv:\n data = load_digits().data[:500]\nelse:\n data = fetch_20newsgroups_vectorized().data[:500]"
8484
]
8585
},
8686
{
@@ -98,7 +98,7 @@
9898
},
9999
"outputs": [],
100100
"source": [
101-
"n_samples, n_features = data.shape\nprint(\"Embedding %d samples with dim %d using various random projections\"\n % (n_samples, n_features))\n\nn_components_range = np.array([300, 1000, 10000])\ndists = euclidean_distances(data, squared=True).ravel()\n\n# select only non-identical samples pairs\nnonzero = dists != 0\ndists = dists[nonzero]\n\nfor n_components in n_components_range:\n t0 = time()\n rp = SparseRandomProjection(n_components=n_components)\n projected_data = rp.fit_transform(data)\n print(\"Projected %d samples from %d to %d in %0.3fs\"\n % (n_samples, n_features, n_components, time() - t0))\n if hasattr(rp, 'components_'):\n n_bytes = rp.components_.data.nbytes\n n_bytes += rp.components_.indices.nbytes\n print(\"Random matrix with size: %0.3fMB\" % (n_bytes / 1e6))\n\n projected_dists = euclidean_distances(\n projected_data, squared=True).ravel()[nonzero]\n\n plt.figure()\n plt.hexbin(dists, projected_dists, gridsize=100, cmap=plt.cm.PuBu)\n plt.xlabel(\"Pairwise squared distances in original space\")\n plt.ylabel(\"Pairwise squared distances in projected space\")\n plt.title(\"Pairwise distances distribution for n_components=%d\" %\n n_components)\n cb = plt.colorbar()\n cb.set_label('Sample pairs counts')\n\n rates = projected_dists / dists\n print(\"Mean distances rate: %0.2f (%0.2f)\"\n % (np.mean(rates), np.std(rates)))\n\n plt.figure()\n plt.hist(rates, bins=50, range=(0., 2.), edgecolor='k', **density_param)\n plt.xlabel(\"Squared distances rate: projected / original\")\n plt.ylabel(\"Distribution of samples pairs\")\n plt.title(\"Histogram of pairwise distance rates for n_components=%d\" %\n n_components)\n\n # TODO: compute the expected value of eps and add them to the previous plot\n # as vertical lines / region\n\nplt.show()"
101+
"n_samples, n_features = data.shape\nprint(\"Embedding %d samples with dim %d using various random projections\"\n % (n_samples, n_features))\n\nn_components_range = np.array([300, 1000, 10000])\ndists = euclidean_distances(data, squared=True).ravel()\n\n# select only non-identical samples pairs\nnonzero = dists != 0\ndists = dists[nonzero]\n\nfor n_components in n_components_range:\n t0 = time()\n rp = SparseRandomProjection(n_components=n_components)\n projected_data = rp.fit_transform(data)\n print(\"Projected %d samples from %d to %d in %0.3fs\"\n % (n_samples, n_features, n_components, time() - t0))\n if hasattr(rp, 'components_'):\n n_bytes = rp.components_.data.nbytes\n n_bytes += rp.components_.indices.nbytes\n print(\"Random matrix with size: %0.3fMB\" % (n_bytes / 1e6))\n\n projected_dists = euclidean_distances(\n projected_data, squared=True).ravel()[nonzero]\n\n plt.figure()\n min_dist = min(projected_dists.min(), dists.min())\n max_dist = max(projected_dists.max(), dists.max())\n plt.hexbin(dists, projected_dists, gridsize=100, cmap=plt.cm.PuBu,\n extent=[min_dist, max_dist, min_dist, max_dist])\n plt.xlabel(\"Pairwise squared distances in original space\")\n plt.ylabel(\"Pairwise squared distances in projected space\")\n plt.title(\"Pairwise distances distribution for n_components=%d\" %\n n_components)\n cb = plt.colorbar()\n cb.set_label('Sample pairs counts')\n\n rates = projected_dists / dists\n print(\"Mean distances rate: %0.2f (%0.2f)\"\n % (np.mean(rates), np.std(rates)))\n\n plt.figure()\n plt.hist(rates, bins=50, range=(0., 2.), edgecolor='k', **density_param)\n plt.xlabel(\"Squared distances rate: projected / original\")\n plt.ylabel(\"Distribution of samples pairs\")\n plt.title(\"Histogram of pairwise distance rates for n_components=%d\" %\n n_components)\n\n # TODO: compute the expected value of eps and add them to the previous plot\n # as vertical lines / region\n\nplt.show()"
102102
]
103103
},
104104
{

dev/_downloads/9806f0059c4cc6c99c54414e573e6615/plot_johnson_lindenstrauss_bound.py

+15-13
Original file line numberDiff line numberDiff line change
@@ -102,27 +102,26 @@
102102
# Empirical validation
103103
# ====================
104104
#
105-
# We validate the above bounds on the digits dataset or on the 20 newsgroups
106-
# text document (TF-IDF word frequencies) dataset:
107-
#
108-
# - for the digits dataset, some 8x8 gray level pixels data for 500
109-
# handwritten digits pictures are randomly projected to spaces for various
110-
# larger number of dimensions ``n_components``.
105+
# We validate the above bounds on the 20 newsgroups text document
106+
# (TF-IDF word frequencies) dataset or on the digits dataset:
111107
#
112108
# - for the 20 newsgroups dataset some 500 documents with 100k
113109
# features in total are projected using a sparse random matrix to smaller
114110
# euclidean spaces with various values for the target number of dimensions
115111
# ``n_components``.
116112
#
117-
# The default dataset is the digits dataset. To run the example on the twenty
118-
# newsgroups dataset, pass the --twenty-newsgroups command line argument to
113+
# - for the digits dataset, some 8x8 gray level pixels data for 500
114+
# handwritten digits pictures are randomly projected to spaces for various
115+
# larger number of dimensions ``n_components``.
116+
#
117+
# The default dataset is the 20 newsgroups dataset. To run the example on the
118+
# digits dataset, pass the ``--use-digits-dataset`` command line argument to
119119
# this script.
120120

121-
if '--twenty-newsgroups' in sys.argv:
122-
# Need an internet connection hence not enabled by default
123-
data = fetch_20newsgroups_vectorized().data[:500]
124-
else:
121+
if '--use-digits-dataset' in sys.argv:
125122
data = load_digits().data[:500]
123+
else:
124+
data = fetch_20newsgroups_vectorized().data[:500]
126125

127126
##########################################################
128127
# For each value of ``n_components``, we plot:
@@ -158,7 +157,10 @@
158157
projected_data, squared=True).ravel()[nonzero]
159158

160159
plt.figure()
161-
plt.hexbin(dists, projected_dists, gridsize=100, cmap=plt.cm.PuBu)
160+
min_dist = min(projected_dists.min(), dists.min())
161+
max_dist = max(projected_dists.max(), dists.max())
162+
plt.hexbin(dists, projected_dists, gridsize=100, cmap=plt.cm.PuBu,
163+
extent=[min_dist, max_dist, min_dist, max_dist])
162164
plt.xlabel("Pairwise squared distances in original space")
163165
plt.ylabel("Pairwise squared distances in projected space")
164166
plt.title("Pairwise distances distribution for n_components=%d" %
Binary file not shown.

dev/_downloads/scikit-learn-docs.pdf

41.3 KB
Binary file not shown.

dev/_images/iris.png

0 Bytes
-59 Bytes
973 Bytes
973 Bytes
-66 Bytes
376 Bytes
376 Bytes
271 Bytes
-16 Bytes
-157 Bytes
-157 Bytes
490 Bytes
112 Bytes
-186 Bytes
-328 Bytes
-117 Bytes
-778 Bytes
-778 Bytes
-105 Bytes
304 Bytes
304 Bytes
-89 Bytes
-89 Bytes
-56 Bytes
-56 Bytes
-109 Bytes
-109 Bytes
42 Bytes
42 Bytes
-135 Bytes
40 Bytes
29 Bytes
29 Bytes
271 Bytes
-198 Bytes
-198 Bytes
-290 Bytes
-981 Bytes
-75 Bytes
-75 Bytes
92 Bytes
9 Bytes

dev/_sources/auto_examples/applications/plot_face_recognition.rst.txt

+13-13

dev/_sources/auto_examples/applications/plot_model_complexity_influence.rst.txt

+16-16

0 commit comments

Comments
 (0)