Skip to content

Commit bc67dbf

Browse files
roliszlarsmans
authored andcommitted
ENH Added custom kernels to SpectralClustering
Fixes scikit-learn#1791.
1 parent 87dcc8b commit bc67dbf

File tree

2 files changed

+75
-15
lines changed

2 files changed

+75
-15
lines changed

sklearn/cluster/spectral.py

Lines changed: 44 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# -*- coding: utf-8 -*-
12
"""Algorithms for spectral clustering"""
23

34
# Author: Gael Varoquaux [email protected]
@@ -11,7 +12,7 @@
1112
from ..base import BaseEstimator, ClusterMixin
1213
from ..utils import check_random_state, as_float_array, deprecated
1314
from ..utils.extmath import norm
14-
from ..metrics.pairwise import rbf_kernel
15+
from ..metrics.pairwise import pairwise_kernels
1516
from ..neighbors import kneighbors_graph
1617
from ..manifold import spectral_embedding
1718
from .k_means_ import k_means
@@ -287,8 +288,9 @@ class SpectralClustering(BaseEstimator, ClusterMixin):
287288
If affinity is the adjacency matrix of a graph, this method can be
288289
used to find normalized graph cuts.
289290
290-
When calling ``fit``, an affinity matrix is constructed using either the
291-
Gaussian (aka RBF) kernel of the euclidean distanced ``d(X, X)``::
291+
When calling ``fit``, an affinity matrix is constructed using either
292+
kernel function such the Gaussian (aka RBF) kernel of the euclidean
293+
distanced ``d(X, X)``::
292294
293295
np.exp(-gamma * d(X,X) ** 2)
294296
@@ -302,12 +304,27 @@ class SpectralClustering(BaseEstimator, ClusterMixin):
302304
n_clusters : integer, optional
303305
The dimension of the projection subspace.
304306
305-
affinity: string, 'nearest_neighbors', 'rbf' or 'precomputed'
307+
affinity : string, array-like or callable, default 'rbf'
308+
If a string, this may be one of 'nearest_neighbors', 'precomputed',
309+
'rbf' or one of the kernels supported by
310+
`sklearn.metrics.pairwise_kernels`.
311+
312+
Only kernels that produce similarity scores (non-negative values that
313+
increase with similarity) should be used. This property is not checked
314+
by the clustering algorithm.
306315
307316
gamma: float
308-
Scaling factor of Gaussian (rbf) affinity kernel. Ignored for
317+
Scaling factor of RBF, polynomial, exponential chi² and
318+
sigmoid affinity kernel. Ignored for
309319
``affinity='nearest_neighbors'``.
310320
321+
degree : float, default=3
322+
Degree of the polynomial kernel. Ignored by other kernels.
323+
324+
coef0 : float, default=1
325+
Zero coefficient for polynomial and sigmoid kernels.
326+
Ignored by other kernels.
327+
311328
n_neighbors: integer
312329
Number of neighbors to use when constructing the affinity matrix using
313330
the nearest neighbors method. Ignored for ``affinity='rbf'``.
@@ -338,6 +355,10 @@ class SpectralClustering(BaseEstimator, ClusterMixin):
338355
also be sensitive to initialization. Discretization is another approach
339356
which is less sensitive to random initialization.
340357
358+
kernel_params : dictionary of string to any, optional
359+
Parameters (keyword arguments) and values for kernel passed as
360+
callable object. Ignored by other kernels.
361+
341362
Attributes
342363
----------
343364
`affinity_matrix_` : array-like, shape (n_samples, n_samples)
@@ -381,7 +402,8 @@ class SpectralClustering(BaseEstimator, ClusterMixin):
381402

382403
def __init__(self, n_clusters=8, eigen_solver=None, random_state=None,
383404
n_init=10, gamma=1., affinity='rbf', n_neighbors=10, k=None,
384-
eigen_tol=0.0, assign_labels='kmeans', mode=None):
405+
eigen_tol=0.0, assign_labels='kmeans', mode=None,
406+
degree=3, coef0=1, kernel_params=None):
385407
if k is not None:
386408
warnings.warn("'k' was renamed to n_clusters and "
387409
"will be removed in 0.15.",
@@ -402,6 +424,9 @@ def __init__(self, n_clusters=8, eigen_solver=None, random_state=None,
402424
self.n_neighbors = n_neighbors
403425
self.eigen_tol = eigen_tol
404426
self.assign_labels = assign_labels
427+
self.degree = degree
428+
self.coef0 = coef0
429+
self.kernel_params = kernel_params
405430

406431
def fit(self, X):
407432
"""Creates an affinity matrix for X using the selected affinity,
@@ -419,18 +444,22 @@ def fit(self, X):
419444
" a custom affinity matrix, "
420445
"set ``affinity=precomputed``.")
421446

422-
if self.affinity == 'rbf':
423-
self.affinity_matrix_ = rbf_kernel(X, gamma=self.gamma)
424-
425-
elif self.affinity == 'nearest_neighbors':
447+
if self.affinity == 'nearest_neighbors':
426448
connectivity = kneighbors_graph(X, n_neighbors=self.n_neighbors)
427449
self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)
428450
elif self.affinity == 'precomputed':
429451
self.affinity_matrix_ = X
430452
else:
431-
raise ValueError("Invalid 'affinity'. Expected 'rbf', "
432-
"'nearest_neighbors' or 'precomputed', got '%s'."
433-
% self.affinity)
453+
params = self.kernel_params
454+
if params is None:
455+
params = {}
456+
if not callable(self.affinity):
457+
params['gamma'] = self.gamma
458+
params['degree'] = self.degree
459+
params['coef0'] = self.coef0
460+
self.affinity_matrix_ = pairwise_kernels(X, metric=self.affinity,
461+
filter_params=True,
462+
**params)
434463

435464
random_state = check_random_state(self.random_state)
436465
self.labels_ = spectral_clustering(self.affinity_matrix_,
@@ -457,3 +486,5 @@ def mode(self):
457486
" 0.15.")
458487
def k(self):
459488
return self.n_clusters
489+
490+

sklearn/cluster/tests/test_spectral.py

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
"""Testing for Spectral Clustering methods"""
22

33
from sklearn.externals.six.moves import cPickle
4+
from sklearn.metrics.pairwise import kernel_metrics
5+
46
dumps, loads = cPickle.dumps, cPickle.loads
57

68
import numpy as np
79
from scipy import sparse
810

11+
from sklearn.utils import check_random_state
912
from sklearn.utils.testing import assert_equal
1013
from sklearn.utils.testing import assert_array_equal
1114
from sklearn.utils.testing import assert_raises
@@ -156,8 +159,8 @@ def test_affinities():
156159
# Note: in the following, random_state has been selected to have
157160
# a dataset that yields a stable eigen decomposition both when built
158161
# on OSX and Linux
159-
X, y = make_blobs(n_samples=40, random_state=2, centers=[[1, 1], [-1, -1]],
160-
cluster_std=0.4)
162+
X, y = make_blobs(n_samples=40, random_state=2,
163+
centers=[[1, 1], [-1, -1]], cluster_std=0.4)
161164
# nearest neighbors affinity
162165
sp = SpectralClustering(n_clusters=2, affinity='nearest_neighbors',
163166
random_state=0)
@@ -168,6 +171,32 @@ def test_affinities():
168171
labels = sp.fit(X).labels_
169172
assert_equal(adjusted_rand_score(y, labels), 1)
170173

174+
X = check_random_state(10).rand(10, 5) * 10
175+
176+
kernels_available = kernel_metrics()
177+
for kern in kernels_available:
178+
# Additive chi^2 gives a negative similarity matrix which
179+
# doesn't make sense for spectral clustering
180+
if kern != 'additive_chi2':
181+
sp = SpectralClustering(n_clusters=2, affinity=kern, random_state=0)
182+
labels = sp.fit(X).labels_
183+
print(labels)
184+
assert_equal((X.shape[0],), labels.shape)
185+
186+
sp = SpectralClustering(n_clusters=2, affinity=lambda x, y: 1,
187+
random_state=0)
188+
labels = sp.fit(X).labels_
189+
assert_equal((X.shape[0],), labels.shape)
190+
191+
def histogram(x, y, **kwargs):
192+
"""Histogram kernel implemented as a callable."""
193+
assert_equal(kwargs, {}) # no kernel_params that we didn't ask for
194+
return np.minimum(x, y).sum()
195+
196+
sp = SpectralClustering(n_clusters=2, affinity=histogram, random_state=0)
197+
labels = sp.fit(X).labels_
198+
assert_equal((X.shape[0],), labels.shape)
199+
171200
# raise error on unknown affinity
172201
sp = SpectralClustering(n_clusters=2, affinity='<unknown>')
173202
assert_raises(ValueError, sp.fit, X)

0 commit comments

Comments
 (0)