Skip to content

Commit 690f47d

Browse files
committed
Added test, renamed calinski to pseudo_F and update few docs
1 parent e4453b1 commit 690f47d

File tree

5 files changed

+58
-33
lines changed

5 files changed

+58
-33
lines changed

examples/cluster/plot_som_colormap.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
A demo of SelfOrganisingMap with colored neurons
44
===========================================================
55
6-
XXX : add description of example.
6+
Example for SOM clustering using 3 dimensionals vectors (RGB)
7+
with 8 colors (black, white, red, green, blue, yellow, cyan, magenta)
78
89
"""
910
print __doc__

examples/cluster/som_digits.py

Lines changed: 23 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@
33
A demo of Self-Organising Map and KMeans on the handwritten digits data
44
=======================================================================
55
6-
XXX : Should add text to describe what the example and what to expect
7-
from the output. Would it be possible to plot something?
6+
Comparing various SOM and Kmeans clustering on the handwritten digits data
7+
with the pseudo_F index
8+
89
"""
910
from __future__ import division
1011
print __doc__
@@ -14,35 +15,30 @@
1415

1516
from scikits.learn.cluster import KMeans
1617
from scikits.learn.cluster import SelfOrganizingMap
17-
from scikits.learn.cluster import calinski_index
18-
18+
from scikits.learn.cluster import pseudo_F
1919
from scikits.learn.datasets import load_digits
2020
from scikits.learn.preprocessing import scale
21-
22-
23-
def display(labels, digits, n_clusters):
24-
# XXX : n_clusters unused
25-
r = {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: []}
26-
for i, v in enumerate(labels):
27-
r[digits.target[i]].append(v)
28-
29-
for k, v in r.items():
30-
s = set(v)
31-
print 'target %i | nb cluster %i |' % (k, len(s)), s
32-
21+
from scikits.learn.metrics import confusion_matrix
22+
3323
np.random.seed(42)
3424

25+
################################################################################
26+
# Load dataset
27+
3528
digits = load_digits()
3629
data = scale(digits.data)
37-
3830
n_samples, n_features = data.shape
3931
n_digits = len(np.unique(digits.target))
4032

41-
print "n_digits: %d" % n_digits
42-
print "n_features: %d" % n_features
43-
print "n_samples: %d" % n_samples
33+
print "Digits dataset"
34+
print "n_digits : %d" % n_digits
35+
print "n_features : %d" % n_features
36+
print "n_samples : %d" % n_samples
4437
print
4538

39+
################################################################################
40+
# Digits dataset clustering using Self-Organizing Map
41+
4642
print "Self-Organizing Map "
4743
t0 = time()
4844
grid_width = 4
@@ -52,18 +48,19 @@ def display(labels, digits, n_clusters):
5248
print "done in %0.3fs" % (time() - t0)
5349
print
5450

55-
display(som.labels_, digits, grid_width**2)
56-
C = calinski_index(data, som.labels_, som.neurons_)
57-
print 'calinski index %0.2f | %0.2f%%' % (C, 100 * (C / (1 + C)))
51+
F = pseudo_F(data, som.labels_, som.neurons_)
52+
print 'pseudo_F %0.2f | %0.2f%%' % (F, 100 * (F / (1 + F)))
5853
print
5954

55+
################################################################################
56+
# Digits dataset clustering using Kmeans
57+
6058
print "KMeans "
6159
t0 = time()
6260
km = KMeans(init='k-means++', k=grid_width**2, n_init=10)
6361
km.fit(data)
6462
print "done in %0.3fs" % (time() - t0)
6563
print
6664

67-
display(km.labels_, digits, n_digits)
68-
C = calinski_index(data, km.labels_, km.cluster_centers_)
69-
print 'calinski index %0.2f | %0.2f%%' % (C, 100 * (C / (1 + C)))
65+
F = pseudo_F(data, km.labels_, km.cluster_centers_)
66+
print 'pseudo_F %0.2f | %0.2f%%' % (F, 100 * (F / (1 + F)))

scikits/learn/cluster/__init__.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,22 @@
1010

1111
import numpy as np
1212

13-
def calinski_index(X,labels,centroids):
13+
def pseudo_F(X, labels, centroids):
14+
'''
15+
The pseudo F statistic :
16+
17+
pseudo F = [( [(T - PG)/(G - 1)])/( [(PG)/(n - G)])]
18+
19+
The pseudo F statistic was suggested by Calinski and Harabasz (1974)
20+
21+
Calinski, T. and J. Harabasz. 1974.
22+
A dendrite method for cluster analysis. Commun. Stat. 3: 1-27.
23+
http://dx.doi.org/10.1080/03610927408827101
24+
'''
1425
mean = np.mean(X,axis=0)
1526
B = np.sum([ (c - mean)**2 for c in centroids])
1627
W = np.sum([ (x-centroids[labels[i]])**2
17-
for i,x in enumerate(X)])
28+
for i, x in enumerate(X)])
1829
c = len(centroids)
1930
n = len(X)
2031
return (B /(c-1))/(W/ (n-c))

scikits/learn/cluster/som_.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@ class SelfOrganizingMap(BaseEstimator):
3939
4040
'random': randomly points choosed
4141
42-
'matrix': interpret the w parameter as a w by M array
43-
of initial centroids.
42+
'matrix': interpret the size parameter as a size by M array
43+
of initial neurons.
4444
4545
Methods
4646
-------
@@ -81,7 +81,6 @@ def fit(self, X, **params):
8181
if self.init == 'random':
8282
self.neurons_ = np.random.rand(self.size, self.size, dim)
8383
elif self.init == 'matrix':
84-
# XXX : untested
8584
assert len(self.size.shape) == 3
8685
self.neurons_ = self.size
8786
self.size = self.neurons_.shape[0]

scikits/learn/cluster/tests/test_som.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@
99
from .common import generate_clustered_data
1010

1111
n_clusters = 4
12-
X = generate_clustered_data(n_clusters=n_clusters, std=.1)
12+
n_features = 2
13+
X = generate_clustered_data(n_clusters=n_clusters, n_features=2, std=.1)
1314

1415

1516
def test_som():
@@ -23,3 +24,19 @@ def test_som():
2324
assert_equal(np.unique(labels[20:40]).shape[0], 1)
2425
assert_equal(np.unique(labels[40:60]).shape[0], 1)
2526
assert_equal(np.unique(labels[60:]).shape[0], 1)
27+
28+
def test_som_init_matrix():
29+
np.random.seed(1)
30+
random_ind = np.random.randint(0, X.shape[0], size=n_clusters)
31+
init_map = X[random_ind].reshape(2,2,n_features)
32+
33+
som = SelfOrganizingMap(size=init_map, init='matrix',
34+
n_iterations=2000, learning_rate=0.1)
35+
36+
som.fit(X)
37+
labels = som.labels_
38+
assert_equal(np.unique(labels).shape[0], 4)
39+
assert_equal(np.unique(labels[:20]).shape[0], 1)
40+
assert_equal(np.unique(labels[20:40]).shape[0], 1)
41+
assert_equal(np.unique(labels[40:60]).shape[0], 1)
42+
assert_equal(np.unique(labels[60:]).shape[0], 1)

0 commit comments

Comments
 (0)