Added test, renamed calinski to pseudo_F and update few docs

scampion · scampion · commit 690f47dc3526 · 2011-02-04T12:02:59.000+01:00
diff --git a/examples/cluster/plot_som_colormap.py b/examples/cluster/plot_som_colormap.py
@@ -3,7 +3,8 @@
 A demo of SelfOrganisingMap with colored neurons
 ===========================================================
 
-XXX : add description of example.
+Example for SOM clustering using 3 dimensionals vectors (RGB)
+with 8 colors (black, white, red, green, blue, yellow, cyan, magenta)
 
 """
 print __doc__
diff --git a/examples/cluster/som_digits.py b/examples/cluster/som_digits.py
@@ -3,8 +3,9 @@
 A demo of Self-Organising Map and KMeans on the handwritten digits data
 =======================================================================
 
-XXX : Should add text to describe what the example and what to expect
-from the output. Would it be possible to plot something?
+Comparing various SOM and Kmeans clustering on the handwritten digits data
+with the pseudo_F index
+ 
 """
 from __future__ import division
 print __doc__
@@ -14,35 +15,30 @@
 
 from scikits.learn.cluster import KMeans
 from scikits.learn.cluster import SelfOrganizingMap
-from scikits.learn.cluster import calinski_index
-
+from scikits.learn.cluster import pseudo_F
 from scikits.learn.datasets import load_digits
 from scikits.learn.preprocessing import scale
-
-
-def display(labels, digits, n_clusters):
-    # XXX : n_clusters unused
-    r = {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: []}
-    for i, v in enumerate(labels):
-        r[digits.target[i]].append(v)
-
-    for k, v in r.items():
-        s = set(v)
-        print 'target %i | nb cluster %i |' % (k, len(s)), s
-
+from scikits.learn.metrics import confusion_matrix
+    
 np.random.seed(42)
 
+################################################################################
+# Load dataset 
+
 digits = load_digits()
 data = scale(digits.data)
-
 n_samples, n_features = data.shape
 n_digits = len(np.unique(digits.target))
 
-print "n_digits: %d" % n_digits
-print "n_features: %d" % n_features
-print "n_samples: %d" % n_samples
+print "Digits dataset"
+print "n_digits   : %d" % n_digits
+print "n_features : %d" % n_features
+print "n_samples  : %d" % n_samples
 print
 
+################################################################################
+# Digits dataset clustering using Self-Organizing Map
+
 print "Self-Organizing Map "
 t0 = time()
 grid_width = 4
@@ -52,18 +48,19 @@ def display(labels, digits, n_clusters):
 print "done in %0.3fs" % (time() - t0)
 print
 
-display(som.labels_, digits, grid_width**2)
-C = calinski_index(data, som.labels_, som.neurons_)
-print 'calinski index %0.2f | %0.2f%%' % (C, 100 * (C / (1 + C)))
+F = pseudo_F(data, som.labels_, som.neurons_)
+print 'pseudo_F %0.2f | %0.2f%%' % (F, 100 * (F / (1 + F)))
 print
 
+################################################################################
+# Digits dataset clustering using Kmeans
+
 print "KMeans "
 t0 = time()
 km = KMeans(init='k-means++', k=grid_width**2, n_init=10)
 km.fit(data)
 print "done in %0.3fs" % (time() - t0)
 print
 
-display(km.labels_, digits, n_digits)
-C = calinski_index(data, km.labels_, km.cluster_centers_)
-print 'calinski index %0.2f | %0.2f%%' % (C, 100 * (C / (1 + C)))
+F = pseudo_F(data, km.labels_, km.cluster_centers_)
+print 'pseudo_F %0.2f | %0.2f%%' % (F, 100 * (F / (1 + F)))
diff --git a/scikits/learn/cluster/__init__.py b/scikits/learn/cluster/__init__.py
@@ -10,11 +10,22 @@
 
 import numpy as np
 
-def calinski_index(X,labels,centroids):
+def pseudo_F(X, labels, centroids):
+    '''
+    The pseudo F statistic :
+
+    pseudo F = [( [(T - PG)/(G - 1)])/( [(PG)/(n - G)])] 
+
+    The pseudo F statistic was suggested by Calinski and Harabasz (1974)
+
+    Calinski, T. and J. Harabasz. 1974. 
+    A dendrite method for cluster analysis. Commun. Stat. 3: 1-27.
+    http://dx.doi.org/10.1080/03610927408827101
+    '''
     mean = np.mean(X,axis=0) 
     B = np.sum([ (c - mean)**2 for c in centroids])
     W = np.sum([ (x-centroids[labels[i]])**2 
-                 for i,x in enumerate(X)])
+                 for i, x in enumerate(X)])
     c = len(centroids)
     n = len(X)
     return (B /(c-1))/(W/ (n-c))
diff --git a/scikits/learn/cluster/som_.py b/scikits/learn/cluster/som_.py
@@ -39,8 +39,8 @@ class SelfOrganizingMap(BaseEstimator):
 
         'random': randomly points choosed
 
-        'matrix': interpret the w parameter as a w by M array
-         of initial centroids.
+        'matrix': interpret the size parameter as a size by M array
+         of initial neurons.
 
     Methods
     -------
@@ -81,7 +81,6 @@ def fit(self, X, **params):
         if self.init == 'random':
             self.neurons_ = np.random.rand(self.size, self.size, dim)
         elif self.init == 'matrix':
-            # XXX : untested
             assert len(self.size.shape) == 3
             self.neurons_ = self.size
             self.size = self.neurons_.shape[0]
diff --git a/scikits/learn/cluster/tests/test_som.py b/scikits/learn/cluster/tests/test_som.py
@@ -9,7 +9,8 @@
 from .common import generate_clustered_data
 
 n_clusters = 4
-X = generate_clustered_data(n_clusters=n_clusters, std=.1)
+n_features = 2
+X = generate_clustered_data(n_clusters=n_clusters, n_features=2, std=.1)
 
 
 def test_som():
@@ -23,3 +24,19 @@ def test_som():
     assert_equal(np.unique(labels[20:40]).shape[0], 1)
     assert_equal(np.unique(labels[40:60]).shape[0], 1)
     assert_equal(np.unique(labels[60:]).shape[0], 1)
+
+def test_som_init_matrix():
+    np.random.seed(1)
+    random_ind = np.random.randint(0, X.shape[0], size=n_clusters)
+    init_map = X[random_ind].reshape(2,2,n_features)
+
+    som = SelfOrganizingMap(size=init_map, init='matrix',
+                            n_iterations=2000, learning_rate=0.1)
+
+    som.fit(X)
+    labels = som.labels_
+    assert_equal(np.unique(labels).shape[0], 4)
+    assert_equal(np.unique(labels[:20]).shape[0], 1)
+    assert_equal(np.unique(labels[20:40]).shape[0], 1)
+    assert_equal(np.unique(labels[40:60]).shape[0], 1)
+    assert_equal(np.unique(labels[60:]).shape[0], 1)