33A demo of Self-Organising Map and KMeans on the handwritten digits data
44=======================================================================
55
6- XXX : Should add text to describe what the example and what to expect
7- from the output. Would it be possible to plot something?
6+ Comparing various SOM and Kmeans clustering on the handwritten digits data
7+ with the pseudo_F index
8+
89"""
910from __future__ import division
1011print __doc__
1415
1516from scikits .learn .cluster import KMeans
1617from scikits .learn .cluster import SelfOrganizingMap
17- from scikits .learn .cluster import calinski_index
18-
18+ from scikits .learn .cluster import pseudo_F
1919from scikits .learn .datasets import load_digits
2020from scikits .learn .preprocessing import scale
21-
22-
23- def display (labels , digits , n_clusters ):
24- # XXX : n_clusters unused
25- r = {0 : [], 1 : [], 2 : [], 3 : [], 4 : [], 5 : [], 6 : [], 7 : [], 8 : [], 9 : []}
26- for i , v in enumerate (labels ):
27- r [digits .target [i ]].append (v )
28-
29- for k , v in r .items ():
30- s = set (v )
31- print 'target %i | nb cluster %i |' % (k , len (s )), s
32-
21+ from scikits .learn .metrics import confusion_matrix
22+
3323np .random .seed (42 )
3424
25+ ################################################################################
26+ # Load dataset
27+
3528digits = load_digits ()
3629data = scale (digits .data )
37-
3830n_samples , n_features = data .shape
3931n_digits = len (np .unique (digits .target ))
4032
41- print "n_digits: %d" % n_digits
42- print "n_features: %d" % n_features
43- print "n_samples: %d" % n_samples
33+ print "Digits dataset"
34+ print "n_digits : %d" % n_digits
35+ print "n_features : %d" % n_features
36+ print "n_samples : %d" % n_samples
4437print
4538
39+ ################################################################################
40+ # Digits dataset clustering using Self-Organizing Map
41+
4642print "Self-Organizing Map "
4743t0 = time ()
4844grid_width = 4
@@ -52,18 +48,19 @@ def display(labels, digits, n_clusters):
5248print "done in %0.3fs" % (time () - t0 )
5349print
5450
55- display (som .labels_ , digits , grid_width ** 2 )
56- C = calinski_index (data , som .labels_ , som .neurons_ )
57- print 'calinski index %0.2f | %0.2f%%' % (C , 100 * (C / (1 + C )))
51+ F = pseudo_F (data , som .labels_ , som .neurons_ )
52+ print 'pseudo_F %0.2f | %0.2f%%' % (F , 100 * (F / (1 + F )))
5853print
5954
55+ ################################################################################
56+ # Digits dataset clustering using Kmeans
57+
6058print "KMeans "
6159t0 = time ()
6260km = KMeans (init = 'k-means++' , k = grid_width ** 2 , n_init = 10 )
6361km .fit (data )
6462print "done in %0.3fs" % (time () - t0 )
6563print
6664
67- display (km .labels_ , digits , n_digits )
68- C = calinski_index (data , km .labels_ , km .cluster_centers_ )
69- print 'calinski index %0.2f | %0.2f%%' % (C , 100 * (C / (1 + C )))
65+ F = pseudo_F (data , km .labels_ , km .cluster_centers_ )
66+ print 'pseudo_F %0.2f | %0.2f%%' % (F , 100 * (F / (1 + F )))
0 commit comments