4040[1] http://archive.ics.uci.edu/ml/datasets/Covertype
4141
4242"""
43- from __future__ import division
43+ from __future__ import division , print_function
4444
45- print __doc__
45+ print ( __doc__ )
4646
47- # Author: Peter Prettenhoer <[email protected] > 47+ # Author: Peter Prettenhofer <[email protected] > 4848# License: BSD Style.
4949
50- # $Id$
51-
52- from time import time
50+ import logging
5351import os
5452import sys
55- import numpy as np
53+ from time import time
5654from optparse import OptionParser
5755
56+ import numpy as np
57+
58+ from sklearn .datasets import fetch_covtype
5859from sklearn .svm import LinearSVC
5960from sklearn .linear_model import SGDClassifier
6061from sklearn .naive_bayes import GaussianNB
6162from sklearn .tree import DecisionTreeClassifier
6263from sklearn .ensemble import RandomForestClassifier , ExtraTreesClassifier
6364from sklearn import metrics
6465from sklearn .externals .joblib import Memory
65- from sklearn .utils import check_random_state
66+
67+ logging .basicConfig (level = logging .INFO ,
68+ format = '%(asctime)s %(levelname)s %(message)s' )
69+ logger = logging .getLogger (__name__ )
6670
6771op = OptionParser ()
6872op .add_option ("--classifiers" ,
8084# estimators.
8185op .add_option ("--random-seed" ,
8286 dest = "random_seed" , default = 13 , type = int ,
83- help = "Common seed used by random number generator."
84- )
87+ help = "Common seed used by random number generator." )
8588
8689op .print_help ()
8790
97100joblib_cache_folder = os .path .join (bench_folder , 'bench_covertype_data' )
98101m = Memory (joblib_cache_folder , mmap_mode = 'r' )
99102
100- # Set seed for rng
101- rng = check_random_state (opts .random_seed )
102-
103103
104104# Load the data, then cache and memmap the train/test split
105105@m .cache
106106def load_data (dtype = np .float32 , order = 'F' ):
107- ######################################################################
108- ## Download the data, if not already on disk
109- if not os .path .exists (original_archive ):
110- # Download the data
111- import urllib
112- print "Downloading data, Please Wait (11MB)..."
113- opener = urllib .urlopen (
114- 'http://archive.ics.uci.edu/ml/'
115- 'machine-learning-databases/covtype/covtype.data.gz' )
116- open (original_archive , 'wb' ).write (opener .read ())
117-
118107 ######################################################################
119108 ## Load dataset
120109 print ("Loading dataset..." )
121- import gzip
122- f = gzip .open (original_archive )
123- X = np .fromstring (f .read ().replace ("," , " " ), dtype = dtype , sep = " " ,
124- count = - 1 )
125- X = X .reshape ((581012 , 55 ))
110+ data = fetch_covtype (download_if_missing = True , shuffle = True ,
111+ random_state = opts .random_seed )
112+ X , y = data .data , data .target
126113 if order .lower () == 'f' :
127114 X = np .asfortranarray (X )
128- f .close ()
129115
130116 # class 1 vs. all others.
131- y = np .ones (X .shape [0 ]) * - 1
132- y [np .where (X [:, - 1 ] == 1 )] = 1
133- X = X [:, :- 1 ]
117+ y [np .where (y != 1 )] = - 1
134118
135119 ######################################################################
136120 ## Create train-test split (as [Joachims, 2006])
137- print ("Creating train-test split..." )
138- idx = np .arange (X .shape [0 ])
139- rng .shuffle (idx )
140- train_idx = idx [:522911 ]
141- test_idx = idx [522911 :]
121+ logger .info ("Creating train-test split..." )
122+ n_train = 522911
142123
143- X_train = X [train_idx ]
144- y_train = y [train_idx ]
145- X_test = X [test_idx ]
146- y_test = y [test_idx ]
147-
148- # free memory
149- del X
150- del y
124+ X_train = X [:n_train ]
125+ y_train = y [:n_train ]
126+ X_test = X [n_train :]
127+ y_test = y [n_train :]
151128
152129 ######################################################################
153130 ## Standardize first 10 features (the numerical ones)
@@ -172,12 +149,14 @@ def load_data(dtype=np.float32, order='F'):
172149print ("%s %d" % ("number of classes:" .ljust (25 ),
173150 np .unique (y_train ).shape [0 ]))
174151print ("%s %s" % ("data type:" .ljust (25 ), X_train .dtype ))
175- print ("%s %d (pos=%d, neg=%d, size=%dMB)" % ("number of train samples:" .ljust (25 ),
176- X_train .shape [0 ], np .sum (y_train == 1 ),
177- np .sum (y_train == - 1 ), int (X_train .nbytes / 1e6 )))
178- print ("%s %d (pos=%d, neg=%d, size=%dMB)" % ("number of test samples:" .ljust (25 ),
179- X_test .shape [0 ], np .sum (y_test == 1 ),
180- np .sum (y_test == - 1 ), int (X_test .nbytes / 1e6 )))
152+ print ("%s %d (pos=%d, neg=%d, size=%dMB)"
153+ % ("number of train samples:" .ljust (25 ),
154+ X_train .shape [0 ], np .sum (y_train == 1 ),
155+ np .sum (y_train == - 1 ), int (X_train .nbytes / 1e6 )))
156+ print ("%s %d (pos=%d, neg=%d, size=%dMB)"
157+ % ("number of test samples:" .ljust (25 ),
158+ X_test .shape [0 ], np .sum (y_test == 1 ),
159+ np .sum (y_test == - 1 ), int (X_test .nbytes / 1e6 )))
181160
182161
183162classifiers = dict ()
@@ -204,7 +183,7 @@ def benchmark(clf):
204183 'dual' : False ,
205184 'tol' : 1e-3 ,
206185 "random_state" : opts .random_seed ,
207- }
186+ }
208187classifiers ['liblinear' ] = LinearSVC (** liblinear_parameters )
209188
210189######################################################################
@@ -218,7 +197,7 @@ def benchmark(clf):
218197 'n_iter' : 2 ,
219198 'n_jobs' : opts .n_jobs ,
220199 "random_state" : opts .random_seed ,
221- }
200+ }
222201classifiers ['SGD' ] = SGDClassifier (** sgd_parameters )
223202
224203######################################################################
@@ -255,21 +234,21 @@ def benchmark(clf):
255234 op .error ('classifier %r unknown' % name )
256235 sys .exit (1 )
257236
258- print ("" )
237+ print ()
259238print ("Training Classifiers" )
260239print ("====================" )
261- print ("" )
240+ print ()
262241err , train_time , test_time = {}, {}, {}
263242for name in sorted (selected_classifiers ):
264243 print ("Training %s ..." % name )
265244 err [name ], train_time [name ], test_time [name ] = benchmark (classifiers [name ])
266245
267246######################################################################
268247## Print classification performance
269- print ("" )
248+ print ()
270249print ("Classification performance:" )
271250print ("===========================" )
272- print ("" )
251+ print ()
273252
274253
275254def print_row (clf_type , train_time , test_time , err ):
@@ -284,5 +263,5 @@ def print_row(clf_type, train_time, test_time, err):
284263
285264for name in sorted (selected_classifiers , key = lambda name : err [name ]):
286265 print_row (name , train_time [name ], test_time [name ], err [name ])
287- print ("" )
288- print ("" )
266+ print ()
267+ print ()
0 commit comments