Skip to content

Commit 42c7496

Browse files
committed
Merge branch 'master' into gbrt-deviance-fix
2 parents 38af9c5 + d51fa05 commit 42c7496

File tree

316 files changed

+7729
-3994
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

316 files changed

+7729
-3994
lines changed

.gitattributes

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
/sklearn/utils/arraybuilder.c -diff
1717
/sklearn/utils/arrayfuncs.c -diff
1818
/sklearn/utils/graph_shortest_path.c -diff
19+
/sklearn/utils/lgamma.c -diff
1920
/sklearn/utils/murmurhash.c -diff
2021
/sklearn/utils/seq_dataset.c -diff
2122
/sklearn/utils/sparsefuncs.c -diff

.travis.yml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
language: python
22
python:
33
- "2.7"
4-
virtualenv:
5-
system_site_packages: true
64
before_install:
5+
- deactivate
76
- sudo apt-get update -qq
8-
- sudo apt-get install -qq python-scipy
7+
- sudo apt-get install -qq python-scipy python-nose
8+
- virtualenv --system-site-packages ~/virtualenv/this
9+
- source ~/virtualenv/this/bin/activate
910
install: python setup.py build_ext --inplace
1011
script: make test

benchmarks/bench_covertype.py

Lines changed: 39 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -40,29 +40,33 @@
4040
[1] http://archive.ics.uci.edu/ml/datasets/Covertype
4141
4242
"""
43-
from __future__ import division
43+
from __future__ import division, print_function
4444

45-
print __doc__
45+
print(__doc__)
4646

47-
# Author: Peter Prettenhoer <[email protected]>
47+
# Author: Peter Prettenhofer <[email protected]>
4848
# License: BSD Style.
4949

50-
# $Id$
51-
52-
from time import time
50+
import logging
5351
import os
5452
import sys
55-
import numpy as np
53+
from time import time
5654
from optparse import OptionParser
5755

56+
import numpy as np
57+
58+
from sklearn.datasets import fetch_covtype
5859
from sklearn.svm import LinearSVC
5960
from sklearn.linear_model import SGDClassifier
6061
from sklearn.naive_bayes import GaussianNB
6162
from sklearn.tree import DecisionTreeClassifier
6263
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
6364
from sklearn import metrics
6465
from sklearn.externals.joblib import Memory
65-
from sklearn.utils import check_random_state
66+
67+
logging.basicConfig(level=logging.INFO,
68+
format='%(asctime)s %(levelname)s %(message)s')
69+
logger = logging.getLogger(__name__)
6670

6771
op = OptionParser()
6872
op.add_option("--classifiers",
@@ -80,8 +84,7 @@
8084
# estimators.
8185
op.add_option("--random-seed",
8286
dest="random_seed", default=13, type=int,
83-
help="Common seed used by random number generator."
84-
)
87+
help="Common seed used by random number generator.")
8588

8689
op.print_help()
8790

@@ -97,57 +100,31 @@
97100
joblib_cache_folder = os.path.join(bench_folder, 'bench_covertype_data')
98101
m = Memory(joblib_cache_folder, mmap_mode='r')
99102

100-
# Set seed for rng
101-
rng = check_random_state(opts.random_seed)
102-
103103

104104
# Load the data, then cache and memmap the train/test split
105105
@m.cache
106106
def load_data(dtype=np.float32, order='F'):
107-
######################################################################
108-
## Download the data, if not already on disk
109-
if not os.path.exists(original_archive):
110-
# Download the data
111-
import urllib
112-
print "Downloading data, Please Wait (11MB)..."
113-
opener = urllib.urlopen(
114-
'http://archive.ics.uci.edu/ml/'
115-
'machine-learning-databases/covtype/covtype.data.gz')
116-
open(original_archive, 'wb').write(opener.read())
117-
118107
######################################################################
119108
## Load dataset
120109
print("Loading dataset...")
121-
import gzip
122-
f = gzip.open(original_archive)
123-
X = np.fromstring(f.read().replace(",", " "), dtype=dtype, sep=" ",
124-
count=-1)
125-
X = X.reshape((581012, 55))
110+
data = fetch_covtype(download_if_missing=True, shuffle=True,
111+
random_state=opts.random_seed)
112+
X, y = data.data, data.target
126113
if order.lower() == 'f':
127114
X = np.asfortranarray(X)
128-
f.close()
129115

130116
# class 1 vs. all others.
131-
y = np.ones(X.shape[0]) * -1
132-
y[np.where(X[:, -1] == 1)] = 1
133-
X = X[:, :-1]
117+
y[np.where(y != 1)] = -1
134118

135119
######################################################################
136120
## Create train-test split (as [Joachims, 2006])
137-
print("Creating train-test split...")
138-
idx = np.arange(X.shape[0])
139-
rng.shuffle(idx)
140-
train_idx = idx[:522911]
141-
test_idx = idx[522911:]
121+
logger.info("Creating train-test split...")
122+
n_train = 522911
142123

143-
X_train = X[train_idx]
144-
y_train = y[train_idx]
145-
X_test = X[test_idx]
146-
y_test = y[test_idx]
147-
148-
# free memory
149-
del X
150-
del y
124+
X_train = X[:n_train]
125+
y_train = y[:n_train]
126+
X_test = X[n_train:]
127+
y_test = y[n_train:]
151128

152129
######################################################################
153130
## Standardize first 10 features (the numerical ones)
@@ -172,12 +149,14 @@ def load_data(dtype=np.float32, order='F'):
172149
print("%s %d" % ("number of classes:".ljust(25),
173150
np.unique(y_train).shape[0]))
174151
print("%s %s" % ("data type:".ljust(25), X_train.dtype))
175-
print("%s %d (pos=%d, neg=%d, size=%dMB)" % ("number of train samples:".ljust(25),
176-
X_train.shape[0], np.sum(y_train == 1),
177-
np.sum(y_train == -1), int(X_train.nbytes / 1e6)))
178-
print("%s %d (pos=%d, neg=%d, size=%dMB)" % ("number of test samples:".ljust(25),
179-
X_test.shape[0], np.sum(y_test == 1),
180-
np.sum(y_test == -1), int(X_test.nbytes / 1e6)))
152+
print("%s %d (pos=%d, neg=%d, size=%dMB)"
153+
% ("number of train samples:".ljust(25),
154+
X_train.shape[0], np.sum(y_train == 1),
155+
np.sum(y_train == -1), int(X_train.nbytes / 1e6)))
156+
print("%s %d (pos=%d, neg=%d, size=%dMB)"
157+
% ("number of test samples:".ljust(25),
158+
X_test.shape[0], np.sum(y_test == 1),
159+
np.sum(y_test == -1), int(X_test.nbytes / 1e6)))
181160

182161

183162
classifiers = dict()
@@ -204,7 +183,7 @@ def benchmark(clf):
204183
'dual': False,
205184
'tol': 1e-3,
206185
"random_state": opts.random_seed,
207-
}
186+
}
208187
classifiers['liblinear'] = LinearSVC(**liblinear_parameters)
209188

210189
######################################################################
@@ -218,7 +197,7 @@ def benchmark(clf):
218197
'n_iter': 2,
219198
'n_jobs': opts.n_jobs,
220199
"random_state": opts.random_seed,
221-
}
200+
}
222201
classifiers['SGD'] = SGDClassifier(**sgd_parameters)
223202

224203
######################################################################
@@ -255,21 +234,21 @@ def benchmark(clf):
255234
op.error('classifier %r unknown' % name)
256235
sys.exit(1)
257236

258-
print("")
237+
print()
259238
print("Training Classifiers")
260239
print("====================")
261-
print("")
240+
print()
262241
err, train_time, test_time = {}, {}, {}
263242
for name in sorted(selected_classifiers):
264243
print("Training %s ..." % name)
265244
err[name], train_time[name], test_time[name] = benchmark(classifiers[name])
266245

267246
######################################################################
268247
## Print classification performance
269-
print("")
248+
print()
270249
print("Classification performance:")
271250
print("===========================")
272-
print("")
251+
print()
273252

274253

275254
def print_row(clf_type, train_time, test_time, err):
@@ -284,5 +263,5 @@ def print_row(clf_type, train_time, test_time, err):
284263

285264
for name in sorted(selected_classifiers, key=lambda name: err[name]):
286265
print_row(name, train_time[name], test_time[name], err[name])
287-
print("")
288-
print("")
266+
print()
267+
print()

benchmarks/bench_glm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525
for i in range(n_iter):
2626

27-
print 'Iteration %s of %s' % (i, n_iter)
27+
print('Iteration %s of %s' % (i, n_iter))
2828

2929
n_samples, n_features = 10 * i + 3, 10 * i + 3
3030

benchmarks/bench_glmnet.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,9 @@ def bench(factory, X, Y, X_test, Y_test, ref_coef):
3838
delta = (time() - tstart)
3939
# stop time
4040

41-
print "duration: %0.3fs" % delta
42-
print "rmse: %f" % rmse(Y_test, clf.predict(X_test))
43-
print "mean coef abs diff: %f" % abs(ref_coef - clf.coef_.ravel()).mean()
41+
print("duration: %0.3fs" % delta)
42+
print("rmse: %f" % rmse(Y_test, clf.predict(X_test)))
43+
print("mean coef abs diff: %f" % abs(ref_coef - clf.coef_.ravel()).mean())
4444
return delta
4545

4646

@@ -58,9 +58,9 @@ def bench(factory, X, Y, X_test, Y_test, ref_coef):
5858
n_informative = n_features / 10
5959
n_test_samples = 1000
6060
for i in range(1, n + 1):
61-
print '=================='
62-
print 'Iteration %s of %s' % (i, n)
63-
print '=================='
61+
print('==================')
62+
print('Iteration %s of %s' % (i, n))
63+
print('==================')
6464

6565
X, Y, coef_ = make_regression(
6666
n_samples=(i * step) + n_test_samples, n_features=n_features,
@@ -71,9 +71,9 @@ def bench(factory, X, Y, X_test, Y_test, ref_coef):
7171
X = X[:(i * step)]
7272
Y = Y[:(i * step)]
7373

74-
print "benching scikit: "
74+
print("benching scikit-learn: ")
7575
scikit_results.append(bench(ScikitLasso, X, Y, X_test, Y_test, coef_))
76-
print "benching glmnet: "
76+
print("benching glmnet: ")
7777
glmnet_results.append(bench(GlmnetLasso, X, Y, X_test, Y_test, coef_))
7878

7979
pl.clf()
@@ -96,9 +96,9 @@ def bench(factory, X, Y, X_test, Y_test, ref_coef):
9696
n_samples = 500
9797

9898
for i in range(1, n + 1):
99-
print '=================='
100-
print 'Iteration %02d of %02d' % (i, n)
101-
print '=================='
99+
print('==================')
100+
print('Iteration %02d of %02d' % (i, n))
101+
print('==================')
102102
n_features = i * step
103103
n_informative = n_features / 10
104104

@@ -111,9 +111,9 @@ def bench(factory, X, Y, X_test, Y_test, ref_coef):
111111
X = X[:n_samples]
112112
Y = Y[:n_samples]
113113

114-
print "benching scikit: "
114+
print("benching scikit-learn: ")
115115
scikit_results.append(bench(ScikitLasso, X, Y, X_test, Y_test, coef_))
116-
print "benching glmnet: "
116+
print("benching glmnet: ")
117117
glmnet_results.append(bench(GlmnetLasso, X, Y, X_test, Y_test, coef_))
118118

119119
xx = np.arange(100, 100 + n * step, step)

benchmarks/bench_lasso.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,20 +19,18 @@
1919

2020

2121
def compute_bench(alpha, n_samples, n_features, precompute):
22-
2322
lasso_results = []
2423
lars_lasso_results = []
2524

26-
n_test_samples = 0
2725
it = 0
2826

2927
for ns in n_samples:
3028
for nf in n_features:
3129
it += 1
32-
print '=================='
33-
print 'Iteration %s of %s' % (it, max(len(n_samples),
34-
len(n_features)))
35-
print '=================='
30+
print('==================')
31+
print('Iteration %s of %s' % (it, max(len(n_samples),
32+
len(n_features))))
33+
print('==================')
3634
n_informative = nf // 10
3735
X, Y, coef_ = make_regression(n_samples=ns, n_features=nf,
3836
n_informative=n_informative,
@@ -41,15 +39,15 @@ def compute_bench(alpha, n_samples, n_features, precompute):
4139
X /= np.sqrt(np.sum(X ** 2, axis=0)) # Normalize data
4240

4341
gc.collect()
44-
print "- benching Lasso"
42+
print("- benching Lasso")
4543
clf = Lasso(alpha=alpha, fit_intercept=False,
4644
precompute=precompute)
4745
tstart = time()
4846
clf.fit(X, Y)
4947
lasso_results.append(time() - tstart)
5048

5149
gc.collect()
52-
print "- benching LassoLars"
50+
print("- benching LassoLars")
5351
clf = LassoLars(alpha=alpha, fit_intercept=False,
5452
normalize=False, precompute=precompute)
5553
tstart = time()

0 commit comments

Comments
 (0)