Skip to content

Commit 7b12364

Browse files
committed
Merge branch 'master' of github.com:scikit-learn/scikit-learn
2 parents bf789be + b44a386 commit 7b12364

File tree

13 files changed

+5663
-119
lines changed

13 files changed

+5663
-119
lines changed

examples/glm/plot_lasso_lars.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,29 @@
2424
# someting's wrong with our dataset
2525
X[:, 6] = -X[:, 6]
2626

27+
28+
m, n = 200, 200
29+
np.random.seed(0)
30+
X = np.random.randn(m, n)
31+
y = np.random.randn(m)
32+
33+
34+
_xmean = X.mean(0)
35+
_ymean = y.mean(0)
36+
X = X - _xmean
37+
y = y - _ymean
38+
_norms = np.apply_along_axis (np.linalg.norm, 0, X)
39+
nonzeros = np.flatnonzero(_norms)
40+
X[:, nonzeros] /= _norms[nonzeros]
41+
2742
################################################################################
2843
# Demo path functions
2944
################################################################################
3045

31-
46+
G = np.dot(X.T, X)
3247
print "Computing regularization path using the LARS ..."
3348
start = datetime.now()
34-
alphas, active, path = glm.lars_path(X, y, method='lasso', max_iter=12)
49+
alphas, active, path = glm.lars_path(X, y, Gram=G, method='lasso')
3550
print "This took ", datetime.now() - start
3651

3752
alphas = np.sum(np.abs(path.T), axis=1)
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
"""
2+
Benchmarks of Lasso vs LassoLARS
3+
4+
First, we fix a training set and increase the number of
5+
samples. Then we plot the computation time as function of
6+
the number of samples.
7+
8+
In the second benchmark, we increase the number of dimensions of the
9+
training set. Then we plot the computation time as function of
10+
the number of dimensions.
11+
12+
In both cases, only 10% of the features are informative.
13+
"""
14+
import gc
15+
from time import time
16+
import numpy as np
17+
18+
from bench_glm import make_data
19+
20+
def bench(clf, X_train, Y_train, X_test, Y_test):
21+
gc.collect()
22+
23+
# start time
24+
tstart = time()
25+
clf = clf.fit(X_train, Y_train)
26+
delta = (time() - tstart)
27+
# stop time
28+
29+
as_size = np.sum(np.abs(clf.coef_) > 0)
30+
print "active set size: %s (%s %%)" % (as_size, float(as_size) / X_train.shape[1])
31+
return delta
32+
33+
def compute_bench(alpha, n_samples, n_features):
34+
35+
def LassoFactory(alpha):
36+
return Lasso(alpha=alpha, fit_intercept=False)
37+
38+
def LassoLARSFactory(alpha):
39+
return LassoLARS(alpha=alpha, normalize=False)
40+
# return LassoLARS(alpha=alpha, fit_intercept=False, normalize=False)
41+
42+
lasso_results = []
43+
larslasso_results = []
44+
45+
n_tests = 1000
46+
it = 0
47+
48+
for ns in n_samples:
49+
for nf in n_features:
50+
it += 1
51+
print '============'
52+
print 'Iteration %s' % it
53+
print '============'
54+
k = nf // 10
55+
X, Y, X_test, Y_test, coef_ = make_data(
56+
n_samples=ns, n_tests=n_tests, n_features=nf,
57+
noise=0.1, k=k)
58+
59+
X /= np.sqrt(np.sum(X**2, axis=0)) # Normalize data
60+
61+
print "benching Lasso: "
62+
lasso_results.append(bench(LassoFactory(alpha),
63+
X, Y, X_test, Y_test))
64+
print "benching LassoLARS: "
65+
larslasso_results.append(bench(LassoLARSFactory(alpha),
66+
X, Y, X_test, Y_test))
67+
68+
return lasso_results, larslasso_results
69+
70+
if __name__ == '__main__':
71+
from scikits.learn.glm import Lasso, LassoLARS
72+
import pylab as pl
73+
74+
alpha = 0.01 # regularization parameter
75+
76+
n_features = 500
77+
list_n_samples = range(500, 10001, 500);
78+
lasso_results, larslasso_results = compute_bench(alpha, list_n_samples,
79+
[n_features])
80+
81+
pl.close('all')
82+
pl.title('Lasso benchmark (%d features - alpha=%s)' % (n_features, alpha))
83+
pl.plot(list_n_samples, lasso_results, 'b-', label='Lasso')
84+
pl.plot(list_n_samples, larslasso_results,'r-', label='LassoLARS')
85+
pl.legend()
86+
pl.xlabel('number of samples')
87+
pl.ylabel('time (in seconds)')
88+
pl.show()
89+
90+
n_samples = 500
91+
list_n_features = range(500, 3001, 500);
92+
lasso_results, larslasso_results = compute_bench(alpha, [n_samples],
93+
list_n_features)
94+
95+
pl.figure()
96+
pl.title('Lasso benchmark (%d samples - alpha=%s)' % (n_samples, alpha))
97+
pl.plot(list_n_features, lasso_results, 'b-', label='Lasso')
98+
pl.plot(list_n_features, larslasso_results,'r-', label='LassoLARS')
99+
pl.legend()
100+
pl.xlabel('number of features')
101+
pl.ylabel('time (in seconds)')
102+
pl.show()
103+

0 commit comments

Comments
 (0)