@@ -91,6 +91,9 @@ class BayesianRidge(LinearModel, RegressorMixin):
9191    lambda_ : float 
9292       estimated precision of the weights. 
9393
94+     sigma_ : array, shape = (n_features, n_features) 
95+         estimated variance-covariance matrix of the weights 
96+ 
9497    scores_ : float 
9598        if computed, value of the objective function (to be maximized) 
9699
@@ -109,6 +112,16 @@ class BayesianRidge(LinearModel, RegressorMixin):
109112    Notes 
110113    ----- 
111114    See examples/linear_model/plot_bayesian_ridge.py for an example. 
115+ 
116+     References 
117+     ---------- 
118+     D. J. C. MacKay, Bayesian Interpolation, Computation and Neural Systems, 
119+     Vol. 4, No. 3, 1992. 
120+ 
121+     R. Salakhutdinov, Lecture notes on Statistical Machine Learning, 
122+     http://www.utstat.toronto.edu/~rsalakhu/sta4273/notes/Lecture2.pdf#page=15 
123+     Their beta is our self.alpha_ 
124+     Their alpha is our self.lambda_ 
112125    """ 
113126
114127    def  __init__ (self , n_iter = 300 , tol = 1.e-3 , alpha_1 = 1.e-6 , alpha_2 = 1.e-6 ,
@@ -142,8 +155,10 @@ def fit(self, X, y):
142155        self : returns an instance of self. 
143156        """ 
144157        X , y  =  check_X_y (X , y , dtype = np .float64 , y_numeric = True )
145-         X , y , X_offset ,  y_offset ,  X_scale  =  self ._preprocess_data (
158+         X , y , X_offset_ ,  y_offset_ ,  X_scale_  =  self ._preprocess_data (
146159            X , y , self .fit_intercept , self .normalize , self .copy_X )
160+         self .X_offset_  =  X_offset_ 
161+         self .X_scale_  =  X_scale_ 
147162        n_samples , n_features  =  X .shape 
148163
149164        # Initialization of the values of the parameters 
@@ -171,7 +186,8 @@ def fit(self, X, y):
171186            # coef_ = sigma_^-1 * XT * y 
172187            if  n_samples  >  n_features :
173188                coef_  =  np .dot (Vh .T ,
174-                                Vh  /  (eigen_vals_  +  lambda_  /  alpha_ )[:, None ])
189+                                Vh  /  (eigen_vals_  + 
190+                                      lambda_  /  alpha_ )[:, np .newaxis ])
175191                coef_  =  np .dot (coef_ , XT_y )
176192                if  self .compute_score :
177193                    logdet_sigma_  =  -  np .sum (
@@ -216,10 +232,45 @@ def fit(self, X, y):
216232        self .alpha_  =  alpha_ 
217233        self .lambda_  =  lambda_ 
218234        self .coef_  =  coef_ 
235+         sigma_  =  np .dot (Vh .T ,
236+                         Vh  /  (eigen_vals_  +  lambda_  /  alpha_ )[:, np .newaxis ])
237+         self .sigma_  =  (1.  /  alpha_ ) *  sigma_ 
219238
220-         self ._set_intercept (X_offset ,  y_offset ,  X_scale )
239+         self ._set_intercept (X_offset_ ,  y_offset_ ,  X_scale_ )
221240        return  self 
222241
242+     def  predict (self , X , return_std = False ):
243+         """Predict using the linear model. 
244+ 
245+         In addition to the mean of the predictive distribution, also its 
246+         standard deviation can be returned. 
247+ 
248+         Parameters 
249+         ---------- 
250+         X : {array-like, sparse matrix}, shape = (n_samples, n_features) 
251+             Samples. 
252+ 
253+         return_std : boolean, optional 
254+             Whether to return the standard deviation of posterior prediction. 
255+ 
256+         Returns 
257+         ------- 
258+         y_mean : array, shape = (n_samples,) 
259+             Mean of predictive distribution of query points. 
260+ 
261+         y_std : array, shape = (n_samples,) 
262+             Standard deviation of predictive distribution of query points. 
263+         """ 
264+         y_mean  =  self ._decision_function (X )
265+         if  return_std  is  False :
266+             return  y_mean 
267+         else :
268+             if  self .normalize :
269+                 X  =  (X  -  self .X_offset_ ) /  self .X_scale_ 
270+             sigmas_squared_data  =  (np .dot (X , self .sigma_ ) *  X ).sum (axis = 1 )
271+             y_std  =  np .sqrt (sigmas_squared_data  +  (1.  /  self .alpha_ ))
272+             return  y_mean , y_std 
273+ 
223274
224275############################################################################### 
225276# ARD (Automatic Relevance Determination) regression 
@@ -323,6 +374,19 @@ class ARDRegression(LinearModel, RegressorMixin):
323374    Notes 
324375    -------- 
325376    See examples/linear_model/plot_ard.py for an example. 
377+ 
378+     References 
379+     ---------- 
380+     D. J. C. MacKay, Bayesian nonlinear modeling for the prediction 
381+     competition, ASHRAE Transactions, 1994. 
382+ 
383+     R. Salakhutdinov, Lecture notes on Statistical Machine Learning, 
384+     http://www.utstat.toronto.edu/~rsalakhu/sta4273/notes/Lecture2.pdf#page=15 
385+     Their beta is our self.alpha_ 
386+     Their alpha is our self.lambda_ 
387+     ARD is a little different than the slide: only dimensions/features for 
388+     which self.lambda_ < self.threshold_lambda are kept and the rest are 
389+     discarded. 
326390    """ 
327391
328392    def  __init__ (self , n_iter = 300 , tol = 1.e-3 , alpha_1 = 1.e-6 , alpha_2 = 1.e-6 ,
@@ -365,7 +429,7 @@ def fit(self, X, y):
365429        n_samples , n_features  =  X .shape 
366430        coef_  =  np .zeros (n_features )
367431
368-         X , y , X_offset ,  y_offset ,  X_scale  =  self ._preprocess_data (
432+         X , y , X_offset_ ,  y_offset_ ,  X_scale_  =  self ._preprocess_data (
369433            X , y , self .fit_intercept , self .normalize , self .copy_X )
370434
371435        # Launch the convergence loop 
@@ -417,7 +481,7 @@ def fit(self, X, y):
417481                s  =  (lambda_1  *  np .log (lambda_ ) -  lambda_2  *  lambda_ ).sum ()
418482                s  +=  alpha_1  *  log (alpha_ ) -  alpha_2  *  alpha_ 
419483                s  +=  0.5  *  (fast_logdet (sigma_ ) +  n_samples  *  log (alpha_ ) + 
420-                                                  np .sum (np .log (lambda_ )))
484+                             np .sum (np .log (lambda_ )))
421485                s  -=  0.5  *  (alpha_  *  rmse_  +  (lambda_  *  coef_  **  2 ).sum ())
422486                self .scores_ .append (s )
423487
@@ -432,5 +496,38 @@ def fit(self, X, y):
432496        self .alpha_  =  alpha_ 
433497        self .sigma_  =  sigma_ 
434498        self .lambda_  =  lambda_ 
435-         self ._set_intercept (X_offset ,  y_offset ,  X_scale )
499+         self ._set_intercept (X_offset_ ,  y_offset_ ,  X_scale_ )
436500        return  self 
501+ 
502+     def  predict (self , X , return_std = False ):
503+         """Predict using the linear model. 
504+ 
505+         In addition to the mean of the predictive distribution, also its 
506+         standard deviation can be returned. 
507+ 
508+         Parameters 
509+         ---------- 
510+         X : {array-like, sparse matrix}, shape = (n_samples, n_features) 
511+             Samples. 
512+ 
513+         return_std : boolean, optional 
514+             Whether to return the standard deviation of posterior prediction. 
515+ 
516+         Returns 
517+         ------- 
518+         y_mean : array, shape = (n_samples,) 
519+             Mean of predictive distribution of query points. 
520+ 
521+         y_std : array, shape = (n_samples,) 
522+             Standard deviation of predictive distribution of query points. 
523+         """ 
524+         y_mean  =  self ._decision_function (X )
525+         if  return_std  is  False :
526+             return  y_mean 
527+         else :
528+             if  self .normalize :
529+                 X  =  (X  -  self .X_offset_ ) /  self .X_scale_ 
530+             X  =  X [:, self .lambda_  <  self .threshold_lambda ]
531+             sigmas_squared_data  =  (np .dot (X , self .sigma_ ) *  X ).sum (axis = 1 )
532+             y_std  =  np .sqrt (sigmas_squared_data  +  (1.  /  self .alpha_ ))
533+             return  y_mean , y_std 
0 commit comments