@@ -104,15 +104,24 @@ class GaussianNB(BaseNB):
104104 """
105105 Gaussian Naive Bayes (GaussianNB)
106106
107+ Can perform online updates to model parameters via `partial_fit` method.
108+ For details on algorithm used to update feature means and variance online,
109+ see Stanford CS tech report STAN-CS-79-773 by Chan, Golub, and LeVeque:
110+
111+ http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf
112+
107113 Attributes
108114 ----------
109- `class_prior_` : array, shape = [ n_classes]
115+ `class_prior_` : array, shape ( n_classes,)
110116 probability of each class.
111117
112- `theta_` : array, shape = [n_classes, n_features]
118+ `class_count_` : array, shape (n_classes,)
119+ number of training samples observed in each class.
120+
121+ `theta_` : array, shape (n_classes, n_features)
113122 mean of each feature per class
114123
115- `sigma_` : array, shape = [ n_classes, n_features]
124+ `sigma_` : array, shape ( n_classes, n_features)
116125 variance of each feature per class
117126
118127 Examples
@@ -126,18 +135,23 @@ class GaussianNB(BaseNB):
126135 GaussianNB()
127136 >>> print(clf.predict([[-0.8, -1]]))
128137 [1]
138+ >>> clf_pf = GaussianNB()
139+ >>> clf_pf.partial_fit(X, Y, np.unique(Y))
140+ GaussianNB()
141+ >>> print(clf_pf.predict([[-0.8, -1]]))
142+ [1]
129143 """
130144
131145 def fit (self , X , y ):
132146 """Fit Gaussian Naive Bayes according to X, y
133147
134148 Parameters
135149 ----------
136- X : array-like, shape = [ n_samples, n_features]
150+ X : array-like, shape ( n_samples, n_features)
137151 Training vectors, where n_samples is the number of samples
138152 and n_features is the number of features.
139153
140- y : array-like, shape = [ n_samples]
154+ y : array-like, shape ( n_samples,)
141155 Target values.
142156
143157 Returns
@@ -157,12 +171,138 @@ def fit(self, X, y):
157171 self .theta_ = np .zeros ((n_classes , n_features ))
158172 self .sigma_ = np .zeros ((n_classes , n_features ))
159173 self .class_prior_ = np .zeros (n_classes )
174+ self .class_count_ = np .zeros (n_classes )
160175 epsilon = 1e-9
161176 for i , y_i in enumerate (unique_y ):
162177 Xi = X [y == y_i , :]
163178 self .theta_ [i , :] = np .mean (Xi , axis = 0 )
164179 self .sigma_ [i , :] = np .var (Xi , axis = 0 ) + epsilon
165- self .class_prior_ [i ] = np .float (Xi .shape [0 ]) / n_samples
180+ self .class_count_ [i ] = Xi .shape [0 ]
181+ self .class_prior_ [:] = self .class_count_ / n_samples
182+ return self
183+
184+ @staticmethod
185+ def _update_mean_variance (n_past , mu , var , X ):
186+ """Compute online update of Gaussian mean and variance.
187+
188+ Given starting sample count, mean, and variance, and a new set of
189+ points X, return the updated mean and variance. (NB - each dimension
190+ (column) in X is treated as independent -- you get variance, not
191+ covariance).
192+
193+ Can take scalar mean and variance, or vector mean and variance to
194+ simultaneously update a number of independent Gaussians.
195+
196+ See Stanford CS tech report STAN-CS-79-773 by Chan, Golub, and LeVeque:
197+
198+ http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf
199+
200+ Parameters
201+ ----------
202+ n_past : int
203+ Number of samples represented in old mean and variance.
204+
205+ mu: array-like, shape (number of Gaussians,)
206+ Means for Gaussians in original set.
207+
208+ var: array-like, shape (number of Gaussians,)
209+ Variances for Gaussians in original set.
210+
211+ Returns
212+ -------
213+ mu_new: array-like, shape (number of Gaussians,)
214+ Updated mean for each Gaussian over the combined set.
215+
216+ var_new: array-like, shape (number of Gaussians,)
217+ Updated variance for each Gaussian over the combined set.
218+ """
219+ if n_past == 0 :
220+ return np .mean (X , axis = 0 ), np .var (X , axis = 0 )
221+ elif X .shape [0 ] == 0 :
222+ return mu , var
223+
224+ old_ssd = var * n_past
225+ n_new = X .shape [0 ]
226+ new_ssd = n_new * np .var (X , axis = 0 )
227+ new_sum = np .sum (X , axis = 0 )
228+ n_total = float (n_past + n_new )
229+
230+ total_ssd = (old_ssd + new_ssd +
231+ (n_past / float (n_new * n_total )) *
232+ (n_new * mu - new_sum ) ** 2 )
233+
234+ total_sum = new_sum + (mu * n_past )
235+
236+ return total_sum / n_total , total_ssd / n_total
237+
238+ def partial_fit (self , X , y , classes = None ):
239+ """Incremental fit on a batch of samples.
240+
241+ This method is expected to be called several times consecutively
242+ on different chunks of a dataset so as to implement out-of-core
243+ or online learning.
244+
245+ This is especially useful when the whole dataset is too big to fit in
246+ memory at once.
247+
248+ This method has some performance and numerical stability overhead,
249+ hence it is better to call partial_fit on chunks of data that are
250+ as large as possible (as long as fitting in the memory budget) to
251+ hide the overhead.
252+
253+ Parameters
254+ ----------
255+ X : array-like, shape (n_samples, n_features)
256+ Training vectors, where n_samples is the number of samples and
257+ n_features is the number of features.
258+
259+ y : array-like, shape (n_samples,)
260+ Target values.
261+
262+ classes : array-like, shape (n_classes,)
263+ List of all the classes that can possibly appear in the y vector.
264+
265+ Must be provided at the first call to partial_fit, can be omitted
266+ in subsequent calls.
267+
268+ Returns
269+ -------
270+ self : object
271+ Returns self.
272+ """
273+ X , y = check_arrays (X , y , sparse_format = 'dense' )
274+ y = column_or_1d (y , warn = True )
275+ epsilon = 1e-9
276+
277+ if _check_partial_fit_first_call (self , classes ):
278+ # This is the first call to partial_fit:
279+ # initialize various cumulative counters
280+ n_features = X .shape [1 ]
281+ n_classes = len (self .classes_ )
282+ self .theta_ = np .zeros ((n_classes , n_features ))
283+ self .sigma_ = np .zeros ((n_classes , n_features ))
284+ self .class_prior_ = np .zeros (n_classes )
285+ self .class_count_ = np .zeros (n_classes )
286+ else :
287+ # Put epsilon back in each time
288+ self .sigma_ [:, :] -= epsilon
289+
290+ class2idx = dict ((cls , idx ) for idx , cls in enumerate (self .classes_ ))
291+ for y_i in np .unique (y ):
292+ i = class2idx [y_i ]
293+ X_i = X [y == y_i , :]
294+ N_i = X_i .shape [0 ]
295+
296+ new_theta , new_sigma = self ._update_mean_variance (
297+ self .class_count_ [i ], self .theta_ [i , :], self .sigma_ [i , :],
298+ X_i )
299+
300+ self .theta_ [i , :] = new_theta
301+ self .sigma_ [i , :] = new_sigma
302+ self .class_count_ [i ] += N_i
303+
304+ self .sigma_ [:, :] += epsilon
305+ self .class_prior_ [:] = self .class_count_ / np .sum (self .class_count_ )
166306 return self
167307
168308 def _joint_log_likelihood (self , X ):
0 commit comments