Added warm_start to bagging

betatim · arjoly · commit 2f563df5123b · 2015-05-06T15:14:05.000+02:00
BaggingClassifier and BaggingRegressor now support warm_starts. Added
basic tests and documentation of the new functionality. Heavily
inspired by work on warm_start for Random forests.
diff --git a/sklearn/ensemble/bagging.py b/sklearn/ensemble/bagging.py
@@ -65,7 +65,7 @@ def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight,
             print("building estimator %d of %d" % (i + 1, n_estimators))
 
         random_state = check_random_state(seeds[i])
-        seed = check_random_state(random_state.randint(MAX_INT))
+        seed = random_state.randint(MAX_INT)
         estimator = ensemble._make_estimator(append=False)
 
         try:  # Not all estimator accept a random_state
@@ -206,6 +206,7 @@ def __init__(self,
                  bootstrap=True,
                  bootstrap_features=False,
                  oob_score=False,
+                 warm_start=False,
                  n_jobs=1,
                  random_state=None,
                  verbose=0):
@@ -218,6 +219,7 @@ def __init__(self,
         self.bootstrap = bootstrap
         self.bootstrap_features = bootstrap_features
         self.oob_score = oob_score
+        self.warm_start = warm_start
         self.n_jobs = n_jobs
         self.random_state = random_state
         self.verbose = verbose
@@ -278,40 +280,60 @@ def fit(self, X, y, sample_weight=None):
             raise ValueError("Out of bag estimation only available"
                              " if bootstrap=True")
 
-        # Free allocated memory, if any
-        self.estimators_ = None
+        if not self.warm_start:
+            # Free allocated memory, if any
+            self.estimators_ = []
+            self.estimators_samples_ = []
+            self.estiamtors_features_ = []
 
-        # Parallel loop
-        n_jobs, n_estimators, starts = _partition_estimators(self.n_estimators,
-                                                             self.n_jobs)
-        seeds = random_state.randint(MAX_INT, size=self.n_estimators)
+        n_more_estimators = self.n_estimators - len(self.estimators_)
 
-        all_results = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
-            delayed(_parallel_build_estimators)(
-                n_estimators[i],
-                self,
-                X,
-                y,
-                sample_weight,
-                seeds[starts[i]:starts[i + 1]],
-                verbose=self.verbose)
-            for i in range(n_jobs))
+        if n_more_estimators < 0:
+            raise ValueError('n_estimators=%d must be larger or equal to '
+                             'len(estimators_)=%d when warm_start==True'
+                             % (self.n_estimators, len(self.estimators_)))
+
+        elif n_more_estimators == 0:
+            warn("Warm-start fitting without increasing n_estimators does not "
+                 "fit new trees.")
+        else:
+            # Parallel loop
+            n_jobs, n_estimators, starts = _partition_estimators(n_more_estimators,
+                                                                 self.n_jobs)
+
+            # Advance random state to state after training
+            # the first n_estimators
+            if self.warm_start and len(self.estimators_) > 0:
+                random_state.randint(MAX_INT, size=len(self.estimators_))
+
+            seeds = random_state.randint(MAX_INT, size=n_more_estimators)
+
+            all_results = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
+                delayed(_parallel_build_estimators)(
+                    n_estimators[i],
+                    self,
+                    X,
+                    y,
+                    sample_weight,
+                    seeds[starts[i]:starts[i + 1]],
+                    verbose=self.verbose)
+                for i in range(n_jobs))
 
         # Reduce
-        self.estimators_ = list(itertools.chain.from_iterable(
+        self.estimators_ += list(itertools.chain.from_iterable(
             t[0] for t in all_results))
         self.estimators_samples_ = list(itertools.chain.from_iterable(
             t[1] for t in all_results))
         self.estimators_features_ = list(itertools.chain.from_iterable(
             t[2] for t in all_results))
 
         if self.oob_score:
-            self._set_oob_score(X, y)
+            self._set_oob_score(X, y, n_more_estimators)
 
         return self
 
     @abstractmethod
-    def _set_oob_score(self, X, y):
+    def _set_oob_score(self, X, y, n_more_estimators):
         """Calculate out of bag predictions and score."""
 
     def _validate_y(self, y):
@@ -368,6 +390,11 @@ class BaggingClassifier(BaseBagging, ClassifierMixin):
         Whether to use out-of-bag samples to estimate
         the generalization error.
 
+    warm_start : bool, optional (default=False)
+        When set to True, reuse the solution of the previous call to fit
+        and add more estimators to the ensemble, otherwise, just fit
+        a whole new ensemble.
+
     n_jobs : int, optional (default=1)
         The number of jobs to run in parallel for both `fit` and `predict`.
         If -1, then the number of jobs is set to the number of cores.
@@ -435,6 +462,7 @@ def __init__(self,
                  bootstrap=True,
                  bootstrap_features=False,
                  oob_score=False,
+                 warm_start=False,
                  n_jobs=1,
                  random_state=None,
                  verbose=0):
@@ -447,6 +475,7 @@ def __init__(self,
             bootstrap=bootstrap,
             bootstrap_features=bootstrap_features,
             oob_score=oob_score,
+            warm_start=warm_start,
             n_jobs=n_jobs,
             random_state=random_state,
             verbose=verbose)
@@ -456,14 +485,14 @@ def _validate_estimator(self):
         super(BaggingClassifier, self)._validate_estimator(
             default=DecisionTreeClassifier())
 
-    def _set_oob_score(self, X, y):
+    def _set_oob_score(self, X, y, n_more_estimators):
         n_classes_ = self.n_classes_
         classes_ = self.classes_
         n_samples = y.shape[0]
 
         predictions = np.zeros((n_samples, n_classes_))
 
-        for estimator, samples, features in zip(self.estimators_,
+        for estimator, samples, features in zip(self.estimators_[-n_more_estimators:],
                                                 self.estimators_samples_,
                                                 self.estimators_features_):
             mask = np.ones(n_samples, dtype=np.bool)
@@ -724,6 +753,11 @@ class BaggingRegressor(BaseBagging, RegressorMixin):
         Whether to use out-of-bag samples to estimate
         the generalization error.
 
+    warm_start : bool, optional (default=False)
+        When set to True, reuse the solution of the previous call to fit
+        and add more estimators to the ensemble, otherwise, just fit
+        a whole new ensemble.
+
     n_jobs : int, optional (default=1)
         The number of jobs to run in parallel for both `fit` and `predict`.
         If -1, then the number of jobs is set to the number of cores.
@@ -783,6 +817,7 @@ def __init__(self,
                  bootstrap=True,
                  bootstrap_features=False,
                  oob_score=False,
+                 warm_start=False,
                  n_jobs=1,
                  random_state=None,
                  verbose=0):
@@ -794,6 +829,7 @@ def __init__(self,
             bootstrap=bootstrap,
             bootstrap_features=bootstrap_features,
             oob_score=oob_score,
+            warm_start=warm_start,
             n_jobs=n_jobs,
             random_state=random_state,
             verbose=verbose)
@@ -840,13 +876,13 @@ def _validate_estimator(self):
         super(BaggingRegressor, self)._validate_estimator(
             default=DecisionTreeRegressor())
 
-    def _set_oob_score(self, X, y):
+    def _set_oob_score(self, X, y, n_more_estimators):
         n_samples = y.shape[0]
 
         predictions = np.zeros((n_samples,))
         n_predictions = np.zeros((n_samples,))
 
-        for estimator, samples, features in zip(self.estimators_,
+        for estimator, samples, features in zip(self.estimators_[-n_more_estimators:],
                                                 self.estimators_samples_,
                                                 self.estimators_features_):
             mask = np.ones(n_samples, dtype=np.bool)
diff --git a/sklearn/ensemble/tests/test_bagging.py b/sklearn/ensemble/tests/test_bagging.py
@@ -29,7 +29,7 @@
 from sklearn.pipeline import make_pipeline
 from sklearn.feature_selection import SelectKBest
 from sklearn.cross_validation import train_test_split
-from sklearn.datasets import load_boston, load_iris
+from sklearn.datasets import load_boston, load_iris, make_hastie_10_2
 from sklearn.utils import check_random_state
 
 from scipy.sparse import csc_matrix, csr_matrix
@@ -571,6 +571,59 @@ def test_bagging_sample_weight_unsupported_but_passed():
     assert_raises(ValueError, estimator.fit, iris.data, iris.target,
                   sample_weight=rng.randint(10, size=(iris.data.shape[0])))
 
+
+def test_warm_start(random_state=42):
+    # Test if fitting incrementally with warm start gives a forest of the
+    # right size and the same results as a normal fit.
+    X, y = make_hastie_10_2(n_samples=20, random_state=1)
+
+    clf_ws = None
+    for n_estimators in [5, 10]:
+        if clf_ws is None:
+            clf_ws = BaggingClassifier(n_estimators=n_estimators,
+                                       random_state=random_state,
+                                       warm_start=True)
+        else:
+            clf_ws.set_params(n_estimators=n_estimators)
+        clf_ws.fit(X, y)
+        assert_equal(len(clf_ws), n_estimators)
+
+    clf_no_ws = BaggingClassifier(n_estimators=10, random_state=random_state,
+                                  warm_start=False)
+    clf_no_ws.fit(X, y)
+
+    assert_equal(set([tree.random_state for tree in clf_ws]),
+                 set([tree.random_state for tree in clf_no_ws]))
+
+
+def test_warm_start_smaller_n_estimators():
+    # Test if warm start'ed second fit with smaller n_estimators raises error.
+    X, y = make_hastie_10_2(n_samples=20, random_state=1)
+    clf = BaggingClassifier(n_estimators=5, warm_start=True)
+    clf.fit(X, y)
+    clf.set_params(n_estimators=4)
+    assert_raises(ValueError, clf.fit, X, y)
+
+
+def test_warm_start_equivalence():
+    # warm started classifier with 5+5 estimators should be equivalent to
+    # one classifier with 10 estimators
+    X, y = make_hastie_10_2(n_samples=20, random_state=1)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)
+
+    clf_ws = BaggingClassifier(n_estimators=5, warm_start=True)
+    clf_ws.fit(X_train, y_train)
+    clf_ws.set_params(n_estimators=10)
+    clf_ws.fit(X_train, y_train)
+    y1 = clf_ws.predict(X_test)
+
+    clf = BaggingClassifier(n_estimators=10, warm_start=False)
+    clf.fit(X_train, y_train)
+    y2 = clf.predict(X_test)
+
+    assert_array_almost_equal(y1, y2)
+
+
 if __name__ == "__main__":
     import nose
     nose.runmodule()