BUG: StratifiedShuffleSplit not obeying n_train

GaelVaroquaux · GaelVaroquaux · commit 41b312c604db · 2014-01-21T19:25:20.000+01:00
StratifiedShuffleSplit was not giving the n_train and n_test requested
diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
@@ -1004,6 +1004,19 @@ def _iter_indices(self):
                 train.extend(cls_i[:n_i[i]])
                 test.extend(cls_i[n_i[i]:n_i[i] + t_i[i]])
 
+            # Because of rounding issues (as n_train and n_test are not
+            # dividers of the number of elements per class), we may end
+            # up here with less samples in train and test than asked for.
+            if len(train) < self.n_train or len(test) < self.n_test:
+                # We complete by affecting randomly the missing indexes
+                missing_idx = np.where(
+                                       np.bincount(train + test,
+                                                   minlength=len(self.y)) == 0,
+                                       )[0]
+                missing_idx = rng.permutation(missing_idx)
+                train.extend(missing_idx[:(self.n_train - len(train))])
+                test.extend(missing_idx[:(self.n_test - len(test))])
+
             train = rng.permutation(train)
             test = rng.permutation(test)
 
diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py
@@ -400,6 +400,8 @@ def test_stratified_shuffle_split_iter_no_indices():
 def test_shuffle_split_even():
     # Test the in StratifiedShuffleSplit, indices are drawn with a
     # equal chance
+    n_folds = 5
+    n_iter = 1000
 
     def assert_counts_are_ok(idx_counts, p):
         # Here we test that the distribution of the counts
@@ -412,26 +414,30 @@ def assert_counts_are_ok(idx_counts, p):
                         "An index is not drawn with chance corresponding "
                         "to even draws")
 
-    for n_labels in (6, 22):
-        labels = np.array((n_labels // 2) * [0, 1])
-        n_folds = 5
-        splits = cval.StratifiedShuffleSplit(labels, n_iter=1000,
+    for n_samples in (6, 22):
+        labels = np.array((n_samples // 2) * [0, 1])
+        splits = cval.StratifiedShuffleSplit(labels, n_iter=n_iter,
                                     test_size=1./n_folds, random_state=0)
 
-        train_counts = [0] * len(labels)
-        test_counts = [0] * len(labels)
+        train_counts = [0] * n_samples
+        test_counts = [0] * n_samples
+        n_splits = 0
         for train, test in splits:
+            n_splits += 1
             for counter, ids in [(train_counts, train), (test_counts, test)]:
                 for id in ids:
                     counter[id] += 1
+        assert_equal(n_splits, n_iter)
+
+        assert_equal(len(train), splits.n_train)
+        assert_equal(len(test), splits.n_test)
 
-        n_splits = len(splits)
         label_counts = np.unique(labels)
         assert_equal(splits.test_size, 1.0 / n_folds)
         assert_equal(splits.n_train + splits.n_test, len(labels))
         assert_equal(len(label_counts), 2)
-        ex_test_p = (1. * splits.n_test) / n_labels
-        ex_train_p = 1.0 - ex_test_p
+        ex_test_p = float(splits.n_test) / n_samples
+        ex_train_p = float(splits.n_train) / n_samples
 
         assert_counts_are_ok(train_counts, ex_train_p)
         assert_counts_are_ok(test_counts, ex_test_p)