FIX use random_state in LogisticRegression

TomDLT · TomDLT · commit fc9d7befe109 · 2015-05-27T16:59:12.000+02:00
diff --git a/sklearn/linear_model/logistic.py b/sklearn/linear_model/logistic.py
@@ -20,6 +20,7 @@
 from ..preprocessing import LabelEncoder, LabelBinarizer
 from ..svm.base import _fit_liblinear
 from ..utils import check_array, check_consistent_length, compute_class_weight
+from ..utils import check_random_state
 from ..utils.extmath import (logsumexp, log_logistic, safe_sparse_dot,
                              squared_norm)
 from ..utils.optimize import newton_cg
@@ -417,7 +418,8 @@ def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
                              max_iter=100, tol=1e-4, verbose=0,
                              solver='lbfgs', coef=None, copy=True,
                              class_weight=None, dual=False, penalty='l2',
-                             intercept_scaling=1., multi_class='ovr'):
+                             intercept_scaling=1., multi_class='ovr',
+                             random_state=None):
     """Compute a Logistic Regression model for a list of regularization
     parameters.
 
@@ -502,8 +504,12 @@ def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
         Multiclass option can be either 'ovr' or 'multinomial'. If the option
         chosen is 'ovr', then a binary problem is fit for each label. Else
         the loss minimised is the multinomial loss fit across
-        the entire probability distribution. Works only for the 'lbfgs'
-        solver.
+        the entire probability distribution. Works only for the 'lbfgs' and
+        'newton-cg' solvers.
+
+    random_state : int seed, RandomState instance, or None (default)
+        The seed of the pseudo random number generator to use when
+        shuffling the data.
 
     Returns
     -------
@@ -531,6 +537,7 @@ def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
     _, n_features = X.shape
     check_consistent_length(X, y)
     classes = np.unique(y)
+    random_state = check_random_state(random_state)
 
     if pos_class is None and multi_class != 'multinomial':
         if (classes.size > 2):
@@ -659,7 +666,7 @@ def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
         elif solver == 'liblinear':
             coef_, intercept_, _, = _fit_liblinear(
                 X, y, C, fit_intercept, intercept_scaling, class_weight,
-                penalty, dual, verbose, max_iter, tol,
+                penalty, dual, verbose, max_iter, tol, random_state
                 )
             if fit_intercept:
                 w0 = np.concatenate([coef_.ravel(), intercept_])
@@ -1029,7 +1036,7 @@ def fit(self, X, y):
             self.coef_, self.intercept_, self.n_iter_ = _fit_liblinear(
                 X, y, self.C, self.fit_intercept, self.intercept_scaling,
                 self.class_weight, self.penalty, self.dual, self.verbose,
-                self.max_iter, self.tol
+                self.max_iter, self.tol, self.random_state
                 )
             return self
 
diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py
@@ -266,13 +266,22 @@ def test_consistency_path():
         assert_array_almost_equal(lr_coef, coefs[0], decimal=4)
 
 
-def test_liblinear_random_state():
+def test_liblinear_dual_random_state():
+    # random_state is relevant for liblinear solver only if dual=True
     X, y = make_classification(n_samples=20)
-    lr1 = LogisticRegression(random_state=0)
+    lr1 = LogisticRegression(random_state=0, dual=True, max_iter=1, tol=1e-15)
     lr1.fit(X, y)
-    lr2 = LogisticRegression(random_state=0)
+    lr2 = LogisticRegression(random_state=0, dual=True, max_iter=1, tol=1e-15)
     lr2.fit(X, y)
+    lr3 = LogisticRegression(random_state=8, dual=True, max_iter=1, tol=1e-15)
+    lr3.fit(X, y)
+
+    # same result for same random state
     assert_array_almost_equal(lr1.coef_, lr2.coef_)
+    # different results for different random states
+    msg = "Arrays are not almost equal to 6 decimals"
+    assert_raise_message(AssertionError, msg,
+                         assert_array_almost_equal, lr1.coef_, lr3.coef_)
 
 
 def test_logistic_loss_and_grad():