catchmrbharath
diff --git a/‎doc/whats_new.rst‎
Lines changed: 5 additions & 2 deletions b/‎doc/whats_new.rst‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎examples/document_classification_20newsgroups.py‎
Lines changed: 3 additions & 1 deletion b/‎examples/document_classification_20newsgroups.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎sklearn/linear_model/ridge.py‎
Lines changed: 72 additions & 59 deletions b/‎sklearn/linear_model/ridge.py‎
Lines changed: 72 additions & 59 deletions
@@ -8,8 +8,11 @@
 Changelog
 ---------
 
-   - :class:`feature_selection.SelectPercentile` now breaks ties deterministically
-     instead of returning all equally ranked features.
+   - :class:`feature_selection.SelectPercentile` now breaks ties
+     deterministically instead of returning all equally ranked features.
+
+   - Ridge regression and ridge classification fitting no longer has
+     quadratic memory complexity.
 
    - Speed up of :func:`metrics.precision_recall_curve` by Conrad Lee.
 
 
@@ -32,6 +32,7 @@
 from sklearn.datasets import fetch_20newsgroups
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.feature_selection import SelectKBest, chi2
+from sklearn.linear_model import RidgeClassifier
 from sklearn.svm import LinearSVC
 from sklearn.linear_model import SGDClassifier
 from sklearn.linear_model import Perceptron
@@ -190,7 +191,8 @@ def benchmark(clf):
 
 
 results = []
-for clf, name in ((Perceptron(n_iter=50), "Perceptron"),
+for clf, name in ((RidgeClassifier(tol=1e-1), "Ridge Classifier"),
+                  (Perceptron(n_iter=50), "Perceptron"),
                   (KNeighborsClassifier(n_neighbors=10), "kNN")):
     print 80 * '='
     print name
 
@@ -2,14 +2,19 @@
 Ridge regression
 """
 
-# Author:   Mathieu Blondel <[email protected]>
-#           Reuben Fletcher-Costin <[email protected]>
+# Author: Mathieu Blondel <[email protected]>
+#         Reuben Fletcher-Costin <[email protected]>
+#         Fabian Pedregosa <[email protected]>
 # License: Simplified BSD
 
 
 from abc import ABCMeta, abstractmethod
 import warnings
+
 import numpy as np
+from scipy import linalg
+from scipy import sparse
+from scipy.sparse import linalg as sp_linalg
 
 from .base import LinearClassifierMixin, LinearModel
 from ..base import RegressorMixin
@@ -19,49 +24,22 @@
 from ..grid_search import GridSearchCV
 
 
-def _solve(A, b, solver, tol):
-    # helper method for ridge_regression, A is symmetric positive
-
-    if solver == 'auto':
-        if hasattr(A, 'todense'):
-            solver = 'sparse_cg'
-        else:
-            solver = 'dense_cholesky'
-
-    if solver == 'sparse_cg':
-        if b.ndim < 2:
-            from scipy.sparse import linalg as sp_linalg
-            sol, error = sp_linalg.cg(A, b, tol=tol)
-            if error:
-                raise ValueError("Failed with error code %d" % error)
-            return sol
-        else:
-            # sparse_cg cannot handle a 2-d b.
-            sol = []
-            for j in range(b.shape[1]):
-                sol.append(_solve(A, b[:, j], solver="sparse_cg", tol=tol))
-            return np.array(sol).T
-
-    elif solver == 'dense_cholesky':
-        from scipy import linalg
-        if hasattr(A, 'todense'):
-            A = A.todense()
-        return linalg.solve(A, b, sym_pos=True, overwrite_a=True)
-    else:
-        raise NotImplementedError('Solver %s not implemented' % solver)
-
-
-def ridge_regression(X, y, alpha, sample_weight=1.0, solver='auto', tol=1e-3):
+def ridge_regression(X, y, alpha, sample_weight=1.0, solver='auto',
+                     max_iter=None, tol=1e-3):
     """Solve the ridge equation by the method of normal equations.
 
     Parameters
     ----------
-    X : {array-like, sparse matrix}, shape = [n_samples, n_features]
+    X : {array-like, sparse matrix, LinearOperator}, shape = [n_samples, n_features]
         Training data
 
     y : array-like, shape = [n_samples] or [n_samples, n_responses]
         Target values
 
+    max_iter : int, optional
+        Maximum number of iterations for conjugate gradient solver.
+        The default value is determined by scipy.sparse.linalg.
+
     sample_weight : float or numpy array of shape [n_samples]
         Individual weights for each sample
 
@@ -86,26 +64,55 @@ def ridge_regression(X, y, alpha, sample_weight=1.0, solver='auto', tol=1e-3):
     """
 
     n_samples, n_features = X.shape
-    is_sparse = False
 
-    if hasattr(X, 'todense'):  # lazy import of scipy.sparse
-        from scipy import sparse
-        is_sparse = sparse.issparse(X)
-
-    if is_sparse:
-        if n_features > n_samples or \
-           isinstance(sample_weight, np.ndarray) or \
-           sample_weight != 1.0:
+    if solver == 'auto':
+        # cholesky if it's a dense array and cg in
+        # any other case
+        if hasattr(X, '__array__'):
+            solver = 'dense_cholesky'
+        else:
+            solver = 'sparse_cg'
 
-            I = sparse.lil_matrix((n_samples, n_samples))
-            I.setdiag(np.ones(n_samples) * alpha * sample_weight)
-            c = _solve(X * X.T + I, y, solver, tol)
-            coef = X.T * c
+    if solver == 'sparse_cg':
+        # gradient descent
+        X1 = sp_linalg.aslinearoperator(X)
+        if y.ndim == 1:
+            y1 = np.reshape(y, (-1, 1))
         else:
-            I = sparse.lil_matrix((n_features, n_features))
-            I.setdiag(np.ones(n_features) * alpha)
-            coef = _solve(X.T * X + I, X.T * y, solver, tol)
+            y1 = y
+        coefs = np.empty((y1.shape[1], n_features))
+
+        for i in range(y1.shape[1]):
+            y_column = y1[:, i]
+            if n_features > n_samples:
+                # kernel ridge
+                # w = X.T * inv(X X^t + alpha*Id) y
+                def mv(x):
+                    return X1.matvec(X1.rmatvec(x)) + alpha * x
+                C = sp_linalg.LinearOperator(
+                    (n_samples, n_samples), matvec=mv, dtype=X.dtype)
+                coef, info = sp_linalg.cg(C, y_column, tol=tol)
+                coefs[i] = X1.rmatvec(coef)
+            else:
+                # ridge
+                # w = inv(X^t X + alpha*Id) * X.T y
+                def mv(x):
+                    return X1.rmatvec(X1.matvec(x)) + alpha * x
+                y_column = X1.rmatvec(y_column)
+                C = sp_linalg.LinearOperator(
+                    (n_features, n_features), matvec=mv, dtype=X.dtype)
+                coefs[i], info = sp_linalg.cg(C, y_column, maxiter=max_iter,
+                                              tol=tol)
+            if info != 0:
+                raise ValueError("Failed with error code %d" % info)
+
+        if y.ndim == 1:
+            return np.ravel(coefs)
+        return coefs
     else:
+        # normal equations (cholesky) method
+        if sparse.issparse(X):
+            X = X.toarray()
         if n_features > n_samples or \
            isinstance(sample_weight, np.ndarray) or \
            sample_weight != 1.0:
@@ -114,13 +121,13 @@ def ridge_regression(X, y, alpha, sample_weight=1.0, solver='auto', tol=1e-3):
             # w = X.T * inv(X X^t + alpha*Id) y
             A = np.dot(X, X.T)
             A.flat[::n_samples + 1] += alpha * sample_weight
-            coef = np.dot(X.T, _solve(A, y, solver, tol))
+            coef = np.dot(X.T, linalg.solve(A, y, sym_pos=True, overwrite_a=True))
         else:
             # ridge
             # w = inv(X^t X + alpha*Id) * X.T y
             A = np.dot(X.T, X)
             A.flat[::n_features + 1] += alpha
-            coef = _solve(A, np.dot(X.T, y), solver, tol)
+            coef = linalg.solve(A, np.dot(X.T, y), sym_pos=True, overwrite_a=True)
 
     return coef.T
 
@@ -130,7 +137,7 @@ class _BaseRidge(LinearModel):
 
     @abstractmethod
     def __init__(self, alpha=1.0, fit_intercept=True, normalize=False,
-                 copy_X=True, tol=1e-3):
+                 copy_X=True, max_iter=None, tol=1e-3):
         self.alpha = alpha
         self.fit_intercept = fit_intercept
         self.normalize = normalize
@@ -204,6 +211,10 @@ class Ridge(_BaseRidge, RegressorMixin):
     copy_X : boolean, optional, default True
         If True, X will be copied; else, it may be overwritten.
 
+    max_iter : int, optional
+        Maximum number of iterations for conjugate gradient solver.
+        The default value is determined by scipy.sparse.linalg.
+
     tol : float
         Precision of the solution.
 
@@ -257,6 +268,10 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge):
     copy_X : boolean, optional, default True
         If True, X will be copied; else, it may be overwritten.
 
+    max_iter : int, optional
+        Maximum number of iterations for conjugate gradient solver.
+        The default value is determined by scipy.sparse.linalg.
+
     tol : float
         Precision of the solution.
 
@@ -281,10 +296,10 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge):
     advantage of the multi-variate response support in Ridge.
     """
     def __init__(self, alpha=1.0, fit_intercept=True, normalize=False,
-            copy_X=True, tol=1e-3, class_weight=None):
+                 copy_X=True, max_iter=None, tol=1e-3, class_weight=None):
         super(RidgeClassifier, self).__init__(alpha=alpha,
                 fit_intercept=fit_intercept, normalize=normalize,
-                copy_X=copy_X, tol=tol)
+                copy_X=copy_X, max_iter=max_iter, tol=tol)
         self.class_weight = class_weight
 
     def fit(self, X, y, solver='auto'):
@@ -381,7 +396,6 @@ def __init__(self, alphas=[0.1, 1.0, 10.0], fit_intercept=True,
     def _pre_compute(self, X, y):
         # even if X is very sparse, K is usually very dense
         K = safe_sparse_dot(X, X.T, dense_output=True)
-        from scipy import linalg
         v, Q = linalg.eigh(K)
         QT_y = np.dot(Q.T, y)
         return v, Q, QT_y
@@ -418,7 +432,6 @@ def _values(self, alpha, y, v, Q, QT_y):
         return y - (c / G_diag), c
 
     def _pre_compute_svd(self, X, y):
-        from scipy import sparse
         if sparse.issparse(X) and hasattr(X, 'toarray'):
             X = X.toarray()
         U, s, _ = np.linalg.svd(X, full_matrices=0)