ENH do normalization in single pass over data

amueller · amueller · commit cf4940a5b1fd · 2012-09-23T17:31:15.000+01:00
diff --git a/sklearn/preprocessing.py b/sklearn/preprocessing.py
@@ -141,6 +141,11 @@ class MinMaxScaler(BaseEstimator, TransformerMixin):
     that it is in the given range on the training set, i.e. between
     zero and one.
 
+    The standardization is given by::
+        X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
+        X_scaled = X_std / (max - min) + min
+    where min, max = feature_range.
+
     This standardization is often used as an alternative to zero mean,
     unit variance scaling.
     It is in particular useful for sparse positive data, as it retains the
@@ -159,46 +164,36 @@ class MinMaxScaler(BaseEstimator, TransformerMixin):
     Attributes
     ----------
     min_ : ndarray, shape (n_features,)
-        Per feature minimum of the training data.
+        Per feature adjustment for minimum.
 
     scale_ : ndarray, shape (n_features,)
-        Per feature range of the training data, i.e. max - min.
+        Per feature relative scaling of the data.
     """
 
     def __init__(self, feature_range=(0, 1), copy=True):
         self.feature_range = feature_range
         self.copy = copy
 
     def fit(self, X, y=None):
+        X = check_arrays(X, sparse_format="dense", copy=self.copy)[0]
         feature_range = self.feature_range
         if feature_range[0] >= feature_range[1]:
             raise ValueError("Minimum of desired feature range must be smaller"
                              " than maximum. Got %s." % str(feature_range))
-        if sp.issparse(X):
-            self.min_ = sp.min(X, axis=1)
-            self.scale = sp.max(X, axis=1) - self.min_
-        else:
-            self.min_ = np.min(X, axis=1)
-            self.scale_ = np.max(X, axis=1) - self.min_
+        min_ = np.min(X, axis=0)
+        scale_ = np.max(X, axis=0) - min_
+        self.scale_ = (feature_range[1] - feature_range[0]) / scale_
+        self.min_ = feature_range[0] - min_ / scale_
         return self
 
     def transform(self, X):
-        if self.copy:
-            X = (X - self.min_[:, np.newaxis]) / self.scale_[:, np.newaxis]
-        else:
-            X -= self.min_
-            X /= self.scale_
-        feature_min, feature_max = self.feature_range
-        if feature_min != 0:
-            X += feature_min
-        # denominator is X.max after adding min
-        max_scale = feature_max / (feature_min + 1.)
-        if max_scale != 1:
-            X *= max_scale
+        X = check_arrays(X, sparse_format="dense", copy=self.copy)[0]
+        X *= self.scale_
+        X += self.min_
         return X
 
 
-class UnitVarianceScaler(BaseEstimator, TransformerMixin):
+class StandardScaler(BaseEstimator, TransformerMixin):
     """Standardize features by removing the mean and scaling to unit variance
 
     Centering and scaling happen indepently on each feature by computing
@@ -351,9 +346,9 @@ def inverse_transform(self, X, copy=None):
         return X
 
 
-class Scaler(UnitVarianceScaler):
+class Scaler(StandardScaler):
     def __init__(self, copy=True, with_mean=True, with_std=True):
-        warnings.warn("Scaler was renamed to UnitVarianceScaler. The old name "
+        warnings.warn("Scaler was renamed to StandardScaler. The old name "
                 " will be removed in 0.15.", DeprecationWarning)
         super(Scaler, self).__init__(copy, with_mean, with_std)
 
diff --git a/sklearn/tests/test_preprocessing.py b/sklearn/tests/test_preprocessing.py
@@ -16,7 +16,7 @@
 from sklearn.preprocessing import LabelEncoder
 from sklearn.preprocessing import Normalizer
 from sklearn.preprocessing import normalize
-from sklearn.preprocessing import UnitVarianceScaler
+from sklearn.preprocessing import StandardScaler
 from sklearn.preprocessing import scale
 from sklearn.preprocessing import MinMaxScaler
 
@@ -38,7 +38,7 @@ def test_scaler_1d():
     X = rng.randn(5)
     X_orig_copy = X.copy()
 
-    scaler = UnitVarianceScaler()
+    scaler = StandardScaler()
     X_scaled = scaler.fit(X).transform(X, copy=False)
     assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
     assert_array_almost_equal(X_scaled.std(axis=0), 1.0)
@@ -49,7 +49,7 @@ def test_scaler_1d():
 
     # Test with 1D list
     X = [0., 1., 2, 0.4, 1.]
-    scaler = UnitVarianceScaler()
+    scaler = StandardScaler()
     X_scaled = scaler.fit(X).transform(X, copy=False)
     assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
     assert_array_almost_equal(X_scaled.std(axis=0), 1.0)
@@ -65,7 +65,7 @@ def test_scaler_2d_arrays():
     X = rng.randn(4, 5)
     X[:, 0] = 0.0  # first feature is always of zero
 
-    scaler = UnitVarianceScaler()
+    scaler = StandardScaler()
     X_scaled = scaler.fit(X).transform(X, copy=True)
     assert_false(np.any(np.isnan(X_scaled)))
 
@@ -99,7 +99,7 @@ def test_scaler_2d_arrays():
 
     X = rng.randn(4, 5)
     X[:, 0] = 1.0  # first feature is a constant, non zero feature
-    scaler = UnitVarianceScaler()
+    scaler = StandardScaler()
     X_scaled = scaler.fit(X).transform(X, copy=True)
     assert_false(np.any(np.isnan(X_scaled)))
     assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
@@ -113,19 +113,14 @@ def test_min_max_scaler():
     scaler = MinMaxScaler()
     # default params
     X_trans = scaler.fit_transform(X)
-    assert_equal(X_trans.min(axis=1), 0)
-    assert_equal(X_trans.max(axis=1), 1)
+    assert_equal(X_trans.min(axis=0), 0)
+    assert_equal(X_trans.max(axis=0), 1)
 
     # not default params
     scaler = MinMaxScaler(feature_range=(1, 2))
     X_trans = scaler.fit_transform(X)
-    assert_equal(X_trans.min(axis=1), 1)
-    assert_equal(X_trans.max(axis=1), 2)
-
-    # sparse
-    X_trans = scaler.fit_transform(sp.csr_matrix(X))
-    assert_equal(X_trans.min(axis=1), 1)
-    assert_equal(X_trans.max(axis=1), 2)
+    assert_equal(X_trans.min(axis=0), 1)
+    assert_equal(X_trans.max(axis=0), 2)
 
 
 def test_scaler_without_centering():
@@ -134,11 +129,11 @@ def test_scaler_without_centering():
     X[:, 0] = 0.0  # first feature is always of zero
     X_csr = sp.csr_matrix(X)
 
-    scaler = UnitVarianceScaler(with_mean=False).fit(X)
+    scaler = StandardScaler(with_mean=False).fit(X)
     X_scaled = scaler.transform(X, copy=True)
     assert_false(np.any(np.isnan(X_scaled)))
 
-    scaler_csr = UnitVarianceScaler(with_mean=False).fit(X_csr)
+    scaler_csr = StandardScaler(with_mean=False).fit(X_csr)
     X_csr_scaled = scaler_csr.transform(X_csr, copy=True)
     assert_false(np.any(np.isnan(X_csr_scaled.data)))
 
@@ -169,18 +164,18 @@ def test_scaler_without_centering():
 
 
 def test_scaler_without_copy():
-    """Check that UnitVarianceScaler.fit does not change input"""
+    """Check that StandardScaler.fit does not change input"""
     rng = np.random.RandomState(42)
     X = rng.randn(4, 5)
     X[:, 0] = 0.0  # first feature is always of zero
     X_csr = sp.csr_matrix(X)
 
     X_copy = X.copy()
-    UnitVarianceScaler(copy=False).fit(X)
+    StandardScaler(copy=False).fit(X)
     assert_array_equal(X, X_copy)
 
     X_csr_copy = X_csr.copy()
-    UnitVarianceScaler(with_mean=False, copy=False).fit(X_csr)
+    StandardScaler(with_mean=False, copy=False).fit(X_csr)
     assert_array_equal(X_csr.toarray(), X_csr_copy.toarray())
 
 
@@ -191,10 +186,10 @@ def test_scale_sparse_with_mean_raise_exception():
 
     # check scaling and fit with direct calls on sparse data
     assert_raises(ValueError, scale, X_csr, with_mean=True)
-    assert_raises(ValueError, UnitVarianceScaler(with_mean=True).fit, X_csr)
+    assert_raises(ValueError, StandardScaler(with_mean=True).fit, X_csr)
 
     # check transform and inverse_transform after a fit on a dense array
-    scaler = UnitVarianceScaler(with_mean=True).fit(X)
+    scaler = StandardScaler(with_mean=True).fit(X)
     assert_raises(ValueError, scaler.transform, X_csr)
 
     X_transformed_csr = sp.csr_matrix(scaler.transform(X))
@@ -518,11 +513,11 @@ def test_label_binarizer_multilabel_unlabeled():
 
 
 def test_center_kernel():
-    """Test that KernelCenterer is equivalent to UnitVarianceScaler
+    """Test that KernelCenterer is equivalent to StandardScaler
        in feature space"""
     rng = np.random.RandomState(0)
     X_fit = rng.random_sample((5, 4))
-    scaler = UnitVarianceScaler(with_std=False)
+    scaler = StandardScaler(with_std=False)
     scaler.fit(X_fit)
     X_fit_centered = scaler.transform(X_fit)
     K_fit = np.dot(X_fit, X_fit.T)
@@ -545,7 +540,7 @@ def test_center_kernel():
 def test_fit_transform():
     rng = np.random.RandomState(0)
     X = rng.random_sample((5, 4))
-    for obj in ((UnitVarianceScaler(), Normalizer(), Binarizer())):
+    for obj in ((StandardScaler(), Normalizer(), Binarizer())):
         X_transformed = obj.fit(X).transform(X)
         X_transformed2 = obj.fit_transform(X)
         assert_array_equal(X_transformed, X_transformed2)