Skip to content

Commit cf4940a

Browse files
committed
ENH do normalization in single pass over data
1 parent 77dc060 commit cf4940a

File tree

2 files changed

+37
-47
lines changed

2 files changed

+37
-47
lines changed

sklearn/preprocessing.py

Lines changed: 18 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,11 @@ class MinMaxScaler(BaseEstimator, TransformerMixin):
141141
that it is in the given range on the training set, i.e. between
142142
zero and one.
143143
144+
The standardization is given by::
145+
X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
146+
X_scaled = X_std / (max - min) + min
147+
where min, max = feature_range.
148+
144149
This standardization is often used as an alternative to zero mean,
145150
unit variance scaling.
146151
It is in particular useful for sparse positive data, as it retains the
@@ -159,46 +164,36 @@ class MinMaxScaler(BaseEstimator, TransformerMixin):
159164
Attributes
160165
----------
161166
min_ : ndarray, shape (n_features,)
162-
Per feature minimum of the training data.
167+
Per feature adjustment for minimum.
163168
164169
scale_ : ndarray, shape (n_features,)
165-
Per feature range of the training data, i.e. max - min.
170+
Per feature relative scaling of the data.
166171
"""
167172

168173
def __init__(self, feature_range=(0, 1), copy=True):
169174
self.feature_range = feature_range
170175
self.copy = copy
171176

172177
def fit(self, X, y=None):
178+
X = check_arrays(X, sparse_format="dense", copy=self.copy)[0]
173179
feature_range = self.feature_range
174180
if feature_range[0] >= feature_range[1]:
175181
raise ValueError("Minimum of desired feature range must be smaller"
176182
" than maximum. Got %s." % str(feature_range))
177-
if sp.issparse(X):
178-
self.min_ = sp.min(X, axis=1)
179-
self.scale = sp.max(X, axis=1) - self.min_
180-
else:
181-
self.min_ = np.min(X, axis=1)
182-
self.scale_ = np.max(X, axis=1) - self.min_
183+
min_ = np.min(X, axis=0)
184+
scale_ = np.max(X, axis=0) - min_
185+
self.scale_ = (feature_range[1] - feature_range[0]) / scale_
186+
self.min_ = feature_range[0] - min_ / scale_
183187
return self
184188

185189
def transform(self, X):
186-
if self.copy:
187-
X = (X - self.min_[:, np.newaxis]) / self.scale_[:, np.newaxis]
188-
else:
189-
X -= self.min_
190-
X /= self.scale_
191-
feature_min, feature_max = self.feature_range
192-
if feature_min != 0:
193-
X += feature_min
194-
# denominator is X.max after adding min
195-
max_scale = feature_max / (feature_min + 1.)
196-
if max_scale != 1:
197-
X *= max_scale
190+
X = check_arrays(X, sparse_format="dense", copy=self.copy)[0]
191+
X *= self.scale_
192+
X += self.min_
198193
return X
199194

200195

201-
class UnitVarianceScaler(BaseEstimator, TransformerMixin):
196+
class StandardScaler(BaseEstimator, TransformerMixin):
202197
"""Standardize features by removing the mean and scaling to unit variance
203198
204199
Centering and scaling happen indepently on each feature by computing
@@ -351,9 +346,9 @@ def inverse_transform(self, X, copy=None):
351346
return X
352347

353348

354-
class Scaler(UnitVarianceScaler):
349+
class Scaler(StandardScaler):
355350
def __init__(self, copy=True, with_mean=True, with_std=True):
356-
warnings.warn("Scaler was renamed to UnitVarianceScaler. The old name "
351+
warnings.warn("Scaler was renamed to StandardScaler. The old name "
357352
" will be removed in 0.15.", DeprecationWarning)
358353
super(Scaler, self).__init__(copy, with_mean, with_std)
359354

sklearn/tests/test_preprocessing.py

Lines changed: 19 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from sklearn.preprocessing import LabelEncoder
1717
from sklearn.preprocessing import Normalizer
1818
from sklearn.preprocessing import normalize
19-
from sklearn.preprocessing import UnitVarianceScaler
19+
from sklearn.preprocessing import StandardScaler
2020
from sklearn.preprocessing import scale
2121
from sklearn.preprocessing import MinMaxScaler
2222

@@ -38,7 +38,7 @@ def test_scaler_1d():
3838
X = rng.randn(5)
3939
X_orig_copy = X.copy()
4040

41-
scaler = UnitVarianceScaler()
41+
scaler = StandardScaler()
4242
X_scaled = scaler.fit(X).transform(X, copy=False)
4343
assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
4444
assert_array_almost_equal(X_scaled.std(axis=0), 1.0)
@@ -49,7 +49,7 @@ def test_scaler_1d():
4949

5050
# Test with 1D list
5151
X = [0., 1., 2, 0.4, 1.]
52-
scaler = UnitVarianceScaler()
52+
scaler = StandardScaler()
5353
X_scaled = scaler.fit(X).transform(X, copy=False)
5454
assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
5555
assert_array_almost_equal(X_scaled.std(axis=0), 1.0)
@@ -65,7 +65,7 @@ def test_scaler_2d_arrays():
6565
X = rng.randn(4, 5)
6666
X[:, 0] = 0.0 # first feature is always of zero
6767

68-
scaler = UnitVarianceScaler()
68+
scaler = StandardScaler()
6969
X_scaled = scaler.fit(X).transform(X, copy=True)
7070
assert_false(np.any(np.isnan(X_scaled)))
7171

@@ -99,7 +99,7 @@ def test_scaler_2d_arrays():
9999

100100
X = rng.randn(4, 5)
101101
X[:, 0] = 1.0 # first feature is a constant, non zero feature
102-
scaler = UnitVarianceScaler()
102+
scaler = StandardScaler()
103103
X_scaled = scaler.fit(X).transform(X, copy=True)
104104
assert_false(np.any(np.isnan(X_scaled)))
105105
assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
@@ -113,19 +113,14 @@ def test_min_max_scaler():
113113
scaler = MinMaxScaler()
114114
# default params
115115
X_trans = scaler.fit_transform(X)
116-
assert_equal(X_trans.min(axis=1), 0)
117-
assert_equal(X_trans.max(axis=1), 1)
116+
assert_equal(X_trans.min(axis=0), 0)
117+
assert_equal(X_trans.max(axis=0), 1)
118118

119119
# not default params
120120
scaler = MinMaxScaler(feature_range=(1, 2))
121121
X_trans = scaler.fit_transform(X)
122-
assert_equal(X_trans.min(axis=1), 1)
123-
assert_equal(X_trans.max(axis=1), 2)
124-
125-
# sparse
126-
X_trans = scaler.fit_transform(sp.csr_matrix(X))
127-
assert_equal(X_trans.min(axis=1), 1)
128-
assert_equal(X_trans.max(axis=1), 2)
122+
assert_equal(X_trans.min(axis=0), 1)
123+
assert_equal(X_trans.max(axis=0), 2)
129124

130125

131126
def test_scaler_without_centering():
@@ -134,11 +129,11 @@ def test_scaler_without_centering():
134129
X[:, 0] = 0.0 # first feature is always of zero
135130
X_csr = sp.csr_matrix(X)
136131

137-
scaler = UnitVarianceScaler(with_mean=False).fit(X)
132+
scaler = StandardScaler(with_mean=False).fit(X)
138133
X_scaled = scaler.transform(X, copy=True)
139134
assert_false(np.any(np.isnan(X_scaled)))
140135

141-
scaler_csr = UnitVarianceScaler(with_mean=False).fit(X_csr)
136+
scaler_csr = StandardScaler(with_mean=False).fit(X_csr)
142137
X_csr_scaled = scaler_csr.transform(X_csr, copy=True)
143138
assert_false(np.any(np.isnan(X_csr_scaled.data)))
144139

@@ -169,18 +164,18 @@ def test_scaler_without_centering():
169164

170165

171166
def test_scaler_without_copy():
172-
"""Check that UnitVarianceScaler.fit does not change input"""
167+
"""Check that StandardScaler.fit does not change input"""
173168
rng = np.random.RandomState(42)
174169
X = rng.randn(4, 5)
175170
X[:, 0] = 0.0 # first feature is always of zero
176171
X_csr = sp.csr_matrix(X)
177172

178173
X_copy = X.copy()
179-
UnitVarianceScaler(copy=False).fit(X)
174+
StandardScaler(copy=False).fit(X)
180175
assert_array_equal(X, X_copy)
181176

182177
X_csr_copy = X_csr.copy()
183-
UnitVarianceScaler(with_mean=False, copy=False).fit(X_csr)
178+
StandardScaler(with_mean=False, copy=False).fit(X_csr)
184179
assert_array_equal(X_csr.toarray(), X_csr_copy.toarray())
185180

186181

@@ -191,10 +186,10 @@ def test_scale_sparse_with_mean_raise_exception():
191186

192187
# check scaling and fit with direct calls on sparse data
193188
assert_raises(ValueError, scale, X_csr, with_mean=True)
194-
assert_raises(ValueError, UnitVarianceScaler(with_mean=True).fit, X_csr)
189+
assert_raises(ValueError, StandardScaler(with_mean=True).fit, X_csr)
195190

196191
# check transform and inverse_transform after a fit on a dense array
197-
scaler = UnitVarianceScaler(with_mean=True).fit(X)
192+
scaler = StandardScaler(with_mean=True).fit(X)
198193
assert_raises(ValueError, scaler.transform, X_csr)
199194

200195
X_transformed_csr = sp.csr_matrix(scaler.transform(X))
@@ -518,11 +513,11 @@ def test_label_binarizer_multilabel_unlabeled():
518513

519514

520515
def test_center_kernel():
521-
"""Test that KernelCenterer is equivalent to UnitVarianceScaler
516+
"""Test that KernelCenterer is equivalent to StandardScaler
522517
in feature space"""
523518
rng = np.random.RandomState(0)
524519
X_fit = rng.random_sample((5, 4))
525-
scaler = UnitVarianceScaler(with_std=False)
520+
scaler = StandardScaler(with_std=False)
526521
scaler.fit(X_fit)
527522
X_fit_centered = scaler.transform(X_fit)
528523
K_fit = np.dot(X_fit, X_fit.T)
@@ -545,7 +540,7 @@ def test_center_kernel():
545540
def test_fit_transform():
546541
rng = np.random.RandomState(0)
547542
X = rng.random_sample((5, 4))
548-
for obj in ((UnitVarianceScaler(), Normalizer(), Binarizer())):
543+
for obj in ((StandardScaler(), Normalizer(), Binarizer())):
549544
X_transformed = obj.fit(X).transform(X)
550545
X_transformed2 = obj.fit_transform(X)
551546
assert_array_equal(X_transformed, X_transformed2)

0 commit comments

Comments
 (0)