Skip to content

Commit e05e1a4

Browse files
committed
ENH reduce memory usage of DictVectorizer.transform in sparse case
Don't materialize iterable's entire contents.
1 parent d5126a7 commit e05e1a4

File tree

2 files changed

+9
-3
lines changed

2 files changed

+9
-3
lines changed

sklearn/feature_extraction/dict_vectorizer.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -175,8 +175,6 @@ def transform(self, X, y=None):
175175
Xa : {array, sparse matrix}
176176
Feature vectors; always 2-d.
177177
"""
178-
X = _tosequence(X)
179-
180178
dtype = self.dtype
181179
vocab = self.vocabulary_
182180

@@ -198,10 +196,12 @@ def transform(self, X, y=None):
198196
except KeyError:
199197
pass
200198

199+
shape = (i + 1, len(vocab))
201200
return sp.coo_matrix((values, (i_ind, j_ind)),
202-
shape=(len(X), len(vocab)), dtype=dtype)
201+
shape=shape, dtype=dtype)
203202

204203
else:
204+
X = _tosequence(X)
205205
Xa = np.zeros((len(X), len(vocab)), dtype=dtype)
206206

207207
for i, x in enumerate(X):

sklearn/feature_extraction/tests/test_dict_vectorizer.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,12 @@ def test_dictvectorizer():
2626
assert_equal(X.sum(), 14)
2727
assert_equal(v.inverse_transform(X), D)
2828

29+
if sparse:
30+
# COO matrices can't be compared for equality
31+
assert_array_equal(X.A, v.transform(D).A)
32+
else:
33+
assert_array_equal(X, v.transform(D))
34+
2935

3036
def test_feature_selection():
3137
# make two feature dicts with two useful features and a bunch of useless

0 commit comments

Comments
 (0)