Skip to content

Commit 3de0f43

Browse files
committed
ENH add dict support to FeatureHasher and make it the default input_type
1 parent 205a947 commit 3de0f43

File tree

4 files changed

+57
-23
lines changed

4 files changed

+57
-23
lines changed

doc/modules/feature_extraction.rst

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -134,9 +134,12 @@ This undoes some of the collision handling,
134134
but allows the output to be passed to estimators like :class:`MultinomialNB`
135135
or ``chi2`` feature selectors that expect non-negative inputs.
136136

137-
:class:`FeatureHasher` accepts either ``(feature, value)`` or strings,
137+
:class:`FeatureHasher` accepts either mappings
138+
(like Python's ``dict`` and its variants in the ``collections`` module),
139+
``(feature, value)`` pairs, or strings,
138140
depending on the constructor parameter ``input_type``.
139-
Single strings have an implicit value of 1.
141+
Mapping are treated as lists of ``(feature, value)`` pairs,
142+
while single strings have an implicit value of 1.
140143
If a feature occurs multiple times in a sample, the values will be summed.
141144
Feature hashing can be employed in document classification,
142145
but unlike :class:`text.CountVectorizer`,
@@ -166,11 +169,16 @@ can be constructed using::
166169

167170
raw_X = (token_features(tok, pos_tagger(tok)) for tok in corpus)
168171

172+
and fed to a hasher with::
173+
174+
hasher = FeatureHasher(input_type=string)
175+
X = hasher.transform(raw_X)
176+
177+
to get a ``scipy.sparse`` matrix ``X``.
178+
169179
Note the use of a generator comprehension,
170180
which introduces laziness into the feature extraction:
171-
tokens are only processed on demand from the :class:`FeatureHasher`
172-
(which should be given the constructor parameter ``input_type=strings``
173-
for this particular example).
181+
tokens are only processed on demand from the hasher.
174182

175183

176184
.. topic:: References:

examples/hashing_vs_dict_vectorizer.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
{document_classification_20newsgroups,clustering}.py for actual learning
77
on text documents.
88
9-
A discrepancy between the number of tokens reported for DictVectorizer and
9+
A discrepancy between the number of terms reported for DictVectorizer and
1010
for FeatureHasher is to be expected due to hash collisions.
1111
"""
1212

@@ -49,9 +49,12 @@ def token_freqs(doc):
4949

5050
categories = [
5151
'alt.atheism',
52-
'talk.religion.misc',
5352
'comp.graphics',
53+
'comp.sys.ibm.pc.hardware',
54+
'misc.forsale',
55+
'rec.autos',
5456
'sci.space',
57+
'talk.religion.misc',
5558
]
5659
# Uncomment the following line to use a larger set (11k+ documents)
5760
#categories=None
@@ -85,14 +88,14 @@ def token_freqs(doc):
8588
print("FeatureHasher on frequency dicts")
8689
t0 = time()
8790
hasher = FeatureHasher(n_features=n_features)
88-
X = hasher.transform(token_freqs(d).iteritems() for d in raw_data)
91+
X = hasher.transform(token_freqs(d) for d in raw_data)
8992
print("done in %fs" % (time() - t0))
9093
print("Found %d unique terms" % n_nonzero_columns(X))
9194
print()
9295

9396
print("FeatureHasher on raw tokens")
9497
t0 = time()
95-
hasher = FeatureHasher(n_features=n_features, input_type="strings")
98+
hasher = FeatureHasher(n_features=n_features, input_type="string")
9699
X = hasher.transform(tokens(d) for d in raw_data)
97100
print("done in %fs" % (time() - t0))
98101
print("Found %d unique terms" % n_nonzero_columns(X))

sklearn/feature_extraction/hashing.py

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Author: Lars Buitinck <[email protected]>
22
# License: 3-clause BSD.
33

4+
import itertools
45
import numbers
56

67
import numpy as np
@@ -10,6 +11,11 @@
1011
from ..base import BaseEstimator, TransformerMixin
1112

1213

14+
def _iteritems(d):
15+
"""Like d.iteritems, but accepts any collections.Mapping."""
16+
return d.iteritems() if hasattr(d, "iteritems") else d.items()
17+
18+
1319
class FeatureHasher(BaseEstimator, TransformerMixin):
1420
"""Implements feature hashing, aka the hashing trick.
1521
@@ -37,10 +43,14 @@ class FeatureHasher(BaseEstimator, TransformerMixin):
3743
as the dtype argument. Do not set this to bool, np.boolean or any
3844
unsigned integer type.
3945
input_type : string, optional
40-
Either "pairs" (the default) to accept pairs of (feature_name, value)
41-
where feature_name is a string to be hashed and value a number, or
42-
"strings" to accept only feature names with an implicit value of 1.
43-
The value's sign might be flipped (but see non_negative, below).
46+
Either "dict" (the default) to accept dictionaries over
47+
(feature_name, value); "pair" to accept pairs of (feature_name, value);
48+
or "string" to accept single strings.
49+
feature_name should be a string, while value should be a number.
50+
In the case of "string", a value of 1 is implied.
51+
The feature_name is hashed to find the appropriate column for the
52+
feature. The value's sign might be flipped in the output (but see
53+
non_negative, below).
4454
non_negative : boolean, optional
4555
Whether output matrices should contain non-negative values only;
4656
effectively calls abs on the matrix prior to returning it.
@@ -50,7 +60,7 @@ class FeatureHasher(BaseEstimator, TransformerMixin):
5060
5161
"""
5262

53-
def __init__(self, n_features=(2 ** 20), input_type="pairs",
63+
def __init__(self, n_features=(2 ** 20), input_type="dict",
5464
dtype=np.float64, non_negative=False):
5565
self._validate_params(n_features, input_type)
5666

@@ -69,9 +79,9 @@ def _validate_params(n_features, input_type):
6979
elif n_features < 1:
7080
raise ValueError("Invalid number of features (%d)." % n_features)
7181

72-
if input_type not in ("pairs", "strings"):
73-
raise ValueError("input_type must be 'pairs' or 'strings', got %r."
74-
% input_type)
82+
if input_type not in ("dict", "pair", "string"):
83+
raise ValueError("input_type must be 'dict', 'pair' or 'string',"
84+
" got %r." % input_type)
7585

7686
def fit(self, X=None, y=None):
7787
"""No-op.
@@ -108,7 +118,9 @@ def transform(self, raw_X, y=None):
108118
109119
"""
110120
raw_X = iter(raw_X)
111-
if self.input_type == "strings":
121+
if self.input_type == "dict":
122+
raw_X = (_iteritems(d) for d in raw_X)
123+
elif self.input_type == "string":
112124
raw_X = (((f, 1) for f in x) for x in raw_X)
113125
indices, indptr, values = \
114126
_hashing.transform(raw_X, self.n_features, self.dtype)

sklearn/feature_extraction/tests/test_feature_hasher.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,17 @@
88
from sklearn.utils.testing import assert_in
99

1010

11+
def test_feature_hasher_dicts():
12+
h = FeatureHasher()
13+
assert_equal("dict", h.input_type)
14+
15+
raw_X = [{"dada": 42, "tzara": 37}, {"gaga": 17}]
16+
X1 = FeatureHasher().transform(raw_X)
17+
X2 = FeatureHasher(input_type="pair").transform(d.iteritems()
18+
for d in raw_X)
19+
assert_array_equal(X1.toarray(), X2.toarray())
20+
21+
1122
def test_feature_hasher_strings():
1223
raw_X = [[u"foo", "bar", "baz", "foo"], # note: duplicate
1324
[u"bar", "baz", "quux"]]
@@ -17,7 +28,7 @@ def test_feature_hasher_strings():
1728

1829
it = (x for x in raw_X) # iterable
1930

20-
h = FeatureHasher(n_features, non_negative=True, input_type="strings")
31+
h = FeatureHasher(n_features, non_negative=True, input_type="string")
2132
X = h.transform(it)
2233

2334
assert_equal(X.shape[0], len(raw_X))
@@ -33,7 +44,7 @@ def test_feature_hasher_strings():
3344
def test_feature_hasher_pairs():
3445
raw_X = (d.iteritems() for d in [{"foo": 1, "bar": 2},
3546
{"baz": 3, "quux": 4, "foo": -1}])
36-
h = FeatureHasher(n_features=4096)
47+
h = FeatureHasher(n_features=4096, input_type="pair")
3748
x1, x2 = h.transform(raw_X).toarray()
3849
x1_nz = sorted(np.abs(x1[x1 != 0]))
3950
x2_nz = sorted(np.abs(x2[x2 != 0]))
@@ -45,7 +56,7 @@ def test_hash_empty_input():
4556
n_features = 16
4657
raw_X = [[], (), xrange(0)]
4758

48-
h = FeatureHasher(n_features=n_features, input_type="strings")
59+
h = FeatureHasher(n_features=n_features, input_type="string")
4960
X = h.transform(raw_X)
5061

5162
assert_array_equal(X.A, np.zeros((len(raw_X), n_features)))
@@ -59,8 +70,8 @@ def test_hasher_invalid_input():
5970

6071
h = FeatureHasher(n_features=np.uint16(2**6))
6172
assert_raises(ValueError, h.transform, [])
62-
assert_raises(TypeError, h.transform, [[5.5]])
63-
assert_raises(TypeError, h.transform, [[None]])
73+
assert_raises(Exception, h.transform, [[5.5]])
74+
assert_raises(Exception, h.transform, [[None]])
6475

6576

6677
def test_hasher_set_params():

0 commit comments

Comments
 (0)