ENH add dict support to FeatureHasher and make it the default input_type

larsmans · larsmans · commit 3de0f43e40e8 · 2012-11-17T17:29:20.000+01:00
diff --git a/doc/modules/feature_extraction.rst b/doc/modules/feature_extraction.rst
@@ -134,9 +134,12 @@ This undoes some of the collision handling,
 but allows the output to be passed to estimators like :class:`MultinomialNB`
 or ``chi2`` feature selectors that expect non-negative inputs.
 
-:class:`FeatureHasher` accepts either ``(feature, value)`` or strings,
+:class:`FeatureHasher` accepts either mappings
+(like Python's ``dict`` and its variants in the ``collections`` module),
+``(feature, value)`` pairs, or strings,
 depending on the constructor parameter ``input_type``.
-Single strings have an implicit value of 1.
+Mapping are treated as lists of ``(feature, value)`` pairs,
+while single strings have an implicit value of 1.
 If a feature occurs multiple times in a sample, the values will be summed.
 Feature hashing can be employed in document classification,
 but unlike :class:`text.CountVectorizer`,
@@ -166,11 +169,16 @@ can be constructed using::
 
   raw_X = (token_features(tok, pos_tagger(tok)) for tok in corpus)
 
+and fed to a hasher with::
+
+  hasher = FeatureHasher(input_type=string)
+  X = hasher.transform(raw_X)
+
+to get a ``scipy.sparse`` matrix ``X``.
+
 Note the use of a generator comprehension,
 which introduces laziness into the feature extraction:
-tokens are only processed on demand from the :class:`FeatureHasher`
-(which should be given the constructor parameter ``input_type=strings``
-for this particular example).
+tokens are only processed on demand from the hasher.
 
 
 .. topic:: References:
diff --git a/examples/hashing_vs_dict_vectorizer.py b/examples/hashing_vs_dict_vectorizer.py
@@ -6,7 +6,7 @@
 {document_classification_20newsgroups,clustering}.py for actual learning
 on text documents.
 
-A discrepancy between the number of tokens reported for DictVectorizer and
+A discrepancy between the number of terms reported for DictVectorizer and
 for FeatureHasher is to be expected due to hash collisions.
 """
 
@@ -49,9 +49,12 @@ def token_freqs(doc):
 
 categories = [
     'alt.atheism',
-    'talk.religion.misc',
     'comp.graphics',
+    'comp.sys.ibm.pc.hardware',
+    'misc.forsale',
+    'rec.autos',
     'sci.space',
+    'talk.religion.misc',
 ]
 # Uncomment the following line to use a larger set (11k+ documents)
 #categories=None
@@ -85,14 +88,14 @@ def token_freqs(doc):
 print("FeatureHasher on frequency dicts")
 t0 = time()
 hasher = FeatureHasher(n_features=n_features)
-X = hasher.transform(token_freqs(d).iteritems() for d in raw_data)
+X = hasher.transform(token_freqs(d) for d in raw_data)
 print("done in %fs" % (time() - t0))
 print("Found %d unique terms" % n_nonzero_columns(X))
 print()
 
 print("FeatureHasher on raw tokens")
 t0 = time()
-hasher = FeatureHasher(n_features=n_features, input_type="strings")
+hasher = FeatureHasher(n_features=n_features, input_type="string")
 X = hasher.transform(tokens(d) for d in raw_data)
 print("done in %fs" % (time() - t0))
 print("Found %d unique terms" % n_nonzero_columns(X))
diff --git a/sklearn/feature_extraction/hashing.py b/sklearn/feature_extraction/hashing.py
@@ -1,6 +1,7 @@
 # Author: Lars Buitinck <L.J.Buitinck@uva.nl>
 # License: 3-clause BSD.
 
+import itertools
 import numbers
 
 import numpy as np
@@ -10,6 +11,11 @@
 from ..base import BaseEstimator, TransformerMixin
 
 
+def _iteritems(d):
+    """Like d.iteritems, but accepts any collections.Mapping."""
+    return d.iteritems() if hasattr(d, "iteritems") else d.items()
+
+
 class FeatureHasher(BaseEstimator, TransformerMixin):
     """Implements feature hashing, aka the hashing trick.
 
@@ -37,10 +43,14 @@ class FeatureHasher(BaseEstimator, TransformerMixin):
         as the dtype argument. Do not set this to bool, np.boolean or any
         unsigned integer type.
     input_type : string, optional
-        Either "pairs" (the default) to accept pairs of (feature_name, value)
-        where feature_name is a string to be hashed and value a number, or
-        "strings" to accept only feature names with an implicit value of 1.
-        The value's sign might be flipped (but see non_negative, below).
+        Either "dict" (the default) to accept dictionaries over
+        (feature_name, value); "pair" to accept pairs of (feature_name, value);
+        or "string" to accept single strings.
+        feature_name should be a string, while value should be a number.
+        In the case of "string", a value of 1 is implied.
+        The feature_name is hashed to find the appropriate column for the
+        feature. The value's sign might be flipped in the output (but see
+        non_negative, below).
     non_negative : boolean, optional
         Whether output matrices should contain non-negative values only;
         effectively calls abs on the matrix prior to returning it.
@@ -50,7 +60,7 @@ class FeatureHasher(BaseEstimator, TransformerMixin):
 
     """
 
-    def __init__(self, n_features=(2 ** 20), input_type="pairs",
+    def __init__(self, n_features=(2 ** 20), input_type="dict",
                  dtype=np.float64, non_negative=False):
         self._validate_params(n_features, input_type)
 
@@ -69,9 +79,9 @@ def _validate_params(n_features, input_type):
         elif n_features < 1:
             raise ValueError("Invalid number of features (%d)." % n_features)
 
-        if input_type not in ("pairs", "strings"):
-            raise ValueError("input_type must be 'pairs' or 'strings', got %r."
-                             % input_type)
+        if input_type not in ("dict", "pair", "string"):
+            raise ValueError("input_type must be 'dict', 'pair' or 'string',"
+                             " got %r." % input_type)
 
     def fit(self, X=None, y=None):
         """No-op.
@@ -108,7 +118,9 @@ def transform(self, raw_X, y=None):
 
         """
         raw_X = iter(raw_X)
-        if self.input_type == "strings":
+        if self.input_type == "dict":
+            raw_X = (_iteritems(d) for d in raw_X)
+        elif self.input_type == "string":
             raw_X = (((f, 1) for f in x) for x in raw_X)
         indices, indptr, values = \
             _hashing.transform(raw_X, self.n_features, self.dtype)
diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py
@@ -8,6 +8,17 @@
 from sklearn.utils.testing import assert_in
 
 
+def test_feature_hasher_dicts():
+    h = FeatureHasher()
+    assert_equal("dict", h.input_type)
+
+    raw_X = [{"dada": 42, "tzara": 37}, {"gaga": 17}]
+    X1 = FeatureHasher().transform(raw_X)
+    X2 = FeatureHasher(input_type="pair").transform(d.iteritems()
+                                                    for d in raw_X)
+    assert_array_equal(X1.toarray(), X2.toarray())
+
+
 def test_feature_hasher_strings():
     raw_X = [[u"foo", "bar", "baz", "foo"],    # note: duplicate
              [u"bar", "baz", "quux"]]
@@ -17,7 +28,7 @@ def test_feature_hasher_strings():
 
         it = (x for x in raw_X)                 # iterable
 
-        h = FeatureHasher(n_features, non_negative=True, input_type="strings")
+        h = FeatureHasher(n_features, non_negative=True, input_type="string")
         X = h.transform(it)
 
         assert_equal(X.shape[0], len(raw_X))
@@ -33,7 +44,7 @@ def test_feature_hasher_strings():
 def test_feature_hasher_pairs():
     raw_X = (d.iteritems() for d in [{"foo": 1, "bar": 2},
                                      {"baz": 3, "quux": 4, "foo": -1}])
-    h = FeatureHasher(n_features=4096)
+    h = FeatureHasher(n_features=4096, input_type="pair")
     x1, x2 = h.transform(raw_X).toarray()
     x1_nz = sorted(np.abs(x1[x1 != 0]))
     x2_nz = sorted(np.abs(x2[x2 != 0]))
@@ -45,7 +56,7 @@ def test_hash_empty_input():
     n_features = 16
     raw_X = [[], (), xrange(0)]
 
-    h = FeatureHasher(n_features=n_features, input_type="strings")
+    h = FeatureHasher(n_features=n_features, input_type="string")
     X = h.transform(raw_X)
 
     assert_array_equal(X.A, np.zeros((len(raw_X), n_features)))
@@ -59,8 +70,8 @@ def test_hasher_invalid_input():
 
     h = FeatureHasher(n_features=np.uint16(2**6))
     assert_raises(ValueError, h.transform, [])
-    assert_raises(TypeError, h.transform, [[5.5]])
-    assert_raises(TypeError, h.transform, [[None]])
+    assert_raises(Exception, h.transform, [[5.5]])
+    assert_raises(Exception, h.transform, [[None]])
 
 
 def test_hasher_set_params():