ENH 20newsgroups example for FeatureHasher

larsmans · larsmans · commit c8e644abeeb5 · 2012-11-17T17:29:20.000+01:00
diff --git a/examples/hashing_vs_dict_vectorizer.py b/examples/hashing_vs_dict_vectorizer.py
@@ -0,0 +1,98 @@
+"""Compares FeatureHasher and DictVectorizer by using both to vectorize
+text documents.
+
+The example demonstrates syntax and speed only; it doesn't actually do
+anything useful with the extracted vectors. See the example scripts
+{document_classification_20newsgroups,clustering}.py for actual learning
+on text documents.
+
+A discrepancy between the number of tokens reported for DictVectorizer and
+for FeatureHasher is to be expected due to hash collisions.
+"""
+
+# Author: Lars Buitinck <L.J.Buitinck@uva.nl>
+# License: 3-clause BSD
+
+from __future__ import print_function
+from collections import defaultdict
+import re
+import sys
+from time import time
+
+import numpy as np
+
+from sklearn.datasets import fetch_20newsgroups
+from sklearn.feature_extraction import DictVectorizer, FeatureHasher
+
+
+def n_nonzero_columns(X):
+    """Returns the number of non-zero columns in a CSR matrix X."""
+    return len(np.unique(X.nonzero()[1]))
+
+
+def tokens(doc):
+    """Extract tokens from doc.
+
+    This uses a simple regex to break strings into tokens. For a more
+    principled approach, see CountVectorizer or TfidfVectorizer.
+    """
+    return (tok.lower() for tok in re.findall(r"\w+", doc))
+
+
+def token_freqs(doc):
+    """Extract a dict mapping tokens from doc to their frequencies."""
+    freq = defaultdict(int)
+    for tok in tokens(doc):
+        freq[tok] += 1
+    return freq
+
+
+categories = [
+    'alt.atheism',
+    'talk.religion.misc',
+    'comp.graphics',
+    'sci.space',
+]
+# Uncomment the following line to use a larger set (11k+ documents)
+#categories=None
+
+print(__doc__)
+print("Usage: %s [n_features_for_hashing]" % sys.argv[0])
+print("    The default number of features is 2**18.")
+print()
+
+try:
+    n_features = int(sys.argv[1])
+except IndexError:
+    n_features = 2 ** 18
+except ValueError:
+    print("not a valid number of features: %r" % sys.argv[1])
+    sys.exit(1)
+
+print("Loading 20 newsgroups training data")
+raw_data = fetch_20newsgroups(subset='train', categories=categories).data
+print("%d documents" % len(raw_data))
+print()
+
+print("DictVectorizer")
+t0 = time()
+vectorizer = DictVectorizer()
+vectorizer.fit_transform(token_freqs(d) for d in raw_data)
+print("done in %fs" % (time() - t0))
+print("Found %d unique terms" % len(vectorizer.get_feature_names()))
+print()
+
+print("FeatureHasher on frequency dicts")
+t0 = time()
+hasher = FeatureHasher(n_features=n_features)
+X = hasher.transform(token_freqs(d).iteritems() for d in raw_data)
+print("done in %fs" % (time() - t0))
+print("Found %d unique terms" % n_nonzero_columns(X))
+print()
+
+print("FeatureHasher on raw tokens")
+t0 = time()
+hasher = FeatureHasher(n_features=n_features, input_type="strings")
+X = hasher.transform(tokens(d) for d in raw_data)
+print("done in %fs" % (time() - t0))
+print("Found %d unique terms" % n_nonzero_columns(X))