ENH added docs, example and tests.

amueller · amueller · commit 6e3472c2bf96 · 2012-11-14T10:40:18.000+01:00
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
@@ -250,6 +250,7 @@ Samples generator
    :template: class.rst
 
    ensemble.RandomForestClassifier
+   ensemble.RandomForestHasher
    ensemble.RandomForestRegressor
    ensemble.ExtraTreesClassifier
    ensemble.ExtraTreesRegressor
diff --git a/sklearn/ensemble/__init__.py b/sklearn/ensemble/__init__.py
@@ -6,6 +6,7 @@
 from .base import BaseEnsemble
 from .forest import RandomForestClassifier
 from .forest import RandomForestRegressor
+from .forest import RandomForestHasher
 from .forest import ExtraTreesClassifier
 from .forest import ExtraTreesRegressor
 from .gradient_boosting import GradientBoostingClassifier
diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py
@@ -1159,7 +1159,65 @@ def __init__(self, n_estimators=10,
         self.max_features = max_features
 
 
-class RandomHashingForest(ExtraTreesClassifier):
+class RandomForestHasher(ExtraTreesClassifier):
+    """Use a completely random forest to create sparse, binary represenations.
+
+    An unsupervised transformation of a dataset to a high-dimensional
+    sparse representation. A datapoint is coded according to which leaf of
+    each tree it is sorted into. Using a one-hot encoding of the leafs,
+    this leads to a binary coding with as many ones as trees in the forest.
+
+    The dimensionality of the resulting representation is approximately
+    ``n_estimators * 2 ** max_depth``.
+
+    Parameters
+    ----------
+    n_estimators : int
+        Number of trees in the forest.
+
+    max_depth : int
+        Maximum depth of each tree.
+
+    min_samples_split : integer, optional (default=1)
+        The minimum number of samples required to split an internal node.
+        Note: this parameter is tree-specific.
+
+    min_samples_leaf : integer, optional (default=1)
+        The minimum number of samples in newly created leaves.  A split is
+        discarded if after the split, one of the leaves would contain less then
+        ``min_samples_leaf`` samples.
+        Note: this parameter is tree-specific.
+
+    min_density : float, optional (default=0.1)
+        This parameter controls a trade-off in an optimization heuristic. It
+        controls the minimum density of the `sample_mask` (i.e. the
+        fraction of samples in the mask). If the density falls below this
+        threshold the mask is recomputed and the input data is packed
+        which results in data copying.  If `min_density` equals to one,
+        the partitions are always represented as copies of the original
+        data. Otherwise, partitions are represented as bit masks (aka
+        sample masks).
+
+    n_jobs : integer, optional (default=1)
+        The number of jobs to run in parallel. If -1, then the number of jobs
+        is set to the number of cores.
+
+    random_state : int, RandomState instance or None, optional (default=None)
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance used
+        by `np.random`.
+
+    verbose : int, optional (default=0)
+        Controls the verbosity of the tree building process.
+
+    Attributes
+    ----------
+    `estimators_`: list of DecisionTreeClassifier
+        The collection of fitted sub-estimators.
+
+    """
+
     def __init__(self, n_estimators=10,
                        max_depth=5,
                        min_samples_split=1,
@@ -1168,7 +1226,7 @@ def __init__(self, n_estimators=10,
                        n_jobs=1,
                        random_state=None,
                        verbose=0):
-        super(RandomHashingForest, self).__init__(
+        super(RandomForestHasher, self).__init__(
                 n_estimators=n_estimators,
                 max_depth=max_depth,
                 min_samples_split=min_samples_split,
@@ -1183,14 +1241,45 @@ def __init__(self, n_estimators=10,
                 verbose=verbose)
 
     def fit(self, X, y=None):
+        """Fit estimator.
+
+        Parameters
+        ----------
+        X : array-like, shape=(n_samples, n_features)
+            Input data used to build forests.
+        """
         self.fit_transform(X, y)
         return self
 
     def fit_transform(self, X, y=None):
+        """Fit estimator and transform dataset.
+
+        Parameters
+        ----------
+        X : array-like, shape=(n_samples, n_features)
+            Input data used to build forests.
+
+        Returns
+        -------
+        X_transformed: sparse matrix, shape=(n_samples, n_out)
+            Transformed dataset.
+        """
         y = np.arange(len(X))
-        super(RandomHashingForest, self).fit(X, y)
+        super(RandomForestHasher, self).fit(X, y)
         self.one_hot_encoder_ = OneHotEncoder()
         return self.one_hot_encoder_.fit_transform(self.apply(X))
 
     def transform(self, X):
+        """Transform dataset.
+
+        Parameters
+        ----------
+        X : array-like, shape=(n_samples, n_features)
+            Input data to be transformed.
+
+        Returns
+        -------
+        X_transformed: sparse matrix, shape=(n_samples, n_out)
+            Transformed dataset.
+        """
         return self.one_hot_encoder_.transform(self.apply(X))
diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
@@ -2,7 +2,7 @@
 Testing for the forest module (sklearn.ensemble.forest).
 """
 
-# Authors: Gilles Louppe, Brian Holt
+# Authors: Gilles Louppe, Brian Holt, Andreas Mueller
 # License: BSD 3
 
 import numpy as np
@@ -17,8 +17,11 @@
 from sklearn.grid_search import GridSearchCV
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.ensemble import RandomForestRegressor
+from sklearn.ensemble import RandomForestHasher
 from sklearn.ensemble import ExtraTreesClassifier
 from sklearn.ensemble import ExtraTreesRegressor
+from sklearn.svm import LinearSVC
+from sklearn.decomposition import RandomizedPCA
 from sklearn import datasets
 
 # toy sample
@@ -372,6 +375,29 @@ def test_multioutput():
     np.seterr(**olderr)
 
 
+def test_random_hasher():
+    # test random forest hashing on circles dataset
+    # make sure that it is linearly separable.
+    # even after projected to two pca dimensions
+    hasher = RandomForestHasher(n_estimators=30, random_state=0)
+    X, y = datasets.make_circles(factor=0.5)
+    X_transformed = hasher.fit_transform(X)
+
+    # test fit and transform:
+    hasher = RandomForestHasher(n_estimators=30, random_state=0)
+    assert_array_equal(hasher.fit(X).transform(X).toarray(),
+                       X_transformed.toarray())
+
+    # one leaf active per data point per forest
+    assert_equal(X_transformed.shape[0], X.shape[0])
+    assert_array_equal(X_transformed.sum(axis=1), hasher.n_estimators)
+    pca = RandomizedPCA(n_components=2)
+    X_reduced = pca.fit_transform(X_transformed)
+    linear_clf = LinearSVC()
+    linear_clf.fit(X_reduced, y)
+    assert_equal(linear_clf.score(X_reduced, y), 1.)
+
+
 if __name__ == "__main__":
     import nose
     nose.runmodule()
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
@@ -37,7 +37,7 @@
 from sklearn.decomposition import SparseCoder
 from sklearn.pipeline import Pipeline, FeatureUnion
 from sklearn.pls import _PLS, PLSCanonical, PLSRegression, CCA, PLSSVD
-from sklearn.ensemble import BaseEnsemble
+from sklearn.ensemble import BaseEnsemble, RandomForestHasher
 from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier,\
         OutputCodeClassifier
 from sklearn.feature_selection import RFE, RFECV, SelectKBest
@@ -54,7 +54,8 @@
 
 dont_test = [Pipeline, FeatureUnion, GridSearchCV, SparseCoder,
         EllipticEnvelope, EllipticEnvelop, DictVectorizer, LabelBinarizer,
-        LabelEncoder, TfidfTransformer, IsotonicRegression, OneHotEncoder]
+        LabelEncoder, TfidfTransformer, IsotonicRegression, OneHotEncoder,
+        RandomForestHasher]
 meta_estimators = [BaseEnsemble, OneVsOneClassifier, OutputCodeClassifier,
         OneVsRestClassifier, RFE, RFECV]