bigfreecoder
diff --git a/‎deepforest/__init__.py
Lines changed: 12 additions & 0 deletions b/‎deepforest/__init__.py
Lines changed: 12 additions & 0 deletions
diff --git a/‎deepforest/_binner.py
Lines changed: 170 additions & 0 deletions b/‎deepforest/_binner.py
Lines changed: 170 additions & 0 deletions
diff --git a/‎deepforest/_cutils.pyx
Lines changed: 113 additions & 0 deletions b/‎deepforest/_cutils.pyx
Lines changed: 113 additions & 0 deletions
diff --git a/‎deepforest/_estimator.py
Lines changed: 75 additions & 0 deletions b/‎deepforest/_estimator.py
Lines changed: 75 additions & 0 deletions
@@ -0,0 +1,12 @@
+from .cascade import CascadeForestClassifier
+from .forest import RandomForestClassifier
+from .forest import ExtraTreesClassifier
+from .tree import DecisionTreeClassifier
+from .tree import ExtraTreeClassifier
+
+
+__all__ = ["CascadeForestClassifier",
+           "RandomForestClassifier",
+           "ExtraTreesClassifier",
+           "DecisionTreeClassifier",
+           "ExtraTreeClassifier"]
@@ -0,0 +1,170 @@
+"""
+Implementation of the Binner in Deep Forest.
+
+This class is modified from:
+    https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/ensemble/_hist_gradient_boosting/binning.py
+"""
+
+
+__all__ = ["Binner"]
+
+import numpy as np
+from sklearn.utils import check_random_state
+from sklearn.base import BaseEstimator, TransformerMixin
+
+from . import _cutils as _LIB
+
+
+X_DTYPE = np.float64
+X_BINNED_DTYPE = np.uint8
+ALMOST_INF = 1e300
+
+
+def _find_binning_thresholds_per_feature(
+        col_data, n_bins, bin_type="percentile"
+):
+    """
+    Private function used to find midpoints for samples along a
+    specific feature.
+    """
+    if len(col_data.shape) != 1:
+        msg = (
+            "Per-feature data should be of the shape (n_samples,), but"
+            " got {}-dims instead."
+        )
+        raise RuntimeError(msg.format(len(col_data.shape)))
+
+    missing_mask = np.isnan(col_data)
+    if missing_mask.any():
+        col_data = col_data[~missing_mask]
+    col_data = np.ascontiguousarray(col_data, dtype=X_DTYPE)
+    distinct_values = np.unique(col_data)
+    # Too few distinct values
+    if len(distinct_values) <= n_bins:
+        midpoints = distinct_values[:-1] + distinct_values[1:]
+        midpoints *= 0.5
+    else:
+        # Equal interval in terms of percentile
+        if bin_type == "percentile":
+            percentiles = np.linspace(0, 100, num=n_bins + 1)
+            percentiles = percentiles[1:-1]
+            midpoints = np.percentile(
+                col_data, percentiles, interpolation="midpoint"
+            ).astype(X_DTYPE)
+            assert midpoints.shape[0] == n_bins - 1
+            np.clip(midpoints, a_min=None, a_max=ALMOST_INF, out=midpoints)
+        # Equal interval in terms of value
+        elif bin_type == "interval":
+            min_value, max_value = np.min(col_data), np.max(col_data)
+            intervals = np.linspace(min_value, max_value, num=n_bins + 1)
+            midpoints = intervals[1:-1]
+            assert midpoints.shape[0] == n_bins - 1
+        else:
+            raise ValueError("Unknown binning type: {}.".format(bin_type))
+
+    return midpoints
+
+
+def _find_binning_thresholds(
+    X, n_bins, bin_subsample=2e5, bin_type="percentile", random_state=None
+):
+    n_samples, n_features = X.shape
+    rng = check_random_state(random_state)
+
+    if n_samples > bin_subsample:
+        subset = rng.choice(
+            np.arange(n_samples), bin_subsample, replace=False
+        )
+        X = X.take(subset, axis=0)
+
+    binning_thresholds = []
+    for f_idx in range(n_features):
+        threshold = _find_binning_thresholds_per_feature(
+            X[:, f_idx],
+            n_bins,
+            bin_type
+        )
+        binning_thresholds.append(threshold)
+
+    return binning_thresholds
+
+
+class Binner(TransformerMixin, BaseEstimator):
+
+    def __init__(
+        self,
+        n_bins=255,
+        bin_subsample=2e5,
+        bin_type="percentile",
+        random_state=None
+    ):
+        self.n_bins = n_bins + 1  # + 1 for missing values
+        self.bin_subsample = int(bin_subsample)
+        self.bin_type = bin_type
+        self.random_state = random_state
+        self._is_fitted = False
+
+    def _validate_params(self):
+
+        if not 2 <= self.n_bins - 1 <= 255:
+            msg = ("`n_bins` should be in the range [2, 255], bug got"
+                   " {} instead.")
+            raise ValueError(msg.format(self.n_bins - 1))
+
+        if not self.bin_subsample > 0:
+            msg = (
+                "The number of samples used to construct the Binner"
+                " should be strictly positive, but got {} instead."
+            )
+            raise ValueError(msg.format(self.bin_subsample))
+
+        if self.bin_type not in ("percentile", "interval"):
+            msg = ("The type of binner should be one of {{percentile, interval"
+                   "}}, bug got {} instead.")
+            raise ValueError(msg.format(self.bin_type))
+
+    def fit(self, X):
+
+        self._validate_params()
+
+        self.bin_thresholds_ = _find_binning_thresholds(
+            X,
+            self.n_bins - 1,
+            self.bin_subsample,
+            self.bin_type,
+            self.random_state,
+        )
+
+        self.n_bins_non_missing_ = np.array(
+            [thresholds.shape[0] + 1 for thresholds in self.bin_thresholds_],
+            dtype=np.uint32,
+        )
+
+        self.missing_values_bin_idx_ = self.n_bins - 1
+        self._is_fitted = True
+
+        return self
+
+    def transform(self, X):
+
+        if not self._is_fitted:
+            msg = (
+                "The binner has not been fitted yet when calling `transform`."
+            )
+            raise RuntimeError(msg)
+
+        if not X.shape[1] == self.n_bins_non_missing_.shape[0]:
+            msg = (
+                "The binner was fitted with {} features but {} features got"
+                " passed to `transform`."
+            )
+            raise ValueError(
+                msg.format(self.n_bins_non_missing_.shape[0], X.shape[1])
+            )
+
+        X_binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order="F")
+        _LIB._map_to_bins(
+            X, self.bin_thresholds_, self.missing_values_bin_idx_, X_binned
+        )
+
+        return X_binned
@@ -0,0 +1,113 @@
+# cython: cdivision=True
+# cython: boundscheck=False
+# cython: wraparound=False
+# cython: nonecheck=False
+# cython: language_level=3
+
+# Author: Yi-Xuan Xu
+
+
+cimport cython
+import numpy as np
+cimport numpy as np
+from libc.math cimport isnan
+
+ctypedef np.npy_bool BOOL
+ctypedef np.npy_intp SIZE_t
+ctypedef np.npy_int32 INT32_t
+ctypedef np.npy_float64 X_DTYPE_C
+ctypedef np.npy_uint8 X_BINNED_DTYPE_C
+
+np.import_array()
+
+
+cpdef void _c_merge_proba(np.ndarray[X_DTYPE_C, ndim=2] probas,
+                          SIZE_t n_outputs,
+                          np.ndarray[X_DTYPE_C, ndim=2] out):
+    cdef:
+        SIZE_t n_features = probas.shape[1]
+        SIZE_t start = 0
+        SIZE_t count = 0
+
+    while start < n_features:
+        out += probas[:, start : (start + n_outputs)]
+        start += n_outputs
+        count += 1
+
+    out /= count
+
+
+cpdef np.ndarray _c_sample_mask(const INT32_t [:] indices,
+                                int n_samples):
+    """
+    Generate the sample mask given indices without resorting to `np.unique`."""
+    cdef:
+        SIZE_t i
+        SIZE_t n = indices.shape[0]
+        SIZE_t sample_id
+        np.ndarray[BOOL, ndim=1] sample_mask = np.zeros((n_samples,),
+                                                        dtype=np.bool)
+
+    with nogil:
+        for i in range(n):
+            sample_id = indices[i]
+            if not sample_mask[sample_id]:
+                sample_mask[sample_id] = True
+
+    return sample_mask
+
+
+# Modified from HGBDT in Scikit-Learn
+cpdef _map_to_bins(object X,
+                   list binning_thresholds,
+                   const unsigned char missing_values_bin_idx,
+                   X_BINNED_DTYPE_C [::1, :] binned):
+    """Bin numerical values to discrete integer-coded levels.
+
+    Parameters
+    ----------
+    data : ndarray, shape (n_samples, n_features)
+        The numerical data to bin.
+    binning_thresholds : list of arrays
+        For each feature, stores the increasing numeric values that are
+        used to separate the bins.
+    binned : ndarray, shape (n_samples, n_features)
+        Output array, must be fortran aligned.
+    """
+    cdef:
+        const X_DTYPE_C[:, :] X_ndarray = X
+        SIZE_t n_features = X_ndarray.shape[1]
+        SIZE_t feature_idx
+
+    for feature_idx in range(n_features):
+        _map_num_col_to_bins(X[:, feature_idx],
+                             binning_thresholds[feature_idx],
+                             missing_values_bin_idx,
+                             binned[:, feature_idx])
+
+
+cdef void _map_num_col_to_bins(const X_DTYPE_C [:] data,
+                               const X_DTYPE_C [:] binning_thresholds,
+                               const unsigned char missing_values_bin_idx,
+                               X_BINNED_DTYPE_C [:] binned):
+    """Binary search to find the bin index for each value in the data."""
+    cdef:
+        SIZE_t i
+        SIZE_t left
+        SIZE_t right
+        SIZE_t middle
+
+    for i in range(data.shape[0]):
+
+        if isnan(data[i]):
+            binned[i] = missing_values_bin_idx
+        else:
+            # for known values, use binary search
+            left, right = 0, binning_thresholds.shape[0]
+            while left < right:
+                middle = (right + left - 1) // 2
+                if data[i] <= binning_thresholds[middle]:
+                    right = middle
+                else:
+                    left = middle + 1
+            binned[i] = left
@@ -0,0 +1,75 @@
+"""A wrapper on base estimator."""
+
+
+__all__ = ["Estimator"]
+
+from .forest import RandomForestClassifier, ExtraTreesClassifier
+
+
+def make_estimator(
+    name,
+    n_trees=100,
+    max_depth=None,
+    min_samples_leaf=1,
+    n_jobs=None,
+    random_state=None
+):
+    # RandomForestClassifier
+    if name == "rf":
+        estimator = RandomForestClassifier(
+            n_estimators=n_trees,
+            max_depth=max_depth,
+            min_samples_leaf=min_samples_leaf,
+            n_jobs=n_jobs,
+            random_state=random_state,
+        )
+    # ExtraTreesClassifier
+    elif name == "erf":
+        estimator = ExtraTreesClassifier(
+            n_estimators=n_trees,
+            max_depth=max_depth,
+            min_samples_leaf=min_samples_leaf,
+            n_jobs=n_jobs,
+            random_state=random_state
+        )
+    else:
+        msg = "Unknown type of estimator, which should be one of {{rf, erf}}."
+        raise NotImplementedError(msg)
+
+    return estimator
+
+
+class Estimator(object):
+
+    def __init__(
+        self,
+        name,
+        n_trees=100,
+        max_depth=None,
+        min_samples_leaf=1,
+        n_jobs=None,
+        random_state=None
+    ):
+        self.estimator_ = make_estimator(name,
+                                         n_trees,
+                                         max_depth,
+                                         min_samples_leaf,
+                                         n_jobs,
+                                         random_state)
+
+    @property
+    def oob_decision_function_(self):
+        return self.estimator_.oob_decision_function_
+
+    def fit_transform(self, X, y):
+        self.estimator_.fit(X, y)
+        X_aug = self.estimator_.oob_decision_function_
+
+        return X_aug
+
+    def transform(self, X):
+
+        return self.estimator_.predict_proba(X)
+
+    def predict(self, X):
+        return self.estimator_.predict_proba(X)