Skip to content

Commit fec75b0

Browse files
committed
Code Commit
1 parent 32aab6d commit fec75b0

32 files changed

+8997
-0
lines changed

deepforest/__init__.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
from .cascade import CascadeForestClassifier
2+
from .forest import RandomForestClassifier
3+
from .forest import ExtraTreesClassifier
4+
from .tree import DecisionTreeClassifier
5+
from .tree import ExtraTreeClassifier
6+
7+
8+
__all__ = ["CascadeForestClassifier",
9+
"RandomForestClassifier",
10+
"ExtraTreesClassifier",
11+
"DecisionTreeClassifier",
12+
"ExtraTreeClassifier"]

deepforest/_binner.py

Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
"""
2+
Implementation of the Binner in Deep Forest.
3+
4+
This class is modified from:
5+
https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/ensemble/_hist_gradient_boosting/binning.py
6+
"""
7+
8+
9+
__all__ = ["Binner"]
10+
11+
import numpy as np
12+
from sklearn.utils import check_random_state
13+
from sklearn.base import BaseEstimator, TransformerMixin
14+
15+
from . import _cutils as _LIB
16+
17+
18+
X_DTYPE = np.float64
19+
X_BINNED_DTYPE = np.uint8
20+
ALMOST_INF = 1e300
21+
22+
23+
def _find_binning_thresholds_per_feature(
24+
col_data, n_bins, bin_type="percentile"
25+
):
26+
"""
27+
Private function used to find midpoints for samples along a
28+
specific feature.
29+
"""
30+
if len(col_data.shape) != 1:
31+
msg = (
32+
"Per-feature data should be of the shape (n_samples,), but"
33+
" got {}-dims instead."
34+
)
35+
raise RuntimeError(msg.format(len(col_data.shape)))
36+
37+
missing_mask = np.isnan(col_data)
38+
if missing_mask.any():
39+
col_data = col_data[~missing_mask]
40+
col_data = np.ascontiguousarray(col_data, dtype=X_DTYPE)
41+
distinct_values = np.unique(col_data)
42+
# Too few distinct values
43+
if len(distinct_values) <= n_bins:
44+
midpoints = distinct_values[:-1] + distinct_values[1:]
45+
midpoints *= 0.5
46+
else:
47+
# Equal interval in terms of percentile
48+
if bin_type == "percentile":
49+
percentiles = np.linspace(0, 100, num=n_bins + 1)
50+
percentiles = percentiles[1:-1]
51+
midpoints = np.percentile(
52+
col_data, percentiles, interpolation="midpoint"
53+
).astype(X_DTYPE)
54+
assert midpoints.shape[0] == n_bins - 1
55+
np.clip(midpoints, a_min=None, a_max=ALMOST_INF, out=midpoints)
56+
# Equal interval in terms of value
57+
elif bin_type == "interval":
58+
min_value, max_value = np.min(col_data), np.max(col_data)
59+
intervals = np.linspace(min_value, max_value, num=n_bins + 1)
60+
midpoints = intervals[1:-1]
61+
assert midpoints.shape[0] == n_bins - 1
62+
else:
63+
raise ValueError("Unknown binning type: {}.".format(bin_type))
64+
65+
return midpoints
66+
67+
68+
def _find_binning_thresholds(
69+
X, n_bins, bin_subsample=2e5, bin_type="percentile", random_state=None
70+
):
71+
n_samples, n_features = X.shape
72+
rng = check_random_state(random_state)
73+
74+
if n_samples > bin_subsample:
75+
subset = rng.choice(
76+
np.arange(n_samples), bin_subsample, replace=False
77+
)
78+
X = X.take(subset, axis=0)
79+
80+
binning_thresholds = []
81+
for f_idx in range(n_features):
82+
threshold = _find_binning_thresholds_per_feature(
83+
X[:, f_idx],
84+
n_bins,
85+
bin_type
86+
)
87+
binning_thresholds.append(threshold)
88+
89+
return binning_thresholds
90+
91+
92+
class Binner(TransformerMixin, BaseEstimator):
93+
94+
def __init__(
95+
self,
96+
n_bins=255,
97+
bin_subsample=2e5,
98+
bin_type="percentile",
99+
random_state=None
100+
):
101+
self.n_bins = n_bins + 1 # + 1 for missing values
102+
self.bin_subsample = int(bin_subsample)
103+
self.bin_type = bin_type
104+
self.random_state = random_state
105+
self._is_fitted = False
106+
107+
def _validate_params(self):
108+
109+
if not 2 <= self.n_bins - 1 <= 255:
110+
msg = ("`n_bins` should be in the range [2, 255], bug got"
111+
" {} instead.")
112+
raise ValueError(msg.format(self.n_bins - 1))
113+
114+
if not self.bin_subsample > 0:
115+
msg = (
116+
"The number of samples used to construct the Binner"
117+
" should be strictly positive, but got {} instead."
118+
)
119+
raise ValueError(msg.format(self.bin_subsample))
120+
121+
if self.bin_type not in ("percentile", "interval"):
122+
msg = ("The type of binner should be one of {{percentile, interval"
123+
"}}, bug got {} instead.")
124+
raise ValueError(msg.format(self.bin_type))
125+
126+
def fit(self, X):
127+
128+
self._validate_params()
129+
130+
self.bin_thresholds_ = _find_binning_thresholds(
131+
X,
132+
self.n_bins - 1,
133+
self.bin_subsample,
134+
self.bin_type,
135+
self.random_state,
136+
)
137+
138+
self.n_bins_non_missing_ = np.array(
139+
[thresholds.shape[0] + 1 for thresholds in self.bin_thresholds_],
140+
dtype=np.uint32,
141+
)
142+
143+
self.missing_values_bin_idx_ = self.n_bins - 1
144+
self._is_fitted = True
145+
146+
return self
147+
148+
def transform(self, X):
149+
150+
if not self._is_fitted:
151+
msg = (
152+
"The binner has not been fitted yet when calling `transform`."
153+
)
154+
raise RuntimeError(msg)
155+
156+
if not X.shape[1] == self.n_bins_non_missing_.shape[0]:
157+
msg = (
158+
"The binner was fitted with {} features but {} features got"
159+
" passed to `transform`."
160+
)
161+
raise ValueError(
162+
msg.format(self.n_bins_non_missing_.shape[0], X.shape[1])
163+
)
164+
165+
X_binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order="F")
166+
_LIB._map_to_bins(
167+
X, self.bin_thresholds_, self.missing_values_bin_idx_, X_binned
168+
)
169+
170+
return X_binned

deepforest/_cutils.pyx

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
# cython: cdivision=True
2+
# cython: boundscheck=False
3+
# cython: wraparound=False
4+
# cython: nonecheck=False
5+
# cython: language_level=3
6+
7+
# Author: Yi-Xuan Xu
8+
9+
10+
cimport cython
11+
import numpy as np
12+
cimport numpy as np
13+
from libc.math cimport isnan
14+
15+
ctypedef np.npy_bool BOOL
16+
ctypedef np.npy_intp SIZE_t
17+
ctypedef np.npy_int32 INT32_t
18+
ctypedef np.npy_float64 X_DTYPE_C
19+
ctypedef np.npy_uint8 X_BINNED_DTYPE_C
20+
21+
np.import_array()
22+
23+
24+
cpdef void _c_merge_proba(np.ndarray[X_DTYPE_C, ndim=2] probas,
25+
SIZE_t n_outputs,
26+
np.ndarray[X_DTYPE_C, ndim=2] out):
27+
cdef:
28+
SIZE_t n_features = probas.shape[1]
29+
SIZE_t start = 0
30+
SIZE_t count = 0
31+
32+
while start < n_features:
33+
out += probas[:, start : (start + n_outputs)]
34+
start += n_outputs
35+
count += 1
36+
37+
out /= count
38+
39+
40+
cpdef np.ndarray _c_sample_mask(const INT32_t [:] indices,
41+
int n_samples):
42+
"""
43+
Generate the sample mask given indices without resorting to `np.unique`."""
44+
cdef:
45+
SIZE_t i
46+
SIZE_t n = indices.shape[0]
47+
SIZE_t sample_id
48+
np.ndarray[BOOL, ndim=1] sample_mask = np.zeros((n_samples,),
49+
dtype=np.bool)
50+
51+
with nogil:
52+
for i in range(n):
53+
sample_id = indices[i]
54+
if not sample_mask[sample_id]:
55+
sample_mask[sample_id] = True
56+
57+
return sample_mask
58+
59+
60+
# Modified from HGBDT in Scikit-Learn
61+
cpdef _map_to_bins(object X,
62+
list binning_thresholds,
63+
const unsigned char missing_values_bin_idx,
64+
X_BINNED_DTYPE_C [::1, :] binned):
65+
"""Bin numerical values to discrete integer-coded levels.
66+
67+
Parameters
68+
----------
69+
data : ndarray, shape (n_samples, n_features)
70+
The numerical data to bin.
71+
binning_thresholds : list of arrays
72+
For each feature, stores the increasing numeric values that are
73+
used to separate the bins.
74+
binned : ndarray, shape (n_samples, n_features)
75+
Output array, must be fortran aligned.
76+
"""
77+
cdef:
78+
const X_DTYPE_C[:, :] X_ndarray = X
79+
SIZE_t n_features = X_ndarray.shape[1]
80+
SIZE_t feature_idx
81+
82+
for feature_idx in range(n_features):
83+
_map_num_col_to_bins(X[:, feature_idx],
84+
binning_thresholds[feature_idx],
85+
missing_values_bin_idx,
86+
binned[:, feature_idx])
87+
88+
89+
cdef void _map_num_col_to_bins(const X_DTYPE_C [:] data,
90+
const X_DTYPE_C [:] binning_thresholds,
91+
const unsigned char missing_values_bin_idx,
92+
X_BINNED_DTYPE_C [:] binned):
93+
"""Binary search to find the bin index for each value in the data."""
94+
cdef:
95+
SIZE_t i
96+
SIZE_t left
97+
SIZE_t right
98+
SIZE_t middle
99+
100+
for i in range(data.shape[0]):
101+
102+
if isnan(data[i]):
103+
binned[i] = missing_values_bin_idx
104+
else:
105+
# for known values, use binary search
106+
left, right = 0, binning_thresholds.shape[0]
107+
while left < right:
108+
middle = (right + left - 1) // 2
109+
if data[i] <= binning_thresholds[middle]:
110+
right = middle
111+
else:
112+
left = middle + 1
113+
binned[i] = left

deepforest/_estimator.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
"""A wrapper on base estimator."""
2+
3+
4+
__all__ = ["Estimator"]
5+
6+
from .forest import RandomForestClassifier, ExtraTreesClassifier
7+
8+
9+
def make_estimator(
10+
name,
11+
n_trees=100,
12+
max_depth=None,
13+
min_samples_leaf=1,
14+
n_jobs=None,
15+
random_state=None
16+
):
17+
# RandomForestClassifier
18+
if name == "rf":
19+
estimator = RandomForestClassifier(
20+
n_estimators=n_trees,
21+
max_depth=max_depth,
22+
min_samples_leaf=min_samples_leaf,
23+
n_jobs=n_jobs,
24+
random_state=random_state,
25+
)
26+
# ExtraTreesClassifier
27+
elif name == "erf":
28+
estimator = ExtraTreesClassifier(
29+
n_estimators=n_trees,
30+
max_depth=max_depth,
31+
min_samples_leaf=min_samples_leaf,
32+
n_jobs=n_jobs,
33+
random_state=random_state
34+
)
35+
else:
36+
msg = "Unknown type of estimator, which should be one of {{rf, erf}}."
37+
raise NotImplementedError(msg)
38+
39+
return estimator
40+
41+
42+
class Estimator(object):
43+
44+
def __init__(
45+
self,
46+
name,
47+
n_trees=100,
48+
max_depth=None,
49+
min_samples_leaf=1,
50+
n_jobs=None,
51+
random_state=None
52+
):
53+
self.estimator_ = make_estimator(name,
54+
n_trees,
55+
max_depth,
56+
min_samples_leaf,
57+
n_jobs,
58+
random_state)
59+
60+
@property
61+
def oob_decision_function_(self):
62+
return self.estimator_.oob_decision_function_
63+
64+
def fit_transform(self, X, y):
65+
self.estimator_.fit(X, y)
66+
X_aug = self.estimator_.oob_decision_function_
67+
68+
return X_aug
69+
70+
def transform(self, X):
71+
72+
return self.estimator_.predict_proba(X)
73+
74+
def predict(self, X):
75+
return self.estimator_.predict_proba(X)

0 commit comments

Comments
 (0)