| 
2 | 2 | ==========================================  | 
3 | 3 | IsolationForest benchmark  | 
4 | 4 | ==========================================  | 
5 |  | -
  | 
6 | 5 | A test of IsolationForest on classical anomaly detection datasets.  | 
7 |  | -
  | 
8 | 6 | """  | 
9 |  | -print(__doc__)  | 
10 | 7 | 
 
  | 
11 | 8 | from time import time  | 
12 | 9 | import numpy as np  | 
13 | 10 | import matplotlib.pyplot as plt  | 
 | 11 | + | 
14 | 12 | from sklearn.ensemble import IsolationForest  | 
15 | 13 | from sklearn.metrics import roc_curve, auc  | 
16 | 14 | from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_mldata  | 
17 |  | -from sklearn.preprocessing import LabelBinarizer  | 
 | 15 | +from sklearn.preprocessing import MultiLabelBinarizer  | 
18 | 16 | from sklearn.utils import shuffle as sh  | 
19 | 17 | 
 
  | 
20 |  | -np.random.seed(1)  | 
 | 18 | +print(__doc__)  | 
21 | 19 | 
 
  | 
22 |  | -datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']  | 
23 | 20 | 
 
  | 
 | 21 | +def print_outlier_ratio(y):  | 
 | 22 | +    """  | 
 | 23 | +    Helper function to show the distinct value count of element in the target.  | 
 | 24 | +    Useful indicator for the datasets used in bench_isolation_forest.py.  | 
 | 25 | +    """  | 
 | 26 | +    uniq, cnt = np.unique(y, return_counts=True)  | 
 | 27 | +    print("----- Target count values: ")  | 
 | 28 | +    for u, c in zip(uniq, cnt):  | 
 | 29 | +        print("------ %s -> %d occurences" % (str(u), c))  | 
 | 30 | +    print("----- Outlier ratio: %.5f" % (np.min(cnt) / len(y)))  | 
 | 31 | + | 
 | 32 | + | 
 | 33 | +np.random.seed(1)  | 
24 | 34 | fig_roc, ax_roc = plt.subplots(1, 1, figsize=(8, 5))  | 
25 | 35 | 
 
  | 
 | 36 | +# Set this to true for plotting score histograms for each dataset:  | 
 | 37 | +with_decision_function_histograms = False  | 
26 | 38 | 
 
  | 
 | 39 | +# Removed the shuttle dataset because as of 2017-03-23 mldata.org is down:  | 
 | 40 | +# datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']  | 
 | 41 | +datasets = ['http', 'smtp', 'SA', 'SF', 'forestcover']  | 
 | 42 | + | 
 | 43 | +# Loop over all datasets for fitting and scoring the estimator:  | 
27 | 44 | for dat in datasets:  | 
28 |  | -    # loading and vectorization  | 
29 |  | -    print('loading data')  | 
30 |  | -    if dat in ['http', 'smtp', 'SA', 'SF']:  | 
 | 45 | + | 
 | 46 | +    # Loading and vectorizing the data:  | 
 | 47 | +    print('====== %s ======' % dat)  | 
 | 48 | +    print('--- Fetching data...')  | 
 | 49 | +    if dat in ['http', 'smtp', 'SF', 'SA']:  | 
31 | 50 |         dataset = fetch_kddcup99(subset=dat, shuffle=True, percent10=True)  | 
32 | 51 |         X = dataset.data  | 
33 | 52 |         y = dataset.target  | 
 | 
43 | 62 |         X = X[s, :]  | 
44 | 63 |         y = y[s]  | 
45 | 64 |         y = (y != 1).astype(int)  | 
 | 65 | +        print('----- ')  | 
46 | 66 | 
 
  | 
47 | 67 |     if dat == 'forestcover':  | 
48 | 68 |         dataset = fetch_covtype(shuffle=True)  | 
 | 
54 | 74 |         X = X[s, :]  | 
55 | 75 |         y = y[s]  | 
56 | 76 |         y = (y != 2).astype(int)  | 
 | 77 | +        print_outlier_ratio(y)  | 
57 | 78 | 
 
  | 
58 |  | -    print('vectorizing data')  | 
 | 79 | +    print('--- Vectorizing data...')  | 
59 | 80 | 
 
  | 
60 | 81 |     if dat == 'SF':  | 
61 |  | -        lb = LabelBinarizer()  | 
62 |  | -        lb.fit(X[:, 1])  | 
63 |  | -        x1 = lb.transform(X[:, 1])  | 
 | 82 | +        lb = MultiLabelBinarizer()  | 
 | 83 | +        x1 = lb.fit_transform(X[:, 1])  | 
64 | 84 |         X = np.c_[X[:, :1], x1, X[:, 2:]]  | 
65 |  | -        y = (y != 'normal.').astype(int)  | 
 | 85 | +        y = (y != b'normal.').astype(int)  | 
 | 86 | +        print_outlier_ratio(y)  | 
66 | 87 | 
 
  | 
67 | 88 |     if dat == 'SA':  | 
68 |  | -        lb = LabelBinarizer()  | 
69 |  | -        lb.fit(X[:, 1])  | 
70 |  | -        x1 = lb.transform(X[:, 1])  | 
71 |  | -        lb.fit(X[:, 2])  | 
72 |  | -        x2 = lb.transform(X[:, 2])  | 
73 |  | -        lb.fit(X[:, 3])  | 
74 |  | -        x3 = lb.transform(X[:, 3])  | 
 | 89 | +        lb = MultiLabelBinarizer()  | 
 | 90 | +        x1 = lb.fit_transform(X[:, 1])  | 
 | 91 | +        x2 = lb.fit_transform(X[:, 2])  | 
 | 92 | +        x3 = lb.fit_transform(X[:, 3])  | 
75 | 93 |         X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]]  | 
76 |  | -        y = (y != 'normal.').astype(int)  | 
 | 94 | +        y = (y != b'normal.').astype(int)  | 
 | 95 | +        print_outlier_ratio(y)  | 
77 | 96 | 
 
  | 
78 |  | -    if dat == 'http' or dat == 'smtp':  | 
79 |  | -        y = (y != 'normal.').astype(int)  | 
 | 97 | +    if dat in ('http', 'smtp'):  | 
 | 98 | +        y = (y != b'normal.').astype(int)  | 
 | 99 | +        print_outlier_ratio(y)  | 
80 | 100 | 
 
  | 
81 | 101 |     n_samples, n_features = X.shape  | 
82 | 102 |     n_samples_train = n_samples // 2  | 
 | 
87 | 107 |     y_train = y[:n_samples_train]  | 
88 | 108 |     y_test = y[n_samples_train:]  | 
89 | 109 | 
 
  | 
90 |  | -    print('IsolationForest processing...')  | 
 | 110 | +    print('--- Fitting the IsolationForest estimator...')  | 
91 | 111 |     model = IsolationForest(n_jobs=-1)  | 
92 | 112 |     tstart = time()  | 
93 | 113 |     model.fit(X_train)  | 
94 | 114 |     fit_time = time() - tstart  | 
95 | 115 |     tstart = time()  | 
96 | 116 | 
 
  | 
97 |  | -    scoring = - model.decision_function(X_test)  # the lower, the more normal  | 
98 |  | - | 
99 |  | -    # Show score histograms  | 
100 |  | -    fig, ax = plt.subplots(3, sharex=True, sharey=True)  | 
101 |  | -    bins = np.linspace(-0.5, 0.5, 200)  | 
102 |  | -    ax[0].hist(scoring, bins, color='black')  | 
103 |  | -    ax[0].set_title('decision function for %s dataset' % dat)  | 
104 |  | -    ax[0].legend(loc="lower right")  | 
105 |  | -    ax[1].hist(scoring[y_test == 0], bins, color='b',  | 
106 |  | -               label='normal data')  | 
107 |  | -    ax[1].legend(loc="lower right")  | 
108 |  | -    ax[2].hist(scoring[y_test == 1], bins, color='r',  | 
109 |  | -               label='outliers')  | 
110 |  | -    ax[2].legend(loc="lower right")  | 
 | 117 | +    scoring = - model.decision_function(X_test)  # the lower, the more abnormal  | 
 | 118 | + | 
 | 119 | +    print("--- Preparing the plot elements...")  | 
 | 120 | +    if with_decision_function_histograms:  | 
 | 121 | +        fig, ax = plt.subplots(3, sharex=True, sharey=True)  | 
 | 122 | +        bins = np.linspace(-0.5, 0.5, 200)  | 
 | 123 | +        ax[0].hist(scoring, bins, color='black')  | 
 | 124 | +        ax[0].set_title('Decision function for %s dataset' % dat)  | 
 | 125 | +        ax[1].hist(scoring[y_test == 0], bins, color='b', label='normal data')  | 
 | 126 | +        ax[1].legend(loc="lower right")  | 
 | 127 | +        ax[2].hist(scoring[y_test == 1], bins, color='r', label='outliers')  | 
 | 128 | +        ax[2].legend(loc="lower right")  | 
111 | 129 | 
 
  | 
112 | 130 |     # Show ROC Curves  | 
113 | 131 |     predict_time = time() - tstart  | 
114 | 132 |     fpr, tpr, thresholds = roc_curve(y_test, scoring)  | 
115 |  | -    AUC = auc(fpr, tpr)  | 
116 |  | -    label = ('%s (area: %0.3f, train-time: %0.2fs, '  | 
117 |  | -             'test-time: %0.2fs)' % (dat, AUC, fit_time, predict_time))  | 
 | 133 | +    auc_score = auc(fpr, tpr)  | 
 | 134 | +    label = ('%s (AUC: %0.3f, train_time= %0.2fs, '  | 
 | 135 | +             'test_time= %0.2fs)' % (dat, auc_score, fit_time, predict_time))  | 
 | 136 | +    # Print AUC score and train/test time:  | 
 | 137 | +    print(label)  | 
118 | 138 |     ax_roc.plot(fpr, tpr, lw=1, label=label)  | 
119 | 139 | 
 
  | 
120 | 140 | 
 
  | 
 | 
0 commit comments