|
13 | 13 |
|
14 | 14 | In the total set of features, only the 4 first ones are significant. We |
15 | 15 | can see that they have the highest score with univariate feature |
16 | | -selection. The SVM attributes small weights to these features, but these |
17 | | -weight are non zero. Applying univariate feature selection before the SVM |
| 16 | +selection. The SVM assigns a large weight to one of these features, but also |
| 17 | +Selects many of the non-informative features. |
| 18 | +Applying univariate feature selection before the SVM |
18 | 19 | increases the SVM weight attributed to the significant features, and will |
19 | 20 | thus improve classification. |
20 | 21 | """ |
|
29 | 30 | ############################################################################### |
30 | 31 | # import some data to play with |
31 | 32 |
|
32 | | -# The IRIS dataset |
| 33 | +# The iris dataset |
33 | 34 | iris = datasets.load_iris() |
34 | 35 |
|
35 | 36 | # Some noisy data not correlated |
36 | | -E = np.random.normal(size=(len(iris.data), 35)) |
| 37 | +E = np.random.uniform(0, 0.1, size=(len(iris.data), 20)) |
37 | 38 |
|
38 | 39 | # Add the noisy data to the informative features |
39 | | -x = np.hstack((iris.data, E)) |
| 40 | +X = np.hstack((iris.data, E)) |
40 | 41 | y = iris.target |
41 | 42 |
|
42 | 43 | ############################################################################### |
43 | 44 | pl.figure(1) |
44 | 45 | pl.clf() |
45 | 46 |
|
46 | | -x_indices = np.arange(x.shape[-1]) |
| 47 | +X_indices = np.arange(X.shape[-1]) |
47 | 48 |
|
48 | 49 | ############################################################################### |
49 | 50 | # Univariate feature selection with F-test for feature scoring |
50 | 51 | # We use the default selection function: the 10% most significant features |
51 | 52 | selector = SelectPercentile(f_classif, percentile=10) |
52 | | -selector.fit(x, y) |
53 | | -scores = -np.log10(selector.scores_) |
| 53 | +selector.fit(X, y) |
| 54 | +scores = -np.log10(selector.pvalues_) |
54 | 55 | scores /= scores.max() |
55 | | -pl.bar(x_indices - .45, scores, width=.3, |
| 56 | +pl.bar(X_indices - .45, scores, width=.2, |
56 | 57 | label=r'Univariate score ($-Log(p_{value})$)', |
57 | 58 | color='g') |
58 | 59 |
|
59 | 60 | ############################################################################### |
60 | 61 | # Compare to the weights of an SVM |
61 | 62 | clf = svm.SVC(kernel='linear') |
62 | | -clf.fit(x, y) |
| 63 | +clf.fit(X, y) |
63 | 64 |
|
64 | 65 | svm_weights = (clf.coef_ ** 2).sum(axis=0) |
65 | 66 | svm_weights /= svm_weights.max() |
66 | | -pl.bar(x_indices - .15, svm_weights, width=.3, label='SVM weight', |
| 67 | + |
| 68 | +pl.bar(X_indices - .25, svm_weights, width=.2, label='SVM weight', |
67 | 69 | color='r') |
68 | 70 |
|
| 71 | +clf_selected = svm.SVC(kernel='linear') |
| 72 | +clf_selected.fit(selector.transform(X), y) |
| 73 | + |
| 74 | +svm_weights_selected = (clf_selected.coef_ ** 2).sum(axis=0) |
| 75 | +svm_weights_selected /= svm_weights_selected.max() |
| 76 | + |
| 77 | +pl.bar(X_indices[selector.get_support()] - .05, svm_weights_selected, width=.2, |
| 78 | + label='SVM weights after selection', color='b') |
| 79 | + |
| 80 | + |
69 | 81 | pl.title("Comparing feature selection") |
70 | 82 | pl.xlabel('Feature number') |
71 | 83 | pl.yticks(()) |
|
0 commit comments