+ "import numpy as np\nfrom scipy import stats\nimport matplotlib.pyplot as plt\nimport matplotlib.font_manager\n\nfrom sklearn import svm\nfrom sklearn.covariance import EllipticEnvelope\nfrom sklearn.ensemble import IsolationForest\nfrom sklearn.neighbors import LocalOutlierFactor\n\nprint(__doc__)\n\nSEED = 42\nGRID_PRECISION = 100\n\nrng = np.random.RandomState(SEED)\n\n# Example settings\nn_samples = 200\noutliers_fraction = 0.25\nclusters_separation = (0, 1, 2)\n\n# define two outlier detection tools to be compared\nclassifiers = {\n \"One-Class SVM\": svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,\n kernel=\"rbf\", gamma=0.1),\n \"Robust covariance\": EllipticEnvelope(contamination=outliers_fraction),\n \"Isolation Forest\": IsolationForest(max_samples=n_samples,\n contamination=outliers_fraction,\n random_state=rng),\n \"Local Outlier Factor\": LocalOutlierFactor(\n n_neighbors=35,\n contamination=outliers_fraction)}\n\n# Compare given classifiers under given settings\nxx, yy = np.meshgrid(np.linspace(-7, 7, GRID_PRECISION),\n np.linspace(-7, 7, GRID_PRECISION))\nn_outliers = int(outliers_fraction * n_samples)\nn_inliers = n_samples - n_outliers\nground_truth = np.ones(n_samples, dtype=int)\nground_truth[-n_outliers:] = -1\n\n# Fit the problem with varying cluster separation\nfor _, offset in enumerate(clusters_separation):\n np.random.seed(SEED)\n # Data generation\n X1 = 0.3 * np.random.randn(n_inliers // 2, 2) - offset\n X2 = 0.3 * np.random.randn(n_inliers // 2, 2) + offset\n X = np.concatenate([X1, X2], axis=0)\n # Add outliers\n X = np.concatenate([X, np.random.uniform(low=-6, high=6,\n size=(n_outliers, 2))], axis=0)\n\n # Fit the model\n plt.figure(figsize=(9, 7))\n for i, (clf_name, clf) in enumerate(classifiers.items()):\n # fit the data and tag outliers\n if clf_name == \"Local Outlier Factor\":\n y_pred = clf.fit_predict(X)\n scores_pred = clf.negative_outlier_factor_\n else:\n clf.fit(X)\n scores_pred = clf.decision_function(X)\n y_pred = clf.predict(X)\n threshold = stats.scoreatpercentile(scores_pred,\n 100 * outliers_fraction)\n n_errors = (y_pred != ground_truth).sum()\n # plot the levels lines and the points\n if clf_name == \"Local Outlier Factor\":\n # decision_function is private for LOF\n Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()])\n else:\n Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])\n Z = Z.reshape(xx.shape)\n subplot = plt.subplot(2, 2, i + 1)\n subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),\n cmap=plt.cm.Blues_r)\n a = subplot.contour(xx, yy, Z, levels=[threshold],\n linewidths=2, colors='red')\n subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()],\n colors='orange')\n b = subplot.scatter(X[:-n_outliers, 0], X[:-n_outliers, 1], c='white',\n s=20, edgecolor='k')\n c = subplot.scatter(X[-n_outliers:, 0], X[-n_outliers:, 1], c='black',\n s=20, edgecolor='k')\n subplot.axis('tight')\n subplot.legend(\n [a.collections[0], b, c],\n ['learned decision function', 'true inliers', 'true outliers'],\n prop=matplotlib.font_manager.FontProperties(size=10),\n loc='lower right')\n subplot.set_xlabel(\"%d. %s (errors: %d)\" % (i + 1, clf_name, n_errors))\n subplot.set_xlim((-7, 7))\n subplot.set_ylim((-7, 7))\n plt.subplots_adjust(0.04, 0.1, 0.96, 0.94, 0.1, 0.26)\n plt.suptitle(\"Outlier detection\")\n\nplt.show()"
0 commit comments