|
| 1 | +#-*-coding:utf-8-*- |
| 2 | +''' |
| 3 | +Created on 2016年5月10日 |
| 4 | +
|
| 5 | +@author: Gamer Think |
| 6 | +''' |
| 7 | +from test.inspect_fodder import StupidGit |
| 8 | + |
| 9 | +__author__="thinkgamer" |
| 10 | + |
| 11 | +from numpy import * |
| 12 | + |
| 13 | +#加载数据集 |
| 14 | +def loadSimData(): |
| 15 | + datMat = matrix([[1.0 , 2.1], |
| 16 | + [2. , 1.1], |
| 17 | + [1.3 , 1. ], |
| 18 | + [1. , 1. ], |
| 19 | + [2. , 1. ]]) |
| 20 | + |
| 21 | + classLabels = [1.0, 1.0, -1.0, -1.0, 1.0] |
| 22 | + return datMat,classLabels |
| 23 | + |
| 24 | +#单层决策树生成函数 |
| 25 | +def stumpClassify(dataMatrix, dimen,threshVal, threshInsq): |
| 26 | + retArray = ones((shape(dataMatrix)[0],1)) |
| 27 | + if threshInsq == 'lt': |
| 28 | + retArray[dataMatrix[:,dimen] <= threshVal] = -1.0 |
| 29 | + else: |
| 30 | + retArray[dataMatrix[:,dimen] > threshVal] = -1.0 |
| 31 | + return retArray |
| 32 | + |
| 33 | +def buildStump(dataArr,classLabels,D): |
| 34 | + dataMatrix = mat(dataArr) |
| 35 | + #matrix必须是二维的,numpy可以是多维的 |
| 36 | + labelMat = mat(classLabels).T #.T表示转置矩阵 |
| 37 | + m,n = shape(dataMatrix) #给定数据集的行列数 |
| 38 | + numSteps = 10.0 #变用于在特征的所有可能值上进行遍历 |
| 39 | + bestStump = {} #字典用于存储给定权重向量0时所得到的最佳单层决策树的相关信息 |
| 40 | + bestClassEnt = mat(zeros((m,1))) |
| 41 | + minError = inf #首先将minError初始化为正无穷大 |
| 42 | + for i in range(n): |
| 43 | + rangeMin = dataMatrix[:,i].min() |
| 44 | + rangeMax = dataMatrix[:,i].max() |
| 45 | + stepSize = (rangeMax-rangeMin)/numSteps |
| 46 | + for j in range(-1,int(numSteps)+1): |
| 47 | + #lt :小于,lte,le:小于等于 |
| 48 | + #gt:大于,,gte,ge:大于等于 |
| 49 | + #eq:等于 ne,neq:不等于 |
| 50 | + for inequal in ['lt','gt']: |
| 51 | + threshVal = (rangeMin + float(j) * stepSize) |
| 52 | + predictedVals = stumpClassify(dataMatrix,i,threshVal, inequal) |
| 53 | + errArr = mat(ones((m,1))) |
| 54 | + errArr[predictedVals==labelMat]=0 |
| 55 | + weightedError = D.T * errArr #计算加权错误概率 |
| 56 | +# print "split: dim %d, thresh % .2f, thresh inequal: %s, the weighted error is %.3f" % (i, threshVal,inequal,weightedError) |
| 57 | + #更新bestStump中保存的最佳单层决策树的相关信息 |
| 58 | + if weightedError < minError: |
| 59 | + minError = weightedError |
| 60 | + bestClassEnt = predictedVals.copy() |
| 61 | + bestStump['dim'] = i |
| 62 | + bestStump['thresh'] = threshVal |
| 63 | + bestStump['ineq'] = inequal |
| 64 | + |
| 65 | + return bestStump,minError,bestClassEnt |
| 66 | + |
| 67 | +#基于单层决策树的AdaBoost训练过程 |
| 68 | +#numIt:迭代次数,默认为40 |
| 69 | +def adaBoostTrainDS(dataArr,classLabels,numIt=40): |
| 70 | + weakClassArr = [] |
| 71 | + m= shape(dataArr)[0] |
| 72 | + D = mat(ones((m,1))/m) |
| 73 | + aggClassEst = mat(zeros((m,1))) |
| 74 | + #迭代 |
| 75 | + for i in range(numIt): |
| 76 | + #调用单层决策树 |
| 77 | + bestStump,error,classEst = buildStump(dataArr, classLabels, D) |
| 78 | + print "D:",D.T #打印D的转置矩阵 |
| 79 | + alpha = float(0.5 * log((1.0 - error) / max(error,1e-16)))# max(error,1e-16)))用于确保没有错误时,不会发生溢出 |
| 80 | + bestStump['alpha'] = alpha |
| 81 | + weakClassArr.append(bestStump) |
| 82 | + print "classEst:",classEst.T |
| 83 | + #为下一次迭代计算D |
| 84 | + expon = multiply(-1 * alpha * mat(classLabels).T,classEst) |
| 85 | + D = multiply(D,exp(expon)) |
| 86 | + D = D /D.sum() |
| 87 | + #错误率累加计算 |
| 88 | + aggClassEst += alpha* classEst |
| 89 | + print "aggClassEst:",aggClassEst.T |
| 90 | + aggErrors = multiply(sign(aggClassEst) != mat(classLabels).T, ones((m,1))) |
| 91 | + errorRate = aggErrors.sum()/m |
| 92 | + print "total error:",errorRate |
| 93 | + #如果不发生错误,返回 |
| 94 | + if errorRate == 0.0: |
| 95 | + break |
| 96 | + return weakClassArr |
| 97 | + |
| 98 | + |
| 99 | +#AdaBoost分类函数 |
| 100 | +#输入参数为待分类样例datToClass和多个弱分类器classifierArr |
| 101 | +def adaClassify(datToClass,classifierArr): |
| 102 | + dataMatrix = mat(datToClass) |
| 103 | + m = shape(dataMatrix)[0] |
| 104 | + aggClassEst = mat(zeros((m,1))) |
| 105 | + for i in range(len(classifierArr)): |
| 106 | + classEst = stumpClassify(dataMatrix,classifierArr[i]['dim'],\ |
| 107 | + classifierArr[i]['thresh'],\ |
| 108 | + classifierArr[i]['ineq']) |
| 109 | + aggClassEst+= classifierArr[i]['alpha'] * classEst |
| 110 | + print aggClassEst |
| 111 | + return sign(aggClassEst) |
| 112 | + |
| 113 | + |
| 114 | +#main函数 |
| 115 | +if __name__=="__main__": |
| 116 | + #加载数据集 |
| 117 | + datMat,classLabels = loadSimData() |
| 118 | +# print "datMat:",datMat |
| 119 | +# print "classLabels:",classLabels |
| 120 | + |
| 121 | + #单层决策树生成函数 |
| 122 | +# D = mat(ones((5,1))/5) |
| 123 | +# print buildStump(datMat, classLabels, D) |
| 124 | + |
| 125 | + #基于单层决策树的Adaboost训练过程 |
| 126 | + classifierArray = adaBoostTrainDS(datMat, classLabels, 30) |
| 127 | +# for classifier in classifierArray: |
| 128 | +# print classifier |
| 129 | + |
| 130 | + #测试AdaBoost分类函数 |
| 131 | + print "[0,0]:\n",adaClassify([0,0], classifierArray) |
| 132 | + print "\n\n[[5,5],[0,0]]:\n",adaClassify([[5,5],[0,0]], classifierArray) |
0 commit comments