| 
 | 1 | +#encoding=utf-8  | 
 | 2 | + | 
 | 3 | +import cv2  | 
 | 4 | +import time  | 
 | 5 | +import math  | 
 | 6 | +import numpy as np  | 
 | 7 | +import pandas as pd  | 
 | 8 | + | 
 | 9 | + | 
 | 10 | +from sklearn.cross_validation import train_test_split  | 
 | 11 | +from sklearn.metrics import accuracy_score  | 
 | 12 | + | 
 | 13 | +total_class = 10  | 
 | 14 | + | 
 | 15 | +# 二值化  | 
 | 16 | +def binaryzation(img):  | 
 | 17 | +    cv_img = img.astype(np.uint8)  | 
 | 18 | +    cv2.threshold(cv_img,50,1,cv2.cv.CV_THRESH_BINARY_INV,cv_img)  | 
 | 19 | +    return cv_img  | 
 | 20 | + | 
 | 21 | +def binaryzation_features(trainset):  | 
 | 22 | +    features = []  | 
 | 23 | + | 
 | 24 | +    for img in trainset:  | 
 | 25 | +        img = np.reshape(img,(28,28))  | 
 | 26 | +        cv_img = img.astype(np.uint8)  | 
 | 27 | + | 
 | 28 | +        img_b = binaryzation(cv_img)  | 
 | 29 | +        # hog_feature = np.transpose(hog_feature)  | 
 | 30 | +        features.append(img_b)  | 
 | 31 | + | 
 | 32 | +    features = np.array(features)  | 
 | 33 | +    features = np.reshape(features,(-1,784))  | 
 | 34 | + | 
 | 35 | +    return features  | 
 | 36 | + | 
 | 37 | + | 
 | 38 | +class Tree(object):  | 
 | 39 | +    def __init__(self,node_type,Class = None, feature = None):  | 
 | 40 | +        self.node_type = node_type  | 
 | 41 | +        self.dict = {}  | 
 | 42 | +        self.Class = Class  | 
 | 43 | +        self.feature = feature  | 
 | 44 | + | 
 | 45 | +    def add_tree(self,val,tree):  | 
 | 46 | +        self.dict[val] = tree  | 
 | 47 | + | 
 | 48 | +    def predict(self,features):  | 
 | 49 | +        if self.node_type == 'leaf':  | 
 | 50 | +            return self.Class  | 
 | 51 | + | 
 | 52 | +        print 'in'  | 
 | 53 | + | 
 | 54 | +        tree = self.dict[features[self.feature]]  | 
 | 55 | +        return tree.predict(features)  | 
 | 56 | + | 
 | 57 | +def calc_ent(x):  | 
 | 58 | +    """  | 
 | 59 | +        calculate shanno ent of x  | 
 | 60 | +    """  | 
 | 61 | + | 
 | 62 | +    x_value_list = set([x[i] for i in range(x.shape[0])])  | 
 | 63 | +    ent = 0.0  | 
 | 64 | +    for x_value in x_value_list:  | 
 | 65 | +        p = float(x[x == x_value].shape[0]) / x.shape[0]  | 
 | 66 | +        logp = np.log2(p)  | 
 | 67 | +        ent -= p * logp  | 
 | 68 | + | 
 | 69 | +    return ent  | 
 | 70 | + | 
 | 71 | +def calc_condition_ent(x, y):  | 
 | 72 | +    """  | 
 | 73 | +        calculate ent H(y|x)  | 
 | 74 | +    """  | 
 | 75 | + | 
 | 76 | +    # calc ent(y|x)  | 
 | 77 | +    x_value_list = set([x[i] for i in range(x.shape[0])])  | 
 | 78 | +    ent = 0.0  | 
 | 79 | +    for x_value in x_value_list:  | 
 | 80 | +        sub_y = y[x == x_value]  | 
 | 81 | +        temp_ent = calc_ent(sub_y)  | 
 | 82 | +        ent += (float(sub_y.shape[0]) / y.shape[0]) * temp_ent  | 
 | 83 | + | 
 | 84 | +    return ent  | 
 | 85 | + | 
 | 86 | +def calc_ent_grap(x,y):  | 
 | 87 | +    """  | 
 | 88 | +        calculate ent grap  | 
 | 89 | +    """  | 
 | 90 | + | 
 | 91 | +    base_ent = calc_ent(y)  | 
 | 92 | +    condition_ent = calc_condition_ent(x, y)  | 
 | 93 | +    ent_grap = base_ent - condition_ent  | 
 | 94 | + | 
 | 95 | +    return ent_grap  | 
 | 96 | + | 
 | 97 | +def train(train_set,train_label,features,epsilon):  | 
 | 98 | +    global total_class  | 
 | 99 | + | 
 | 100 | +    LEAF = 'leaf'  | 
 | 101 | +    INTERNAL = 'internal'  | 
 | 102 | + | 
 | 103 | + | 
 | 104 | +    # 步骤1——如果train_set中的所有实例都属于同一类Ck  | 
 | 105 | +    label_dict = [0 for i in xrange(total_class)]  | 
 | 106 | +    for label in train_label:  | 
 | 107 | +        label_dict[label] += 1  | 
 | 108 | + | 
 | 109 | +    for label, label_count in enumerate(label_dict):  | 
 | 110 | +        if label_count == len(train_label):  | 
 | 111 | +            tree = Tree(LEAF,Class = label)  | 
 | 112 | +            return tree  | 
 | 113 | + | 
 | 114 | +    # 步骤2——如果features为空  | 
 | 115 | +    max_len,max_class = 0,0  | 
 | 116 | +    for i in xrange(total_class):  | 
 | 117 | +        class_i = filter(lambda x:x==i,train_label)  | 
 | 118 | +        if len(class_i) > max_len:  | 
 | 119 | +            max_class = i  | 
 | 120 | +            max_len = len(class_i)  | 
 | 121 | + | 
 | 122 | +    if len(features) == 0:  | 
 | 123 | +        tree = Tree(LEAF,Class = max_class)  | 
 | 124 | +        return tree  | 
 | 125 | + | 
 | 126 | +    # 步骤3——计算信息增益  | 
 | 127 | +    max_feature = 0  | 
 | 128 | +    max_gda = 0  | 
 | 129 | + | 
 | 130 | +    D = train_label  | 
 | 131 | +    HD = calc_ent(D)  | 
 | 132 | +    for feature in features:  | 
 | 133 | +        A = np.array(train_set[:,feature].flat)  | 
 | 134 | +        gda = HD - calc_condition_ent(A,D)  | 
 | 135 | + | 
 | 136 | +        if gda > max_gda:  | 
 | 137 | +            max_gda,max_feature = gda,feature  | 
 | 138 | + | 
 | 139 | +    # 步骤4——小于阈值  | 
 | 140 | +    if max_gda < epsilon:  | 
 | 141 | +        tree = Tree(LEAF,Class = max_class)  | 
 | 142 | +        return tree  | 
 | 143 | + | 
 | 144 | +    # 步骤5——构建非空子集  | 
 | 145 | +    sub_features = filter(lambda x:x!=max_feature,features)  | 
 | 146 | +    tree = Tree(INTERNAL,feature=max_feature)  | 
 | 147 | + | 
 | 148 | +    feature_col = np.array(train_set[:,max_feature].flat)  | 
 | 149 | +    feature_value_list = set([feature_col[i] for i in range(feature_col.shape[0])])  | 
 | 150 | +    for feature_value in feature_value_list:  | 
 | 151 | + | 
 | 152 | +        index = []  | 
 | 153 | +        for i in xrange(len(train_label)):  | 
 | 154 | +            if train_set[i][max_feature] == feature_value:  | 
 | 155 | +                index.append(i)  | 
 | 156 | + | 
 | 157 | +        sub_train_set = train_set[index]  | 
 | 158 | +        sub_train_label = train_label[index]  | 
 | 159 | + | 
 | 160 | +        sub_tree = train(sub_train_set,sub_train_label,sub_features,epsilon)  | 
 | 161 | +        tree.add_tree(feature_value,sub_tree)  | 
 | 162 | + | 
 | 163 | +    return tree  | 
 | 164 | + | 
 | 165 | +def predict(test_set,tree):  | 
 | 166 | + | 
 | 167 | +    result = []  | 
 | 168 | +    for features in test_set:  | 
 | 169 | +        tmp_predict = tree.predict(features)  | 
 | 170 | +        result.append(tmp_predict)  | 
 | 171 | +    return np.array(result)  | 
 | 172 | + | 
 | 173 | + | 
 | 174 | + | 
 | 175 | +if __name__ == '__main__':  | 
 | 176 | +    # classes = [0,0,1,1,0,0,0,1,1,1,1,1,1,1,0]  | 
 | 177 | +    #  | 
 | 178 | +    # age = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2]  | 
 | 179 | +    # occupation = [0,0,1,1,0,0,0,1,0,0,0,0,1,1,0]  | 
 | 180 | +    # house = [0,0,0,1,0,0,0,1,1,1,1,1,0,0,0]  | 
 | 181 | +    # loan = [0,1,1,0,0,0,1,1,2,2,2,1,1,2,0]  | 
 | 182 | +    #  | 
 | 183 | +    # features = []  | 
 | 184 | +    #  | 
 | 185 | +    # for i in range(15):  | 
 | 186 | +    #     feature = [age[i],occupation[i],house[i],loan[i]]  | 
 | 187 | +    #     features.append(feature)  | 
 | 188 | +    #  | 
 | 189 | +    # trainset = np.array(features)  | 
 | 190 | +    #  | 
 | 191 | +    # tree = train(trainset,np.array(classes),[0,1,2,3],0.1)  | 
 | 192 | +    #  | 
 | 193 | +    # print type(tree)  | 
 | 194 | +    # features = [0,0,0,1]  | 
 | 195 | +    # print tree.predict(np.array(features))  | 
 | 196 | + | 
 | 197 | + | 
 | 198 | +    print 'Start read data'  | 
 | 199 | + | 
 | 200 | +    time_1 = time.time()  | 
 | 201 | + | 
 | 202 | +    raw_data = pd.read_csv('../data/train.csv',header=0)  | 
 | 203 | +    data = raw_data.values  | 
 | 204 | + | 
 | 205 | +    imgs = data[0::,1::]  | 
 | 206 | +    labels = data[::,0]  | 
 | 207 | + | 
 | 208 | +    features = binaryzation_features(imgs)  | 
 | 209 | + | 
 | 210 | +    # 选取 2/3 数据作为训练集, 1/3 数据作为测试集  | 
 | 211 | +    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.33, random_state=23323)  | 
 | 212 | +    # print train_features.shape  | 
 | 213 | +    # print train_features.shape  | 
 | 214 | + | 
 | 215 | +    time_2 = time.time()  | 
 | 216 | +    print 'read data cost ',time_2 - time_1,' second','\n'  | 
 | 217 | + | 
 | 218 | +    print 'Start training'  | 
 | 219 | +    tree = train(train_features,train_labels,[i for i in range(784)],0.2)  | 
 | 220 | +    print type(tree)  | 
 | 221 | +    print 'knn do not need to train'  | 
 | 222 | +    time_3 = time.time()  | 
 | 223 | +    print 'training cost ',time_3 - time_2,' second','\n'  | 
 | 224 | + | 
 | 225 | +    print 'Start predicting'  | 
 | 226 | +    test_predict = predict(test_features,tree)  | 
 | 227 | +    time_4 = time.time()  | 
 | 228 | +    print 'predicting cost ',time_4 - time_3,' second','\n'  | 
 | 229 | + | 
 | 230 | +    score = accuracy_score(test_labels,test_predict)  | 
 | 231 | +    print "The accruacy socre is ", score  | 
 | 232 | + | 
 | 233 | + | 
 | 234 | + | 
 | 235 | + | 
 | 236 | + | 
 | 237 | + | 
 | 238 | + | 
 | 239 | + | 
 | 240 | + | 
 | 241 | + | 
 | 242 | + | 
0 commit comments