| 
5 | 5 | # @Last modified by:   WenDesi  | 
6 | 6 | # @Last modified time: 08-11-16  | 
7 | 7 | 
 
  | 
 | 8 | +import time  | 
8 | 9 | import math  | 
9 | 10 | import random  | 
10 | 11 | 
 
  | 
 | 12 | +import pandas as pd  | 
 | 13 | +from sklearn.cross_validation import train_test_split  | 
 | 14 | +from sklearn.metrics import accuracy_score  | 
11 | 15 | 
 
  | 
12 |  | -def predict_(x, w):  | 
13 |  | -    wx = sum([w[j] * x[j] for j in xrange(len(w))])  | 
14 |  | -    exp_wx = math.exp(wx)  | 
15 | 16 | 
 
  | 
16 |  | -    predict1 = exp_wx / (1 + exp_wx)  | 
17 |  | -    predict0 = 1 / (1 + exp_wx)  | 
 | 17 | +class LogisticRegression(object):  | 
18 | 18 | 
 
  | 
19 |  | -    if predict1 > predict0:  | 
20 |  | -        return 1  | 
21 |  | -    else:  | 
22 |  | -        return 0  | 
 | 19 | +    def __init__(self):  | 
 | 20 | +        self.learning_step = 0.00001  | 
 | 21 | +        self.max_iteration = 5000  | 
23 | 22 | 
 
  | 
 | 23 | +    def predict_(self,x):  | 
 | 24 | +        wx = sum([self.w[j] * x[j] for j in xrange(len(self.w))])  | 
 | 25 | +        exp_wx = math.exp(wx)  | 
24 | 26 | 
 
  | 
25 |  | -def train(features, labels):  | 
26 |  | -    w = [0.0] * (len(features[0]) + 1)  | 
 | 27 | +        predict1 = exp_wx / (1 + exp_wx)  | 
 | 28 | +        predict0 = 1 / (1 + exp_wx)  | 
27 | 29 | 
 
  | 
28 |  | -    learning_step = 0.00001  | 
29 |  | -    max_iteration = 1000  | 
30 |  | -    correct_count = 0  | 
31 |  | -    time = 0  | 
 | 30 | +        if predict1 > predict0:  | 
 | 31 | +            return 1  | 
 | 32 | +        else:  | 
 | 33 | +            return 0  | 
32 | 34 | 
 
  | 
33 |  | -    while time < max_iteration:  | 
34 |  | -        index = random.randint(0, len(labels) - 1)  | 
35 |  | -        x = features[index]  | 
36 |  | -        x.append(1.0)  | 
37 |  | -        y = labels[index]  | 
38 | 35 | 
 
  | 
39 |  | -        if y == predict_(x, w):  | 
40 |  | -            correct_count += 1  | 
41 |  | -            if correct_count > max_iteration:  | 
42 |  | -                break  | 
43 |  | -            continue  | 
 | 36 | +    def train(self,features, labels):  | 
 | 37 | +        self.w = [0.0] * (len(features[0]) + 1)  | 
44 | 38 | 
 
  | 
45 |  | -        print 'iterater times %d' % time  | 
46 |  | -        time += 1  | 
47 | 39 |         correct_count = 0  | 
 | 40 | +        time = 0  | 
48 | 41 | 
 
  | 
49 |  | -        wx = sum([w[i] * x[i] for i in xrange(len(w))])  | 
50 |  | -        exp_wx = math.exp(wx)  | 
51 |  | - | 
52 |  | -        for i in xrange(len(w)):  | 
53 |  | -            w[i] -= learning_step * (-y * x[i] + float(x[i] * exp_wx) / float(1 + exp_wx))  | 
 | 42 | +        while time < self.max_iteration:  | 
 | 43 | +            index = random.randint(0, len(labels) - 1)  | 
 | 44 | +            x = list(features[index])  | 
 | 45 | +            x.append(1.0)  | 
 | 46 | +            y = labels[index]  | 
54 | 47 | 
 
  | 
55 |  | -    return w  | 
 | 48 | +            if y == self.predict_(x):  | 
 | 49 | +                correct_count += 1  | 
 | 50 | +                if correct_count > self.max_iteration:  | 
 | 51 | +                    break  | 
 | 52 | +                continue  | 
56 | 53 | 
 
  | 
 | 54 | +            # print 'iterater times %d' % time  | 
 | 55 | +            time += 1  | 
 | 56 | +            correct_count = 0  | 
57 | 57 | 
 
  | 
58 |  | -def predict(features, w):  | 
59 |  | -    labels = []  | 
 | 58 | +            wx = sum([self.w[i] * x[i] for i in xrange(len(self.w))])  | 
 | 59 | +            exp_wx = math.exp(wx)  | 
60 | 60 | 
 
  | 
61 |  | -    for feature in features:  | 
62 |  | -        feature.append(1)  | 
63 |  | -        x = feature  | 
 | 61 | +            for i in xrange(len(self.w)):  | 
 | 62 | +                self.w[i] -= self.learning_step * \  | 
 | 63 | +                    (-y * x[i] + float(x[i] * exp_wx) / float(1 + exp_wx))  | 
64 | 64 | 
 
  | 
65 |  | -        labels.append(predict_(x,w))  | 
66 | 65 | 
 
  | 
67 |  | -    return labels  | 
 | 66 | +    def predict(self,features):  | 
 | 67 | +        labels = []  | 
68 | 68 | 
 
  | 
 | 69 | +        for feature in features:  | 
 | 70 | +            x = list(feature)  | 
 | 71 | +            x.append(1)  | 
 | 72 | +            labels.append(self.predict_(x))  | 
69 | 73 | 
 
  | 
70 |  | -def build_dataset(label, original_posins, radius, size):  | 
71 |  | -    datasets = []  | 
72 |  | -    dim = len(original_posins)  | 
 | 74 | +        return labels  | 
73 | 75 | 
 
  | 
74 |  | -    for i in xrange(size):  | 
75 |  | -        dataset = [label]  | 
76 |  | -        for j in xrange(dim):  | 
77 |  | -            point = random.randint(0, 2 * radius) - radius + original_posins[j]  | 
78 |  | -            dataset.append(point)  | 
79 |  | -        datasets.append(dataset)  | 
 | 76 | +if __name__ == "__main__":  | 
 | 77 | +    print 'Start read data'  | 
80 | 78 | 
 
  | 
81 |  | -    return datasets  | 
 | 79 | +    time_1 = time.time()  | 
82 | 80 | 
 
  | 
83 |  | -if __name__ == "__main__":  | 
 | 81 | +    raw_data = pd.read_csv('../data/train_binary.csv',header=0)  | 
 | 82 | +    data = raw_data.values  | 
84 | 83 | 
 
  | 
85 |  | -    # 构建训练集  | 
86 |  | -    trainset1 = build_dataset(0, [0, 0], 10, 100)  | 
87 |  | -    trainset2 = build_dataset(1, [30, 30], 10, 100)  | 
 | 84 | +    imgs = data[0::,1::]  | 
 | 85 | +    labels = data[::,0]  | 
88 | 86 | 
 
  | 
89 |  | -    trainset = trainset1  | 
90 |  | -    trainset.extend(trainset2)  | 
91 |  | -    random.shuffle(trainset)  | 
92 | 87 | 
 
  | 
93 |  | -    trainset_features = map(lambda x: x[1:], trainset)  | 
94 |  | -    trainset_labels = map(lambda x: x[0], trainset)  | 
 | 88 | +    # 选取 2/3 数据作为训练集, 1/3 数据作为测试集  | 
 | 89 | +    train_features, test_features, train_labels, test_labels = train_test_split(imgs, labels, test_size=0.33, random_state=23323)  | 
95 | 90 | 
 
  | 
96 |  | -    # 训练  | 
97 |  | -    w = train(trainset_features, trainset_labels)  | 
 | 91 | +    time_2 = time.time()  | 
 | 92 | +    print 'read data cost ',time_2 - time_1,' second','\n'  | 
98 | 93 | 
 
  | 
99 |  | -    # 构建测试集  | 
100 |  | -    testset1 = build_dataset(0, [0, 0], 10, 500)  | 
101 |  | -    testset2 = build_dataset(1, [30, 30], 10, 500)  | 
 | 94 | +    print 'Start training'  | 
 | 95 | +    lr = LogisticRegression()  | 
 | 96 | +    lr.train(train_features, train_labels)  | 
102 | 97 | 
 
  | 
103 |  | -    testset = testset1  | 
104 |  | -    testset.extend(testset2)  | 
105 |  | -    random.shuffle(testset)  | 
 | 98 | +    time_3 = time.time()  | 
 | 99 | +    print 'training cost ',time_3 - time_2,' second','\n'  | 
106 | 100 | 
 
  | 
107 |  | -    testset_features = map(lambda x: x[1:], testset)  | 
108 |  | -    testset_labels = map(lambda x: x[0], testset)  | 
 | 101 | +    print 'Start predicting'  | 
 | 102 | +    test_predict = lr.predict(test_features)  | 
 | 103 | +    time_4 = time.time()  | 
 | 104 | +    print 'predicting cost ',time_4 - time_3,' second','\n'  | 
109 | 105 | 
 
  | 
110 |  | -    # 测试  | 
111 |  | -    testset_predicts = predict(testset_features, w)  | 
112 |  | -    print 'asad'  | 
113 |  | -    accuracy_score = float(len(filter(lambda x: x == True, [testset_labels[i] == testset_predicts[  | 
114 |  | -                           i] for i in xrange(len(testset_predicts))]))) / float(len(testset_predicts))  | 
115 |  | -    print "The accruacy socre is ", accuracy_score  | 
 | 106 | +    score = accuracy_score(test_labels,test_predict)  | 
 | 107 | +    print "The accruacy socre is ", score  | 
0 commit comments