|
5 | 5 | # @Last modified by: WenDesi |
6 | 6 | # @Last modified time: 08-11-16 |
7 | 7 |
|
| 8 | +import time |
8 | 9 | import math |
9 | 10 | import random |
10 | 11 |
|
| 12 | +import pandas as pd |
| 13 | +from sklearn.cross_validation import train_test_split |
| 14 | +from sklearn.metrics import accuracy_score |
11 | 15 |
|
12 | | -def predict_(x, w): |
13 | | - wx = sum([w[j] * x[j] for j in xrange(len(w))]) |
14 | | - exp_wx = math.exp(wx) |
15 | 16 |
|
16 | | - predict1 = exp_wx / (1 + exp_wx) |
17 | | - predict0 = 1 / (1 + exp_wx) |
| 17 | +class LogisticRegression(object): |
18 | 18 |
|
19 | | - if predict1 > predict0: |
20 | | - return 1 |
21 | | - else: |
22 | | - return 0 |
| 19 | + def __init__(self): |
| 20 | + self.learning_step = 0.00001 |
| 21 | + self.max_iteration = 5000 |
23 | 22 |
|
| 23 | + def predict_(self,x): |
| 24 | + wx = sum([self.w[j] * x[j] for j in xrange(len(self.w))]) |
| 25 | + exp_wx = math.exp(wx) |
24 | 26 |
|
25 | | -def train(features, labels): |
26 | | - w = [0.0] * (len(features[0]) + 1) |
| 27 | + predict1 = exp_wx / (1 + exp_wx) |
| 28 | + predict0 = 1 / (1 + exp_wx) |
27 | 29 |
|
28 | | - learning_step = 0.00001 |
29 | | - max_iteration = 1000 |
30 | | - correct_count = 0 |
31 | | - time = 0 |
| 30 | + if predict1 > predict0: |
| 31 | + return 1 |
| 32 | + else: |
| 33 | + return 0 |
32 | 34 |
|
33 | | - while time < max_iteration: |
34 | | - index = random.randint(0, len(labels) - 1) |
35 | | - x = features[index] |
36 | | - x.append(1.0) |
37 | | - y = labels[index] |
38 | 35 |
|
39 | | - if y == predict_(x, w): |
40 | | - correct_count += 1 |
41 | | - if correct_count > max_iteration: |
42 | | - break |
43 | | - continue |
| 36 | + def train(self,features, labels): |
| 37 | + self.w = [0.0] * (len(features[0]) + 1) |
44 | 38 |
|
45 | | - print 'iterater times %d' % time |
46 | | - time += 1 |
47 | 39 | correct_count = 0 |
| 40 | + time = 0 |
48 | 41 |
|
49 | | - wx = sum([w[i] * x[i] for i in xrange(len(w))]) |
50 | | - exp_wx = math.exp(wx) |
51 | | - |
52 | | - for i in xrange(len(w)): |
53 | | - w[i] -= learning_step * (-y * x[i] + float(x[i] * exp_wx) / float(1 + exp_wx)) |
| 42 | + while time < self.max_iteration: |
| 43 | + index = random.randint(0, len(labels) - 1) |
| 44 | + x = list(features[index]) |
| 45 | + x.append(1.0) |
| 46 | + y = labels[index] |
54 | 47 |
|
55 | | - return w |
| 48 | + if y == self.predict_(x): |
| 49 | + correct_count += 1 |
| 50 | + if correct_count > self.max_iteration: |
| 51 | + break |
| 52 | + continue |
56 | 53 |
|
| 54 | + # print 'iterater times %d' % time |
| 55 | + time += 1 |
| 56 | + correct_count = 0 |
57 | 57 |
|
58 | | -def predict(features, w): |
59 | | - labels = [] |
| 58 | + wx = sum([self.w[i] * x[i] for i in xrange(len(self.w))]) |
| 59 | + exp_wx = math.exp(wx) |
60 | 60 |
|
61 | | - for feature in features: |
62 | | - feature.append(1) |
63 | | - x = feature |
| 61 | + for i in xrange(len(self.w)): |
| 62 | + self.w[i] -= self.learning_step * \ |
| 63 | + (-y * x[i] + float(x[i] * exp_wx) / float(1 + exp_wx)) |
64 | 64 |
|
65 | | - labels.append(predict_(x,w)) |
66 | 65 |
|
67 | | - return labels |
| 66 | + def predict(self,features): |
| 67 | + labels = [] |
68 | 68 |
|
| 69 | + for feature in features: |
| 70 | + x = list(feature) |
| 71 | + x.append(1) |
| 72 | + labels.append(self.predict_(x)) |
69 | 73 |
|
70 | | -def build_dataset(label, original_posins, radius, size): |
71 | | - datasets = [] |
72 | | - dim = len(original_posins) |
| 74 | + return labels |
73 | 75 |
|
74 | | - for i in xrange(size): |
75 | | - dataset = [label] |
76 | | - for j in xrange(dim): |
77 | | - point = random.randint(0, 2 * radius) - radius + original_posins[j] |
78 | | - dataset.append(point) |
79 | | - datasets.append(dataset) |
| 76 | +if __name__ == "__main__": |
| 77 | + print 'Start read data' |
80 | 78 |
|
81 | | - return datasets |
| 79 | + time_1 = time.time() |
82 | 80 |
|
83 | | -if __name__ == "__main__": |
| 81 | + raw_data = pd.read_csv('../data/train_binary.csv',header=0) |
| 82 | + data = raw_data.values |
84 | 83 |
|
85 | | - # 构建训练集 |
86 | | - trainset1 = build_dataset(0, [0, 0], 10, 100) |
87 | | - trainset2 = build_dataset(1, [30, 30], 10, 100) |
| 84 | + imgs = data[0::,1::] |
| 85 | + labels = data[::,0] |
88 | 86 |
|
89 | | - trainset = trainset1 |
90 | | - trainset.extend(trainset2) |
91 | | - random.shuffle(trainset) |
92 | 87 |
|
93 | | - trainset_features = map(lambda x: x[1:], trainset) |
94 | | - trainset_labels = map(lambda x: x[0], trainset) |
| 88 | + # 选取 2/3 数据作为训练集, 1/3 数据作为测试集 |
| 89 | + train_features, test_features, train_labels, test_labels = train_test_split(imgs, labels, test_size=0.33, random_state=23323) |
95 | 90 |
|
96 | | - # 训练 |
97 | | - w = train(trainset_features, trainset_labels) |
| 91 | + time_2 = time.time() |
| 92 | + print 'read data cost ',time_2 - time_1,' second','\n' |
98 | 93 |
|
99 | | - # 构建测试集 |
100 | | - testset1 = build_dataset(0, [0, 0], 10, 500) |
101 | | - testset2 = build_dataset(1, [30, 30], 10, 500) |
| 94 | + print 'Start training' |
| 95 | + lr = LogisticRegression() |
| 96 | + lr.train(train_features, train_labels) |
102 | 97 |
|
103 | | - testset = testset1 |
104 | | - testset.extend(testset2) |
105 | | - random.shuffle(testset) |
| 98 | + time_3 = time.time() |
| 99 | + print 'training cost ',time_3 - time_2,' second','\n' |
106 | 100 |
|
107 | | - testset_features = map(lambda x: x[1:], testset) |
108 | | - testset_labels = map(lambda x: x[0], testset) |
| 101 | + print 'Start predicting' |
| 102 | + test_predict = lr.predict(test_features) |
| 103 | + time_4 = time.time() |
| 104 | + print 'predicting cost ',time_4 - time_3,' second','\n' |
109 | 105 |
|
110 | | - # 测试 |
111 | | - testset_predicts = predict(testset_features, w) |
112 | | - print 'asad' |
113 | | - accuracy_score = float(len(filter(lambda x: x == True, [testset_labels[i] == testset_predicts[ |
114 | | - i] for i in xrange(len(testset_predicts))]))) / float(len(testset_predicts)) |
115 | | - print "The accruacy socre is ", accuracy_score |
| 106 | + score = accuracy_score(test_labels,test_predict) |
| 107 | + print "The accruacy socre is ", score |
0 commit comments