|  | 
| 5 | 5 | # @Last modified by:   WenDesi | 
| 6 | 6 | # @Last modified time: 08-11-16 | 
| 7 | 7 | 
 | 
|  | 8 | +import time | 
| 8 | 9 | import math | 
| 9 | 10 | import random | 
| 10 | 11 | 
 | 
|  | 12 | +import pandas as pd | 
|  | 13 | +from sklearn.cross_validation import train_test_split | 
|  | 14 | +from sklearn.metrics import accuracy_score | 
| 11 | 15 | 
 | 
| 12 |  | -def predict_(x, w): | 
| 13 |  | -    wx = sum([w[j] * x[j] for j in xrange(len(w))]) | 
| 14 |  | -    exp_wx = math.exp(wx) | 
| 15 | 16 | 
 | 
| 16 |  | -    predict1 = exp_wx / (1 + exp_wx) | 
| 17 |  | -    predict0 = 1 / (1 + exp_wx) | 
|  | 17 | +class LogisticRegression(object): | 
| 18 | 18 | 
 | 
| 19 |  | -    if predict1 > predict0: | 
| 20 |  | -        return 1 | 
| 21 |  | -    else: | 
| 22 |  | -        return 0 | 
|  | 19 | +    def __init__(self): | 
|  | 20 | +        self.learning_step = 0.00001 | 
|  | 21 | +        self.max_iteration = 5000 | 
| 23 | 22 | 
 | 
|  | 23 | +    def predict_(self,x): | 
|  | 24 | +        wx = sum([self.w[j] * x[j] for j in xrange(len(self.w))]) | 
|  | 25 | +        exp_wx = math.exp(wx) | 
| 24 | 26 | 
 | 
| 25 |  | -def train(features, labels): | 
| 26 |  | -    w = [0.0] * (len(features[0]) + 1) | 
|  | 27 | +        predict1 = exp_wx / (1 + exp_wx) | 
|  | 28 | +        predict0 = 1 / (1 + exp_wx) | 
| 27 | 29 | 
 | 
| 28 |  | -    learning_step = 0.00001 | 
| 29 |  | -    max_iteration = 1000 | 
| 30 |  | -    correct_count = 0 | 
| 31 |  | -    time = 0 | 
|  | 30 | +        if predict1 > predict0: | 
|  | 31 | +            return 1 | 
|  | 32 | +        else: | 
|  | 33 | +            return 0 | 
| 32 | 34 | 
 | 
| 33 |  | -    while time < max_iteration: | 
| 34 |  | -        index = random.randint(0, len(labels) - 1) | 
| 35 |  | -        x = features[index] | 
| 36 |  | -        x.append(1.0) | 
| 37 |  | -        y = labels[index] | 
| 38 | 35 | 
 | 
| 39 |  | -        if y == predict_(x, w): | 
| 40 |  | -            correct_count += 1 | 
| 41 |  | -            if correct_count > max_iteration: | 
| 42 |  | -                break | 
| 43 |  | -            continue | 
|  | 36 | +    def train(self,features, labels): | 
|  | 37 | +        self.w = [0.0] * (len(features[0]) + 1) | 
| 44 | 38 | 
 | 
| 45 |  | -        print 'iterater times %d' % time | 
| 46 |  | -        time += 1 | 
| 47 | 39 |         correct_count = 0 | 
|  | 40 | +        time = 0 | 
| 48 | 41 | 
 | 
| 49 |  | -        wx = sum([w[i] * x[i] for i in xrange(len(w))]) | 
| 50 |  | -        exp_wx = math.exp(wx) | 
| 51 |  | - | 
| 52 |  | -        for i in xrange(len(w)): | 
| 53 |  | -            w[i] -= learning_step * (-y * x[i] + float(x[i] * exp_wx) / float(1 + exp_wx)) | 
|  | 42 | +        while time < self.max_iteration: | 
|  | 43 | +            index = random.randint(0, len(labels) - 1) | 
|  | 44 | +            x = list(features[index]) | 
|  | 45 | +            x.append(1.0) | 
|  | 46 | +            y = labels[index] | 
| 54 | 47 | 
 | 
| 55 |  | -    return w | 
|  | 48 | +            if y == self.predict_(x): | 
|  | 49 | +                correct_count += 1 | 
|  | 50 | +                if correct_count > self.max_iteration: | 
|  | 51 | +                    break | 
|  | 52 | +                continue | 
| 56 | 53 | 
 | 
|  | 54 | +            # print 'iterater times %d' % time | 
|  | 55 | +            time += 1 | 
|  | 56 | +            correct_count = 0 | 
| 57 | 57 | 
 | 
| 58 |  | -def predict(features, w): | 
| 59 |  | -    labels = [] | 
|  | 58 | +            wx = sum([self.w[i] * x[i] for i in xrange(len(self.w))]) | 
|  | 59 | +            exp_wx = math.exp(wx) | 
| 60 | 60 | 
 | 
| 61 |  | -    for feature in features: | 
| 62 |  | -        feature.append(1) | 
| 63 |  | -        x = feature | 
|  | 61 | +            for i in xrange(len(self.w)): | 
|  | 62 | +                self.w[i] -= self.learning_step * \ | 
|  | 63 | +                    (-y * x[i] + float(x[i] * exp_wx) / float(1 + exp_wx)) | 
| 64 | 64 | 
 | 
| 65 |  | -        labels.append(predict_(x,w)) | 
| 66 | 65 | 
 | 
| 67 |  | -    return labels | 
|  | 66 | +    def predict(self,features): | 
|  | 67 | +        labels = [] | 
| 68 | 68 | 
 | 
|  | 69 | +        for feature in features: | 
|  | 70 | +            x = list(feature) | 
|  | 71 | +            x.append(1) | 
|  | 72 | +            labels.append(self.predict_(x)) | 
| 69 | 73 | 
 | 
| 70 |  | -def build_dataset(label, original_posins, radius, size): | 
| 71 |  | -    datasets = [] | 
| 72 |  | -    dim = len(original_posins) | 
|  | 74 | +        return labels | 
| 73 | 75 | 
 | 
| 74 |  | -    for i in xrange(size): | 
| 75 |  | -        dataset = [label] | 
| 76 |  | -        for j in xrange(dim): | 
| 77 |  | -            point = random.randint(0, 2 * radius) - radius + original_posins[j] | 
| 78 |  | -            dataset.append(point) | 
| 79 |  | -        datasets.append(dataset) | 
|  | 76 | +if __name__ == "__main__": | 
|  | 77 | +    print 'Start read data' | 
| 80 | 78 | 
 | 
| 81 |  | -    return datasets | 
|  | 79 | +    time_1 = time.time() | 
| 82 | 80 | 
 | 
| 83 |  | -if __name__ == "__main__": | 
|  | 81 | +    raw_data = pd.read_csv('../data/train_binary.csv',header=0) | 
|  | 82 | +    data = raw_data.values | 
| 84 | 83 | 
 | 
| 85 |  | -    # 构建训练集 | 
| 86 |  | -    trainset1 = build_dataset(0, [0, 0], 10, 100) | 
| 87 |  | -    trainset2 = build_dataset(1, [30, 30], 10, 100) | 
|  | 84 | +    imgs = data[0::,1::] | 
|  | 85 | +    labels = data[::,0] | 
| 88 | 86 | 
 | 
| 89 |  | -    trainset = trainset1 | 
| 90 |  | -    trainset.extend(trainset2) | 
| 91 |  | -    random.shuffle(trainset) | 
| 92 | 87 | 
 | 
| 93 |  | -    trainset_features = map(lambda x: x[1:], trainset) | 
| 94 |  | -    trainset_labels = map(lambda x: x[0], trainset) | 
|  | 88 | +    # 选取 2/3 数据作为训练集, 1/3 数据作为测试集 | 
|  | 89 | +    train_features, test_features, train_labels, test_labels = train_test_split(imgs, labels, test_size=0.33, random_state=23323) | 
| 95 | 90 | 
 | 
| 96 |  | -    # 训练 | 
| 97 |  | -    w = train(trainset_features, trainset_labels) | 
|  | 91 | +    time_2 = time.time() | 
|  | 92 | +    print 'read data cost ',time_2 - time_1,' second','\n' | 
| 98 | 93 | 
 | 
| 99 |  | -    # 构建测试集 | 
| 100 |  | -    testset1 = build_dataset(0, [0, 0], 10, 500) | 
| 101 |  | -    testset2 = build_dataset(1, [30, 30], 10, 500) | 
|  | 94 | +    print 'Start training' | 
|  | 95 | +    lr = LogisticRegression() | 
|  | 96 | +    lr.train(train_features, train_labels) | 
| 102 | 97 | 
 | 
| 103 |  | -    testset = testset1 | 
| 104 |  | -    testset.extend(testset2) | 
| 105 |  | -    random.shuffle(testset) | 
|  | 98 | +    time_3 = time.time() | 
|  | 99 | +    print 'training cost ',time_3 - time_2,' second','\n' | 
| 106 | 100 | 
 | 
| 107 |  | -    testset_features = map(lambda x: x[1:], testset) | 
| 108 |  | -    testset_labels = map(lambda x: x[0], testset) | 
|  | 101 | +    print 'Start predicting' | 
|  | 102 | +    test_predict = lr.predict(test_features) | 
|  | 103 | +    time_4 = time.time() | 
|  | 104 | +    print 'predicting cost ',time_4 - time_3,' second','\n' | 
| 109 | 105 | 
 | 
| 110 |  | -    # 测试 | 
| 111 |  | -    testset_predicts = predict(testset_features, w) | 
| 112 |  | -    print 'asad' | 
| 113 |  | -    accuracy_score = float(len(filter(lambda x: x == True, [testset_labels[i] == testset_predicts[ | 
| 114 |  | -                           i] for i in xrange(len(testset_predicts))]))) / float(len(testset_predicts)) | 
| 115 |  | -    print "The accruacy socre is ", accuracy_score | 
|  | 106 | +    score = accuracy_score(test_labels,test_predict) | 
|  | 107 | +    print "The accruacy socre is ", score | 
0 commit comments