diff --git a/AdaBoost/adaboost.py b/AdaBoost/adaboost.py index 1851f3b..645973f 100644 --- a/AdaBoost/adaboost.py +++ b/AdaBoost/adaboost.py @@ -12,7 +12,7 @@ import numpy as np import pandas as pd -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score sign_time_count = 0 @@ -48,7 +48,7 @@ def _train_less_than_(self): for i in self.indexes: score = 0 - for j in xrange(self.N): + for j in range(self.N): val = -1 if self.X[j] 0) def train(self, features, labels): @@ -37,7 +37,7 @@ def train(self, features, labels): x = list(features[index]) x.append(1.0) y = 2 * labels[index] - 1 - wx = sum([self.w[j] * x[j] for j in xrange(len(self.w))]) + wx = sum([self.w[j] * x[j] for j in range(len(self.w))]) if wx * y > 0: correct_count += 1 @@ -45,7 +45,7 @@ def train(self, features, labels): break continue - for i in xrange(len(self.w)): + for i in range(len(self.w)): self.w[i] += self.learning_step * (y * x[i]) def predict(self,features): @@ -59,7 +59,7 @@ def predict(self,features): if __name__ == '__main__': - print 'Start read data' + print('Start read data') time_1 = time.time() @@ -76,19 +76,19 @@ def predict(self,features): # print train_features.shape time_2 = time.time() - print 'read data cost ', time_2 - time_1, ' second', '\n' + print('read data cost ', time_2 - time_1, ' second', '\n') - print 'Start training' + print('Start training') p = Perceptron() p.train(train_features, train_labels) time_3 = time.time() - print 'training cost ', time_3 - time_2, ' second', '\n' + print('training cost ', time_3 - time_2, ' second', '\n') - print 'Start predicting' + print('Start predicting') test_predict = p.predict(test_features) time_4 = time.time() - print 'predicting cost ', time_4 - time_3, ' second', '\n' + print('predicting cost ', time_4 - time_3, ' second', '\n') score = accuracy_score(test_labels, test_predict) - print "The accruacy socre is ", score + print("The accruacy socre is ", score) diff --git a/logistic_regression/competation.py b/logistic_regression/competation.py index cba8f01..7386ef1 100644 --- a/logistic_regression/competation.py +++ b/logistic_regression/competation.py @@ -11,7 +11,7 @@ from binary_perceptron import Perceptron from logistic_regression import LogisticRegression -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score if __name__ == '__main__': @@ -29,8 +29,8 @@ writer = csv.writer(file('result.csv', 'wb')) - for time in xrange(test_time): - print 'iterater time %d' % time + for time in range(test_time): + print('iterater time %d' % time) train_features, test_features, train_labels, test_labels = train_test_split( imgs, labels, test_size=0.33, random_state=23323) @@ -44,7 +44,7 @@ p_score = accuracy_score(test_labels, p_predict) lr_score = accuracy_score(test_labels, lr_predict) - print 'perceptron accruacy score ', p_score - print 'logistic Regression accruacy score ', lr_score + print('perceptron accruacy score ', p_score) + print('logistic Regression accruacy score ', lr_score) writer.writerow([time,p_score,lr_score]) diff --git a/logistic_regression/logistic_regression.py b/logistic_regression/logistic_regression.py index 84e8233..5da8901 100644 --- a/logistic_regression/logistic_regression.py +++ b/logistic_regression/logistic_regression.py @@ -10,7 +10,7 @@ import random import pandas as pd -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score @@ -21,7 +21,7 @@ def __init__(self): self.max_iteration = 5000 def predict_(self,x): - wx = sum([self.w[j] * x[j] for j in xrange(len(self.w))]) + wx = sum([self.w[j] * x[j] for j in range(len(self.w))]) exp_wx = math.exp(wx) predict1 = exp_wx / (1 + exp_wx) @@ -55,10 +55,10 @@ def train(self,features, labels): time += 1 correct_count = 0 - wx = sum([self.w[i] * x[i] for i in xrange(len(self.w))]) + wx = sum([self.w[i] * x[i] for i in range(len(self.w))]) exp_wx = math.exp(wx) - for i in xrange(len(self.w)): + for i in range(len(self.w)): self.w[i] -= self.learning_step * \ (-y * x[i] + float(x[i] * exp_wx) / float(1 + exp_wx)) @@ -74,7 +74,7 @@ def predict(self,features): return labels if __name__ == "__main__": - print 'Start read data' + print('Start read data') time_1 = time.time() @@ -89,19 +89,19 @@ def predict(self,features): train_features, test_features, train_labels, test_labels = train_test_split(imgs, labels, test_size=0.33, random_state=23323) time_2 = time.time() - print 'read data cost ',time_2 - time_1,' second','\n' + print('read data cost ',time_2 - time_1,' second','\n') - print 'Start training' + print('Start training') lr = LogisticRegression() lr.train(train_features, train_labels) time_3 = time.time() - print 'training cost ',time_3 - time_2,' second','\n' + print('training cost ',time_3 - time_2,' second','\n') - print 'Start predicting' + print('Start predicting') test_predict = lr.predict(test_features) time_4 = time.time() - print 'predicting cost ',time_4 - time_3,' second','\n' + print('predicting cost ',time_4 - time_3,' second','\n') score = accuracy_score(test_labels,test_predict) - print "The accruacy socre is ", score + print("The accruacy socre is ", score) diff --git a/maxENT/maxENT.py b/maxENT/maxENT.py index 3acad95..e5ba71e 100644 --- a/maxENT/maxENT.py +++ b/maxENT/maxENT.py @@ -15,7 +15,7 @@ from collections import defaultdict -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score @@ -47,7 +47,7 @@ def cal_Pxy_Px(self, X, Y): self.Pxy = defaultdict(int) self.Px = defaultdict(int) - for i in xrange(len(X)): + for i in range(len(X)): x_, y = X[i], Y[i] self.Y_.add(y) @@ -60,7 +60,7 @@ def cal_EPxy(self): 计算书中82页最下面那个期望 ''' self.EPxy = defaultdict(float) - for id in xrange(self.n): + for id in range(self.n): (x, y) = self.id2xy[id] self.EPxy[id] = float(self.Pxy[(x, y)]) / float(self.N) @@ -84,7 +84,7 @@ def cal_EPx(self): ''' 计算书83页最上面那个期望 ''' - self.EPx = [0.0 for i in xrange(self.n)] + self.EPx = [0.0 for i in range(self.n)] for i, X in enumerate(self.X_): Pyxs = self.cal_probality(X) @@ -104,19 +104,19 @@ def train(self, X, Y): self.w = [0.0 for i in range(self.n)] max_iteration = 1000 - for times in xrange(max_iteration): - print 'iterater times %d' % times + for times in range(max_iteration): + print('iterater times %d' % times) sigmas = [] self.cal_EPx() - for i in xrange(self.n): + for i in range(self.n): sigma = 1 / self.M * math.log(self.EPxy[i] / self.EPx[i]) sigmas.append(sigma) # if len(filter(lambda x: abs(x) >= 0.01, sigmas)) == 0: # break - self.w = [self.w[i] + sigmas[i] for i in xrange(self.n)] + self.w = [self.w[i] + sigmas[i] for i in range(self.n)] def predict(self, testset): results = [] @@ -142,7 +142,7 @@ def rebuild_features(features): if __name__ == "__main__": - print 'Start read data' + print('Start read data') time_1 = time.time() @@ -160,19 +160,19 @@ def rebuild_features(features): test_features = rebuild_features(test_features) time_2 = time.time() - print 'read data cost ', time_2 - time_1, ' second', '\n' + print('read data cost ', time_2 - time_1, ' second', '\n') - print 'Start training' + print('Start training') met = MaxEnt() met.train(train_features, train_labels) time_3 = time.time() - print 'training cost ', time_3 - time_2, ' second', '\n' + print('training cost ', time_3 - time_2, ' second', '\n') - print 'Start predicting' + print('Start predicting') test_predict = met.predict(test_features) time_4 = time.time() - print 'predicting cost ', time_4 - time_3, ' second', '\n' + print('predicting cost ', time_4 - time_3, ' second', '\n') score = accuracy_score(test_labels, test_predict) - print "The accruacy socre is ", score + print("The accruacy socre is ", score) diff --git a/naive_bayes/naive_bayes.py b/naive_bayes/naive_bayes.py index 07c2091..099e54d 100644 --- a/naive_bayes/naive_bayes.py +++ b/naive_bayes/naive_bayes.py @@ -6,13 +6,13 @@ import random import time -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score # 二值化 def binaryzation(img): cv_img = img.astype(np.uint8) - cv2.threshold(cv_img,50,1,cv2.cv.CV_THRESH_BINARY_INV,cv_img) + cv2.threshold(cv_img,50,1,cv2.THRESH_BINARY_INV,cv_img) return cv_img def Train(trainset,train_labels): @@ -83,7 +83,7 @@ def Predict(testset,prior_probability,conditional_probability): if __name__ == '__main__': - print 'Start read data' + print('Start read data') time_1 = time.time() @@ -99,17 +99,17 @@ def Predict(testset,prior_probability,conditional_probability): # print train_features.shape time_2 = time.time() - print 'read data cost ',time_2 - time_1,' second','\n' + print('read data cost ',time_2 - time_1,' second','\n') - print 'Start training' + print('Start training') prior_probability,conditional_probability = Train(train_features,train_labels) time_3 = time.time() - print 'training cost ',time_3 - time_2,' second','\n' + print('training cost ',time_3 - time_2,' second','\n') - print 'Start predicting' + print('Start predicting') test_predict = Predict(test_features,prior_probability,conditional_probability) time_4 = time.time() - print 'predicting cost ',time_4 - time_3,' second','\n' + print('predicting cost ',time_4 - time_3,' second','\n') score = accuracy_score(test_labels,test_predict) - print "The accruacy socre is ", score \ No newline at end of file + print("The accruacy socre is ", score) \ No newline at end of file diff --git a/notebooks/1-perceptron.ipynb b/notebooks/1-perceptron.ipynb new file mode 100644 index 0000000..ec5083d --- /dev/null +++ b/notebooks/1-perceptron.ipynb @@ -0,0 +1,101 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "这里使用sigmoid函数,处理的是二分类问题" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import cv2\n", + "import random\n", + "import time\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "class Perceptron(object):\n", + "\n", + " def __init__(self):\n", + " self.learning_step = 0.00001\n", + " self.max_iteration = 5000\n", + " \n", + " def train(self, features, labels):\n", + " self.w = [0.0] * (len(features[0]) + 1)\n", + " correct_count = 0\n", + " time = 0\n", + " \n", + " while time < self.max_iteration:\n", + " index = random.randint(0, len(labels) - 1)\n", + " x = list(features[index])\n", + " x.append(1.0)\n", + " y = 2 * labels[index] - 1\n", + " wx = sum([self.w[j] * x[j] for j in range(len(self.w))])\n", + " \n", + " if wx * y > 0:\n", + " correct_count += 1\n", + " if correct_count > self.max_iteration:\n", + " break\n", + " continue\n", + "\n", + " for i in range(len(self.w)):\n", + " self.w[i] += self.learning_step * (y * x[i])\n", + "\n", + " def predict_(self, x):\n", + " wx = sum([self.w[j] * x[j] for j in range(len(self.w))])\n", + " return int(wx > 0)\n", + "\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [py35]", + "language": "python", + "name": "Python [py35]" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/2-knn.ipynb b/notebooks/2-knn.ipynb new file mode 100644 index 0000000..914c941 --- /dev/null +++ b/notebooks/2-knn.ipynb @@ -0,0 +1,1005 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import cv2\n", + "import random\n", + "import time\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "数据集是mnist,28\\*28,这里选择提取HOG特征,方向梯度直方图(Histogram of Oriented Gradient, HOG):" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "raw_data = pd.read_csv('../data/train.csv',header=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
labelpixel0pixel1pixel2pixel3pixel4pixel5pixel6pixel7pixel8...pixel774pixel775pixel776pixel777pixel778pixel779pixel780pixel781pixel782pixel783
01000000000...0000000000
10000000000...0000000000
21000000000...0000000000
34000000000...0000000000
40000000000...0000000000
\n", + "

5 rows × 785 columns

\n", + "
" + ], + "text/plain": [ + " label pixel0 pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 \\\n", + "0 1 0 0 0 0 0 0 0 0 \n", + "1 0 0 0 0 0 0 0 0 0 \n", + "2 1 0 0 0 0 0 0 0 0 \n", + "3 4 0 0 0 0 0 0 0 0 \n", + "4 0 0 0 0 0 0 0 0 0 \n", + "\n", + " pixel8 ... pixel774 pixel775 pixel776 pixel777 pixel778 \\\n", + "0 0 ... 0 0 0 0 0 \n", + "1 0 ... 0 0 0 0 0 \n", + "2 0 ... 0 0 0 0 0 \n", + "3 0 ... 0 0 0 0 0 \n", + "4 0 ... 0 0 0 0 0 \n", + "\n", + " pixel779 pixel780 pixel781 pixel782 pixel783 \n", + "0 0 0 0 0 0 \n", + "1 0 0 0 0 0 \n", + "2 0 0 0 0 0 \n", + "3 0 0 0 0 0 \n", + "4 0 0 0 0 0 \n", + "\n", + "[5 rows x 785 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "raw_data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(42000, 785)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "raw_data.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "两个冒号的语法:\n", + " seq[start:end:step]\n", + "原来是\n", + " imgs = data[0::,1::]\n", + " labels = data[::,0]\n", + "没必要这样写" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "data = raw_data.values\n", + "imgs = data[:, 1:]\n", + "labels = data[:, 0]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(42000, 784)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "imgs.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.unique(labels)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# 利用opencv获取图像hog特征\n", + "def get_hog_features(trainset):\n", + " features = []\n", + "\n", + " hog = cv2.HOGDescriptor('../hog.xml')\n", + "\n", + " for img in trainset:\n", + " img = np.reshape(img,(28,28))\n", + " cv_img = img.astype(np.uint8)\n", + "\n", + " hog_feature = hog.compute(cv_img)\n", + " # hog_feature = np.transpose(hog_feature)\n", + " features.append(hog_feature)\n", + "\n", + " features = np.array(features)\n", + " features = np.reshape(features,(-1,324))\n", + "\n", + " return features" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "features = get_hog_features(imgs)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(42000, 324)" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "features.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(42000,)" + ] + }, + "execution_count": 112, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "labels.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.33, random_state=23323)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 预测\n", + "\n", + "因为knn不需要训练,我们可以直接进行预测。不过因为4万个数据即使是预测也非常花时间,这里只取前100个样本做训练集,去30个样本做测试集:" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "testset, trainset, train_labels = test_features[:30], train_features[:100], train_labels[:100]" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "k = 10 # 最近的10个点\n", + "\n", + "predict = []\n", + "count = 0" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "5.0" + ] + }, + "execution_count": 122, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 计算两个点的欧氏距离\n", + "np.linalg.norm(np.array([0, 3]) - np.array([4, 0]))" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "time_1 = time.time()\n", + "\n", + "for test_vec in testset:\n", + " # 输出当前运行的测试用例坐标,用于测试\n", + " count += 1\n", + " if count % 5000 == 0:\n", + " print(count)\n", + " \n", + " knn_list = np.zeros((1, 2)) # 初始化,存放当前k个最近邻居\n", + " \n", + " # 先将前k个点放入k个最近邻居中,填充满knn_list\n", + " for i in range(k):\n", + " label = train_labels[i]\n", + " train_vec = trainset[i]\n", + "\n", + " dist = np.linalg.norm(train_vec - test_vec) # 计算两个点的欧氏距离\n", + " knn_list = np.append(knn_list, [[dist, label]], axis=0)\n", + " \n", + " # 剩下的点\n", + " for i in range(k, len(train_labels)):\n", + " label = train_labels[i]\n", + " train_vec = trainset[i]\n", + "\n", + " dist = np.linalg.norm(train_vec - test_vec) # 计算两个点的欧氏距离\n", + "\n", + " # 寻找10个邻近点中距离最远的点\n", + " max_index = np.argmax(knn_list[:, 0])\n", + " max_dist = np.max(knn_list[:, 0])\n", + "\n", + " # 如果当前k个最近邻居中存在点距离比当前点距离远,则替换\n", + " if dist < max_dist:\n", + " knn_list[max_index] = [dist, label]\n", + " \n", + " \n", + " # 上面代码计算全部运算完之后,即说明已经找到了离当前test_vec最近的10个train_vec\n", + " # 统计选票\n", + " class_total = 10\n", + " class_count = [0 for i in range(class_total)]\n", + " for dist, label in knn_list:\n", + " class_count[int(label)] += 1\n", + "\n", + " # 找出最大选票数\n", + " label_max = max(class_count)\n", + "\n", + " # 最大选票数对应的class\n", + " predict.append(class_count.index(label_max))\n", + "\n", + "time_2 = time.time()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "train time is 0.07612895965576172\n" + ] + } + ], + "source": [ + "print('train time is %s' % (time_2 - time_1))" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "train time is 3\n" + ] + } + ], + "source": [ + "print('train time is %s' % (5-2))" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 0. , 0. ],\n", + " [ 1.10036302, 3. ],\n", + " [ 1.09803486, 3. ],\n", + " [ 1.09235775, 3. ],\n", + " [ 1.03992426, 3. ],\n", + " [ 1.04467952, 3. ],\n", + " [ 1.06501627, 3. ],\n", + " [ 0.93764162, 3. ],\n", + " [ 1.05351973, 3. ],\n", + " [ 1.04691565, 3. ],\n", + " [ 0.9816038 , 3. ]])" + ] + }, + "execution_count": 106, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "knn_list" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([], dtype=float64)" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "knn_list = np.array([]) # 当前k个最近邻居\n", + " \n", + "# 先将前k个点放入k个最近邻居中,填充满knn_list\n", + "for i in range(k):\n", + " label = train_labels[i]\n", + " train_vec = trainset[i]\n", + "\n", + " dist = np.linalg.norm(train_vec - test_vec) # 计算两个点的欧氏距离\n", + " knn_list_test = np.append(knn_list_test, [[8.5, 9]], axis=0)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 测试用\n", + "\n", + "下面自己写一个寻找10个领近点中距离最远的点:" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 0., 0.]])" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "knn_list = np.zeros((1, 2)) # 当前k个最近邻居\n", + "knn_list" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "ename": "ValueError", + "evalue": "all the input array dimensions except for the concatenation axis must match exactly", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mknn_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m8.5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m9\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/Users/xu/anaconda/envs/py35/lib/python3.5/site-packages/numpy/lib/function_base.py\u001b[0m in \u001b[0;36mappend\u001b[0;34m(arr, values, axis)\u001b[0m\n\u001b[1;32m 5145\u001b[0m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mravel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5146\u001b[0m \u001b[0maxis\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0marr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mndim\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5147\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mconcatenate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mValueError\u001b[0m: all the input array dimensions except for the concatenation axis must match exactly" + ] + } + ], + "source": [ + "np.append(knn_list, [[8.5, 9]], axis=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 2.3, 1. ],\n", + " [ 3.5, 1. ],\n", + " [ 1.5, 4. ],\n", + " [ 6.5, 2. ],\n", + " [ 5.5, 8. ]])" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "knn_list_test = np.array([[2.3, 1], [3.5, 1], [1.5, 4], [6.5, 2], [5.5, 8]])\n", + "# 每个元组里,第一个是距离,第二个是对应标签\n", + "knn_list_test" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 2.3, 3.5, 1.5, 6.5, 5.5])" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "knn_list_test[:, 0]" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "knn_list_test[2] = [9.5, 5]" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 2.3, 1. ],\n", + " [ 3.5, 1. ],\n", + " [ 9.5, 5. ],\n", + " [ 6.5, 2. ],\n", + " [ 5.5, 8. ]])" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "knn_list_test" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "要想给一个ndarray添加一个元素,必须是同样的格式,即必须是`[[8.5, 9]]`,不能使`[8.5, 9]`,而且必须要用axis指定才行。" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 2.3, 1. ],\n", + " [ 3.5, 1. ],\n", + " [ 9.5, 5. ],\n", + " [ 6.5, 2. ],\n", + " [ 5.5, 8. ],\n", + " [ 8.5, 9. ],\n", + " [ 8.5, 9. ]])" + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.append(knn_list_test, [[8.5, 9]], axis=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 2.3, 1. ],\n", + " [ 3.5, 1. ],\n", + " [ 9.5, 5. ],\n", + " [ 6.5, 2. ],\n", + " [ 5.5, 8. ],\n", + " [ 8.5, 9. ]])" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "knn_list_test" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "3" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "knn_list_test[:, 0].argmax()" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([], dtype=float64)" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.array([])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 输出评分\n", + "\n", + "统计结束后,得到predict" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "30" + ] + }, + "execution_count": 125, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(predict)" + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "test_predict = np.array(predict)" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "score = accuracy_score(test_labels[:30], test_predict)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.6333333333333333" + ] + }, + "execution_count": 129, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "score" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [py35]", + "language": "python", + "name": "Python [py35]" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/3-naive_bayes.ipynb b/notebooks/3-naive_bayes.ipynb new file mode 100644 index 0000000..d821bea --- /dev/null +++ b/notebooks/3-naive_bayes.ipynb @@ -0,0 +1,1027 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import cv2\n", + "import random\n", + "import time\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 数据预处理" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(28140, 784)\n", + "(13860, 784)\n" + ] + } + ], + "source": [ + "raw_data = pd.read_csv('../data/train.csv',header=0)\n", + "data = raw_data.values\n", + "imgs = data[:, 1:]\n", + "labels = data[:, 0]\n", + "# 选取 2/3 数据作为训练集, 1/3 数据作为测试集\n", + "train_features, test_features, train_labels, test_labels = train_test_split(imgs, labels, test_size=0.33, random_state=23323)\n", + "\n", + "print(train_features.shape)\n", + "print(test_features.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# train" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# 二值化\n", + "def binaryzation(img):\n", + " cv_img = img.astype(np.uint8)\n", + " cv2.threshold(cv_img, 50, 1, cv2.THRESH_BINARY_INV, cv_img)\n", + " return cv_img" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " cv2.threshold(cv_img, 50, 1, cv2.THRESH_BINARY_INV, cv_img)\n", + "这句代码中,cv_img是输入的784个pixel数字(0~255),50表示阈值,1表示最大值,cv2.THRESH_BINARY_INV表示二值化的类型。这句代码表示pixel数字大于50的部分,为1,小于50的部分,为0。\n", + "\n", + "看一下经过二值化处理后是什么效果:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 63,\n", + " 255, 253, 253, 244, 120, 22, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12,\n", + " 100, 209, 253, 252, 252, 252, 252, 187, 6, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 144, 217, 252, 161, 253, 183, 153, 106, 218, 252, 70, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 87, 180, 242, 243, 202, 68, 10, 3, 0, 0, 60, 194, 31,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 5, 184, 252, 226, 93, 23, 0, 0, 0, 0, 0, 32,\n", + " 142, 179, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 195, 252, 183, 29, 0, 0, 0, 0, 0, 0,\n", + " 0, 141, 252, 45, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 48, 247, 173, 38, 0, 0, 0, 0, 0,\n", + " 0, 0, 26, 245, 252, 74, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 100, 229, 72, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 132, 252, 252, 131, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 26, 153, 27, 0, 0,\n", + " 0, 0, 0, 0, 0, 34, 132, 252, 252, 98, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 242, 159,\n", + " 111, 58, 68, 77, 0, 15, 34, 14, 180, 252, 252, 21, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 181, 253, 253, 253, 253, 114, 0, 0, 0, 100, 253, 253, 141,\n", + " 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 50, 229, 252, 249, 120, 20, 0, 0, 0, 176, 252,\n", + " 252, 55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 29, 44, 42, 0, 0, 0, 0, 0,\n", + " 209, 252, 206, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 3, 128, 251, 252, 92, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 58, 252, 252, 238, 31, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 39, 230, 252, 252, 143, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 116, 253, 252, 252, 20, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 14, 226, 253, 252, 172, 4, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 159, 252, 253, 232, 30,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 20, 216, 252, 253,\n", + " 186, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0])" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trainset[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,\n", + " 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,\n", + " 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,\n", + " 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,\n", + " 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,\n", + " 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,\n", + " 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1], dtype=uint8)" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "binaryzation(trainset[0]) # 图片二值化" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "trainset, train_labels = train_features, train_labels" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "class_num = 10\n", + "feature_len = 784" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(10,)\n", + "(10, 784, 2)\n" + ] + } + ], + "source": [ + "# 存放先验概率\n", + "prior_probability = np.zeros(class_num) \n", + "print(prior_probability.shape)\n", + "# 存放条件概率\n", + "conditional_probability = np.zeros((class_num, feature_len, 2)) \n", + "print(conditional_probability.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "具体文章参考这一篇[机器学习通俗入门-朴素贝叶斯分类器](http://blog.csdn.net/TaiJi1985/article/details/73657994)\n", + "\n", + "$x^{(i)}$ 为一个28维的向量表示第i个样本, $y^{(i)}$ 为标注的类别。我们求解的目标是:\n", + "\n", + "$$f = \\underset{j}{arg maxP} (y^{(i)} = j \\mid x^{(i)}) $$\n", + "\n", + "简单说就是计算$p (y^{(i)} = 0 \\mid x^{(i)}) $,$p (y^{(i)} = 1 \\mid x^{(i)}) $ ... $p (y^{(i)} = 9 \\mid x^{(i)}) $,从中找出一个最大的,如果从属于第j个类的概率最大,那么就认为这张图片从属于j这个类。\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# 计算先验概率及条件概率\n", + "for i in range(len(train_labels)):\n", + " img = binaryzation(trainset[i]) # 图片二值化\n", + " label = train_labels[i]\n", + "\n", + " prior_probability[label] += 1 # 每个label的图片各有多少个\n", + "\n", + " for j in range(feature_len):\n", + " conditional_probability[label][j][img[j]] += 1 \n", + " # img[j]表示在像素点j上的值。如果是0,就会给第一个位置+1,如果是1,会给第二个位置+1\n", + " # 比如下面的conditional_probability[0][0],结果是[0, 2711]。\n", + " # 说明在img中,标签为0的样本中,像素点为0的对应位置,在img中分别为0或1的数量" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 2711., 3197., 2828., 2897., 2751., 2565., 2769., 2964.,\n", + " 2654., 2804.])" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prior_probability" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "把上面的循环拆解分析一下,这里取第一个训练样本:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "9" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "img1 = binaryzation(trainset[0]) # 图片二值化, 784个像素(feature)中,要么是0,要么是1\n", + "label1 = train_labels[0]\n", + "label1" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "img1[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 0., 2711.])" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(conditional_probability[0][0]) # " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "说明在img中,标签为0的样本中,像素点为500的对应位置,在img中为0的样本数量是199,为1的样本数量是2512" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 199. 2512.]\n" + ] + } + ], + "source": [ + "print(conditional_probability[0][500]) # " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "下面之所以将概率归到[1.10001],是因为上面所有关于概率的部分都是直接用样本数量,而不是实际的概率来记录的。这么做应该是为了在工程上解决内存,但是这种工程上的优化,对于理解书中的公式造成了影响。" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "而且下面计算概率的时候有点问题:\n", + " probalility_0 = (float(pix_0)/float(pix_0+pix_1))*1000000 + 1\n", + "分母部分是,属于i类(0~9)的图像中,像素j的数量……对啊,这个像素j的数量其实就是pix_0和pix_1的和,即属于i类的图像的数量。看来这里没问题,是我想多了。" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# 将概率归到[1.10001]\n", + "for i in range(class_num):\n", + " for j in range(feature_len):\n", + "\n", + " # 经过二值化后图像只有0,1两种取值\n", + " pix_0 = conditional_probability[i][j][0] # 属于i类(0~9)的图像中,像素j(0~783)为0的数量\n", + " pix_1 = conditional_probability[i][j][1] # 属于i类(0~9)的图像中,像素j(0~783)为1的数量\n", + "\n", + " # 计算0,1像素点对应的条件概率\n", + " probalility_0 = (float(pix_0)/float(pix_0+pix_1))*1000000 + 1\n", + " probalility_1 = (float(pix_1)/float(pix_0+pix_1))*1000000 + 1\n", + "\n", + " conditional_probability[i][j][0] = probalility_0\n", + " conditional_probability[i][j][1] = probalility_1" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 1.00000000e+00, 1.00000100e+06])" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conditional_probability[0][0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "得到了prior_probability和conditional_probability,这就算是训练结束了。\n", + "\n", + "# test (predict)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(100, 784)" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 为了加快预测速度,这里直接取100个测试样本\n", + "\n", + "testset = test_features[:100]\n", + "testset.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "$$p (y^{(i)} = j \\mid x_{k}^{(i)}) = \\frac{p (x_{k}^{(i)} \\mid y^{(i)} = j) \\cdot p(y^{(i)} = j)}{p(x_{k}^{(i)})}$$\n", + "\n", + "$p (y^{(i)} = j \\mid x_{k}^{(i)}) $中,$y^{(i)} = j$表示从属于哪一类,$x_{k}^{(i)}$表示哪一个像素点。\n", + "\n", + "下面calculate_probability函数就是在计算分子部分。\n", + "\n", + "`probability *= int(conditional_probability[label][i][img[i]])`\n", + "\n", + "这行代码中:\n", + "- probability表示先验概率 $p(y^{(i)} = j)$\n", + "- `conditional_probability[label][i][img[i]]`表示 $p (x_{k}^{(i)} \\mid y^{(i)} = j) $\n" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# 计算不同标签下,testdata的概率\n", + "def calculate_probability(img, label):\n", + " probability = int(prior_probability[label])\n", + "\n", + " for i in range(len(img)):\n", + " probability *= int(conditional_probability[label][i][img[i]])\n", + "\n", + " return probability" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "predict = []\n", + "\n", + "for img in testset:\n", + "\n", + " # 图像二值化\n", + " img = binaryzation(img)\n", + "\n", + " max_label = 0\n", + " max_probability = calculate_probability(img, 0)\n", + "\n", + " for j in range(1, 10):\n", + " probability = calculate_probability(img, j)\n", + "\n", + " if max_probability < probability:\n", + " max_label = j\n", + " max_probability = probability\n", + "\n", + " predict.append(max_label)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "test_predict = np.array(predict)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.76000000000000001" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "score = accuracy_score(test_labels[:100], test_predict)\n", + "score" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 重构朴素贝叶斯算法\n", + "\n", + "![](https://pic1.zhimg.com/v2-e17426fd0627560f1fc82118dd1d5d14_r.jpg)\n", + "\n", + "朴素贝叶斯认为所有特征都是独立的,然后得出一个样本出现的概率使其所有特征出现概率的联乘。\n", + "\n", + "首先求每一个标签的先验概率:" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(10,)\n", + "(10, 784, 2)\n" + ] + } + ], + "source": [ + "class_num = 10\n", + "feature_len = 784\n", + "\n", + "# 存放每个label的数量\n", + "class_number = np.zeros(class_num) \n", + "\n", + "# 存放先验概率\n", + "prior_probability = np.zeros(class_num) \n", + "print(prior_probability.shape)\n", + "# 存放条件概率\n", + "conditional_probability = np.zeros((class_num, feature_len, 2)) \n", + "print(conditional_probability.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# 计算先验概率\n", + "for i in range(len(train_labels)):\n", + " img = binaryzation(trainset[i]) # 图片二值化\n", + " label = train_labels[i]\n", + "\n", + " class_number[label] += 1 # 每个label的图片各有多少个\n" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 0.09633973, 0.11361052, 0.10049751, 0.10294954, 0.09776119,\n", + " 0.09115139, 0.09840085, 0.10533049, 0.09431414, 0.09964463])" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "class_number/len(train_labels)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 2711., 3197., 2828., 2897., 2751., 2565., 2769., 2964.,\n", + " 2654., 2804.])" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "class_number" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "prior_probability = class_number / len(train_labels)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "计算条件概率: \n", + "\n", + "$$p (X^{(i)} = a_{jl} \\mid Y = c_k)$$\n", + "\n", + "在标签为$c_k$的前提下,样本x的第$j$个特征(像素点)的第$l$个值(经过二值化处理,这里的$l$只有0或1两种可能)。conditional_probability的维度是`(10, 784, 2)`,最后的那个2,指的就是每个特征可以取的值。如果不做二值化处理,那么每个像素点应该有256种取值。" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# 条件概率\n", + "conditional_probability = np.zeros((class_num, feature_len, 2)) \n", + "\n", + "for i in range(len(train_labels)):\n", + " img = binaryzation(trainset[i]) # 图片二值化\n", + " label = train_labels[i]\n", + " for j in range(feature_len):\n", + " conditional_probability[label][j][img[j]] += 1 # 这里只得到a_jl的数量" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 199., 2512.])" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conditional_probability[0][500] " + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "2711.0" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "class_number[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 0.07340465, 0.92659535])" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conditional_probability[0][500] / class_number[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "conditional_probability_fraction = np.zeros((class_num, feature_len, 2)) \n", + "\n", + "for i in range(len(train_labels)):\n", + " label = train_labels[i]\n", + " for j in range(feature_len):\n", + " conditional_probability_fraction[label][j] = conditional_probability[label][j] / class_number[label]" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 0.07340465, 0.92659535])" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conditional_probability_fraction[0][500]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "发现上面如果分开两循环写的话冗长,这里还是应该写在一起:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# 计算先验概率及条件概率\n", + "for i in range(len(train_labels)):\n", + " img = binaryzation(trainset[i]) # 图片二值化\n", + " label = train_labels[i]\n", + "\n", + " class_number[label] += 1 # 每个label的图片各有多少个\n", + " prior_probability = class_number / len(train_labels)\n", + "\n", + " for j in range(feature_len):\n", + " conditional_probability[label][j][img[j]] += 1 \n", + " # 在所有训练样本中,标签=0的样本中,像素点=0的对应位置上,一共有多少个样本是0,一共有多少个样本是1\n", + " \n", + "# 推荐概率 \n", + "for i in range(class_num):\n", + " for j in range(feature_len):\n", + " conditional_probability[label][j] = conditional_probability[label][j] / class_number[label]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "上面就算完成了第一步,计算完了先验概率和条件概率。接下来第二步对测试集进行预测:" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(100, 784)" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "testset.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# 写一个函数来计算每一个标签下,对应的概率\n", + "def calculate_probability(img, label):\n", + " probability = prior_probability[label] # 先验概率\n", + "\n", + " # 对每一个像素点进行迭代,计算在laebl固定的情况下,每一个像素点的概率,然后联乘\n", + " for i in range(len(img)):\n", + " probability *= conditional_probability[label][i][img[i]] \n", + " # [i]表示一个测试样本中,第i个像素点\n", + " # img[i]表示一个测试样本中,第i个像素点是0还是1\n", + "\n", + " return probability" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/xu/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel/__main__.py:7: RuntimeWarning: overflow encountered in double_scalars\n", + "/Users/xu/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel/__main__.py:7: RuntimeWarning: invalid value encountered in double_scalars\n" + ] + } + ], + "source": [ + "predict = []\n", + "\n", + "for img in testset:\n", + " img = binaryzation(img)\n", + " \n", + " max_label = 0\n", + " max_probability = calculate_probability(img, 0)\n", + " \n", + " for j in range(1, 10):\n", + " probability = calculate_probability(img, j)\n", + " \n", + " if max_probability < probability:\n", + " max_label = j\n", + " max_probability = probability\n", + "\n", + " predict.append(max_label)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "看来确实是这样,为了防止溢出,所以源代码里才一直用数量代替。\n", + "\n", + "看来这个算法不需要我改了,源代码其实已经考虑了溢出的问题。" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [py35]", + "language": "python", + "name": "Python [py35]" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/perceptron/binary_perceptron.py b/perceptron/binary_perceptron.py index e6ff6e0..187ba8c 100644 --- a/perceptron/binary_perceptron.py +++ b/perceptron/binary_perceptron.py @@ -12,7 +12,7 @@ import random import time -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score @@ -23,7 +23,7 @@ def __init__(self): self.max_iteration = 5000 def predict_(self, x): - wx = sum([self.w[j] * x[j] for j in xrange(len(self.w))]) + wx = sum([self.w[j] * x[j] for j in range(len(self.w))]) return int(wx > 0) def train(self, features, labels): @@ -37,7 +37,7 @@ def train(self, features, labels): x = list(features[index]) x.append(1.0) y = 2 * labels[index] - 1 - wx = sum([self.w[j] * x[j] for j in xrange(len(self.w))]) + wx = sum([self.w[j] * x[j] for j in range(len(self.w))]) if wx * y > 0: correct_count += 1 @@ -45,7 +45,7 @@ def train(self, features, labels): break continue - for i in xrange(len(self.w)): + for i in range(len(self.w)): self.w[i] += self.learning_step * (y * x[i]) def predict(self,features): @@ -59,7 +59,7 @@ def predict(self,features): if __name__ == '__main__': - print 'Start read data' + print('Start read data') time_1 = time.time() @@ -76,19 +76,19 @@ def predict(self,features): # print train_features.shape time_2 = time.time() - print 'read data cost ', time_2 - time_1, ' second', '\n' + print('read data cost ', time_2 - time_1, ' second', '\n') - print 'Start training' + print('Start training') p = Perceptron() p.train(train_features, train_labels) time_3 = time.time() - print 'training cost ', time_3 - time_2, ' second', '\n' + print('training cost ', time_3 - time_2, ' second', '\n') - print 'Start predicting' + print('Start predicting') test_predict = p.predict(test_features) time_4 = time.time() - print 'predicting cost ', time_4 - time_3, ' second', '\n' + print('predicting cost ', time_4 - time_3, ' second', '\n') score = accuracy_score(test_labels, test_predict) - print "The accruacy socre is ", score + print("The accruacy socre is ", score) diff --git a/svm/__pycache__/generate_dataset.cpython-35.pyc b/svm/__pycache__/generate_dataset.cpython-35.pyc new file mode 100644 index 0000000..62efb2a Binary files /dev/null and b/svm/__pycache__/generate_dataset.cpython-35.pyc differ diff --git a/svm/generate_dataset.py b/svm/generate_dataset.py index b71102e..cdf8262 100644 --- a/svm/generate_dataset.py +++ b/svm/generate_dataset.py @@ -49,7 +49,7 @@ def data_visualization(X,y,title): size = len(y) - for i in xrange(size): + for i in range(size): X_1 = X[0][i] X_2 = X[1][i] @@ -76,7 +76,7 @@ def rebuild_features(features): size = len(features[0]) new_features = [] - for i in xrange(size): + for i in range(size): new_features.append([features[0][i],features[1][i]]) return new_features @@ -92,7 +92,7 @@ def generate_dataset(size, noisy = False, visualization = True): testset_size = int(len(y)*0.333) - indexes = [i for i in xrange(len(y))] + indexes = [i for i in range(len(y))] test_indexes = random.sample(indexes,testset_size) train_indexes = list(set(indexes)-set(test_indexes)) diff --git a/svm/svm.py b/svm/svm.py index cb56839..d145a4e 100644 --- a/svm/svm.py +++ b/svm/svm.py @@ -12,7 +12,7 @@ import logging import pandas as pd -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from generate_dataset import * @@ -36,7 +36,7 @@ def _init_parameters(self, features, labels): self.n = len(features[0]) self.N = len(features) self.alpha = [0.0] * self.N - self.E = [self._E_(i) for i in xrange(self.N)] + self.E = [self._E_(i) for i in range(self.N)] self.C = 1000 self.Max_Interation = 5000 @@ -63,14 +63,17 @@ def _select_two_parameters(self): ''' 按照书上7.4.2选择两个变量 ''' - index_list = [i for i in xrange(self.N)] + index_list = [i for i in range(self.N)] - i1_list_1 = filter(lambda i: self.alpha[i] > 0 and self.alpha[i] < self.C, index_list) + i1_list_1 = list(filter(lambda i: self.alpha[i] > 0 and self.alpha[i] < self.C, index_list)) i1_list_2 = list(set(index_list) - set(i1_list_1)) i1_list = i1_list_1 i1_list.extend(i1_list_2) - + ''' + python 提示AttributeError: 'range' object has no attribute 'extend' + key:listtemp=list(range(...)) + ''' for i in i1_list: if self._satisfy_KKT(i): continue @@ -94,13 +97,13 @@ def _K_(self, x1, x2): ''' if self.kernel == 'linear': - return sum([x1[k] * x2[k] for k in xrange(self.n)]) + return sum([x1[k] * x2[k] for k in range(self.n)]) if self.kernel == 'poly': - return (sum([x1[k] * x2[k] for k in xrange(self.n)])+1)**3 + return (sum([x1[k] * x2[k] for k in range(self.n)])+1)**3 - print '没有定义核函数' + print('没有定义核函数') return 0 def _g_(self, i): @@ -109,7 +112,7 @@ def _g_(self, i): ''' result = self.b - for j in xrange(self.N): + for j in range(self.N): result += self.alpha[j] * self.Y[j] * self._K_(self.X[i], self.X[j]) return result @@ -122,7 +125,7 @@ def _E_(self, i): def try_E(self,i): result = self.b-self.Y[i] - for j in xrange(self.N): + for j in range(self.N): if self.alpha[j]<0 or self.alpha[j]>self.C: continue result += self.Y[j]*self.alpha[j]*self._K_(self.X[i],self.X[j]) @@ -133,7 +136,7 @@ def train(self, features, labels): self._init_parameters(features, labels) - for times in xrange(self.Max_Interation): + for times in range(self.Max_Interation): # if self.is_stop(): # return @@ -190,7 +193,7 @@ def train(self, features, labels): def _predict_(self,feature): result = self.b - for i in xrange(self.N): + for i in range(self.N): result += self.alpha[i]*self.Y[i]*self._K_(feature,self.X[i]) if result > 0: @@ -210,7 +213,7 @@ def predict(self,features): logger = logging.getLogger() logger.setLevel(logging.DEBUG) - print 'Start read data' + print('Start read data') time_1 = time.time() @@ -218,20 +221,20 @@ def predict(self,features): train_features, train_labels, test_features, test_labels = generate_dataset(2000,visualization=False) time_2 = time.time() - print 'read data cost ',time_2 - time_1,' second','\n' + print('read data cost ',time_2 - time_1,' second','\n') - print 'Start training' + print('Start training') svm = SVM() svm.train(train_features, train_labels) time_3 = time.time() - print 'training cost ',time_3 - time_2,' second','\n' + print('training cost ',time_3 - time_2,' second','\n') - print 'Start predicting' + print('Start predicting') test_predict = svm.predict(test_features) time_4 = time.time() - print 'predicting cost ',time_4 - time_3,' second','\n' + print('predicting cost ',time_4 - time_3,' second','\n') score = accuracy_score(test_labels,test_predict) - print "svm1 the accruacy socre is ", score + print("svm1 the accruacy socre is ", score)