目录
一、线性回归
算法实现:正规方程法、梯度下降法
linearRegression.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Author : Cabbage
# @project : ML
# @FileName: linearRegression.py
# @Blog : https://blog.csdn.net/lzbmc
'''
线性回归:单变量(一元)线性回归、多变量(多元)线性回归
下面代码是单变量线性回归。
多变量只需要在读取数据的时候x对应列增加,注意数据归一化,绘图需要降维
'''
import numpy as np
import matplotlib.pyplot as plt
from numpy import *
import pandas as pd
def loadDataSet():
data = pd.read_csv('ex0.txt', header=None, delimiter='\t') # 没有表头
# print(data.head()) # 默认查看前五行,可以用来查看数据
# # 查看数据特征之间的相关系数
# r = data.corr() # 0-0.3:弱相关;0.3-0.6:中等程度相关;0.6-1:强相关
# print(r)
xArr = data.values[:, 0:2] # 数据第一列全为1,即X0,对应的W0是b (200,2)
yArr = data.values[:, -1] # (200,1)
return xArr, yArr
def analyzeData(x, y):
x = x[:, 1] # (200,)
plt.scatter(x, y, s=15)
return plt
def plotLrEquation(weights, xArr, yArr):
plt = analyzeData(xArr, yArr)
xCopy = xArr.copy()
xCopy.sort(0) # 按列排序
y = xCopy * weights
plt.plot(xCopy[:, 1], y, c='r')
plt.show()
# 回归任务常用性能度量:均方误差。
def lr_normalEquation(x, y):
'''
方法一、正规方程法:求导为零时,得到w
'''
print('====', y.shape)
xTx = x.T * x # (n,m)*(m,n)=(n,n)
if np.linalg.det(xTx) == 0: # 计算行列式,判断是否可逆
print('This matrix is singular, cannot do inverse')
return
ws = xTx.I * x.T * y.T # (n,n)*(n,m)=(n,m) * (m,1) = (n,1)
return ws
def lr_gradientDecent(x, y, epoch):
'''
方法二、梯度下降法:设置学习率和迭代次数,更新w
'''
alpha = 0.01
m, n = x.shape
weights = ones((n, 1)) # (n,1)
cost = [] # 统计每次迭代之后的cost
for i in range(epoch):
error = x * weights - y.T # (m,1)
weights = weights - alpha * (1 / m) * x.T * error
currCost = cost_function(x, y, weights)
cost.append(currCost)
return weights, cost
# 代价函数
def cost_function(x, y, weights):
m = x.shape[0] # 数据的个数
cost = 1 / (2 * m) * sum(np.power((x * weights - y.T), 2))
return cost
# 迭代次数与代价函数的关系
def plot_epochCost(cost, epoch=1000): # 默认值参数必须放在后面
x = range(epoch)
y = cost
plt.plot(x, y)
plt.xlabel('epoch')
plt.ylabel('cost')
plt.show()
if __name__ == "__main__":
dataArr, labels = loadDataSet()
xMat = np.mat(dataArr) # (m,n)
yMat = np.mat(labels) # (1,m)
# plt = analyzeData(dataArr, labels)
# plt.show()
# # 最小二乘正规方程
w = lr_normalEquation(xMat, yMat)
print(w)
plotLrEquation(w, dataArr, labels)
# # 梯度下降法
# epoch = 1000
# w, cost = lr_gradientDecent(xMat, yMat, epoch)
# plotLrEquation(w, dataArr, labels)
# plot_epochCost(cost, epoch)
sklearn实现
linearRegression_sklearn.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Author : Cabbage
# @project : ML
# @FileName: linearRegression_sklearn.py
# @Blog : https://blog.csdn.net/lzbmc
import numpy as np
from sklearn.linear_model import LinearRegression
import pandas as pd
def loadDataSet():
data = pd.read_csv('ex0.txt', header=None, delimiter='\t') # 没有表头
# print(data.head()) # 默认查看前五行,可以用来查看数据
# # 查看数据特征之间的相关系数
# r = data.corr() # 0-0.3:弱相关;0.3-0.6:中等程度相关;0.6-1:强相关
# print(r)
trainSet = np.array(data) # 转化为数组
xArr = trainSet[:, 0:2] # 数据第一列全为1,即X0,对应的W0是b (200,2)
yArr = trainSet[:, -1] # (200,1)
return xArr, yArr
def lr_sklearn(xArr, yArr):
model = LinearRegression()
model.fit(xArr, yArr)
weights = model.coef_ # 回归系数
bias = model.intercept_ # 截距
return weights, bias
if __name__ == '__main__':
dataArr, labels = loadDataSet()
# xMat = np.mat(dataArr) # (m,n)
# yMat = np.mat(labels) # (1,m)
weights, bias = lr_sklearn(dataArr, labels)
print(weights, bias)
二、对数几率回归(逻辑回归)3.3
算法实现
- 3.3 编程实现对率回归,并给出西瓜数据集3.0α上的结果
logisticRegression.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Author : Cabbage
# @project : ML
# @FileName: logisticRegression.py
# @Blog : https://blog.csdn.net/lzbmc
# 机器学习实战
'''
编程实现对率回归,并给出西瓜数据集3.0α上的结果
'''
import pandas as pd
import numpy as np
from numpy import *
# 加载数据,结构化数据格式
def loadDataSet0():
raw_data = pd.read_csv('../watermelon3.0a')
dataSet = raw_data.values[0:, 1:-1]
m = dataSet.shape[0]
x0 = ones(m)
dataArr = np.insert(dataSet, 0, values=x0, axis=1) # 第一列插入1,b当做w0,对应x0为1
labelsArr = raw_data.values[0:, -1] # <class 'numpy.ndarray'>
return dataArr, labelsArr
def loadDataSet1():
dataMat = []
labelMat = []
fr = open('../watermelon3.0a', encoding='utf-8')
for line in fr.readlines():
if '编号' in line:
continue
lineArr = line.strip().split(',')
dataMat.append([1.0, float(lineArr[1]), float(lineArr[2])]) # b看做w0,对应x0=1
labelMat.append(int(lineArr[3])) # <class 'list'>
return dataMat, labelMat
# matrix的优势就是相对简单的运算符号,比如两个矩阵相乘,就是用符号*,但是array相乘不能这么用,得用方法.dot()
# array的优势就是不仅仅表示二维,还能表示3、4、5…维,而且在大部分Python程序里,array也是更常用的。
# 激活函数
def sigmoid(inputX):
return 1.0 / (1 + exp(-inputX))
# 一、标准梯度上升:计算整个数据集的梯度,计算复杂度太高。梯度上升求函数最大值,梯度下降求最小值
def gradAscent(dataArr, classLabels):
dataMat = mat(dataArr) # <class 'numpy.matrixlib.defmatrix.matrix'> 数组array转换成matrix
labelMat = mat(classLabels) # ndarray转置很麻烦,并且*是对应位置元素相乘。所以转换成矩阵
# 初始化权重矩阵
m, n = shape(dataMat)
weights = ones((n, 1)) # 数组
# print('standard:', type(weights)) # <class 'numpy.ndarray'>
# 设置学习率(目标移动的步长)和迭代次数
alpha = 0.001
maxCycles = 500
# 参数更新
for i in range(maxCycles):
predict = sigmoid(dataMat * weights) # (m,1)
error = labelMat.T - predict
weights = weights + alpha * dataMat.T * error # (n,m)*(m,1)
# print('standard:', type(weights)) # <class 'numpy.matrixlib.defmatrix.matrix'>
return weights # 矩阵
import matplotlib.pyplot as plt
# 分析数据,画出决策边界
def plotBestFit(weights):
dataSet, labels = loadDataSet0() # dataSet每一行是一条数据,第一列对应b的参数,其余两列是特征,画图对应x,y。
# dataArr = array(dataSet) # <class 'numpy.ndarray'> loadDataSet1时将列表转换成数组
m = shape(dataSet)[0]
xcord1 = []; ycord1 = []
xcord2 = []; ycord2 = []
for i in range(m):
if int(labels[i]) == 1:
xcord1.append(dataSet[i, 1])
ycord1.append(dataSet[i, 2])
else:
xcord2.append(dataSet[i, 1])
ycord2.append(dataSet[i, 2])
fig = plt.figure() # 创建图像
ax = fig.add_subplot(111) # 子图
ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
ax.scatter(xcord2, ycord2, s=30, c='green')
x = arange(-0.5, 1.0, 0.1)
y = (-weights[0] - weights[1] * x) / weights[2] # 0 = w0x0 + w1x + w2y
ax.plot(x, y)
plt.xlabel('X1')
plt.ylabel('X2')
plt.show()
# 随机梯度上升:对数据中的每个样本,计算梯度,更新参数
def stocGradAscent0(dataArr, classLabels):
m, n = shape(dataArr)
alpha = 0.001
weights = ones(n) # <class 'numpy.ndarray'> (1, n)
for iter in range(200): # 在数据集上运行200次
for i in range(m):
predict = sigmoid(sum(dataArr[i] * weights)) # 一个数值
error = classLabels[i] - predict
weights = weights + alpha * dataArr[i] * error
# print('stochastic:', type(weights)) # <class 'numpy.ndarray'>
return weights # 数组
# 改进的随机梯度上升:随着训练次数增加学习步长减小,随机选取样本,选完之后删除此样本
def stocGradAscent1(dataArr, classLabels, numIter=150): # 增加参数迭代次数
m, n = shape(dataArr)
weights = ones(n)
for iter in range(numIter):
dataIndex = list(range(m)) # python3要加list,因为python3中range不返回数组对象,而是返回range对象
for i in range(m):
randIndex = int(random.uniform(0, len(dataIndex))) # len(dataIndex)是变化的
predict = sigmoid(sum(dataArr[randIndex] * weights))
error = classLabels[randIndex] - predict
alpha = 4 / (1 + iter + i) + 0.001
weights = weights + alpha * dataArr[randIndex] * error
del(dataIndex[randIndex])
return weights # 数组
def classifyVector(inputX, weights):
prob = sigmoid(sum(inputX * weights)) # 矩阵的*:矩阵乘法; 数组的*:对应位置元素相乘。所以要sum
# print(prob)
if prob > 0.5:
return 1.0
else:
return 0.0
if __name__ == '__main__':
dataArr, labels = loadDataSet1() # 列表
# dataArr0, labels0 = loadDataSet0() # 数组
weights = gradAscent(dataArr, labels) # 矩阵
print(weights)
plotBestFit(weights.getA()) # 注意!!!a.getA()是将numpy矩阵转换为数组,与mat(a)功能相反
# weights2 = stocGradAscent0(array(dataArr), labels) # 数组
# # weights2 = stocGradAscent0(dataArr, labels)
# plotBestFit(weights2)
# weights = stocGradAscent1(array(dataArr), labels) # 数组
# print(weights)
# plotBestFit(weights)
testData = [1, 0.697, 0.460]
result = classifyVector(testData, weights)
print(result)
- 机器学习实战:从疝气病症预测病马的死亡率
colic.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Author : Cabbage
# @project : ML
# @FileName: colic.py
# @Blog : https://blog.csdn.net/lzbmc
# 机器学习实战:从疝气病症预测病马的死亡率
from logisticRegression import stocGradAscent1, classifyVector
from numpy import *
def colicTest():
frTrain = open('../horseColic/horseColicTraining.txt')
frTest = open('../horseColic/horseColicTest.txt')
trainingSet = []
trainingLabels = []
for line in frTrain.readlines():
currLine = line.strip().split('\t')
lineArr = [] # 可以把b当做哑结点w0, lineArr=[1.0]
lastIndex = len(currLine) - 1 # 最后一列的索引 21
for i in range(lastIndex):
lineArr.append(float(currLine[i]))
trainingSet.append(lineArr)
trainingLabels.append(float(currLine[-1]))
# 最优参数
trainingWeights = stocGradAscent1(array(trainingSet), trainingLabels, numIter=500)
# print(shape(trainingWeights))
# 测试集处理
errorCount= 0
numTestSet = 0.0 # 统计测试集数目
for line in frTest.readlines():
numTestSet += 1.0
currLine = line.strip().split('\t')
lineArr = []
for i in range(21):
lineArr.append(float(currLine[i]))
if int(classifyVector(array(lineArr), trainingWeights)) != int(currLine[-1]):
errorCount += 1
errorRate = float(errorCount) / numTestSet
print('the error rate of this test is: %f' % errorRate)
return errorRate
# 多次运算求平均错误率
def multiTest(numTests = 10):
errorRateSum = 0.0
for k in range(numTests):
errorRateSum += colicTest()
aveErrorRate = errorRateSum / float(numTests)
print('after %d iterations the average error rate is: %f' % (k+1, aveErrorRate))
if __name__ == '__main__':
# colicTest()
multiTest()
sklearn实现
logisticRegression_sklearn.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Author : Cabbage
# @project : ML
# @FileName: logisticRegression_sklearn.py
# @Blog : https://blog.csdn.net/lzbmc
'''
solver参数决定了我们对逻辑回归损失函数的优化方法,有五种算法可以选择,分别是:
a) liblinear:使用了开源的liblinear库实现,内部使用了坐标轴下降法来迭代优化损失函数。小数数据集
b) lbfgs:拟牛顿法的一种,利用损失函数二阶导数矩阵即海森矩阵来迭代优化损失函数。
c) newton-cg:也是牛顿法家族的一种,利用损失函数二阶导数矩阵即海森矩阵来迭代优化损失函数。
d) sag:即随机平均梯度下降,是梯度下降法的变种,和普通梯度下降法的区别是每次迭代仅仅
用一部分的样本来计算梯度,适合于样本数据多的时候。大数据集。
e) sage:大数据集
https://scikit-learn.org/dev/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression
'''
from sklearn.linear_model import LogisticRegression
import pandas as pd
raw_train = pd.read_csv('../horseColic/horseColicTraining.txt', header=None, delimiter='\t') # 没有表头
raw_test = pd.read_csv('../horseColic/horseColicTest.txt', header=None, delimiter='\t')
# print(type(raw_train)) # <class 'pandas.core.frame.DataFrame'>
# print(type(raw_train.values)) # <class 'numpy.ndarray'>
train = raw_train.values[:, 1:2]
trainLabels = raw_train.values[:, -1]
test = raw_test.values[:, 0:-1]
testLabels = raw_test.values[:, -1]
LR = LogisticRegression()
LR.fit(train, trainLabels)
score = LR.score(test, testLabels)
print(score) # 0.7313432835820896
- 补充:NumPy数组 和 Numpy矩阵 的区别
array_matrix.py
import numpy as np
''' ndarray '''
x = np.array([[1, 2], [3, 4]]) # <class 'numpy.ndarray'>
y = np.array([[5, 6], [7, 8]])
print(type(x))
print('array *:', '\n', x * y) # 对应位置元素相乘
print('array dot:', '\n', np.dot(x, y)) # 矩阵乘法
print('array multiply:', '\n', np.multiply(x, y)) # 对应位置元素相乘
a = np.array([1, 2])
print(a ** 2) # 对应位置元素平方
print(np.power(a, 2)) # 对应位置元素平方
# 一维转置:用reshape
data = np.arange(5)
print(data, data.T) # 不会转置
print(data.transpose()) # 不会转置
print(data.reshape((5,1)))
print('==============我是华丽分割线==============')
''' matrix '''
a = np.mat([[1, 2], [3, 4]]) # <class 'numpy.matrixlib.defmatrix.matrix'>
b = np.mat([[5, 6], [7, 8]])
print(type(a))
print('mat *:', '\n', a * b) # 矩阵乘法
print('mat dot:', '\n', np.dot(a, b)) # 矩阵乘法
print('mat multiply:', '\n', np.multiply(a, b)) # 对应位置元素相乘
a = np.mat([[1, 2], [2, 3]])
print(a ** 2) # 矩阵**平方必须是方阵
a = np.mat([1, 2])
print(np.power(a, 2)) # 对应位置元素平方
a = [1, 2, 3] # 列表和numpy数组相乘:对应位置元素平方
b = np.array([1, 1, 1])
print(a * b)
二、线性判别分析LDA(3.5)
算法实现 & sklearn实现
- 3.5 编程实现线性判别分析,并给出西瓜数据集3.0a上的结果
#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Author : Cabbage
# @project : ML
# @FileName: LDA.py
# @Blog : https://blog.csdn.net/lzbmc
# 参考:https://cloud.tencent.com/developer/article/1099252
'''
编程实现线性判别分析,并给出西瓜数据集3.0a上的结果
'''
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
def processData():
raw_data = pd.read_csv('../watermelon3.0a')
x = raw_data.values[:, 1:-1]
y = raw_data.values[:, -1]
# 分类
x0 = x[y == 0] # (numX1, 2)
x1 = x[y == 1]
# print(x[y == 0, 0]) # y=0对应的索引的第一列 [0.666 0.243 0.245 0.343 0.639 0.657 0.36 0.593 0.719]
return x0, x1
def LDA(x0, x1):
# 计算各个属性的均值
u0 = np.array([x0[:, 0].mean(), x0[:, 1].mean()]) # (1,n) # u0 = x0.mean(0) 同等作用,按列求平均
u1 = x1.mean(0)
# 类内散度矩阵
sw = np.dot((x0 - u0).T, x0 - u0) + np.dot((x1 - u1).T, x1 - u1)
W = np.dot(np.linalg.inv(sw), (u0 - u1).T)
print(W)
# 解决中文乱码
plt.rcParams['font.sans-serif'] = [u'SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.scatter(x0[:, 0], x0[:, 1], s=15, color='g', label='坏瓜')
plt.scatter(x1[:, 0], x1[:, 1], s=15, color='r', label='好瓜')
plt.legend(loc='upper left')
plt.xlabel('密度')
plt.ylabel('含糖率')
# plt.show()
pl = -(0.2 * W[0] - 0.01) / W[1]
pr = -(0.8 * W[0] - 0.01) / W[1]
plt.plot([0.2, 0.8], [pl, pr])
plt.show()
def LDA_sklearn():
raw_data = pd.read_csv('iris.data2', header=None)
X = raw_data.values[:, 0:4]
y = raw_data.values[:, -1]
# 设置压缩到1维
lda = LinearDiscriminantAnalysis(n_components=1)
'''利用线性判别函数将四维的样本数据压缩到一条直线上'''
lda.fit(X, y)
print(lda.score(X, y))
'''展示LDA的降维功能'''
X_r2 = lda.fit(X, y).transform(X)
X_Zero = np.zeros(X_r2.shape)
print(X_Zero.shape)
'''绘制降维效果图'''
for c, i in zip('ryb', [0, 1, 2]):
plt.scatter(X_r2[y == i], X_Zero[y == i], c=c, s=5)
plt.legend()
plt.grid()
plt.show()
'''划分数据集,进行训练和预测'''
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print(y_test)
lda = LinearDiscriminantAnalysis(n_components=1)
lda.fit(X_train, y_train)
# y_pred = lda.predict(X_test)
# score = accuracy_score(y_pred, y_test)
# !!!上面两句等价于
score = lda.score(X_test, y_test)
print('准确率:', score)
if __name__ == '__main__':
x0, x1 = processData()
# LDA(x0, x1)
LDA_sklearn()
1234

被折叠的 条评论
为什么被折叠?



