Skip to content

Commit b46def4

Browse files
committed
乐高实例
1 parent 1dd516f commit b46def4

File tree

1 file changed

+231
-0
lines changed

1 file changed

+231
-0
lines changed

Regression/lego.py

Lines changed: 231 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,231 @@
1+
# -*-coding:utf-8 -*-
2+
import numpy as np
3+
from bs4 import BeautifulSoup
4+
import random
5+
6+
def scrapePage(retX, retY, inFile, yr, numPce, origPrc):
7+
"""
8+
函数说明:从页面读取数据,生成retX和retY列表
9+
Parameters:
10+
retX - 数据X
11+
retY - 数据Y
12+
inFile - HTML文件
13+
yr - 年份
14+
numPce - 乐高部件数目
15+
origPrc - 原价
16+
Returns:
17+
18+
Website:
19+
http://www.cuijiahua.com/
20+
Modify:
21+
2017-12-03
22+
"""
23+
# 打开并读取HTML文件
24+
with open(inFile, encoding='utf-8') as f:
25+
html = f.read()
26+
soup = BeautifulSoup(html)
27+
28+
i = 1
29+
# 根据HTML页面结构进行解析
30+
currentRow = soup.find_all('table', r = "%d" % i)
31+
32+
while(len(currentRow) != 0):
33+
currentRow = soup.find_all('table', r = "%d" % i)
34+
title = currentRow[0].find_all('a')[1].text
35+
lwrTitle = title.lower()
36+
# 查找是否有全新标签
37+
if (lwrTitle.find('new') > -1) or (lwrTitle.find('nisb') > -1):
38+
newFlag = 1.0
39+
else:
40+
newFlag = 0.0
41+
42+
# 查找是否已经标志出售,我们只收集已出售的数据
43+
soldUnicde = currentRow[0].find_all('td')[3].find_all('span')
44+
if len(soldUnicde) == 0:
45+
print("商品 #%d 没有出售" % i)
46+
else:
47+
# 解析页面获取当前价格
48+
soldPrice = currentRow[0].find_all('td')[4]
49+
priceStr = soldPrice.text
50+
priceStr = priceStr.replace('$','')
51+
priceStr = priceStr.replace(',','')
52+
if len(soldPrice) > 1:
53+
priceStr = priceStr.replace('Free shipping', '')
54+
sellingPrice = float(priceStr)
55+
56+
# 去掉不完整的套装价格
57+
if sellingPrice > origPrc * 0.5:
58+
print("%d\t%d\t%d\t%f\t%f" % (yr, numPce, newFlag, origPrc, sellingPrice))
59+
retX.append([yr, numPce, newFlag, origPrc])
60+
retY.append(sellingPrice)
61+
i += 1
62+
currentRow = soup.find_all('table', r = "%d" % i)
63+
64+
#
65+
def setDataCollect(retX, retY):
66+
"""
67+
函数说明:依次读取六种乐高套装的数据,并生成数据矩阵
68+
Parameters:
69+
70+
Returns:
71+
72+
Website:
73+
http://www.cuijiahua.com/
74+
Modify:
75+
2017-12-03
76+
"""
77+
scrapePage(retX, retY, './lego/lego8288.html', 2006, 800, 49.99) #2006年的乐高8288,部件数目800,原价49.99
78+
scrapePage(retX, retY, './lego/lego10030.html', 2002, 3096, 269.99) #2002年的乐高10030,部件数目3096,原价269.99
79+
scrapePage(retX, retY, './lego/lego10179.html', 2007, 5195, 499.99) #2007年的乐高10179,部件数目5195,原价499.99
80+
scrapePage(retX, retY, './lego/lego10181.html', 2007, 3428, 199.99) #2007年的乐高10181,部件数目3428,原价199.99
81+
scrapePage(retX, retY, './lego/lego10189.html', 2008, 5922, 299.99) #2008年的乐高10189,部件数目5922,原价299.99
82+
scrapePage(retX, retY, './lego/lego10196.html', 2009, 3263, 249.99) #2009年的乐高10196,部件数目3263,原价249.99
83+
84+
def regularize(xMat, yMat):
85+
"""
86+
函数说明:数据标准化
87+
Parameters:
88+
xMat - x数据集
89+
yMat - y数据集
90+
Returns:
91+
inxMat - 标准化后的x数据集
92+
inyMat - 标准化后的y数据集
93+
Website:
94+
http://www.cuijiahua.com/
95+
Modify:
96+
2017-12-03
97+
"""
98+
inxMat = xMat.copy() #数据拷贝
99+
inyMat = yMat.copy()
100+
yMean = np.mean(yMat, 0) #行与行操作,求均值
101+
inyMat = yMat - yMean #数据减去均值
102+
inMeans = np.mean(inxMat, 0) #行与行操作,求均值
103+
inVar = np.var(inxMat, 0) #行与行操作,求方差
104+
# print(inxMat)
105+
print(inMeans)
106+
# print(inVar)
107+
inxMat = (inxMat - inMeans) / inVar #数据减去均值除以方差实现标准化
108+
return inxMat, inyMat
109+
110+
def rssError(yArr,yHatArr):
111+
"""
112+
函数说明:计算平方误差
113+
Parameters:
114+
yArr - 预测值
115+
yHatArr - 真实值
116+
Returns:
117+
118+
Website:
119+
http://www.cuijiahua.com/
120+
Modify:
121+
2017-12-03
122+
"""
123+
return ((yArr-yHatArr)**2).sum()
124+
125+
def standRegres(xArr,yArr):
126+
"""
127+
函数说明:计算回归系数w
128+
Parameters:
129+
xArr - x数据集
130+
yArr - y数据集
131+
Returns:
132+
ws - 回归系数
133+
Website:
134+
http://www.cuijiahua.com/
135+
Modify:
136+
2017-11-12
137+
"""
138+
xMat = np.mat(xArr); yMat = np.mat(yArr).T
139+
xTx = xMat.T * xMat #根据文中推导的公示计算回归系数
140+
if np.linalg.det(xTx) == 0.0:
141+
print("矩阵为奇异矩阵,不能转置")
142+
return
143+
ws = xTx.I * (xMat.T*yMat)
144+
return ws
145+
146+
def crossValidation(xArr, yArr, numVal = 10):
147+
m = len(yArr) #统计样本个数
148+
indexList = range(m) #生成索引值列表
149+
errorMat = np.zeros((numVal,30)) #create error mat 30columns numVal rows
150+
for i in range(numVal): #交叉验证numVal次
151+
trainX = []; trainY = [] #训练集
152+
testX = []; testY = [] #测试集
153+
random.shuffle(indexList) #打乱次序
154+
for j in range(m): #划分数据集:90%训练集,10%测试集
155+
if j < m * 0.9:
156+
trainX.append(xArr[indexList[j]])
157+
trainY.append(yArr[indexList[j]])
158+
else:
159+
testX.append(xArr[indexList[j]])
160+
testY.append(yArr[indexList[j]])
161+
wMat = ridgeTest(trainX, trainY) #获得30个不同lambda下的岭回归系数
162+
for k in range(30): #遍历所有的岭回归系数
163+
matTestX = np.mat(testX); matTrainX = np.mat(trainX) #测试集
164+
meanTrain = np.mean(matTrainX,0) #测试集均值
165+
varTrain = np.var(matTrainX,0) #测试集方差
166+
matTestX = (matTestX - meanTrain) / varTrain #测试集标准化
167+
yEst = matTestX * np.mat(wMat[k,:]).T + np.mean(trainY)
168+
errorMat[i,k] = rssError(yEst.T.A, array(testY))
169+
#print errorMat[i,k]
170+
meanErrors = np.mean(errorMat,0)
171+
minMean = float(min(meanErrors))
172+
bestWeights = wMat[np.nonzero(meanErrors == minMean)]
173+
174+
xMat = np.mat(xArr); yMat = np.mat(yArr).T
175+
meanX = np.mean(xMat,0); varX = np.var(xMat,0)
176+
unReg = bestWeights/varX
177+
print("the best model from Ridge Regression is:\n",unReg)
178+
print("with constant term: ",-1 * sum(multiply(meanX,unReg)) + mean(yMat))
179+
180+
def ridgeTest(xArr, yArr):
181+
"""
182+
函数说明:岭回归测试
183+
Parameters:
184+
xMat - x数据集
185+
yMat - y数据集
186+
Returns:
187+
wMat - 回归系数矩阵
188+
Website:
189+
http://www.cuijiahua.com/
190+
Modify:
191+
2017-11-20
192+
"""
193+
xMat = np.mat(xArr); yMat = np.mat(yArr).T
194+
#数据标准化
195+
yMean = np.mean(yMat, axis = 0) #行与行操作,求均值
196+
yMat = yMat - yMean #数据减去均值
197+
xMeans = np.mean(xMat, axis = 0) #行与行操作,求均值
198+
xVar = np.var(xMat, axis = 0) #行与行操作,求方差
199+
xMat = (xMat - xMeans) / xVar #数据减去均值除以方差实现标准化
200+
numTestPts = 30 #30个不同的lambda测试
201+
wMat = np.zeros((numTestPts, np.shape(xMat)[1])) #初始回归系数矩阵
202+
for i in range(numTestPts): #改变lambda计算回归系数
203+
ws = ridgeRegres(xMat, yMat, np.exp(i - 10)) #lambda以e的指数变化,最初是一个非常小的数,
204+
wMat[i, :] = ws.T #计算回归系数矩阵
205+
return wMat
206+
207+
208+
209+
def useStandRegres():
210+
"""
211+
函数说明:使用简单的线性回归
212+
Parameters:
213+
214+
Returns:
215+
216+
Website:
217+
http://www.cuijiahua.com/
218+
Modify:
219+
2017-11-12
220+
"""
221+
lgX = []
222+
lgY = []
223+
setDataCollect(lgX, lgY)
224+
data_num, features_num = np.shape(lgX)
225+
lgX1 = np.mat(np.ones((data_num, features_num + 1)))
226+
lgX1[:, 1:5] = np.mat(lgX)
227+
ws = standRegres(lgX1, lgY)
228+
print('%f%f*年份%f*部件数量%f*是否为全新%f*原价' % (ws[0],ws[1],ws[2],ws[3],ws[4]))
229+
230+
if __name__ == '__main__':
231+
useStandRegres()

0 commit comments

Comments
 (0)