1+ # -*-coding:utf-8 -*-
2+ import numpy as np
3+ from bs4 import BeautifulSoup
4+ import random
5+
6+ def scrapePage (retX , retY , inFile , yr , numPce , origPrc ):
7+ """
8+ 函数说明:从页面读取数据,生成retX和retY列表
9+ Parameters:
10+ retX - 数据X
11+ retY - 数据Y
12+ inFile - HTML文件
13+ yr - 年份
14+ numPce - 乐高部件数目
15+ origPrc - 原价
16+ Returns:
17+ 无
18+ Website:
19+ http://www.cuijiahua.com/
20+ Modify:
21+ 2017-12-03
22+ """
23+ # 打开并读取HTML文件
24+ with open (inFile , encoding = 'utf-8' ) as f :
25+ html = f .read ()
26+ soup = BeautifulSoup (html )
27+
28+ i = 1
29+ # 根据HTML页面结构进行解析
30+ currentRow = soup .find_all ('table' , r = "%d" % i )
31+
32+ while (len (currentRow ) != 0 ):
33+ currentRow = soup .find_all ('table' , r = "%d" % i )
34+ title = currentRow [0 ].find_all ('a' )[1 ].text
35+ lwrTitle = title .lower ()
36+ # 查找是否有全新标签
37+ if (lwrTitle .find ('new' ) > - 1 ) or (lwrTitle .find ('nisb' ) > - 1 ):
38+ newFlag = 1.0
39+ else :
40+ newFlag = 0.0
41+
42+ # 查找是否已经标志出售,我们只收集已出售的数据
43+ soldUnicde = currentRow [0 ].find_all ('td' )[3 ].find_all ('span' )
44+ if len (soldUnicde ) == 0 :
45+ print ("商品 #%d 没有出售" % i )
46+ else :
47+ # 解析页面获取当前价格
48+ soldPrice = currentRow [0 ].find_all ('td' )[4 ]
49+ priceStr = soldPrice .text
50+ priceStr = priceStr .replace ('$' ,'' )
51+ priceStr = priceStr .replace (',' ,'' )
52+ if len (soldPrice ) > 1 :
53+ priceStr = priceStr .replace ('Free shipping' , '' )
54+ sellingPrice = float (priceStr )
55+
56+ # 去掉不完整的套装价格
57+ if sellingPrice > origPrc * 0.5 :
58+ print ("%d\t %d\t %d\t %f\t %f" % (yr , numPce , newFlag , origPrc , sellingPrice ))
59+ retX .append ([yr , numPce , newFlag , origPrc ])
60+ retY .append (sellingPrice )
61+ i += 1
62+ currentRow = soup .find_all ('table' , r = "%d" % i )
63+
64+ #
65+ def setDataCollect (retX , retY ):
66+ """
67+ 函数说明:依次读取六种乐高套装的数据,并生成数据矩阵
68+ Parameters:
69+ 无
70+ Returns:
71+ 无
72+ Website:
73+ http://www.cuijiahua.com/
74+ Modify:
75+ 2017-12-03
76+ """
77+ scrapePage (retX , retY , './lego/lego8288.html' , 2006 , 800 , 49.99 ) #2006年的乐高8288,部件数目800,原价49.99
78+ scrapePage (retX , retY , './lego/lego10030.html' , 2002 , 3096 , 269.99 ) #2002年的乐高10030,部件数目3096,原价269.99
79+ scrapePage (retX , retY , './lego/lego10179.html' , 2007 , 5195 , 499.99 ) #2007年的乐高10179,部件数目5195,原价499.99
80+ scrapePage (retX , retY , './lego/lego10181.html' , 2007 , 3428 , 199.99 ) #2007年的乐高10181,部件数目3428,原价199.99
81+ scrapePage (retX , retY , './lego/lego10189.html' , 2008 , 5922 , 299.99 ) #2008年的乐高10189,部件数目5922,原价299.99
82+ scrapePage (retX , retY , './lego/lego10196.html' , 2009 , 3263 , 249.99 ) #2009年的乐高10196,部件数目3263,原价249.99
83+
84+ def regularize (xMat , yMat ):
85+ """
86+ 函数说明:数据标准化
87+ Parameters:
88+ xMat - x数据集
89+ yMat - y数据集
90+ Returns:
91+ inxMat - 标准化后的x数据集
92+ inyMat - 标准化后的y数据集
93+ Website:
94+ http://www.cuijiahua.com/
95+ Modify:
96+ 2017-12-03
97+ """
98+ inxMat = xMat .copy () #数据拷贝
99+ inyMat = yMat .copy ()
100+ yMean = np .mean (yMat , 0 ) #行与行操作,求均值
101+ inyMat = yMat - yMean #数据减去均值
102+ inMeans = np .mean (inxMat , 0 ) #行与行操作,求均值
103+ inVar = np .var (inxMat , 0 ) #行与行操作,求方差
104+ # print(inxMat)
105+ print (inMeans )
106+ # print(inVar)
107+ inxMat = (inxMat - inMeans ) / inVar #数据减去均值除以方差实现标准化
108+ return inxMat , inyMat
109+
110+ def rssError (yArr ,yHatArr ):
111+ """
112+ 函数说明:计算平方误差
113+ Parameters:
114+ yArr - 预测值
115+ yHatArr - 真实值
116+ Returns:
117+
118+ Website:
119+ http://www.cuijiahua.com/
120+ Modify:
121+ 2017-12-03
122+ """
123+ return ((yArr - yHatArr )** 2 ).sum ()
124+
125+ def standRegres (xArr ,yArr ):
126+ """
127+ 函数说明:计算回归系数w
128+ Parameters:
129+ xArr - x数据集
130+ yArr - y数据集
131+ Returns:
132+ ws - 回归系数
133+ Website:
134+ http://www.cuijiahua.com/
135+ Modify:
136+ 2017-11-12
137+ """
138+ xMat = np .mat (xArr ); yMat = np .mat (yArr ).T
139+ xTx = xMat .T * xMat #根据文中推导的公示计算回归系数
140+ if np .linalg .det (xTx ) == 0.0 :
141+ print ("矩阵为奇异矩阵,不能转置" )
142+ return
143+ ws = xTx .I * (xMat .T * yMat )
144+ return ws
145+
146+ def crossValidation (xArr , yArr , numVal = 10 ):
147+ m = len (yArr ) #统计样本个数
148+ indexList = range (m ) #生成索引值列表
149+ errorMat = np .zeros ((numVal ,30 )) #create error mat 30columns numVal rows
150+ for i in range (numVal ): #交叉验证numVal次
151+ trainX = []; trainY = [] #训练集
152+ testX = []; testY = [] #测试集
153+ random .shuffle (indexList ) #打乱次序
154+ for j in range (m ): #划分数据集:90%训练集,10%测试集
155+ if j < m * 0.9 :
156+ trainX .append (xArr [indexList [j ]])
157+ trainY .append (yArr [indexList [j ]])
158+ else :
159+ testX .append (xArr [indexList [j ]])
160+ testY .append (yArr [indexList [j ]])
161+ wMat = ridgeTest (trainX , trainY ) #获得30个不同lambda下的岭回归系数
162+ for k in range (30 ): #遍历所有的岭回归系数
163+ matTestX = np .mat (testX ); matTrainX = np .mat (trainX ) #测试集
164+ meanTrain = np .mean (matTrainX ,0 ) #测试集均值
165+ varTrain = np .var (matTrainX ,0 ) #测试集方差
166+ matTestX = (matTestX - meanTrain ) / varTrain #测试集标准化
167+ yEst = matTestX * np .mat (wMat [k ,:]).T + np .mean (trainY )
168+ errorMat [i ,k ] = rssError (yEst .T .A , array (testY ))
169+ #print errorMat[i,k]
170+ meanErrors = np .mean (errorMat ,0 )
171+ minMean = float (min (meanErrors ))
172+ bestWeights = wMat [np .nonzero (meanErrors == minMean )]
173+
174+ xMat = np .mat (xArr ); yMat = np .mat (yArr ).T
175+ meanX = np .mean (xMat ,0 ); varX = np .var (xMat ,0 )
176+ unReg = bestWeights / varX
177+ print ("the best model from Ridge Regression is:\n " ,unReg )
178+ print ("with constant term: " ,- 1 * sum (multiply (meanX ,unReg )) + mean (yMat ))
179+
180+ def ridgeTest (xArr , yArr ):
181+ """
182+ 函数说明:岭回归测试
183+ Parameters:
184+ xMat - x数据集
185+ yMat - y数据集
186+ Returns:
187+ wMat - 回归系数矩阵
188+ Website:
189+ http://www.cuijiahua.com/
190+ Modify:
191+ 2017-11-20
192+ """
193+ xMat = np .mat (xArr ); yMat = np .mat (yArr ).T
194+ #数据标准化
195+ yMean = np .mean (yMat , axis = 0 ) #行与行操作,求均值
196+ yMat = yMat - yMean #数据减去均值
197+ xMeans = np .mean (xMat , axis = 0 ) #行与行操作,求均值
198+ xVar = np .var (xMat , axis = 0 ) #行与行操作,求方差
199+ xMat = (xMat - xMeans ) / xVar #数据减去均值除以方差实现标准化
200+ numTestPts = 30 #30个不同的lambda测试
201+ wMat = np .zeros ((numTestPts , np .shape (xMat )[1 ])) #初始回归系数矩阵
202+ for i in range (numTestPts ): #改变lambda计算回归系数
203+ ws = ridgeRegres (xMat , yMat , np .exp (i - 10 )) #lambda以e的指数变化,最初是一个非常小的数,
204+ wMat [i , :] = ws .T #计算回归系数矩阵
205+ return wMat
206+
207+
208+
209+ def useStandRegres ():
210+ """
211+ 函数说明:使用简单的线性回归
212+ Parameters:
213+ 无
214+ Returns:
215+ 无
216+ Website:
217+ http://www.cuijiahua.com/
218+ Modify:
219+ 2017-11-12
220+ """
221+ lgX = []
222+ lgY = []
223+ setDataCollect (lgX , lgY )
224+ data_num , features_num = np .shape (lgX )
225+ lgX1 = np .mat (np .ones ((data_num , features_num + 1 )))
226+ lgX1 [:, 1 :5 ] = np .mat (lgX )
227+ ws = standRegres (lgX1 , lgY )
228+ print ('%f%f*年份%f*部件数量%f*是否为全新%f*原价' % (ws [0 ],ws [1 ],ws [2 ],ws [3 ],ws [4 ]))
229+
230+ if __name__ == '__main__' :
231+ useStandRegres ()
0 commit comments