1+ #-*-coding:utf8-*- 
2+ ''' 
3+ Created on 2016-5-15 
4+ 
5+ @author: thinkgamer 
6+ ''' 
7+ from  numpy  import  * 
8+ 
9+ def  loadDataSet (filename ,delim  =  "\t " ):
10+     fr  =  open (filename )
11+     stringArr  =  [line .strip ().split (delim ) for  line  in  fr .readlines ()]
12+     datArr  =  [map (float , line ) for  line  in  stringArr ]
13+     return  mat (datArr )
14+ 
15+ #dataMat对应数据集,N个特征 
16+ def  pca (dataMat , topNfeat = 9999999 ):
17+     meanVals  =  mean (dataMat , axis  =  0 )   #求平均值 
18+     meanRemoved  =  dataMat  -  meanVals  #去平均值 
19+     covMat  =  cov (meanRemoved ,rowvar = 0 ) #计算协防差矩阵 
20+     eigVals , eigVects  =  linalg .eig (mat (covMat ))
21+     eigValInd  =  argsort (eigVals )
22+     #从小到大对N个值排序 
23+     eigValInd  =  eigValInd [: - (topNfeat  +  1 ) : - 1 ]
24+     redEigVects  =  eigVects [:, eigValInd ]
25+     #将数据转换到新空间 
26+     lowDDataMat  =  meanRemoved  *  redEigVects 
27+     reconMat  =  (lowDDataMat  *  redEigVects .T ) +  meanVals 
28+     return  lowDDataMat , reconMat 
29+ 
30+ #测试 
31+ dataMat  =  loadDataSet ("testSet.txt" )
32+ lowDMat , reconMat  =  pca (dataMat ,1 )
33+ print  shape (lowDMat )
34+ 
35+ ''' 
36+ #show 
37+ import matplotlib 
38+ import matplotlib.pyplot as plt 
39+ fig = plt.figure() 
40+ ax = fig.add_subplot(111) 
41+ ax.scatter(dataMat[:,0].flatten().A[0], dataMat[:,1].flatten().A[0], marker='^',  s = 90 ) 
42+ ax.scatter(reconMat[:,0].flatten().A[0], reconMat[:,1].flatten().A[0],marker='o', s = 50 , c ='red' ) 
43+ plt.show()  
44+ ''' 
45+ 
46+ #将NaN替换成平均值函数 
47+ def  replaceNanWithMean (): 
48+     datMat  =  loadDataSet ('secom.data' , ' ' )
49+     numFeat  =  shape (datMat )[1 ]
50+     for  i  in  range (numFeat ):
51+         meanVal  =  mean (datMat [nonzero (~ isnan (datMat [:,i ].A ))[0 ],i ]) #values that are not NaN (a number) 
52+         datMat [nonzero (isnan (datMat [:,i ].A ))[0 ],i ] =  meanVal   #set NaN values to mean 
53+     return  datMat 
54+ 
55+ #加载数据                
56+ dataMat  =  replaceNanWithMean ()
57+ #去除均值 
58+ meanVals  =  mean (dataMat , axis = 0 )
59+ meanRemoved  =  dataMat  -  meanVals         
60+ #计算协方差                
61+ covMat  =  cov (meanRemoved , rowvar = 0 )
62+ 
63+ #特征值分析 
64+ eigVals ,   eigVects  =  linalg .eig (mat (covMat ))               
65+ print  eigVals                
0 commit comments