Skip to content

Commit 234e15c

Browse files
author
=
committed
PCA降维技术代码实现
1 parent 5416293 commit 234e15c

File tree

3 files changed

+2632
-0
lines changed

3 files changed

+2632
-0
lines changed

PCA/PCA.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
#-*-coding:utf8-*-
2+
'''
3+
Created on 2016-5-15
4+
5+
@author: thinkgamer
6+
'''
7+
from numpy import *
8+
9+
def loadDataSet(filename,delim = "\t"):
10+
fr = open(filename)
11+
stringArr = [line.strip().split(delim) for line in fr.readlines()]
12+
datArr = [map(float, line) for line in stringArr]
13+
return mat(datArr)
14+
15+
#dataMat对应数据集,N个特征
16+
def pca(dataMat, topNfeat=9999999):
17+
meanVals = mean(dataMat, axis = 0) #求平均值
18+
meanRemoved = dataMat - meanVals #去平均值
19+
covMat = cov(meanRemoved,rowvar=0) #计算协防差矩阵
20+
eigVals, eigVects = linalg.eig(mat(covMat))
21+
eigValInd = argsort(eigVals)
22+
#从小到大对N个值排序
23+
eigValInd = eigValInd[: -(topNfeat + 1) : -1]
24+
redEigVects = eigVects[:, eigValInd]
25+
#将数据转换到新空间
26+
lowDDataMat = meanRemoved * redEigVects
27+
reconMat = (lowDDataMat * redEigVects.T) + meanVals
28+
return lowDDataMat, reconMat
29+
30+
#测试
31+
dataMat = loadDataSet("testSet.txt")
32+
lowDMat, reconMat = pca(dataMat,1)
33+
print shape(lowDMat)
34+
35+
'''
36+
#show
37+
import matplotlib
38+
import matplotlib.pyplot as plt
39+
fig = plt.figure()
40+
ax = fig.add_subplot(111)
41+
ax.scatter(dataMat[:,0].flatten().A[0], dataMat[:,1].flatten().A[0], marker='^', s = 90 )
42+
ax.scatter(reconMat[:,0].flatten().A[0], reconMat[:,1].flatten().A[0],marker='o', s = 50 , c ='red' )
43+
plt.show()
44+
'''
45+
46+
#将NaN替换成平均值函数
47+
def replaceNanWithMean():
48+
datMat = loadDataSet('secom.data', ' ')
49+
numFeat = shape(datMat)[1]
50+
for i in range(numFeat):
51+
meanVal = mean(datMat[nonzero(~isnan(datMat[:,i].A))[0],i]) #values that are not NaN (a number)
52+
datMat[nonzero(isnan(datMat[:,i].A))[0],i] = meanVal #set NaN values to mean
53+
return datMat
54+
55+
#加载数据
56+
dataMat = replaceNanWithMean()
57+
#去除均值
58+
meanVals = mean(dataMat, axis=0)
59+
meanRemoved = dataMat - meanVals
60+
#计算协方差
61+
covMat = cov(meanRemoved, rowvar=0)
62+
63+
#特征值分析
64+
eigVals, eigVects = linalg.eig(mat(covMat))
65+
print eigVals

0 commit comments

Comments
 (0)