7-1 什么是PCA
7-2 使用梯度上升法求解PCA问题
7-3 求数据的主成分PCA
Notbook 示例
Notbook 源码
使用梯度上升法求解主成分
[1]
import numpy as np
import matplotlib.pyplot as plt
[2]
X = np.empty((100,2))
X[:,0] = np.random.uniform(0,100,size=100)
X[:,1] = 0.75 * X[:,0] + 3. +np.random.normal(0,10,size=100)
[3]
plt.scatter(X[:,0],X[:,1])
<matplotlib.collections.PathCollection at 0x20e24cb3880>
[4]
def demean(X):
return X - np.mean(X,axis=0)
[5]
(np.mean(X,axis=0)).shape
(2,)
[6]
X_dmean = demean(X)
[7]
plt.scatter(X_dmean[:,0],X_dmean[:,1])
<matplotlib.collections.PathCollection at 0x20e24dc96a0>
[8]
np.mean(X_dmean[:,0])
1.4281908988778012e-14
[9]
np.mean(X_dmean[:,1])
-2.7142732506035826e-14
梯度上升法
[10]
def f(w,X):
return np.sum((X.dot(w) ** 2)) / len(X)
[11]
def df_math(w,X):
return X.T.dot(X.dot(w)) * 2.0 / len(X)
[12]
def df_debug(w, X, epsilon=0.0001):
res = np.empty(len(w))
for i in range(len(w)):
w_1 = w.copy()
w_1[i] += epsilon
w_2 = w.copy()
w_2[i] -= epsilon
res[i] = (f(w_1,X) - f(w_2,X)) / (2 * epsilon)
return res
[13]
def direction(w):
return w / np.linalg.norm(w)
def gradient_ascent(df, X, initial_w, eta, n_iters = 1e5, epsilon = 1e-8):
w = direction(initial_w)
i_iters = 0
while i_iters < n_iters:
gradient = df(w, X)
last_w = w
w = w + eta * gradient
w = direction(w) # 注意1:每次求一个单位方向
if(abs(f(w, X)-f(last_w, X)) < epsilon ):
break
i_iters +=1
return w
[14]
initial_w = np.random.random(X.shape[1]) # 注意2:不能从0向量开始
initial_w
array([0.72672574, 0.65631127])
[15]
eta = 0.001
[16]
# 注意3: 不能使用StandardScaler 标准化数据
[17]
gradient_ascent(df_debug, X_dmean,initial_w,eta) # 若梯度下降则为负 array([-0.78243378, -0.6227338 ])
array([0.7703671 , 0.63760061])
[18]
w = gradient_ascent(df_math, X_dmean,initial_w,eta)
[19]
plt.scatter(X_dmean[:,0],X_dmean[:,1])
plt.plot([0,w[0]*30],[0,w[1]*30], color='r') #***********************************************************************
# w 是一个向量,有 x 与 y 的分量
[<matplotlib.lines.Line2D at 0x20e254ec850>]
[20]
(w[0]*30).shape
()
[21]
X2 = np.empty((100,2))
X2[:,0] = np.random.uniform(0,100,size=100)
X2[:,1] = 0.75 * X2[:,0] + 3.
[22]
plt.scatter(X2[:,0],X2[:,1])
<matplotlib.collections.PathCollection at 0x20e25556160>
[23]
X2_dmean = demean(X2)
[24]
w2 = gradient_ascent(df_math, X2_dmean,initial_w,eta)
[25]
plt.scatter(X2_dmean[:,0],X2_dmean[:,1])
plt.plot([0,w2[0]*30],[0,w2[1]*30], color='r')
[<matplotlib.lines.Line2D at 0x20e255bbbe0>]
7-4 求数据的前n个主成分
Notbook 示例
Notbook 源码
[1]
import numpy as np
import matplotlib.pyplot as plt
[2]
X = np.empty((100,2))
X[:,0] = np.random.uniform(0,100,size=100)
X[:,1] = 0.75 * X[:,0] + 3. +np.random.normal(0,10,size=100)
[4]
def demean(X):
return X - np.mean(X,axis=0)
X = demean(X)
[5]
plt.scatter(X[:,0],X[:,1])
<matplotlib.collections.PathCollection at 0x21b0ea71eb0>
[6]
def f(w,X):
return np.sum((X.dot(w) ** 2)) / len(X)
def df(w,X):
return X.T.dot(X.dot(w)) * 2.0 / len(X)
def direction(w):
return w / np.linalg.norm(w)
def first_componet(X, initial_w, eta, n_iters = 1e5, epsilon = 1e-8):
w = direction(initial_w)
i_iters = 0
while i_iters < n_iters:
gradient = df(w, X)
last_w = w
w = w + eta * gradient
w = direction(w) # 注意1:每次求一个单位方向
if(abs(f(w, X)-f(last_w, X)) < epsilon ):
break
i_iters +=1
return w
[7]
initial_w = np.random.random(X.shape[1])
eta = 0.01
w = first_componet(X,initial_w,eta)
w
array([0.77709976, 0.62937744])
[8]
X2 = np.empty(X.shape)
for i in range(len(X)):
X2[i] = X[i] - X[i].dot(w) * w
[14]
X2 = X - X.dot(w).reshape(-1,1) * w
[15]
plt.scatter(X2[:,0], X2[:,1])
<matplotlib.collections.PathCollection at 0x21b11a22be0>
[12]
w2 = first_componet(X2,initial_w,eta)
w2
array([-0.62937344, 0.777103 ])
[13]
w.dot(w2)
5.13826667680739e-06
[17]
def first_n_components(n, X, eta=0.01, n_iters=1e4, epsilon=1e-8 ):
X_pca = X.copy()
X_pca = demean(X_pca)
res = []
for i in range(n):
initial_w = np.random.random(X_pca.shape[1])
w = first_componet(X_pca,initial_w,eta)
res.append(w)
X_pca = X_pca - X_pca.dot(w).reshape(-1,1) * w
return res
[18]
first_n_components(2,X)
[array([0.77709976, 0.62937744]), array([-0.62937373, 0.77710277])]
7-5 高维数据映射为低维数据
Notbook 示例
Notbook 源码
从高维数据向低维数据的映射
[1]
import numpy as np
import matplotlib.pyplot as plt
[2]
X = np.empty((100,2))
X[:,0] = np.random.uniform(0,100,size=100)
X[:,1] = 0.75 * X[:,0] + 3. +np.random.normal(0,10,size=100)
[3]
from playML.PCA import PCA
pca = PCA(n_components=2)
pca.fit(X)
PCA(n_components=2)
[4]
pca.components_
array([[ 0.78080313, 0.62477713],
[ 0.62478081, -0.7808002 ]])
[5]
pca =PCA(n_components=1)
pca.fit(X)
PCA(n_components=1)
[6]
X_reduction = pca.transform(X)
[7]
X_reduction.shape
(100, 1)
[9]
X_restore = pca.inverse_transform(X_reduction)
[10]
X_restore.shape
(100, 2)
[12]
plt.scatter(X[:,0],X[:,1],color='b',alpha=0.5)
plt.scatter(X_restore[:,0],X_restore[:,1],color='r',alpha=0.5)
<matplotlib.collections.PathCollection at 0x1e67905a550>
scikit-learn 中的 PCA
[13]
from sklearn.decomposition import PCA
[14]
pca = PCA(n_components=1)
pca.fit(X)
PCA(n_components=1)
[15]
pca.components_
array([[0.78080305, 0.62477724]])
[16]
X_reduction = pca.transform(X)
[17]
X_reduction.shape
(100, 1)
[18]
X_restore = pca.inverse_transform(X_reduction)
[19]
X_restore.shape
(100, 2)
[20]
plt.scatter(X[:,0],X[:,1],color='b',alpha=0.5)
plt.scatter(X_restore[:,0],X_restore[:,1],color='r',alpha=0.5)
<matplotlib.collections.PathCollection at 0x1e67a427ee0>
7-6 scikit-learn中的PCA
Notbook 示例
notbook 源码
scikit-learn中的PCA
[1]
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
[2]
digits = datasets.load_digits()
X = digits.data
y = digits.target
[3]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=666)
[4]
X_train.shape
(1347, 64)
[5]
%%time
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train,y_train)
CPU times: total: 78.1 ms
Wall time: 288 ms
KNeighborsClassifier()
[6]
knn_clf.score(X_test,y_test)
0.9866666666666667
[7]
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(X_train)
X_train_reduction = pca.transform(X_train)
X_test_reduction = pca.transform(X_test)
[8]
%%time
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train_reduction,y_train)
CPU times: total: 15.6 ms
Wall time: 1.99 ms
KNeighborsClassifier()
[9]
knn_clf.score(X_test_reduction,y_test)
0.6066666666666667
[10]
pca.explained_variance_ratio_
array([0.14566817, 0.13735469])
[11]
pca = PCA(n_components=X_train.shape[1])
pca.fit(X_train)
pca.explained_variance_ratio_
array([1.45668166e-01, 1.37354688e-01, 1.17777287e-01, 8.49968861e-02,
5.86018996e-02, 5.11542945e-02, 4.26605279e-02, 3.60119663e-02,
3.41105814e-02, 3.05407804e-02, 2.42337671e-02, 2.28700570e-02,
1.80304649e-02, 1.79346003e-02, 1.45798298e-02, 1.42044841e-02,
1.29961033e-02, 1.26617002e-02, 1.01728635e-02, 9.09314698e-03,
8.85220461e-03, 7.73828332e-03, 7.60516219e-03, 7.11864860e-03,
6.85977267e-03, 5.76411920e-03, 5.71688020e-03, 5.08255707e-03,
4.89020776e-03, 4.34888085e-03, 3.72917505e-03, 3.57755036e-03,
3.26989470e-03, 3.14917937e-03, 3.09269839e-03, 2.87619649e-03,
2.50362666e-03, 2.25417403e-03, 2.20030857e-03, 1.98028746e-03,
1.88195578e-03, 1.52769283e-03, 1.42823692e-03, 1.38003340e-03,
1.17572392e-03, 1.07377463e-03, 9.55152460e-04, 9.00017642e-04,
5.79162563e-04, 3.82793717e-04, 2.38328586e-04, 8.40132221e-05,
5.60545588e-05, 5.48538930e-05, 1.08077650e-05, 4.01354717e-06,
1.23186515e-06, 1.05783059e-06, 6.06659094e-07, 5.86686040e-07,
1.71368535e-33, 7.44075955e-34, 7.44075955e-34, 7.15189459e-34])
[12]
plt.plot([i for i in range(X_train.shape[1])],
[np.sum(pca.explained_variance_ratio_[:i+1]) for i in range(X_train.shape[1])]
)
[<matplotlib.lines.Line2D at 0x1d015480880>]
[13]
pca = PCA(0.95)
pca.fit(X_train)
PCA(n_components=0.95)
[14]
pca.n_components_
28
[15]
X_train_reduction = pca.transform(X_train)
X_test_reduction = pca.transform(X_test)
[16]
%%time
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train_reduction,y_train)
CPU times: total: 0 ns
Wall time: 1.99 ms
KNeighborsClassifier()
[17]
knn_clf.score(X_test_reduction,y_test)
0.98
[19]
pca = PCA(n_components=2)
pca.fit(X)
X_reduction = pca.transform(X)
[20]
X_reduction.shape
(1797, 2)
[22]
for i in range(10):
plt.scatter(X_reduction[y==i,0],X_reduction[y==i,1],alpha=0.8)
7-7 试手MNIST数据集
Notbook 示例
Notbook 源码
MNIST
[1]
import numpy as np
from sklearn.datasets import fetch_openml
[2]
%time mnist = fetch_openml('mnist_784')
CPU times: total: 3min 41s
Wall time: 5min 5s
[3]
# mnist
[4]
X, y = mnist['data'],mnist['target']
[5]
X.shape
(70000, 784)
[6]
X_train = np.array(X[:60000],dtype=float)
y_train = np.array(y[:60000],dtype=float)
X_test = np.array(X[60000:],dtype=float)
y_test = np.array(y[60000:],dtype=float)
[7]
X_train.shape
(60000, 784)
[8]
y_train.shape
(60000,)
[9]
X_test.shape
(10000, 784)
[10]
y_test.shape
(10000,)
[11]
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier()
%time knn_clf.fit(X_train,y_train)
CPU times: total: 93.8 ms
Wall time: 131 ms
KNeighborsClassifier()
[12]
%time knn_clf.score(X_test,y_test)
CPU times: total: 2min 56s
Wall time: 2min 12s
0.9688
PCA 进行降维
[13]
from sklearn.decomposition import PCA
pca = PCA(0.9)
pca.fit(X_train)
X_train_reduction = pca.transform(X_train)
[14]
X_train_reduction.shape
(60000, 87)
[15]
knn_clf = KNeighborsClassifier()
%time knn_clf.fit(X_train_reduction,y_train)
CPU times: total: 31.2 ms
Wall time: 21.9 ms
KNeighborsClassifier()
[16]
X_test_reduction = pca.transform(X_test)
[17]
%time knn_clf.score(X_test_reduction,y_test)
CPU times: total: 40.1 s
Wall time: 47 s
0.9728
7-8 使用PCA对数据进行降噪
Notbook 示例
Notbook 源码
回忆我们之前的例子
[1]
import numpy as np
import matplotlib.pyplot as plt
[2]
X = np.empty((100,2))
X[:,0] = np.random.uniform(0,100,size=100)
X[:,1] = 0.75 * X[:,0] + 3. + np.random.normal(0,10,size=100)
[3]
plt.scatter(X[:,0],X[:,1])
<matplotlib.collections.PathCollection at 0x18a08e43820>
[4]
from sklearn.decomposition import PCA
pca = PCA(n_components=1)
pca.fit(X)
X_reduction = pca.transform(X)
X_restore = pca.inverse_transform(X_reduction)
[5]
plt.scatter(X_restore[:,0],X_restore[:,1])
<matplotlib.collections.PathCollection at 0x18a0b8a63a0>
手写识别的例子
[6]
from sklearn import datasets
digits = datasets.load_digits()
X = digits.data
y = digits.target
[7]
noisy_digits = X + np.random.normal(0,4,size=X.shape)
[8]
X.shape
(1797, 64)
[9]
example_digits = noisy_digits[y==0,:][:10]
for num in range(1,10):
X_num = noisy_digits[y==num,:][:10]
example_digits = np.vstack([example_digits,X_num])
[10]
example_digits.shape
(100, 64)
[11]
def plot_digits(data):
fig,axes = plt.subplots(10,10,figsize=(10,10),
subplot_kw={'xticks':[],'yticks':[]},
gridspec_kw=dict(hspace=0.1,wspace =0.1))
for i,ax in enumerate(axes.flat):
ax.imshow(data[i].reshape(8,8),
cmap='binary',interpolation='nearest',
clim=(0,16))
plot_digits(example_digits)
[12]
pca = PCA(0.5)
pca.fit(noisy_digits)
PCA(n_components=0.5)
[13]
pca.n_components_
12
[14]
components = pca.transform(example_digits)
filtered_digits = pca.inverse_transform(components)
plot_digits(filtered_digits)
7-9 人脸识别与特征脸
Notbook 示例
Notbook 源码
特征脸
[1]
import numpy as np
import matplotlib.pyplot as plt
[2]
from sklearn.datasets import fetch_lfw_people
[3]
faces = fetch_lfw_people()
[5]
faces.keys()
dict_keys(['data', 'images', 'target', 'target_names', 'DESCR'])
[7]
faces.data.shape
(13233, 2914)
[8]
faces.images.shape
(13233, 62, 47)
[10]
random_indexes = np.random.permutation(len(faces.data))
X = faces.data[random_indexes]
[12]
example_faces = X[:36,:]
example_faces.shape
(36, 2914)
[13]
def plot_faces(faces):
fig,axes = plt.subplots(6,6,figsize=(10,10),
subplot_kw={'xticks':[],'yticks':[]},
gridspec_kw=dict(hspace=0.1,wspace =0.1))
for i,ax in enumerate(axes.flat):
ax.imshow(faces[i].reshape(62,47),
cmap='bone')
plot_faces(example_faces)
[14]
faces.target_names
array(['AJ Cook', 'AJ Lamas', 'Aaron Eckhart', ..., 'Zumrati Juma',
'Zurab Tsereteli', 'Zydrunas Ilgauskas'], dtype='<U35')
[15]
len(faces.target_names)
5749
特征脸
[16]
from sklearn.decomposition import PCA
pca = PCA(svd_solver='randomized')
pca.fit(X)
PCA(svd_solver='randomized')
[17]
pca.components_.shape
(2914, 2914)
[18]
plot_faces(pca.components_[:36,:])
[20]
faces2 = fetch_lfw_people(min_faces_per_person=60)
[21]
faces2.data.shape
(1348, 2914)
[22]
faces2.target_names
array(['Ariel Sharon', 'Colin Powell', 'Donald Rumsfeld', 'George W Bush',
'Gerhard Schroeder', 'Hugo Chavez', 'Junichiro Koizumi',
'Tony Blair'], dtype='<U17')
[23]
len(faces2.target_names)
8
本文含有隐藏内容,请 开通VIP 后查看