SVC在非线性数据上的推广
为了能够找出非线性数据的线性决策边界,我们需要将数据从原始的空间 投射到新空间 中
非线性SVM的损失函数的初始形态为:
非线性SVM的拉格朗日函数和拉格朗日对偶函数也可得:
最终的决策函数
重要参数kernel
而解决这些问题的数学方式,叫做“核技巧”(Kernel Trick),是一种能够使用数据原始空间中的向量计算来表示 升维后的空间中的点积结果的数学方式。具体表现为, 。而这个原始空间中的点积 函数 ——核函数
案例:如何选取最佳核函数
探索核函数在不同数据集上的表现
#导入所需的库和模块
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import svm
from sklearn.datasets import make_circles, make_moons, make_blobs,make_classification
#创建数据集,定义核函数的选择
n_samples = 100
datasets = [
make_moons(n_samples=n_samples, noise=0.2, random_state=0),
make_circles(n_samples=n_samples, noise=0.2, factor=0.5, random_state=1),
make_blobs(n_samples=n_samples, centers=2, random_state=5),
make_classification(n_samples=n_samples,n_features =
2,n_informative=2,n_redundant=0, random_state=5)
]
Kernel = ["linear","poly","rbf","sigmoid"]
#四个数据集分别是什么样子呢?
for X,Y in datasets:
plt.figure(figsize=(5,4))
plt.scatter(X[:,0],X[:,1],c=Y,s=50,cmap="rainbow")
#构建子图
nrows=len(datasets)
ncols=len(Kernel) + 1
fig, axes = plt.subplots(nrows, ncols,figsize=(20,16))
#开始进行子图循环
#第一层循环:在不同的数据集中循环
for ds_cnt, (X,Y) in enumerate(datasets):
#在图像中的第一列,放置原数据的分布
ax = axes[ds_cnt, 0]
if ds_cnt == 0:
ax.set_title("Input data")
ax.scatter(X[:, 0], X[:, 1], c=Y, zorder=10, cmap=plt.cm.Paired,edgecolors='k')
ax.set_xticks(())
ax.set_yticks(())
#第二层循环:在不同的核函数中循环
#从图像的第二列开始,一个个填充分类结果
for est_idx, kernel in enumerate(Kernel):
#定义子图位置
ax = axes[ds_cnt, est_idx + 1]
#建模
clf = svm.SVC(kernel=kernel, gamma=2).fit(X, Y)
score = clf.score(X, Y)
#绘制图像本身分布的散点图
ax.scatter(X[:, 0], X[:, 1], c=Y
,zorder=10
,cmap=plt.cm.Paired,edgecolors='k')
#绘制支持向量
ax.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=50,
facecolors='none', zorder=10, edgecolors='k')
#绘制决策边界
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
#np.mgrid,合并了我们之前使用的np.linspace和np.meshgrid的用法
#一次性使用最大值和最小值来生成网格
#表示为[起始值:结束值:步长]
#如果步长是复数,则其整数部分就是起始值和结束值之间创建的点的数量,并且结束值被包含在内
XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j]
#np.c_,类似于np.vstack的功能
Z = clf.decision_function(np.c_[XX.ravel(), YY.ravel()]).reshape(XX.shape)
#填充等高线不同区域的颜色
ax.pcolormesh(XX, YY, Z > 0, cmap=plt.cm.Paired)
#绘制等高线
ax.contour(XX, YY, Z, colors=['k', 'k', 'k'], linestyles=['--', '-', '--'],
levels=[-1, 0, 1])
#设定坐标轴为不显示
ax.set_xticks(())
ax.set_yticks(())
#将标题放在第一行的顶上
if ds_cnt == 0:
ax.set_title(kernel)
#为每张图添加分类的分数
ax.text(0.95, 0.06, ('%.2f' % score).lstrip('0')
, size=15
, bbox=dict(boxstyle='round', alpha=0.8, facecolor='white')
#为分数添加一个白色的格子作为底色
, transform=ax.transAxes #确定文字所对应的坐标轴,就是ax子图的坐标轴本身
, horizontalalignment='right' #位于坐标轴的什么方向
)
plt.tight_layout()
plt.show()
探索核函数的优势和缺陷
from sklearn.datasets import load_breast_cancer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
from time import time
import datetime
data = load_breast_cancer()
X = data.data
y = data.target
X.shape
np.unique(y)
plt.scatter(X[:,0],X[:,1],c=y)
plt.show()
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,y,test_size=0.3,random_state=420)
Kernel = ["linear","poly","rbf","sigmoid"]
for kernel in Kernel:
time0 = time()
clf= SVC(kernel = kernel
, gamma="auto"
# , degree = 1
, cache_size=5000
).fit(Xtrain,Ytrain)
print("The accuracy under kernel %s is %f" % (kernel,clf.score(Xtest,Ytest)))
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
跑不出来,模型一直停留在线性核函数之后,就没有再打印结果了。这证明,多项式核函 数此时此刻要消耗大量的时间,运算非常的缓慢。让我们在循环中去掉多项式核函数,再试试看能否跑出结果:
Kernel = ["linear","rbf","sigmoid"]
for kernel in Kernel:
time0 = time()
clf= SVC(kernel = kernel
, gamma="auto"
# , degree = 1
, cache_size=5000
).fit(Xtrain,Ytrain)
print("The accuracy under kernel %s is %f" % (kernel,clf.score(Xtest,Ytest)))
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
如果数据是线性的,那如果我们把degree参数调整为1,多项式核函数应该也可以得到不错的结果:
Kernel = ["linear","poly","rbf","sigmoid"]
for kernel in Kernel:
time0 = time()
clf= SVC(kernel = kernel
, gamma="auto"
, degree = 1
, cache_size=5000
).fit(Xtrain,Ytrain)
print("The accuracy under kernel %s is %f" % (kernel,clf.score(Xtest,Ytest)))
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
真正的问题是数据的量纲问题。回忆一下我们如何求解决策边界,如何判断点是否在决策边界的一边? 是靠计算”距离“,虽然我们不能说SVM是完全的距离类模型,但是它严重受到数据量纲的影响。让我们来探索一下 乳腺癌数据集的量纲:
import pandas as pd
data = pd.DataFrame(X)
data.describe([0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.99]).T
我们来使用数据预处理中的标准化的类,对数据进行标准化:
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(X)
data = pd.DataFrame(X)
data.describe([0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.99]).T
让SVC在核函数中遍历,此时我们把degree的数值设定为1,观察各个核函数在去量纲后的数 据上的表现:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,y,test_size=0.3,random_state=420)
Kernel = ["linear","poly","rbf","sigmoid"]
for kernel in Kernel:
time0 = time()
clf= SVC(kernel = kernel
, gamma="auto"
, degree = 1
, cache_size=5000
).fit(Xtrain,Ytrain)
print("The accuracy under kernel %s is %f" % (kernel,clf.score(Xtest,Ytest)))
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
我们来试试看高斯径向基核函数 rbf的参数gamma在乳腺癌数据集上的表现:
score = []
gamma_range = np.logspace(-10, 1, 50) #返回在对数刻度上均匀间隔的数字
for i in gamma_range:
clf = SVC(kernel="rbf",gamma = i,cache_size=5000).fit(Xtrain,Ytrain)
score.append(clf.score(Xtest,Ytest))
print(max(score), gamma_range[score.index(max(score))])
plt.plot(gamma_range,score)
plt.show()
对于多项式核函数来说,一切就没有那么容易了,因为三个参数共同作用在一个数学公式上影响它的效果,因此 我们往往使用网格搜索来共同调整三个对多项式核函数有影响的参数。依然使用乳腺癌数据集
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
time0 = time()
gamma_range = np.logspace(-10,1,20)
coef0_range = np.linspace(0,5,10)
param_grid = dict(gamma = gamma_range
,coef0 = coef0_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=420)
grid = GridSearchCV(SVC(kernel = "poly",degree=1,cache_size=5000),
param_grid=param_grid, cv=cv)
grid.fit(X, y)
print("The best parameters are %s with a score of %0.5f" % (grid.best_params_,
grid.best_score_))
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
重要参数C
使用网格搜索或者学习曲线来调整C的值
#调线性核函数
score = []
C_range = np.linspace(0.01,30,50)
for i in C_range:
clf = SVC(kernel="linear",C=i,cache_size=5000).fit(Xtrain,Ytrain)
score.append(clf.score(Xtest,Ytest))
print(max(score), C_range[score.index(max(score))])
plt.plot(C_range,score)
plt.show()
#换rbf
score = []
C_range = np.linspace(0.01,30,50)
for i in C_range:
clf = SVC(kernel="rbf",C=i,gamma =
0.012742749857031322,cache_size=5000).fit(Xtrain,Ytrain)
score.append(clf.score(Xtest,Ytest))
print(max(score), C_range[score.index(max(score))])
plt.plot(C_range,score)
plt.show()
#进一步细化
score = []
C_range = np.linspace(5,7,50)
for i in C_range:
clf = SVC(kernel="rbf",C=i,gamma =
0.012742749857031322,cache_size=5000).fit(Xtrain,Ytrain)
score.append(clf.score(Xtest,Ytest))
print(max(score), C_range[score.index(max(score))])
plt.plot(C_range,score)
plt.show()