【机器学习】乳腺癌——决策树方法的实现

发布于:2024-05-16 ⋅ 阅读:(32) ⋅ 点赞:(0)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz

data = pd.read_csv("breast cancer.csv")
#挑选特征
x = data[['radius_mean','texture_mean','perimeter_mean','area_mean','smoothness_mean','compactness_mean','concavity_mean','concave points_mean','symmetry_mean','fractal_dimension_mean','radius_se','texture_se','perimeter_se','area_se','smoothness_se','compactness_se','concavity_se','concave points_se','symmetry_se','fractal_dimension_se','radius_worst','texture_worst','perimeter_worst','area_worst','smoothness_worst','compactness_worst','concavity_worst','concave points_worst','symmetry_worst','fractal_dimension_worst']]
#选择目标值
y = data['diagnosis']
#填补缺失值

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25)
dic =DictVectorizer(sparse=False)
x_train = dic.fit_transform(x_train.to_dict(orient="records"))#x_train.to_dict(orient="records")将dataframe转化为字典
x_test = dic.transform(x_test.to_dict(orient="records"))
dec = DecisionTreeClassifier()
dec.fit(x_train,y_train)

print("预测的准确率",dec.score(x_test,y_test))

export_graphviz(dec,out_file="E:/lu_learning_demo/tree.dot",feature_names=['radius_mean','texture_mean','perimeter_mean','area_mean','smoothness_mean','compactness_mean','concavity_mean','concave points_mean','symmetry_mean','fractal_dimension_mean','radius_se','texture_se','perimeter_se','area_se','smoothness_se','compactness_se','concavity_se','concave points_se','symmetry_se','fractal_dimension_se','radius_worst','texture_worst','perimeter_worst','area_worst','smoothness_worst','compactness_worst','concavity_worst','concave points_worst','symmetry_worst','fractal_dimension_worst'])
#lu_learning_demo

输出的tree.dot为:

digraph Tree {
node [shape=box, fontname="helvetica"] ;
edge [fontname="helvetica"] ;
0 [label="perimeter_mean <= 865.7\ngini = 0.475\nsamples = 426\nvalue = [261, 165]"] ;
1 [label="symmetry_mean <= 0.151\ngini = 0.167\nsamples = 283\nvalue = [257, 26]"] ;
0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
2 [label="symmetry_mean <= 0.136\ngini = 0.073\nsamples = 263\nvalue = [253, 10]"] ;
1 -> 2 ;
3 [label="smoothness_se <= 0.056\ngini = 0.04\nsamples = 248\nvalue = [243, 5]"] ;
2 -> 3 ;
4 [label="gini = 0.0\nsamples = 1\nvalue = [0, 1]"] ;
3 -> 4 ;
5 [label="texture_mean <= 38.605\ngini = 0.032\nsamples = 247\nvalue = [243, 4]"] ;
3 -> 5 ;
6 [label="perimeter_worst <= 0.003\ngini = 0.017\nsamples = 237\nvalue = [235, 2]"] ;
5 -> 6 ;
7 [label="texture_se <= 0.195\ngini = 0.32\nsamples = 5\nvalue = [4, 1]"] ;
6 -> 7 ;
8 [label="gini = 0.0\nsamples = 4\nvalue = [4, 0]"] ;
7 -> 8 ;
9 [label="gini = 0.0\nsamples = 1\nvalue = [0, 1]"] ;
7 -> 9 ;
10 [label="fractal_dimension_worst <= 33.27\ngini = 0.009\nsamples = 232\nvalue = [231, 1]"] ;
6 -> 10 ;
11 [label="gini = 0.0\nsamples = 216\nvalue = [216, 0]"] ;
10 -> 11 ;
12 [label="fractal_dimension_worst <= 33.56\ngini = 0.117\nsamples = 16\nvalue = [15, 1]"] ;
10 -> 12 ;
13 [label="gini = 0.0\nsamples = 1\nvalue = [0, 1]"] ;
12 -> 13 ;
14 [label="gini = 0.0\nsamples = 15\nvalue = [15, 0]"] ;
12 -> 14 ;
15 [label="compactness_worst <= 0.025\ngini = 0.32\nsamples = 10\nvalue = [8, 2]"] ;
5 -> 15 ;
16 [label="symmetry_worst <= 2.431\ngini = 0.444\nsamples = 3\nvalue = [1, 2]"] ;
15 -> 16 ;
17 [label="gini = 0.0\nsamples = 2\nvalue = [0, 2]"] ;
16 -> 17 ;
18 [label="gini = 0.0\nsamples = 1\nvalue = [1, 0]"] ;
16 -> 18 ;
19 [label="gini = 0.0\nsamples = 7\nvalue = [7, 0]"] ;
15 -> 19 ;
20 [label="fractal_dimension_worst <= 28.78\ngini = 0.444\nsamples = 15\nvalue = [10, 5]"] ;
2 -> 20 ;
21 [label="symmetry_mean <= 0.139\ngini = 0.165\nsamples = 11\nvalue = [10, 1]"] ;
20 -> 21 ;
22 [label="gini = 0.0\nsamples = 1\nvalue = [0, 1]"] ;
21 -> 22 ;
23 [label="gini = 0.0\nsamples = 10\nvalue = [10, 0]"] ;
21 -> 23 ;
24 [label="gini = 0.0\nsamples = 4\nvalue = [0, 4]"] ;
20 -> 24 ;
25 [label="fractal_dimension_worst <= 24.605\ngini = 0.32\nsamples = 20\nvalue = [4, 16]"] ;
1 -> 25 ;
26 [label="smoothness_mean <= 0.032\ngini = 0.32\nsamples = 5\nvalue = [4, 1]"] ;
25 -> 26 ;
27 [label="gini = 0.0\nsamples = 1\nvalue = [0, 1]"] ;
26 -> 27 ;
28 [label="gini = 0.0\nsamples = 4\nvalue = [4, 0]"] ;
26 -> 28 ;
29 [label="gini = 0.0\nsamples = 15\nvalue = [0, 15]"] ;
25 -> 29 ;
30 [label="texture_se <= 0.182\ngini = 0.054\nsamples = 143\nvalue = [4, 139]"] ;
0 -> 30 [labeldistance=2.5, labelangle=-45, headlabel="False"] ;
31 [label="concavity_mean <= 0.038\ngini = 0.48\nsamples = 5\nvalue = [3, 2]"] ;
30 -> 31 ;
32 [label="gini = 0.0\nsamples = 2\nvalue = [0, 2]"] ;
31 -> 32 ;
33 [label="gini = 0.0\nsamples = 3\nvalue = [3, 0]"] ;
31 -> 33 ;
34 [label="compactness_worst <= 0.01\ngini = 0.014\nsamples = 138\nvalue = [1, 137]"] ;
30 -> 34 ;
35 [label="gini = 0.0\nsamples = 1\nvalue = [1, 0]"] ;
34 -> 35 ;
36 [label="gini = 0.0\nsamples = 137\nvalue = [0, 137]"] ;
34 -> 36 ;
}

数据集为:Breast Cancer Wisconsin | Kaggle