一、项目介绍
本项目通过对棋牌游戏数据的探索,通过python数据处理以及可视化,最后进行数据建模预测,整个项目分为项目目的的确定、数据的预处理、对数据的分析和项目总结这五个部分。
二、项目流程
项目目的
找到棋牌游戏用户流失的原因,并通过构建Xgboost模型对其预测
数据来源
数据来源于和鲸社区,数据字段如下图所示,共有1309条数据,13个字段,数据为棋牌游戏用户一周的行为数据
数据预处理
数据基本概况
缺失值查看
重复值查看
label编码
因为仅需要对性别和label编码处理,较为简单,不进行导包处理(labelEncoder)
查看数据流失占比
label_gp = data.groupby('是否流失')['用户id'].count()
print('是否流失样本的数量:\n',label_gp)
_,axe = plt.subplots(1,2,figsize=(12,6))
data['是否流失'].value_counts().plot(kind='pie',autopct='%1.1f%%',shadow=True,explode=[0,0.1],ax=axe[0])
sns.countplot('是否流失',data=data,ax=axe[1],)
可视化分析(pyecharts)
由下图性别与用户流失关系可以得到,性别与是否流失的影响不大,男性与女性的流失占比相差无几
a=(round((data[data['gender']==1].groupby(['label'])['用户id'].count()/data[data['gender']==1].shape[0]),2)).tolist()
b=(round((data[data['gender']==0].groupby(['label'])['用户id'].count()/data[data['gender']==0].shape[0]),2)).tolist()
fn = """
function(params) {
if(params.name == '是')
return '\\n\\n\\n' + params.name + ' : ' + params.value + '%';
return params.name + ' : ' + params.value + '%';
}
"""
def new_label_opts():
return opts.LabelOpts(formatter=JsCode(fn), position="center")
c = (
Pie()
.add(
"男",
[list(z) for z in zip(["是", "否"], a)],
center=["20%", "30%"],
radius=[60, 80],
label_opts=new_label_opts(),
)
.add(
"女",
[list(z) for z in zip(["是", "否"], b)],
center=["55%", "30%"],
radius=[60, 80],
label_opts=new_label_opts(),
)
.set_global_opts(
title_opts=opts.TitleOpts(title="性别与是否流失关系"),
legend_opts=opts.LegendOpts(
type_="scroll", pos_top="20%", pos_left="80%", orient="vertical"
),
)
)
c.render_notebook()
由下图登录总次数与用户流失的关系,大部分的流失在一周内的登录次数为2次,可能的原因是游戏界面吸引力不强,游戏的新手教学较为复杂,游戏可能刚开始就进行用户消费引导,游戏的广告设置问题等。
c = (
Bar({"theme": ThemeType.MACARONS})
.add_xaxis(
[str(i) for i in range(2,12)]
)
.add_yaxis("是", list(d1.values()))
.add_yaxis("否", list(d2.values()),gap=0)
.reversal_axis()
.set_series_opts(label_opts=opts.LabelOpts(position="right"))
.set_global_opts(
title_opts=opts.TitleOpts(title="登录总次数与是否流失的关系"),
)
)
c.render_notebook()
由下图用户好友数、等级和积分分别与是否流失的关系,由以上我们可以得到大多数流失用户一周内登录次数仅为2天,所以也导致其用户好友数较少,等级提升较少,积分获取也较少,在用户站内好友数上可以建设论坛吸引用户,或者通过活动刺激用户病毒式传播吸引更多用户,在等级和积分上可以尝试降低等级提升难度和积分获取难度
c = (
Bar()
.add_xaxis([str(i) for i in d])
.add_yaxis("是",list(d1.values()),bar_width=10)
.add_yaxis("否",list(d2.values()),bar_width=10,gap=0)
.set_global_opts(title_opts=opts.TitleOpts(title="站内好友数与是否流失的关系"))
)
c.render_notebook()
c = (
Line()
.add_xaxis([str(i) for i in d])
.add_yaxis("是",list(d1.values()))
.add_yaxis("否",list(d2.values()))
.set_global_opts(title_opts=opts.TitleOpts(title="等级与是否流失的关系"))
)
c.render_notebook()
c = (
Line()
.add_xaxis([str(i) for i in d])
.add_yaxis(
"是",
list(d1.values()),
markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_="max")]),
)
.add_yaxis(
"否",
list(d2.values()),
markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_="max")]),
)
.set_global_opts(title_opts=opts.TitleOpts(title="积分与是否流失的关系"))
)
c.render_notebook()
建模分析
如下图所示,各变量的直方图,因为本数据数据量较小,而且本实验采用Xgboost,对数据归一化影响不大
对源数据进行建模
可以看到模型拟合较好,通过学习曲线可以看到存在过拟合
features_columns = [col for col in data.columns if col not in ['用户id','是否流失','性别','label']]
train_data = data[features_columns]
target =data['label']
def model(train_1,target_1):
X_train, X_val, y_train, y_val = train_test_split(train_1, target_1, test_size = 0.3 ,random_state = 42)
clf = XGBClassifier()
clf.fit(X_train, y_train)
y_train_pred = clf.predict_proba(X_train)
y_train_pred_pos = y_train_pred[:,1]
y_val_pred = clf.predict_proba(X_val)
y_val_pred_pos = y_val_pred[:,1]
auc_train = roc_auc_score(y_train, y_train_pred_pos)
auc_test = roc_auc_score(y_val, y_val_pred_pos)
print(f"Train AUC Score {auc_train}")
print(f"Test AUC Score {auc_test}")
fpr, tpr, _ = roc_curve(y_val, y_val_pred_pos)
return fpr,tpr,clf,auc_test
def plot_learning_cuve(model, X, Y,num):
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 11)
train_loss, test_loss = [], []
for m in range(num,len(x_train),num):
model.fit(x_train.iloc[:m,:], y_train[:m])
y_train_prob_pred = model.predict_proba(x_train.iloc[:m,:])
train_loss.append(log_loss(y_train[:m], y_train_prob_pred))
y_test_prob_pred = model.predict_proba(x_test)
test_loss.append(log_loss(y_test, y_test_prob_pred))
plt.figure(figsize = (15,8))
plt.plot(train_loss, 'r-+', label = 'Training Loss')
plt.plot(test_loss, 'b-', label = 'Test Loss')
plt.xlabel('Number Of Batches')
plt.ylabel('Log-Loss')
plt.legend(loc = 'best')
plt.show()
通过过采样对数据进行平衡,平衡后的学习曲线
smote = SMOTE(random_state = 402)
X_smote, Y_smote = smote.fit_resample(train_data,target)
sns.countplot(Y_smote, edgecolor = 'black')
继续采样获取数据,可以看到明显减小过拟合程度,可以继续对其进行采样,本实验仅采样一次
smote = SMOTE(random_state = 446)
X_smote1, Y_smote1 = smote.fit_resample(train_data,target)
X_final = pd.concat([X_smote, X_smote1], axis = 0).reset_index(drop = True)
Y_final = pd.concat([Y_smote, Y_smote1], axis = 0).reset_index(drop = True)
sns.countplot(Y_final, edgecolor = 'black')
对模型特征选择,由图可知,选择全部特征进行建模较好
def feature_selection(train, train_sel, target):
clf = XGBClassifier()
scores = cross_val_score(clf, train, target, cv=5)
scores_sel = cross_val_score(clf, train_sel, target, cv=5)
print("No Select Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print("Features Select Accuracy: %0.2f (+/- %0.2f)" % (scores_sel.mean(), scores_sel.std() * 2))
return scores.mean(),scores_sel.mean()
def select(train,goal):
a_score=[]
b_score=[]
for i in range(2,train.shape[1]+1):
sel = SelectKBest(mutual_info_classif, k=i)
sel = sel.fit(train, goal)
train_sel = sel.transform(train)
print('训练数据未特征筛选维度', train.shape)
print('训练数据特征筛选维度后', train_sel.shape)
mean_train,mean_test=feature_selection(train, train_sel, goal)
a_score.append(mean_train)
b_score.append(mean_test)
x=list(range(2,train.shape[1]+1))
plt.plot(x, a_score, marker='o', markersize=3) # 绘制折线图,添加数据点,设置点的大小
plt.plot(x, b_score, marker='o', markersize=3)
plt.xticks(x)
plt.show()
return a_score,b_score
模型调参
树的个数,估计器的个数n_estimators,由下图可知确定树的个数为275
def tun_parameters(train_x,train_y): #通过这个函数,确定树的个数
xgb1 = XGBClassifier(learning_rate=0.1,n_estimators=1000,max_depth=5,min_child_weight=1,gamma=0,subsample=0.8,
colsample_bytree=0.8,objective= 'binary:logistic',scale_pos_weight=1,seed=27)
modelfit(xgb1,train_x,train_y)
def modelfit(alg,X, y,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
if useTrainCV:
xgb_param = alg.get_xgb_params()#参数
xgtrain = xgb.DMatrix(X, label=y)#训练数据
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'],
nfold=cv_folds,metrics='auc', early_stopping_rounds=early_stopping_rounds)
#num_boost_round:树的个数 nfold 几折交叉验证 early_stopping_rounds 早停值
alg.set_params(n_estimators=cvresult.shape[0])
#输出树的个数
#Fit the algorithm on the data
alg.fit(X, y,eval_metric='auc')
#Predict training set:
dtrain_predictions = alg.predict(X)
dtrain_predprob = alg.predict_proba(X)[:,1]
#Print model report:
print ("\nModel Report")
print ("Accuracy : %.4g" % metrics.accuracy_score(y, dtrain_predictions))
print ("AUC Score (Train): %f" % metrics.roc_auc_score(y, dtrain_predprob))
feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False)
#bosster()改为get_bosster()
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
plt.show()
print ('n_estimators=',cvresult.shape[0])
max_depth树的最大深度 min_child_weight[默认1] 决定最小叶子节点样本权重和,由下图可知,
树最大深度为9,最小叶节点样本权重为1
#树的最大深度 最小叶子节点样本权重
param_test1 = {
'max_depth':range(3,10,1),
'min_child_weight':range(1,6,1)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=275, max_depth=5,
min_child_weight=1, gamma=0, subsample=0.8,colsample_bytree=0.8,\
objective= 'binary:logistic', nthread=8,scale_pos_weight=1, seed=27),
param_grid = param_test1,scoring='roc_auc',n_jobs=-1, cv=5)
gsearch1.fit(X_final,Y_final)
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_
gamma分割最小损失值 ,由下图可知,gamma确定为0.0
param_test3 = {
'gamma': [i / 10.0 for i in range(0, 5)]
}
gsearch3 = GridSearchCV(
estimator=XGBClassifier(learning_rate=0.1, n_estimators=275, max_depth=9, min_child_weight=1, gamma=0,
subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=8,
scale_pos_weight=1, seed=27), param_grid=param_test3, scoring='roc_auc', n_jobs=-1,
cv=5)
gsearch3.fit(X_final,Y_final)
gsearch3.cv_results_, gsearch3.best_params_, gsearch3.best_score_
colsample_bytree:每轮训练使用的特征占比 subsample :采样率,由下图可确定每轮训练占比为0.6,采样率为0.9
param_test4 = {
'subsample': [i / 10.0 for i in range(6, 10)],
'colsample_bytree': [i / 10.0 for i in range(6, 10)]
}
gsearch4 = GridSearchCV(
estimator=XGBClassifier(learning_rate=0.1, n_estimators=275, max_depth=9, min_child_weight=1, gamma=0.0,
subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=8,
scale_pos_weight=1, seed=27), param_grid=param_test4, scoring='roc_auc', n_jobs=-1,
cv=5)
gsearch4.fit(X_final,Y_final)
gsearch4.cv_results_, gsearch4.best_params_, gsearch4.best_score_
reg_alpha:L1正则化的权重系数,由下图可确定正则化的权重系数为0.0001
param_test6 = {
'reg_alpha':[1e-5,1e-4,1e-3, 1e-2, 0.1, 1, 100]
}
gsearch6 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=275, max_depth=9, min_child_weight=1,
gamma=0.0, subsample=0.9, colsample_bytree=0.6, objective= 'binary:logistic', nthread=8,
scale_pos_weight=1,seed=27), param_grid = param_test6, scoring='roc_auc',n_jobs=-1,cv=5)
gsearch6.fit(X_final,Y_final)
gsearch6.cv_results_, gsearch6.best_params_, gsearch6.best_score_
通过交叉验证得到的最终结果为:0.9998
from sklearn.model_selection import KFold
train_score=[]
test_score=[]
kf = KFold(n_splits=5)
for k, (train_index, test_index) in enumerate(kf.split(X_final)):
train_data1,test_data1,train_target1,test_target1 = X_final.iloc[train_index],X_final.iloc[test_index],Y_final[train_index],Y_final[test_index]
clf = XGBClassifier(
learning_rate =0.1,
n_estimators=275,max_depth=9, min_child_weight=1,
gamma=0.0, subsample=0.9, colsample_bytree=0.6, objective= 'binary:logistic', nthread=8,
scale_pos_weight=1,seed=27,reg_alpha=0.0001
)
clf.fit(train_data1,train_target1)
train_pred=clf.predict_proba(train_data1)[:,1]
test_pred=clf.predict_proba(test_data1)[:,1]
score_train = roc_auc_score(train_target1, train_pred)
score_test = roc_auc_score(test_target1, test_pred)
train_score.append(score_train)
test_score.append(score_test)
参考来源
xgboost调参笔记XGBoost调参笔记_浅笑古今的博客-CSDN博客_xgboost调参
xgboost核心数据结构和API(速查表)XGBoost 核心数据结构和API(速查表)_maerdym的博客-CSDN博客_xgboost结构