1. 函数库导入
import os, sys, pickle
import numpy as np
import pandas as pd
import matplotlib. pyplot as plt
import matplotlib. dates as mdates
import seaborn as sns
import datetime as dt
from datetime import date
from sklearn. linear_model import SGDClassifier, LogisticRegression
from sklearn. metrics import log_loss, roc_auc_score, auc, roc_curve
% matplotlib inline
% config InlineBackend. figure_format = 'retina'
2. 读取文件数据
dfoff = pd. read_csv( './data/ccf_offline_stage1_train.csv' )
dftest = pd. read_csv( './data/ccf_offline_stage1_test_revised.csv' )
dfon = pd. read_csv( './data/ccf_online_stage1_train.csv' )
dfoff. head( )
User_id
Merchant_id
Coupon_id
Discount_rate
Distance
Date_received
Date
0
1439408
2632
NaN
NaN
0.0
NaN
20160217.0
1
1439408
4663
11002.0
150:20
1.0
20160528.0
NaN
2
1439408
2632
8591.0
20:1
0.0
20160217.0
NaN
3
1439408
2632
1078.0
20:1
0.0
20160319.0
NaN
4
1439408
2632
8591.0
20:1
0.0
20160613.0
NaN
3. 数据处理
def getDiscountType ( row) :
if pd. isnull( row) :
return np. nan
elif ':' in row:
return 1
else :
return 0
def convertRate ( row) :
"""Convert discount to rate"""
if pd. isnull( row) :
return 1.0
elif ':' in str ( row) :
rows = row. split( ':' )
return 1.0 - float ( rows[ 1 ] ) / float ( rows[ 0 ] )
else :
return float ( row)
def getDiscountMan ( row) :
if ':' in str ( row) :
rows = row. split( ':' )
return int ( rows[ 0 ] )
else :
return 0
def getDiscountJian ( row) :
if ':' in str ( row) :
rows = row. split( ':' )
return int ( rows[ 1 ] )
else :
return 0
def processData ( df) :
df[ 'discount_rate' ] = df[ 'Discount_rate' ] . apply ( convertRate)
df[ 'discount_man' ] = df[ 'Discount_rate' ] . apply ( getDiscountMan)
df[ 'discount_jian' ] = df[ 'Discount_rate' ] . apply ( getDiscountJian)
df[ 'discount_type' ] = df[ 'Discount_rate' ] . apply ( getDiscountType)
df[ 'distance' ] = df[ 'Distance' ] . fillna( - 1 ) . astype( int )
return df
dfoff = processData( dfoff)
dftest = processData( dftest)
dfoff. head( )
dftest. head( )
User_id
Merchant_id
Coupon_id
Discount_rate
Distance
Date_received
discount_rate
discount_man
discount_jian
discount_type
distance
0
4129537
450
9983
30:5
1.0
20160712
0.833333
30
5
1
1
1
6949378
1300
3429
30:5
NaN
20160706
0.833333
30
5
1
-1
2
2166529
7113
6928
200:20
5.0
20160727
0.900000
200
20
1
5
3
2166529
7113
1808
100:10
5.0
20160727
0.900000
100
10
1
5
4
6172162
7605
6500
30:1
2.0
20160708
0.966667
30
1
1
2
date_received = dfoff[ 'Date_received' ] . unique( )
date_received = sorted ( date_received[ pd. notnull( date_received) ] )
date_buy = dfoff[ 'Date' ] . unique( )
date_buy = sorted ( date_buy[ pd. notnull( date_buy) ] )
date_buy