05_机器学习赛事_优惠券使用预测

发布于:2024-05-09 ⋅ 阅读:(32) ⋅ 点赞:(0)

在这里插入图片描述

1. 函数库导入

# import libraries necessary for this project
import os, sys, pickle

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.dates as mdates

import seaborn as sns
import datetime as dt

from datetime import date

from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve

# display for this notebook
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

2. 读取文件数据

dfoff = pd.read_csv('./data/ccf_offline_stage1_train.csv')
dftest = pd.read_csv('./data/ccf_offline_stage1_test_revised.csv')
dfon = pd.read_csv('./data/ccf_online_stage1_train.csv')

dfoff.head()
User_id Merchant_id Coupon_id Discount_rate Distance Date_received Date
0 1439408 2632 NaN NaN 0.0 NaN 20160217.0
1 1439408 4663 11002.0 150:20 1.0 20160528.0 NaN
2 1439408 2632 8591.0 20:1 0.0 20160217.0 NaN
3 1439408 2632 1078.0 20:1 0.0 20160319.0 NaN
4 1439408 2632 8591.0 20:1 0.0 20160613.0 NaN

3. 数据处理

# 1. 将满xx减yy类型(`xx:yy`)的券变成折扣率 : `1 - yy/xx`,同时建立折扣券相关的特征 `discount_rate, discount_man, discount_jian, discount_type`
# 2. 将距离 `str` 转为 `int`
# convert Discount_rate and Distance
def getDiscountType(row):
    if pd.isnull(row):
        return np.nan
    elif ':' in row:
        return 1
    else:
        return 0

    
def convertRate(row):
    """Convert discount to rate"""
    if pd.isnull(row):
        return 1.0
    elif ':' in str(row):
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)

def getDiscountMan(row):
    if ':' in str(row):
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0

def getDiscountJian(row):
    if ':' in str(row):
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0
    
def processData(df):
    # convert discunt_rate
    df['discount_rate'] = df['Discount_rate'].apply(convertRate)
    df['discount_man'] = df['Discount_rate'].apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].apply(getDiscountType)
    #print(df['discount_rate'].unique())
    # convert distance
    df['distance'] = df['Distance'].fillna(-1).astype(int)
    return df

dfoff = processData(dfoff)
dftest = processData(dftest)

dfoff.head()
dftest.head()
User_id Merchant_id Coupon_id Discount_rate Distance Date_received discount_rate discount_man discount_jian discount_type distance
0 4129537 450 9983 30:5 1.0 20160712 0.833333 30 5 1 1
1 6949378 1300 3429 30:5 NaN 20160706 0.833333 30 5 1 -1
2 2166529 7113 6928 200:20 5.0 20160727 0.900000 200 20 1 5
3 2166529 7113 1808 100:10 5.0 20160727 0.900000 100 10 1 5
4 6172162 7605 6500 30:1 2.0 20160708 0.966667 30 1 1 2
date_received = dfoff['Date_received'].unique()
date_received = sorted(date_received[pd.notnull(date_received)])

date_buy = dfoff['Date'].unique()
date_buy = sorted(date_buy[pd.notnull(date_buy)])
date_buy 

网站公告

今日签到

点亮在社区的每一天
去签到