python数据挖掘案例

发布于:2025-07-17 ⋅ 阅读:(17) ⋅ 点赞:(0)
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import chi2_contingency, normaltest
from scipy.signal import savgol_filter
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import mutual_info_classif
import warnings
warnings.filterwarnings('ignore')

# ================================
# 1. 数据生成和加载(模拟真实客户数据)
# ================================

def create_sample_data():
    """创建模拟的客户数据集"""
    np.random.seed(42)
    n_customers = 10000
    
    # 基础客户信息
    data = {
        'customer_id': range(1, n_customers + 1),
        'age': np.random.normal(45, 15, n_customers).astype(int),
        'gender': np.random.choice(['M', 'F'], n_customers),
        'income': np.random.lognormal(10, 0.5, n_customers),
        'education': np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], 
                                    n_customers, p=[0.4, 0.35, 0.2, 0.05]),
        'tenure_months': np.random.exponential(24, n_customers),
        
        # 消费行为数据
        'monthly_charges': np.random.normal(70, 20, n_customers),
        'total_charges': np.random.exponential(1000, n_customers),
        'payment_method': np.random.choice(['Credit Card', 'Bank Transfer', 'Electronic Check', 'Mailed Check'], 
                                         n_customers, p=[0.3, 0.25, 0.25, 0.2]),
        
        # 服务使用数据
        'phone_service': np.random.choice([0, 1], n_customers, p=[0.1, 0.9]),
        'internet_service': np.random.choice(['No', 'DSL', 'Fiber'], n_customers, p=[0.2, 0.4, 0.4]),
        'online_security': np.random.choice([0, 1], n_customers, p=[0.5, 0.5]),
        'tech_support': np.random.choice([0, 1], n_customers, p=[0.6, 0.4]),
        
        # 客户满意度和行为指标
        'support_calls': np.random.poisson(2, n_customers),
        'late_payments': np.random.poisson(1, n_customers),
        'contract_type': np.random.choice(['Month-to-month', 'One year', 'Two year'], 
                                        n_customers, p=[0.5, 0.3, 0.2]),
    }
    
    df = pd.DataFrame(data)
    
    # 创建目标变量(客户流失)- 基于多个因素
    churn_prob = (
        0.1 +  # 基础流失率
        0.3 * (df['tenure_months'] < 12) +  # 新客户更容易流失
        0.2 * (df['monthly_charges'] > 80) +  # 高费用客户
        0.1 * (df['support_calls'] > 3) +  # 频繁投诉
        0.15 * (df['late_payments'] > 2) +  # 付款问题
        0.2 * (df['contract_type'] == 'Month-to-month')  # 无长期合约
    )
    
    df['churn'] = np.random.binomial(1, np.clip(churn_prob, 0, 1), n_customers)
    
    return df

# ================================
# 2. 基础特征工程(pandas操作)
# ================================

def basic_feature_engineering(df):
    """基础特征工程"""
    print("=== 基础特征工程 ===")
    
    # 年龄分组
    df['age_group'] = pd.cut(df['age'], 
                            bins=[0, 25, 35, 45, 55, 100], 
                            labels=['Young', 'Adult', 'Middle', 'Senior', 'Elder'])
    
    # 收入分位数
    df['income_quartile'] = pd.qcut(df['income'], 4, labels=['Low', 'Medium-Low', 'Medium-High', 'High'])
    
    # 消费能力指标
    df['avg_monthly_spend'] = df['total_charges'] / (df['tenure_months'] + 1)  # 避免除零
    df['charge_per_month_ratio'] = df['monthly_charges'] / df['avg_monthly_spend']
    
    # 客户价值评分
    df['customer_value_score'] = (
        df['total_charges'] * 0.4 + 
        df['tenure_months'] * 10 * 0.3 + 
        (df['monthly_charges'] * df['tenure_months']) * 0.3
    )
    
    # 服务使用综合评分
    service_cols = ['phone_service', 'online_security', 'tech_support']
    df['service_usage_score'] = df[service_cols].sum(axis=1)
    
    # 问题客户标识
    df['problematic_customer'] = (
        (df['support_calls'] > df['support_calls'].quantile(0.8)) | 
        (df['late_payments'] > df['late_payments'].quantile(0.8))
    ).astype(int)
    
    return df

# ================================
# 3. 高级特征构造(numpy数值计算)
# ================================

def advanced_numerical_features(df):
    """使用numpy进行高级数值特征构造"""
    print("=== 高级数值特征构造 ===")
    
    # 对数变换(处理长尾分布)
    df['log_income'] = np.log1p(df['income'])
    df['log_total_charges'] = np.log1p(df['total_charges'])
    
    # 平方根变换
    df['sqrt_tenure'] = np.sqrt(df['tenure_months'])
    df['sqrt_monthly_charges'] = np.sqrt(df['monthly_charges'])
    
    # Box-Cox类似变换
    df['boxcox_income'] = np.sign(df['income']) * np.power(np.abs(df['income']), 0.3)
    
    # 标准化特征
    numerical_cols = ['age', 'income', 'tenure_months', 'monthly_charges', 'total_charges']
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(df[numerical_cols])
    
    for i, col in enumerate(numerical_cols):
        df[f'{col}_scaled'] = scaled_features[:, i]
    
    # 多项式特征
    df['age_income_interaction'] = df['age'] * df['income'] / 1000000  # 缩放避免数值过大
    df['tenure_charges_interaction'] = df['tenure_months'] * df['monthly_charges']
    
    # 比率特征
    df['charges_to_income_ratio'] = df['monthly_charges'] / (df['income'] / 12 + 1)  # 月收入
    df['tenure_to_age_ratio'] = df['tenure_months'] / (df['age'] * 12)  # 客户生命周期占比
    
    # 异常值检测
    df['income_outlier'] = (np.abs(stats.zscore(df['income'])) > 3).astype(int)
    df['charges_outlier'] = (np.abs(stats.zscore(df['monthly_charges'])) > 3).astype(int)
    
    return df

# ================================
# 4. 统计特征(scipy统计计算)
# ================================

def statistical_features(df):
    """使用scipy进行统计特征构造"""
    print("=== 统计特征构造 ===")
    
    # 分组统计特征
    groupby_features = []
    
    # 按年龄组计算统计特征
    age_group_stats = df.groupby('age_group').agg({
        'monthly_charges': ['mean', 'std', 'median'],
        'total_charges': ['mean', 'std'],
        'support_calls': 'mean'
    }).round(2)
    
    # 展平多级列名
    age_group_stats.columns = ['_'.join(col) for col in age_group_stats.columns]
    age_group_stats = age_group_stats.add_prefix('age_group_')
    
    # 合并回原数据
    df = df.merge(age_group_stats, left_on='age_group', right_index=True, how='left')
    
    # 按收入分位数计算统计特征
    income_quartile_stats = df.groupby('income_quartile').agg({
        'monthly_charges': 'mean',
        'tenure_months': 'mean',
        'churn': 'mean'  # 流失率
    }).round(3)
    
    income_quartile_stats.columns = [f'income_quartile_{col}' for col in income_quartile_stats.columns]
    df = df.merge(income_quartile_stats, left_on='income_quartile', right_index=True, how='left')
    
    # 偏移统计特征
    numerical_features = ['monthly_charges', 'total_charges', 'tenure_months', 'support_calls']
    
    for col in numerical_features:
        # 与组均值的偏差
        group_mean_col = f'age_group_{col}_mean'
        if group_mean_col in df.columns:
            df[f'{col}_deviation_from_age_group'] = df[col] - df[group_mean_col]
        
        # 分位数特征
        df[f'{col}_percentile'] = df[col].rank(pct=True)
        
        # 移动统计(模拟时间序列特征)
        df = df.sort_values('customer_id')
        df[f'{col}_rolling_mean'] = df[col].rolling(window=100, min_periods=1).mean()
        df[f'{col}_rolling_std'] = df[col].rolling(window=100, min_periods=1).std()
    
    # 正态性检验特征
    for col in ['income', 'monthly_charges', 'total_charges']:
        # 计算偏度和峰度
        df[f'{col}_skewness'] = stats.skew(df[col])
        df[f'{col}_kurtosis'] = stats.kurtosis(df[col])
    
    return df

# ================================
# 5. 高级特征工程技术
# ================================

def advanced_feature_engineering(df):
    """高级特征工程技术"""
    print("=== 高级特征工程 ===")
    
    # 1. 目标编码(Target Encoding)
    categorical_cols = ['gender', 'education', 'payment_method', 'internet_service', 'contract_type']
    
    for col in categorical_cols:
        target_mean = df.groupby(col)['churn'].mean()
        df[f'{col}_target_encoded'] = df[col].map(target_mean)
    
    # 2. 频率编码
    for col in categorical_cols:
        freq_map = df[col].value_counts(normalize=True)
        df[f'{col}_frequency'] = df[col].map(freq_map)
    
    # 3. WOE编码(Weight of Evidence)
    def calculate_woe(df, col, target):
        """计算WOE值"""
        crosstab = pd.crosstab(df[col], df[target])
        crosstab['total'] = crosstab.sum(axis=1)
        crosstab['bad_rate'] = crosstab[1] / crosstab['total']
        crosstab['good_rate'] = crosstab[0] / crosstab['total']
        
        total_bad = df[target].sum()
        total_good = len(df) - total_bad
        
        crosstab['bad_dist'] = crosstab[1] / total_bad
        crosstab['good_dist'] = crosstab[0] / total_good
        
        # 避免除零错误
        crosstab['bad_dist'] = crosstab['bad_dist'].replace(0, 0.0001)
        crosstab['good_dist'] = crosstab['good_dist'].replace(0, 0.0001)
        
        crosstab['woe'] = np.log(crosstab['bad_dist'] / crosstab['good_dist'])
        
        return crosstab['woe'].to_dict()
    
    # 应用WOE编码
    for col in ['education', 'contract_type']:
        woe_map = calculate_woe(df, col, 'churn')
        df[f'{col}_woe'] = df[col].map(woe_map)
    
    # 4. 聚类特征
    from sklearn.cluster import KMeans
    
    # 选择数值特征进行聚类
    cluster_features = ['age', 'income', 'tenure_months', 'monthly_charges', 'total_charges']
    
    # 标准化后聚类
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df[cluster_features])
    
    # K-means聚类
    kmeans = KMeans(n_clusters=5, random_state=42)
    df['customer_cluster'] = kmeans.fit_predict(scaled_data)
    
    # 聚类距离特征
    cluster_centers = kmeans.cluster_centers_
    distances = np.sqrt(((scaled_data - cluster_centers[df['customer_cluster']]) ** 2).sum(axis=1))
    df['cluster_distance'] = distances
    
    # 5. 特征交叉
    # 数值特征交叉
    df['age_tenure_cross'] = df['age'] * df['tenure_months']
    df['income_charges_cross'] = df['income'] * df['monthly_charges']
    
    # 分类特征交叉
    df['gender_education'] = df['gender'] + '_' + df['education']
    df['contract_payment'] = df['contract_type'] + '_' + df['payment_method']
    
    return df

# ================================
# 6. 特征选择和重要性分析
# ================================

def feature_selection_analysis(df):
    """特征选择和重要性分析"""
    print("=== 特征选择和重要性分析 ===")
    
    # 准备特征和目标变量
    feature_cols = [col for col in df.columns 
                   if col not in ['customer_id', 'churn', 'age_group', 'income_quartile']]
    
    # 处理分类变量
    categorical_cols = df[feature_cols].select_dtypes(include=['object']).columns
    
    # 标签编码
    le_dict = {}
    df_encoded = df.copy()
    
    for col in categorical_cols:
        if col in df_encoded.columns:
            le = LabelEncoder()
            df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))
            le_dict[col] = le
    
    # 填充缺失值
    df_encoded = df_encoded.fillna(df_encoded.median(numeric_only=True))
    df_encoded = df_encoded.fillna(0)  # 对于非数值列
    
    X = df_encoded[feature_cols]
    y = df_encoded['churn']
    
    # 互信息特征选择
    mi_scores = mutual_info_classif(X, y, random_state=42)
    mi_df = pd.DataFrame({
        'feature': feature_cols,
        'mutual_info_score': mi_scores
    }).sort_values('mutual_info_score', ascending=False)
    
    print("Top 20 features by mutual information:")
    print(mi_df.head(20))
    
    # 相关性分析
    corr_matrix = df_encoded[feature_cols + ['churn']].corr()
    target_corr = corr_matrix['churn'].abs().sort_values(ascending=False)
    
    print("\nTop 15 features by correlation with target:")
    print(target_corr.head(15))
    
    # 特征重要性统计
    feature_importance = pd.DataFrame({
        'feature': feature_cols,
        'mutual_info': mi_scores,
        'correlation': [abs(corr_matrix.loc[feat, 'churn']) for feat in feature_cols]
    })
    
    feature_importance['combined_score'] = (
        feature_importance['mutual_info'] * 0.6 + 
        feature_importance['correlation'] * 0.4
    )
    
    feature_importance = feature_importance.sort_values('combined_score', ascending=False)
    
    return feature_importance, df_encoded

# ================================
# 7. 主函数执行
# ================================

def main():
    """主函数"""
    print("开始数据挖掘特征工程实战案例\n")
    
    # 1. 创建样本数据
    print("1. 创建样本数据...")
    df = create_sample_data()
    print(f"数据集大小: {df.shape}")
    print(f"流失率: {df['churn'].mean():.3f}")
    print(f"数据预览:\n{df.head()}\n")
    
    # 2. 基础特征工程
    df = basic_feature_engineering(df)
    print(f"基础特征工程后列数: {df.shape[1]}\n")
    
    # 3. 高级数值特征
    df = advanced_numerical_features(df)
    print(f"数值特征工程后列数: {df.shape[1]}\n")
    
    # 4. 统计特征
    df = statistical_features(df)
    print(f"统计特征工程后列数: {df.shape[1]}\n")
    
    # 5. 高级特征工程
    df = advanced_feature_engineering(df)
    print(f"高级特征工程后列数: {df.shape[1]}\n")
    
    # 6. 特征选择
    feature_importance, df_encoded = feature_selection_analysis(df)
    print(f"\n最终特征数量: {len(feature_importance)}")
    
    # 7. 输出总结
    print("\n=== 特征工程总结 ===")
    print(f"原始特征数: 15")
    print(f"最终特征数: {df.shape[1] - 2}")  # 减去customer_id和churn
    print(f"特征工程增加特征数: {df.shape[1] - 2 - 15}")
    
    print("\n特征工程技术应用:")
    print("✓ pandas: 数据处理、分组统计、分箱、编码")
    print("✓ numpy: 数学变换、多项式特征、异常值检测")
    print("✓ scipy: 统计检验、分布分析、正态性检验")
    print("✓ sklearn: 聚类、标准化、特征选择")
    
    print(f"\n数据质量检查:")
    print(f"缺失值: {df.isnull().sum().sum()}")
    print(f"重复行: {df.duplicated().sum()}")
    
    return df, feature_importance

# 运行示例
if __name__ == "__main__":
    df_final, importance_df = main()
    
    # 展示一些关键统计信息
    print("\n=== 关键特征统计 ===")
    key_features = ['customer_value_score', 'charges_to_income_ratio', 
                   'problematic_customer', 'cluster_distance']
    
    for feature in key_features:
        if feature in df_final.columns:
            print(f"\n{feature}:")
            print(f"  均值: {df_final[feature].mean():.3f}")
            print(f"  标准差: {df_final[feature].std():.3f}")
            print(f"  与流失的相关性: {df_final[feature].corr(df_final['churn']):.3f}")

网站公告

今日签到

点亮在社区的每一天
去签到