import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.impute import SimpleImputer from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.svm import SVC from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score from sklearn.metrics import confusion_matrix, roc_curve import os # 设置中文显示 plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"] plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题 # 1.数据加载与预处理(适配train_data.csv列结构) def data_preprocessing(): # 读取本地数据文件(处理列名中的空格) try: data = pd.read_csv('train_data.csv') # 统一列名格式:去除空格、替换为下划线 data.columns = data.columns.str.replace(' ', '_') print(f"成功读取数据,原始数据形状:{data.shape}") print("数据列名:", data.columns.tolist()) print(data.info()) except FileNotFoundError: print("错误:未找到'train_data.csv'文件,请确保该文件与代码在同一目录下") return None, None, None, None, None, None # 异常值处理(AADT) def detect_outliers(df, col, z_threshold=3): mean = df[col].mean() std = df[col].std() z_scores = (df[col] - mean) / std outliers = df[abs(z_scores) > z_threshold] return outliers.index # 处理AADT异常值(数据中列名为'AADT') if 'AADT' in data.columns: outlier_indices = detect_outliers(data, 'AADT') print(f"AADT异常值索引:{outlier_indices.tolist()}") # 修正异常值为95%分位数 if len(outlier_indices) > 0: aadthigh = data['AADT'].quantile(0.95) data.loc[outlier_indices, 'AADT'] = aadthigh else: print("警告:数据中未找到'AADT'列,跳过异常值处理") # 特征衍生(适配数据中的列名) # 维护间隔年(数据中列名为'Last_Maintenance',原列名带空格) if 'Last_Maintenance' in data.columns: data['维护间隔年'] = 2025 - data['Last_Maintenance'] else: print("警告:数据中未找到'Last_Maintenance'列,无法生成'维护间隔年'特征") # 交通荷载强度(假设道路宽度12.5米) if 'AADT' in data.columns: data['交通荷载强度'] = data['AADT'] * 365 / 12.5 else: print("警告:数据中未找到'AADT'列,无法生成'交通荷载强度'特征") # 降雨侵蚀指数(数据中列名为'Average_Rainfall') if 'Average_Rainfall' in data.columns and '维护间隔年' in data.columns: data['降雨侵蚀指数'] = data['Average_Rainfall'] * data['维护间隔年'] else: print("警告:缺少'Average_Rainfall'或'维护间隔年',无法生成'降雨侵蚀指数'特征") # 划分特征与目标(目标列名为'Needs_Maintenance') if 'Needs_Maintenance' not in data.columns: print("错误:数据中未找到'Needs_Maintenance'目标列,无法继续") return None, None, None, None, None, None # 特征列:排除ID和目标列(ID列名为'Segment_ID') X = data.drop(['Segment_ID', 'Needs_Maintenance'], axis=1, errors='ignore') y = data['Needs_Maintenance'] # 定义特征列表(匹配数据中的实际列名) numeric_features = [ 'PCI', 'AADT', 'Average_Rainfall', 'Rutting', 'IRI', '维护间隔年', '交通荷载强度', '降雨侵蚀指数' ] # 筛选存在的数值特征 numeric_features = [f for f in numeric_features if f in X.columns] # 分类特征(数据中列名为'Road_Type'和'Asphalt_Type') categorical_features = ['Road_Type', 'Asphalt_Type'] # 筛选存在的分类特征 categorical_features = [f for f in categorical_features if f in X.columns] print(f"实际使用的数值特征:{numeric_features}") print(f"实际使用的分类特征:{categorical_features}") .........................
通过网盘分享的文件:2025钉钉杯(创新杯)资料
链接: https://pan.baidu.com/s/1IPfcDObvPTCtjVx4UbTl3Q
提取码: yutf