Predicting Optimal Fertilizers
题意:
给出土壤的特性,预测出3种最佳的肥料
数据处理:
1.有数字型和类别型,类别不能随意换成数字,独热编码。cat可以直接处理category类型。
2.构造一些相关土壤特性特征
3.由于label是category类型,但是xgb不可以处理category类型,因此需要先编码,最后求出结果之后再解码。
建立模型:
1.catboost交叉验证、xgboost交叉验证
代码:
import os
import warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
# 忽略警告信息
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
def init():
"""初始化设置"""
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)
pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_columns", 1000)
def load_data(path_train, path_test):
"""加载数据"""
df_train = pd.read_csv(path_train)
df_test = pd.read_csv(path_test)
print(f"Train shape: {df_train.shape}, Test shape: {df_test.shape}")
return df_train, df_test
def feature_engineering(df_all):
"""特征工程:创建新特征"""
# 肥力综合指数
df_all['Fertility_Index'] = (0.4 * df_all['Nitrogen'] / 100 +
0.3 * df_all['Phosphorous'] / 50 +
0.3 * df_all['Potassium'] / 150)
# 氮磷比
df_all['N_P_ratio'] = df_all['Nitrogen'] / (df_all['Phosphorous'] + 1e-6)
# 钾素盈亏差
df_all['K_deficit'] = df_all['Potassium'] - (df_all['Nitrogen'] + df_all['Phosphorous']) / 2
# 类别编码
df_all['Crop_Type_Code'] = LabelEncoder().fit_transform(df_all['Crop Type'])
category_data = pd.get_dummies(df_all[['Soil Type', 'Crop Type']])
df_all = pd.concat([df_all.drop(['Soil Type', 'Crop Type'], axis=1), category_data], axis=1)
return df_all
def prepare_data(df_train, df_test):
"""合并训练集和测试集并进行预处理"""
df_all = pd.concat([
df_train.drop(['id', 'Fertilizer Name'], axis=1),
df_test.drop(['id'], axis=1)
], axis=0).reset_index(drop=True)
df_all = feature_engineering(df_all)
X_train = df_all[:len(df_train)]
Y_train = LabelEncoder().fit_transform(df_train['Fertilizer Name'])
X_test = df_all[len(df_train):]
return X_train, Y_train, X_test
def train_model(X_train, Y_train, model_type='xgb', n_splits=5):
"""使用交叉验证训练模型"""
models = []
oof_preds = np.zeros((X_train.shape[0],))
scores = []
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train, Y_train)):
print(f"\nFold {fold + 1}/{n_splits}")
x_tr, x_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
y_tr, y_val = Y_train[train_idx], Y_train[val_idx]
if model_type == 'xgb':
model = XGBClassifier(
max_depth=12,
colsample_bytree=0.467,
subsample=0.86,
n_estimators=8000,
learning_rate=0.03,
gamma=0.26,
max_delta_step=4,
reg_alpha=2.7,
reg_lambda=1.4,
early_stopping_rounds=500,
objective='multi:softprob',
random_state=13,
enable_categorical=True,
tree_method='hist',
device='cuda'
)
elif model_type == 'cat':
model = CatBoostClassifier(
iterations=8000,
learning_rate=0.03,
depth=10,
loss_function='MultiClass',
eval_metric='MultiClass',
random_seed=42,
od_type='Iter',
od_wait=500,
verbose=100,
task_type="GPU"
)
elif model_type == 'lgb':
model = LGBMClassifier(
n_estimators=8000,
learning_rate=0.03,
num_leaves=255,
max_depth=10,
subsample=0.8,
colsample_bytree=0.7,
class_weight='balanced',
metric='multi_logloss',
early_stopping_rounds=500,
random_state=42,
verbosity=-1
)
model.fit(x_tr, y_tr, eval_set=[(x_val, y_val)], verbose=100)
val_pred = model.predict(x_val)
score = accuracy_score(y_val, val_pred)
print(f"Validation Accuracy: {score:.4f}")
oof_preds[val_idx] = val_pred
models.append(model)
scores.append(score)
print(f"\nAverage CV Accuracy: {np.mean(scores):.4f} ± {np.std(scores):.4f}")
return models, scores
def predict_test(models, X_test):
"""对测试集进行预测并取平均"""
pred_proba = np.zeros((X_test.shape[0], len(np.unique(Y_train))))
for model in models:
pred_proba += model.predict_proba(X_test) / len(models)
return pred_proba
def generate_submission(df_test, pred_proba, le, output_path='submission.csv'):
"""生成提交文件"""
pred_top3 = np.argsort(pred_proba, axis=1)[:, -3:][:, ::-1]
top3_labels = [list(le.classes_[i]) for i in pred_top3]
submission = pd.DataFrame({
'id': df_test['id'],
'Fertilizer Name': [' '.join(row) for row in top3_labels]
})
submission.to_csv(output_path, index=False)
print(f"Submission saved to {output_path}")
if __name__ == '__main__':
init()
# Step 1: 加载数据
df_train, df_test = load_data('train.csv', 'test.csv')
# Step 2: 准备数据
X_train, Y_train, X_test = prepare_data(df_train, df_test)
# Step 3: 训练模型(支持 xgb/cat/lgb)
models, scores = train_model(X_train, Y_train, model_type='xgb', n_splits=5)
# Step 4: 预测测试集
pred_proba = predict_test(models, X_test)
# Step 5: 生成提交文件
le = LabelEncoder()
le.fit(df_train['Fertilizer Name'])
generate_submission(df_test, pred_proba, le)
#AI生成版本0.34190