DAY 37 早停策略和模型权重的保存

发布于:2025-08-18 ⋅ 阅读:(17) ⋅ 点赞:(0)

@浙大疏锦行知识点回顾:
1.过拟合的判断:测试集和训练集同步打印指标
2.模型的保存和加载
a.仅保存权重
b.保存权重和模型
c.保存全部信息checkpoint,还包含训练状态
3.早停策略
作业

import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from tqdm import tqdm
import time
import matplotlib.pyplot as plt

# 设置设备
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"使用设备: {device}")

# 读取数据
data = pd.read_csv('data.csv')  # 读取数据
y = data['Credit Default']
X = data.drop(['Credit Default'], axis=1)

# 划分训练集、验证集和测试集(为早停策略添加验证集)
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25,
                                                  random_state=42)  # 0.25*0.8=0.2

object_cols = X.select_dtypes(include=['object']).columns.tolist()

# 有序分类特征
ordinal_features = ['Home Ownership', 'Years in current job', 'Term']
ordinal_categories = [
    ['Own Home', 'Rent', 'Have Mortgage', 'Home Mortgage'],
    ['< 1 year', '1 year', '2 years', '3 years', '4 years', '5 years', '6 years', '7 years', '8 years', '9 years',
     '10+ years'],
    ['Short Term', 'Long Term']
]

# 有序特征处理管道
ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(categories=ordinal_categories, handle_unknown='use_encoded_value', unknown_value=-1))
])

# 分类特征
nominal_features = ['Purpose']
nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# 连续特征
continuous_features = X.columns.difference(object_cols).tolist()
continuous_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', StandardScaler())
])

# 构建预处理管道
preprocessor = ColumnTransformer(
    transformers=[
        ('ordinal', ordinal_transformer, ordinal_features),
        ('nominal', nominal_transformer, nominal_features),
        ('continuous', continuous_transformer, continuous_features)
    ],
    remainder='passthrough'
)

# 应用预处理
X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.transform(X_val)
X_test_processed = preprocessor.transform(X_test)

# 查看预处理后的特征数
print(f"预处理后训练集特征数:{X_train_processed.shape[1]}")
input_feature_num = X_train_processed.shape[1]

# 转换为PyTorch张量并移至设备
X_train = torch.FloatTensor(X_train_processed).to(device)
y_train = torch.LongTensor(y_train.values).to(device)
X_val = torch.FloatTensor(X_val_processed).to(device)
y_val = torch.LongTensor(y_val.values).to(device)
X_test = torch.FloatTensor(X_test_processed).to(device)
y_test = torch.LongTensor(y_test.values).to(device)


# 定义模型
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_feature_num, 20)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)  # 添加dropout防止过拟合
        self.fc2 = nn.Linear(20, 2)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        return out


# 第一阶段:初始训练并保存权重
print("===== 第一阶段:初始训练 =====")
model = MLP().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

num_epochs_initial = 20000
train_losses_initial = []
val_losses_initial = []
epochs_initial = []

# 早停参数
best_val_loss_initial = float('inf')
best_epoch_initial = 0
patience_initial = 100  # 信贷数据可能需要更大的耐心值
counter_initial = 0
early_stopped_initial = False

start_time_initial = time.time()

with tqdm(total=num_epochs_initial, desc="初始训练进度") as pbar:
    for epoch in range(num_epochs_initial):
        # 训练模式
        model.train()
        outputs = model(X_train)
        train_loss = criterion(outputs, y_train)

        # 反向传播和优化
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()

        # 每100轮评估一次验证集
        if (epoch + 1) % 100 == 0:
            # 评估模式
            model.eval()
            with torch.no_grad():
                val_outputs = model(X_val)
                val_loss = criterion(val_outputs, y_val)

            train_losses_initial.append(train_loss.item())
            val_losses_initial.append(val_loss.item())
            epochs_initial.append(epoch + 1)

            pbar.set_postfix({
                'Train Loss': f'{train_loss.item():.4f}',
                'Val Loss': f'{val_loss.item():.4f}'
            })

            # 早停逻辑
            if val_loss.item() < best_val_loss_initial:
                best_val_loss_initial = val_loss.item()
                best_epoch_initial = epoch + 1
                counter_initial = 0
                # 保存最佳模型
                torch.save(model.state_dict(), 'credit_best_initial_model.pth')
            else:
                counter_initial += 1
                if counter_initial >= patience_initial:
                    print(f"\n初始训练早停触发!在第{epoch + 1}轮,验证集损失已有{patience_initial}轮未改善。")
                    print(f"初始训练最佳验证集损失出现在第{best_epoch_initial}轮,损失值为{best_val_loss_initial:.4f}")
                    early_stopped_initial = True
                    break

        pbar.update(1)

time_initial = time.time() - start_time_initial
print(f'初始训练时间: {time_initial:.2f} seconds')

# 加载初始训练的最佳模型
model.load_state_dict(torch.load('credit_best_initial_model.pth'))

# 评估初始模型
model.eval()
with torch.no_grad():
    outputs = model(X_test)
    _, predicted = torch.max(outputs, 1)
    correct = (predicted == y_test).sum().item()
    accuracy = correct / y_test.size(0)
    print(f'初始模型测试集准确率: {accuracy * 100:.2f}%')

# 第二阶段:加载权重后继续训练50轮,带早停策略
print("\n===== 第二阶段:加载权重后继续训练 =====")
# 重新初始化优化器,可考虑使用较小的学习率
optimizer = optim.SGD(model.parameters(), lr=0.005)

num_epochs_resume = 50
train_losses_resume = []
val_losses_resume = []
epochs_resume = []

# 续训阶段早停参数
best_val_loss_resume = float('inf')
best_epoch_resume = 0
patience_resume = 10  # 续训阶段可以使用较小的耐心值
counter_resume = 0
early_stopped_resume = False

start_time_resume = time.time()

with tqdm(total=num_epochs_resume, desc="续训进度") as pbar:
    for epoch in range(num_epochs_resume):
        # 训练模式
        model.train()
        outputs = model(X_train)
        train_loss = criterion(outputs, y_train)

        # 反向传播和优化
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()

        # 每轮都评估验证集(因为只有50轮)
        model.eval()
        with torch.no_grad():
            val_outputs = model(X_val)
            val_loss = criterion(val_outputs, y_val)

        # 计算绝对轮数
        absolute_epoch = best_epoch_initial + epoch + 1

        train_losses_resume.append(train_loss.item())
        val_losses_resume.append(val_loss.item())
        epochs_resume.append(absolute_epoch)

        pbar.set_postfix({
            'Train Loss': f'{train_loss.item():.4f}',
            'Val Loss': f'{val_loss.item():.4f}'
        })

        # 早停逻辑
        if val_loss.item() < best_val_loss_resume:
            best_val_loss_resume = val_loss.item()
            best_epoch_resume = absolute_epoch
            counter_resume = 0
            torch.save(model.state_dict(), 'credit_best_resumed_model.pth')
        else:
            counter_resume += 1
            if counter_resume >= patience_resume:
                print(f"\n续训早停触发!在第{absolute_epoch}轮,验证集损失已有{patience_resume}轮未改善。")
                print(f"续训阶段最佳验证集损失出现在第{best_epoch_resume}轮,损失值为{best_val_loss_resume:.4f}")
                early_stopped_resume = True
                break

        pbar.update(1)

time_resume = time.time() - start_time_resume
print(f'续训时间: {time_resume:.2f} seconds')

# 加载续训后的最佳模型
model.load_state_dict(torch.load('credit_best_resumed_model.pth'))

# 评估最终模型
model.eval()
with torch.no_grad():
    outputs = model(X_test)
    _, predicted = torch.max(outputs, 1)
    correct = (predicted == y_test).sum().item()
    accuracy = correct / y_test.size(0)
    print(f'最终模型测试集准确率: {accuracy * 100:.2f}%')

# 可视化损失曲线
plt.figure(figsize=(12, 6))
# 初始训练阶段
plt.plot(epochs_initial, train_losses_initial, 'b-', label='Initial Train Loss')
plt.plot(epochs_initial, val_losses_initial, 'r-', label='Initial Val Loss')
# 续训阶段
plt.plot(epochs_resume, train_losses_resume, 'b--', label='Resumed Train Loss')
plt.plot(epochs_resume, val_losses_resume, 'r--', label='Resumed Val Loss')

# 标记关键节点
plt.axvline(x=best_epoch_initial, color='g', linestyle=':', label='Initial Training End')
plt.axvline(x=best_epoch_resume, color='purple', linestyle=':', label='Best Resumed Point')

plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss (Initial + Resumed Training)')
plt.legend()
plt.grid(True)
plt.show()


网站公告

今日签到

点亮在社区的每一天
去签到