pytorch--模型训练的一般流程

发布于:2025-07-04 ⋅ 阅读:(14) ⋅ 点赞:(0)

前言

在pytorch中模型训练一般分为以下几个步骤:
0、数据集准备
1、数据集读取(dataset模块)
2、数据集转换为tensor(dataloader模块)
3、定义模型model(编写模型代码,主要是前向传播)
4、定义损失函数loss
5、定义优化器optimizer
6、最后一步是模型训练阶段train:这一步会,利用循环把dataset->dataloader->model->loss->optimizer合并起来。
相比于普通的函数神经网络并没有特别神奇的地方,我们不妨训练过程看成普通函数参数求解的过程,也就是最优化求解参数。以Alex模型为例,进行分类训练。

0、数据集准备

分类数据不需要进行标注,只需要给出类别就可以了,对应分割,检测需要借助labelme或者labelimg进行标注。将数据分为训练集,验证集,测试集。训练集用于模型训练,验证集用于训练过程中检验模型训练参数的表现,测试集是模型训练完成之后验证模型的表现。

1、数据集

这里下载数据集The TU Darmstadt Database (formerly the ETHZ Database)一个三种类型115 motorbikes + 50 x 2 cars + 112 cows = 327张照片,把数据分为训练train和验证集val

在这里插入图片描述

并对train和val文件夹形成对应的标签文件,每一行为照片的名称和对应的类别编号(从0开始):
在这里插入图片描述

2、dataset

现在写一个名为dataset.py文件,写一个VOCDataset的类,来读取训练集和验证集,VOCDataset继承了torch.utils.data.Dataset,并重写父类的两个函数__getitem__:返回每个图像及其对应的标签,def __len__返回数据集的数量:


import torch  
from torch.utils.data import Dataset
from torchvision import datasets, transforms
from PIL import Image
import os

class VOCDataset(Dataset):
    def __init__(self, img_dir, label_root, transform=None):
        self.img_root = img_dir
        self.label_root = label_root
        self.transform = transform
        # 获取所有图像路径
        self.img_paths= [os.path.join(self.img_root, f) for f in os.listdir(self.img_root) if f.endswith('.png')]
        # 读取txt中class标签,txt文件每行格式为: img_name class_id
        self.label_classes = {}
        with open(label_root, 'r') as f:
            for line in f:
                img_name, class_id = line.strip().split()
                self.label_classes[img_name] = int(class_id)
        
    def __len__(self):
        return len(self.img_paths)

    def __getitem__(self, idx):
        img_path = self.img_paths[idx]
        img = Image.open(img_path).convert('RGB')
        # 获取对应的标签
        img_name = os.path.basename(img_path)
        target = self.label_classes.get(img_name, -1)
        if target == -1:
            raise ValueError(f"Image {img_name} not found in label file.")
        if self.transform:
            img = self.transform(img)
        else:
            img = transforms.ToTensor()(img)
        return img, target

3、model

新建一个model.py的文件,写一个Alex的类(参考动手学深度学习7.1),继承torch.nn.Module,重写forword函数:

from torch import nn
from torchvision import models

class AlexNet(nn.Module):
    def __init__(self,num_class=3):
        super(AlexNet, self).__init__()
        self.conv2d1=nn.Conv2d(in_channels=3,out_channels=96,kernel_size=11,stride=4,padding=1)
        self.pool1=nn.MaxPool2d(kernel_size=3,stride=2,padding=0)
        self.conv2d2=nn.Conv2d(in_channels=96,out_channels=256,kernel_size=5,stride=1,padding=2)
        self.pool2=nn.MaxPool2d(kernel_size=3,stride=2,padding=0)
        
        self.conv2d3=nn.Conv2d(in_channels=256,out_channels=384,kernel_size=3,stride=1,padding=1)
        self.conv2d4=nn.Conv2d(in_channels=384,out_channels=384,kernel_size=3,stride=1,padding=1)
        self.conv2d5=nn.Conv2d(in_channels=384,out_channels=256,kernel_size=3,stride=1,padding=1)
        self.pool3=nn.MaxPool2d(kernel_size=3,stride=2,padding=0)
        # 全连接层4096
        self.fc1=nn.Linear(256*5*5,4096)
        self.fc2=nn.Linear(4096,4096)
        self.fc3=nn.Linear(4096,num_class)
        self.sequential = nn.Sequential(
            self.conv2d1,nn.ReLU(),
            self.pool1,
            self.conv2d2,nn.ReLU(),
            self.pool2,
            self.conv2d3,nn.ReLU(),
            self.conv2d4,nn.ReLU(),
            self.conv2d5,nn.ReLU(),
            self.pool3,
            nn.Flatten(),
            self.fc1,nn.ReLU(),nn.Dropout(0.5),
            self.fc2,nn.ReLU(),nn.Dropout(0.5),
            self.fc3
        )
        # 初始化权重
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)


    def forward(self,x):
        x = self.sequential(x)
        return x

4、训练模型

首先定义损失函数和优化器:

  criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, weight_decay=1e-4)

新建一个train.py的文件:

import torch
from torch.utils.data import DataLoader
from torchvision import transforms
from dataset import VOCDataset
from model import AlexNet, ResnetPretrained
from torchvision import models
from torchvision.datasets import CIFAR10
from dataset import VOCDataset
import tensorboard


def train(model, train_dataset, val_dataset, num_epochs=20, batch_size=32, learning_rate=0.001):
    # 1. 创建数据加载器
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
    
    # 2. 定义损失函数和优化器
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, weight_decay=1e-4)
   # optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    # 3. 修正学习率调度器(放在循环外)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='max', factor=0.5, patience=2
    )
    
    # 4. 训练模型
    best_acc = 0.0
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        total = 0
        
        for i, (inputs, labels) in enumerate(train_loader):
            inputs, labels = inputs.cuda(), labels.cuda()
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * inputs.size(0)
            total += inputs.size(0)
            
            if i % 100 == 0:
                avg_loss = running_loss / total
                print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {avg_loss:.4f}')
        
        # 每个epoch结束后验证
        model.eval()
        correct = 0
        total_val = 0
        val_loss = 0.0
        
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.cuda(), labels.cuda()
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                
                _, predicted = torch.max(outputs.data, 1)
                total_val += labels.size(0)
                correct += (predicted == labels).sum().item()
                val_loss += loss.item() * inputs.size(0)
        
        epoch_acc = 100 * correct / total_val
        avg_val_loss = val_loss / total_val
        
        print(f'Epoch {epoch+1}/{num_epochs} | '
              f'Train Loss: {running_loss/total:.4f} | '
              f'Val Loss: {avg_val_loss:.4f} | '
              f'Val Acc: {epoch_acc:.2f}%')
        
        # 更新学习率(基于验证集准确率)
        #scheduler.step(epoch_acc)
        
        # 保存最佳模型
        if epoch_acc > best_acc:
            best_acc = epoch_acc
            torch.save(model.state_dict(), 'best_alexnet_cifar10.pth')
    
    print(f'Best Validation Accuracy: {best_acc:.2f}%')

if __name__ == "__main__":
    # 1. 定义数据集路径
    train_img_dir = r'F:\dataset\tud\TUDarmstadt\PNGImages\train'
    val_img_dir = r'F:\dataset\tud\TUDarmstadt\PNGImages\val'

    train_label_file = r'F:\dataset\tud\TUDarmstadt\PNGImages/train_set.txt'
    val_label_file = r'F:\dataset\tud\TUDarmstadt\PNGImages/val_set.txt'
    # 2. 创建数据集实例
    # 增强数据增强
    transform_train = transforms.Compose([
        transforms.Resize((256, 256)),  # 先放大
        transforms.RandomCrop(224),  # 随机裁剪
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(15),
        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    # 验证集不需要数据增强,但需要同样的预处理
    transform_val = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    


    # 创建训练和验证数据集

    train_dataset = VOCDataset(train_img_dir, train_label_file, transform=transform_train)
    val_dataset = VOCDataset(val_img_dir, val_label_file, transform=transform_val)
    print(f'Train dataset size: {len(train_dataset)}')
    print(f'Validation dataset size: {len(val_dataset)}')
    # 2. 下载并利用CIFAR-10数据集进行分类

    # # # 定义数据增强和预处理
    # transform_train = transforms.Compose([
    #     transforms.Resize((224, 224)),
    #     transforms.RandomHorizontalFlip(),
    #     transforms.RandomCrop(224, padding=4),
    #     transforms.ToTensor(),
    #     transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], 
    #                          std=[0.2470, 0.2435, 0.2616])
    # ])
    
    # transform_val = transforms.Compose([
    #     transforms.Resize((224, 224)),
    #     transforms.ToTensor(),
    #     transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], 
    #                          std=[0.2470, 0.2435, 0.2616])
    # ])
    # # 下载CIFAR-10训练集和验证集
    # train_dataset = CIFAR10(root='data', train=True, download=True, transform=transform_train)
    # val_dataset = CIFAR10(root='data', train=False, download=True, transform=transform_val)
    # print(f'Train dataset size: {len(train_dataset)}')
    # print(f'Validation dataset size: {len(val_dataset)}')

    # 3. 创建模型实例
    model = AlexNet(num_class=10)  # CIFAR-10有10个类别  
    # 检查是否有可用的GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)  # 将模型移动到GPU或CPU
    # 打印模型结构
    #print(model)
    # 4. 开始训练
    train(model, train_dataset, val_dataset, num_epochs=20, batch_size=32, learning_rate=0.001)
    print('Finished Training')
    # 5. 保存模型
    torch.save(model.state_dict(), 'output/alexnet.pth')
    print('Model saved as alexnet.pth')
   

运行main函数就可以进行训练了,后面会讲一些如何改进这个模型和一些训练技巧。

参考:
1
2
3


网站公告

今日签到

点亮在社区的每一天
去签到