一文讲清楚Pytorch 张量、链式求导、正向传播、反向求导、计算图等基础知识

发布于:2025-08-15 ⋅ 阅读:(18) ⋅ 点赞:(0)

PyTorch深度学习基础知识详解

目录

  1. 基础数学概念
  2. PyTorch张量基础
  3. 计算图与自动求导
  4. 链式法则与反向传播
  5. 雅可比矩阵
  6. GPU与CPU兼容性
  7. 完整示例:构建简单神经网络

1. 基础数学概念

1.1 标量、向量、矩阵和张量

import torch
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from mpl_toolkits.mplot3d import Axes3D

# 设置随机种子以确保可重复性
torch.manual_seed(42)
np.random.seed(42)

# 检查CUDA是否可用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")
#使用设备: cuda


# 1. 标量 (Scalar) - 0维张量
scalar = torch.tensor(3.14)
print(f"标量: {scalar}")
#标量: 3.140000104904175
print(f"标量形状: {scalar.shape}")
#标量形状: torch.Size([])
print(f"标量维度: {scalar.dim()}")
#标量维度: 0

# 2. 向量 (Vector) - 1维张量
vector = torch.tensor([1.0, 2.0, 3.0, 4.0])
print(f"\n向量: {vector}")
#向量: tensor([1., 2., 3., 4.])
print(f"向量形状: {vector.shape}")
#向量形状: torch.Size([4])
print(f"向量维度: {vector.dim()}")
#向量维度: 1

# 3. 矩阵 (Matrix) - 2维张量
matrix = torch.tensor([[1, 2, 3],
                       [4, 5, 6],
                       [7, 8, 9]])
print(f"\n矩阵:\n{matrix}")
#矩阵:
#tensor([[1, 2, 3],
#       [4, 5, 6],
#       [7, 8, 9]])
print(f"矩阵形状: {matrix.shape}")
#矩阵形状: torch.Size([3, 3])
print(f"矩阵维度: {matrix.dim()}")
#矩阵维度: 2

# 4. 张量 (Tensor) - 多维数组
tensor_3d = torch.randn(2, 3, 4)  # 2个3x4的矩阵
print(f"\n3维张量形状: {tensor_3d.shape}")
#3维张量形状: torch.Size([2, 3, 4])
print(f"3维张量维度: {tensor_3d.dim()}")
#3维张量维度: 3

1.2 张量可视化

# 可视化不同维度的张量
fig = plt.figure(figsize=(15, 4))

# 标量可视化
ax1 = fig.add_subplot(141)
ax1.text(0.5, 0.5, '3.14', fontsize=20, ha='center', va='center')
ax1.set_xlim(0, 1)
ax1.set_ylim(0, 1)
ax1.set_title('标量 (0D)')
ax1.axis('off')

# 向量可视化
ax2 = fig.add_subplot(142)
vector_data = [1, 2, 3, 4]
ax2.bar(range(len(vector_data)), vector_data, color='blue', alpha=0.7)
ax2.set_title('向量 (1D)')
ax2.set_xlabel('索引')
ax2.set_ylabel('值')

# 矩阵可视化
ax3 = fig.add_subplot(143)
matrix_data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
im = ax3.imshow(matrix_data, cmap='viridis', aspect='auto')
ax3.set_title('矩阵 (2D)')
for i in range(3):
    for j in range(3):
        ax3.text(j, i, str(matrix_data[i, j]), ha='center', va='center', color='white')
plt.colorbar(im, ax=ax3)

# 3D张量可视化(显示为多个2D切片)
ax4 = fig.add_subplot(144, projection='3d')
tensor_3d_data = np.random.randn(3, 3, 3)
x, y, z = np.meshgrid(range(3), range(3), range(3))
ax4.scatter(x, y, z, c=tensor_3d_data.flatten(), cmap='coolwarm', s=100, alpha=0.6)
ax4.set_title('3D张量')
ax4.set_xlabel('X')
ax4.set_ylabel('Y')
ax4.set_zlabel('Z')

plt.tight_layout()
plt.show()

在这里插入图片描述

2. PyTorch张量基础

2.1 张量创建与操作

# 创建张量的多种方式
# 1. 从Python列表创建
tensor_from_list = torch.tensor([[1, 2], [3, 4]], dtype=torch.float32)

# 2. 创建特定形状的张量
zeros_tensor = torch.zeros(3, 4)  # 3x4的零张量
ones_tensor = torch.ones(2, 3)    # 2x3的全1张量
random_tensor = torch.randn(3, 3) # 3x3的随机张量(标准正态分布)

# 3. 创建等差数列
arange_tensor = torch.arange(0, 10, 2)  # [0, 2, 4, 6, 8]
linspace_tensor = torch.linspace(0, 1, 5)  # 5个均匀分布的点

print("从列表创建的张量:")
#从列表创建的张量:

print(tensor_from_list)
#tensor([[1., 2.],
#       [3., 4.]])
print(f"数据类型: {tensor_from_list.dtype}")
#数据类型: torch.float32
print(f"设备: {tensor_from_list.device}")
#设备: cpu

2.2 张量的基本运算

# 创建两个张量
a = torch.tensor([[1.0, 2.0], [3.0, 4.0]], device=device)
print(f"矩阵a:\n{a}")
#矩阵a:
#tensor([[1., 2.],
#        [3., 4.]], device='cuda:0')

b = torch.tensor([[5.0, 6.0], [7.0, 8.0]], device=device)
print(f"矩阵b:\n{b}")
#矩阵b:
#tensor([[5., 6.],
#      [7., 8.]], device='cuda:0')



# 基本算术运算
add_result = a + b  # 逐元素加法
sub_result = a - b  # 逐元素减法
mul_result = a * b  # 逐元素乘法
div_result = a / b  # 逐元素除法


#print(f"逐元素乘法 a * b:\n{mul_result}")
#逐元素乘法 a * b:
#tensor([[ 5., 12.],
#        [21., 32.]], device='cuda:0')


# 矩阵运算
matmul_result = torch.matmul(a, b)  # 矩阵乘法
# 或者使用 @ 操作符
matmul_result2 = a @ b

print(f"矩阵乘法 a @ b:\n{matmul_result}")
#矩阵乘法 a @ b:
#tensor([[19., 22.],
#        [43., 50.]], device='cuda:0')


# 转置
transpose_result = a.T  # 或 a.transpose(0, 1)

# 聚合操作
sum_result = a.sum()  # 所有元素求和
mean_result = a.mean()  # 平均值
max_result = a.max()  # 最大值
min_result = a.min()  # 最小值


2.3 张量的形状操作

# 创建一个张量
x = torch.randn(4, 3, 2)
print(f"原始形状: {x.shape}")
#原始形状: torch.Size([4, 3, 2])

# reshape操作
x_reshaped = x.reshape(6, 4)  # 必须保证元素总数不变
print(f"reshape后: {x_reshaped.shape}")
#reshape后: torch.Size([6, 4])
    
    
# view操作(与reshape类似,但要求张量在内存中连续)
x_viewed = x.view(12, 2)
print(f"view后: {x_viewed.shape}")
#view后: torch.Size([12, 2])


# squeeze和unsqueeze
y = torch.randn(1, 3, 1, 4)
y_squeezed = y.squeeze()  # 移除所有维度为1的维度
print(f"squeeze前: {y.shape}, squeeze后: {y_squeezed.shape}")
#squeeze前: torch.Size([1, 3, 1, 4]), squeeze后: torch.Size([3, 4])
           
y_unsqueezed = y_squeezed.unsqueeze(0)  # 在第0维添加一个维度
print(f"unsqueeze后: {y_unsqueezed.shape}")
#unsqueeze后: torch.Size([1, 3, 4])
    
    
# 广播机制示例
a = torch.randn(3, 1)
b = torch.randn(1, 4)
c = a + b  # 自动广播到 (3, 4)
print(f"广播结果形状: {c.shape}")
#广播结果形状: torch.Size([3, 4])

3. 计算图与自动求导

3.1 计算图概念

计算图是一种有向无环图(DAG),用于表示计算过程中的操作和数据流。

# 创建需要梯度的张量
x = torch.tensor(2.0, requires_grad=True)
y = torch.tensor(3.0, requires_grad=True)

# 构建计算图
z = x * y
w = z ** 2
loss = w + 5

print(f"x: {x}, requires_grad={x.requires_grad}")
#x: 2.0, requires_grad=True
    
print(f"y: {y}, requires_grad={y.requires_grad}")
#y: 3.0, requires_grad=True
    
print(f"z = x * y: {z}")
#z = x * y: 6.0
    
print(f"w = z^2: {w}")
#w = z^2: 36.0
    
print(f"loss = w + 5: {loss}")
#loss = w + 5: 41.0

# 查看计算图信息
print(f"\nloss的grad_fn: {loss.grad_fn}")
#loss的grad_fn: <AddBackward0 object at 0x0000024315868310>
print(f"w的grad_fn: {w.grad_fn}")
#w的grad_fn: <PowBackward0 object at 0x00000243136D70D0>
print(f"z的grad_fn: {z.grad_fn}")
#z的grad_fn: <MulBackward0 object at 0x0000024315868310>

3.2 计算图可视化

import networkx as nx

def visualize_computation_graph():
    """可视化简单的计算图"""
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
    
    # 前向传播图
    G_forward = nx.DiGraph()
    G_forward.add_edges_from([
        ('x', 'z = x * y'),
        ('y', 'z = x * y'),
        ('z = x * y', 'w = z²'),
        ('w = z²', 'loss = w + 5'),
        ('5', 'loss = w + 5')
    ])
    
    pos_forward = {
        'x': (0, 2),
        'y': (0, 0),
        'z = x * y': (2, 1),
        'w = z²': (4, 1),
        '5': (4, 0),
        'loss = w + 5': (6, 1)
    }
    
    nx.draw(G_forward, pos_forward, ax=ax1, with_labels=True, 
            node_color='lightblue', node_size=2000, font_size=10,
            arrows=True, arrowsize=20, edge_color='gray')
    ax1.set_title('前向传播', fontsize=14)
    
    # 反向传播图
    G_backward = nx.DiGraph()
    G_backward.add_edges_from([
        ('∂loss/∂loss = 1', '∂loss/∂w'),
        ('∂loss/∂w', '∂loss/∂z'),
        ('∂loss/∂z', '∂loss/∂x'),
        ('∂loss/∂z', '∂loss/∂y')
    ])
    
    pos_backward = {
        '∂loss/∂loss = 1': (6, 1),
        '∂loss/∂w': (4, 1),
        '∂loss/∂z': (2, 1),
        '∂loss/∂x': (0, 2),
        '∂loss/∂y': (0, 0)
    }
    
    nx.draw(G_backward, pos_backward, ax=ax2, with_labels=True,
            node_color='lightcoral', node_size=2000, font_size=10,
            arrows=True, arrowsize=20, edge_color='gray')
    ax2.set_title('反向传播', fontsize=14)
    
    plt.tight_layout()
    plt.show()

visualize_computation_graph()

在这里插入图片描述

3.3 自动求导机制

# PyTorch的autograd自动求导
x = torch.tensor(2.0, requires_grad=True)
y = torch.tensor(3.0, requires_grad=True)

# 前向传播
z = x * y
w = z ** 2
loss = w + 5

# 反向传播
loss.backward()

# 查看梯度
print(f"∂loss/∂x = {x.grad}")
print(f"∂loss/∂y = {y.grad}")

# 手动验证梯度计算
# loss = (x*y)^2 + 5
# ∂loss/∂x = 2*x*y^2 = 2*2*9 = 36
# ∂loss/∂y = 2*x^2*y = 2*4*3 = 24
print(f"\n手动计算验证:")
print(f"∂loss/∂x = 2*x*y^2 = 2*{x.item()}*{y.item()}^2 = {2*x.item()*y.item()**2}")
print(f"∂loss/∂y = 2*x^2*y = 2*{x.item()}^2*{y.item()} = {2*x.item()**2*y.item()}")
∂loss/∂x = 36.0
∂loss/∂y = 24.0

手动计算验证:
∂loss/∂x = 2*x*y^2 = 2*2.0*3.0^2 = 36.0
∂loss/∂y = 2*x^2*y = 2*2.0^2*3.0 = 24.0

3.4 梯度累积与清零

# 梯度累积示例
x = torch.tensor(2.0, requires_grad=True)

# 第一次计算
y1 = x ** 2
y1.backward()
print(f"第一次backward后,x.grad = {x.grad}")

# 第二次计算(梯度会累积)
y2 = x ** 3
y2.backward()
print(f"第二次backward后(累积),x.grad = {x.grad}")

# 清零梯度
x.grad.zero_()
y3 = x ** 4
y3.backward()
print(f"清零后再次backward,x.grad = {x.grad}")

import torch

# 演示梯度累积的详细过程
x = torch.tensor(2.0, requires_grad=True)

print("=== 第一次backward ===")
y1 = x ** 2  # y1 = x²
print(f"y1 = {y1}")
print(f"dy1/dx = 2x = 2*{x} = {2*x}")

y1.backward()
print(f"第一次backward后,x.grad = {x.grad}")

print("\n=== 第二次backward(累积)===")
y2 = x ** 2  # y2 = x²  
print(f"y2 = {y2}")
print(f"dy2/dx = 2x = 2*{x} = {2*x}")

y2.backward()
print(f"新梯度 = {2*x}")
print(f"累积梯度 = 原梯度 + 新梯度 = 4.0 + 4.0 = {x.grad}")

print("\n=== 清零后重新计算 ===")
x.grad.zero_()  # 手动清零
y3 = x ** 4  # y3 = x⁴
print(f"y3 = {y3}")
print(f"dy3/dx = 4x³ = 4*{x}³ = 4*8 = {4 * (x**3)}")

y3.backward()
print(f"清零后的梯度 = {x.grad}")
=== 第一次backward ===
y1 = 4.0
dy1/dx = 2x = 2*2.0 = 4.0
第一次backward后,x.grad = 4.0

=== 第二次backward(累积)===
y2 = 4.0
dy2/dx = 2x = 2*2.0 = 4.0
新梯度 = 4.0
累积梯度 = 原梯度 + 新梯度 = 4.0 + 4.0 = 8.0

=== 清零后重新计算 ===
y3 = 16.0
dy3/dx = 4x³ = 4*2.0³ = 4*8 = 32.0
清零后的梯度 = 32.0
import torch
import torch.nn as nn

# 场景1: 模拟大批量训练(内存限制)
def simulate_large_batch_training():
    """模拟大批量训练:将大批量分解为小批量累积"""
    model = nn.Linear(10, 1)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
    
    # 模拟数据:假设我们想要批量大小为16,但内存只能处理4个样本
    total_batch_size = 16
    mini_batch_size = 4
    accumulation_steps = total_batch_size // mini_batch_size
    
    print(f"总批量大小: {total_batch_size}")
    print(f"小批量大小: {mini_batch_size}")
    print(f"累积步数: {accumulation_steps}")
    
    # 生成模拟数据
    all_data = torch.randn(total_batch_size, 10)
    all_targets = torch.randn(total_batch_size, 1)
    
    optimizer.zero_grad()  # 清零梯度
    
    total_loss = 0
    for step in range(accumulation_steps):
        # 获取小批量数据
        start_idx = step * mini_batch_size
        end_idx = start_idx + mini_batch_size
        
        mini_batch_data = all_data[start_idx:end_idx]
        mini_batch_targets = all_targets[start_idx:end_idx]
        
        # 前向传播
        outputs = model(mini_batch_data)
        loss = nn.MSELoss()(outputs, mini_batch_targets)
        
        # 反向传播(梯度累积)
        loss.backward()  # 梯度会自动累积
        
        total_loss += loss.item()
        print(f"步骤 {step+1}: loss = {loss.item():.4f}")
        
        # 查看参数的梯度(累积过程)
        if hasattr(model.weight, 'grad') and model.weight.grad is not None:
            grad_norm = model.weight.grad.norm().item()
            print(f"  累积梯度范数: {grad_norm:.4f}")
    
    # 平均梯度(模拟大批量的效果)
    for param in model.parameters():
        if param.grad is not None:
            param.grad /= accumulation_steps
    
    # 更新参数
    optimizer.step()
    
    avg_loss = total_loss / accumulation_steps
    print(f"\n平均损失: {avg_loss:.4f}")
    print("参数更新完成")
    
    return model

# 场景2: 多任务学习中的梯度累积
def multi_task_gradient_accumulation():
    """多任务学习:来自不同任务的梯度累积"""
    shared_model = nn.Linear(10, 5)
    task1_head = nn.Linear(5, 1)
    task2_head = nn.Linear(5, 2)
    
    optimizer = torch.optim.Adam(
        list(shared_model.parameters()) + 
        list(task1_head.parameters()) + 
        list(task2_head.parameters()), 
        lr=0.001
    )
    
    # 模拟数据
    x = torch.randn(8, 10)
    y1 = torch.randn(8, 1)  # 任务1目标
    y2 = torch.randn(8, 2)  # 任务2目标
    
    optimizer.zero_grad()
    
    # 共享特征提取
    shared_features = shared_model(x)
    
    # 任务1的损失和梯度
    print("=== 任务1 ===")
    output1 = task1_head(shared_features)
    loss1 = nn.MSELoss()(output1, y1)
    print(f"任务1损失: {loss1.item():.4f}")
    
    loss1.backward(retain_graph=True)  # 保留计算图用于任务2
    
    # 检查共享层的梯度
    if shared_model.weight.grad is not None:
        grad_norm_after_task1 = shared_model.weight.grad.norm().item()
        print(f"任务1后共享层梯度范数: {grad_norm_after_task1:.4f}")
    
    # 任务2的损失和梯度(累积到任务1的梯度上)
    print("\n=== 任务2 ===")
    output2 = task2_head(shared_features)
    loss2 = nn.MSELoss()(output2, y2)
    print(f"任务2损失: {loss2.item():.4f}")
    
    loss2.backward()  # 梯度会累积到共享层
    
    # 检查累积后的梯度
    if shared_model.weight.grad is not None:
        grad_norm_after_both = shared_model.weight.grad.norm().item()
        print(f"两个任务后共享层梯度范数: {grad_norm_after_both:.4f}")
        print(f"梯度增加了: {grad_norm_after_both - grad_norm_after_task1:.4f}")
    
    # 更新所有参数
    optimizer.step()
    print("\n多任务梯度累积完成,参数已更新")

# 场景3: 演示为什么需要手动清零梯度
def why_zero_grad_needed():
    """演示不清零梯度的问题"""
    x = torch.tensor([1.0, 2.0], requires_grad=True)
    
    print("=== 不清零梯度的问题 ===")
    for epoch in range(3):
        print(f"\nEpoch {epoch + 1}:")
        
        # 计算损失
        y = (x ** 2).sum()
        print(f"损失: {y.item()}")
        print(f"理论梯度应该是: {2 * x}")
        
        # 反向传播
        y.backward()
        print(f"实际累积梯度: {x.grad}")
        
        if epoch == 0:
            print("第一次正确 ✓")
        else:
            print("梯度被错误累积了! ✗")
        
        # 注意:这里故意不清零梯度来演示问题
    
    print("\n=== 正确做法:每次清零梯度 ===")
    x.grad.zero_()  # 重置
    
    for epoch in range(3):
        print(f"\nEpoch {epoch + 1}:")
        
        # 清零梯度(正确做法)
        if x.grad is not None:
            x.grad.zero_()
        
        # 计算损失
        y = (x ** 2).sum()
        print(f"损失: {y.item()}")
        print(f"理论梯度应该是: {2 * x}")
        
        # 反向传播
        y.backward()
        print(f"实际梯度: {x.grad}")
        print("正确! ✓")

if __name__ == "__main__":
    print("1. 模拟大批量训练")
    print("=" * 50)
    simulate_large_batch_training()
    
    print("\n\n2. 多任务学习梯度累积")
    print("=" * 50)
    multi_task_gradient_accumulation()
    
    print("\n\n3. 为什么需要手动清零梯度")
    print("=" * 50)
    why_zero_grad_needed()
1. 模拟大批量训练
==================================================
总批量大小: 16
小批量大小: 4
累积步数: 4
步骤 1: loss = 1.8221
  累积梯度范数: 4.5243
步骤 2: loss = 2.0437
  累积梯度范数: 6.4268
步骤 3: loss = 2.1350
  累积梯度范数: 7.6952
步骤 4: loss = 1.1126
  累积梯度范数: 7.5846

平均损失: 1.7784
参数更新完成


2. 多任务学习梯度累积
==================================================
=== 任务1 ===
任务1损失: 0.9983
任务1后共享层梯度范数: 1.3653

=== 任务2 ===
任务2损失: 1.3457
两个任务后共享层梯度范数: 1.7382
梯度增加了: 0.3728

多任务梯度累积完成,参数已更新


3. 为什么需要手动清零梯度
==================================================
=== 不清零梯度的问题 ===

Epoch 1:
损失: 5.0
理论梯度应该是: tensor([2., 4.], grad_fn=<MulBackward0>)
实际累积梯度: tensor([2., 4.])
第一次正确 ✓

Epoch 2:
损失: 5.0
理论梯度应该是: tensor([2., 4.], grad_fn=<MulBackward0>)
实际累积梯度: tensor([4., 8.])
梯度被错误累积了! ✗

Epoch 3:
损失: 5.0
理论梯度应该是: tensor([2., 4.], grad_fn=<MulBackward0>)
实际累积梯度: tensor([ 6., 12.])
梯度被错误累积了! ✗

=== 正确做法:每次清零梯度 ===

Epoch 1:
损失: 5.0
理论梯度应该是: tensor([2., 4.], grad_fn=<MulBackward0>)
实际梯度: tensor([2., 4.])
正确! ✓

Epoch 2:
损失: 5.0
理论梯度应该是: tensor([2., 4.], grad_fn=<MulBackward0>)
实际梯度: tensor([2., 4.])
正确! ✓

Epoch 3:
损失: 5.0
理论梯度应该是: tensor([2., 4.], grad_fn=<MulBackward0>)
实际梯度: tensor([2., 4.])
正确! ✓

4. 链式法则与反向传播

4.1 链式法则原理

def chain_rule_example():
    """演示链式法则"""
    # 定义函数: f(g(h(x)))
    # h(x) = x^2
    # g(h) = sin(h)
    # f(g) = exp(g)
    
    x = torch.tensor(0.5, requires_grad=True)
    
    # 前向传播
    h = x ** 2
    g = torch.sin(h)
    f = torch.exp(g)
    
    # 反向传播
    f.backward()
    
    print("链式法则示例:")
    print(f"x = {x.item():.4f}")
    print(f"h = x^2 = {h.item():.4f}")
    print(f"g = sin(h) = {g.item():.4f}")
    print(f"f = exp(g) = {f.item():.4f}")
    print(f"\n自动求导结果: df/dx = {x.grad.item():.4f}")
    
    # 手动计算验证
    df_dg = torch.exp(g)
    dg_dh = torch.cos(h)
    dh_dx = 2 * x
    df_dx_manual = df_dg * dg_dh * dh_dx
    print(f"手动计算结果: df/dx = {df_dx_manual.item():.4f}")
    
    # 可视化链式法则
    fig, ax = plt.subplots(figsize=(12, 4))
    ax.text(0.1, 0.5, 'x', fontsize=20, ha='center', va='center',
            bbox=dict(boxstyle="round,pad=0.3", facecolor="lightblue"))
    ax.arrow(0.15, 0.5, 0.1, 0, head_width=0.02, head_length=0.02, fc='black', ec='black')
    
    ax.text(0.3, 0.5, 'h=x²', fontsize=16, ha='center', va='center',
            bbox=dict(boxstyle="round,pad=0.3", facecolor="lightgreen"))
    ax.arrow(0.38, 0.5, 0.1, 0, head_width=0.02, head_length=0.02, fc='black', ec='black')
    
    ax.text(0.53, 0.5, 'g=sin(h)', fontsize=16, ha='center', va='center',
            bbox=dict(boxstyle="round,pad=0.3", facecolor="lightyellow"))
    ax.arrow(0.63, 0.5, 0.1, 0, head_width=0.02, head_length=0.02, fc='black', ec='black')
    
    ax.text(0.78, 0.5, 'f=exp(g)', fontsize=16, ha='center', va='center',
            bbox=dict(boxstyle="round,pad=0.3", facecolor="lightcoral"))
    
    # 反向传播箭头
    ax.arrow(0.78, 0.3, -0.15, 0, head_width=0.02, head_length=0.02, 
            fc='red', ec='red', linestyle='--', alpha=0.7)
    ax.text(0.7, 0.25, 'df/dg', fontsize=12, color='red')
    
    ax.arrow(0.53, 0.3, -0.15, 0, head_width=0.02, head_length=0.02,
            fc='red', ec='red', linestyle='--', alpha=0.7)
    ax.text(0.45, 0.25, 'dg/dh', fontsize=12, color='red')
    
    ax.arrow(0.3, 0.3, -0.12, 0, head_width=0.02, head_length=0.02,
            fc='red', ec='red', linestyle='--', alpha=0.7)
    ax.text(0.22, 0.25, 'dh/dx', fontsize=12, color='red')
    
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.axis('off')
    ax.set_title('链式法则: df/dx = (df/dg) × (dg/dh) × (dh/dx)', fontsize=14)
    plt.show()

chain_rule_example()

在这里插入图片描述

4.2 反向传播算法

class SimpleNN:
    """手动实现简单神经网络的前向和反向传播"""
    
    def __init__(self, input_size, hidden_size, output_size):
        # 初始化权重和偏置
        self.W1 = torch.randn(input_size, hidden_size, requires_grad=True)
        self.b1 = torch.zeros(hidden_size, requires_grad=True)
        self.W2 = torch.randn(hidden_size, output_size, requires_grad=True)
        self.b2 = torch.zeros(output_size, requires_grad=True)
    
    def forward(self, x):
        """前向传播"""
        # 第一层
        self.z1 = x @ self.W1 + self.b1
        self.a1 = torch.relu(self.z1)
        
        # 第二层
        self.z2 = self.a1 @ self.W2 + self.b2
        self.output = torch.sigmoid(self.z2)
        
        return self.output
    
    def backward_manual(self, x, y, output):
        """手动实现反向传播(用于教学)"""
        m = x.shape[0]
        
        # 输出层梯度
        dz2 = output - y
        dW2 = (self.a1.T @ dz2) / m
        db2 = torch.sum(dz2, dim=0) / m
        
        # 隐藏层梯度
        da1 = dz2 @ self.W2.T
        dz1 = da1 * (self.z1 > 0).float()  # ReLU的导数
        dW1 = (x.T @ dz1) / m
        db1 = torch.sum(dz1, dim=0) / m
        
        return dW1, db1, dW2, db2

# 测试反向传播
torch.manual_seed(42)
model = SimpleNN(2, 3, 1)

# 生成样本数据
x = torch.randn(4, 2)
y = torch.tensor([[1.], [0.], [1.], [0.]])

# 前向传播
output = model.forward(x)

# 计算损失
loss = torch.mean((output - y) ** 2)

# PyTorch自动反向传播
loss.backward()

print("神经网络结构:")
print(f"输入层: 2个神经元")
print(f"隐藏层: 3个神经元 (ReLU激活)")
print(f"输出层: 1个神经元 (Sigmoid激活)")
print(f"\n损失值: {loss.item():.4f}")
print(f"\nW1的梯度形状: {model.W1.grad.shape}")
print(f"W2的梯度形状: {model.W2.grad.shape}")
神经网络结构:
输入层: 2个神经元
隐藏层: 3个神经元 (ReLU激活)
输出层: 1个神经元 (Sigmoid激活)

损失值: 0.4442

W1的梯度形状: torch.Size([2, 3])
W2的梯度形状: torch.Size([3, 1])

4.3 梯度消失和梯度爆炸

def gradient_issues_demo():
    """演示梯度消失和梯度爆炸问题"""
    
    # 梯度消失示例(深层网络with sigmoid)
    x = torch.randn(1, 10, requires_grad=True)
    
    # 模拟深层网络
    h = x
    activations = [x.detach().numpy()]
    
    for i in range(10):
        h = torch.sigmoid(h)
        activations.append(h.detach().numpy())
    
    loss = h.sum()
    loss.backward()
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
    
    # 绘制激活值变化
    for i, act in enumerate(activations):
        ax1.plot(act.flatten(), label=f'Layer {i}', alpha=0.7)
    ax1.set_title('梯度消失:Sigmoid激活值逐层递减')
    ax1.set_xlabel('神经元索引')
    ax1.set_ylabel('激活值')
    ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    # 梯度爆炸示例
    x2 = torch.randn(1, 10, requires_grad=True)
    h2 = x2
    
    for i in range(5):
        h2 = h2 * 2.0  # 每层权重都大于1
    
    loss2 = h2.sum()
    loss2.backward()
    
    # 绘制梯度大小
    gradient_magnitudes = [x.grad.abs().mean().item(), x2.grad.abs().mean().item()]
    ax2.bar(['梯度消失\n(深层Sigmoid)', '梯度爆炸\n(权重>1)'], 
            gradient_magnitudes, color=['blue', 'red'], alpha=0.7)
    ax2.set_ylabel('梯度绝对值均值')
    ax2.set_title('梯度问题对比')
    ax2.set_yscale('log')
    
    plt.tight_layout()
    plt.show()
    
    print(f"梯度消失情况下的梯度均值: {gradient_magnitudes[0]:.2e}")
    print(f"梯度爆炸情况下的梯度均值: {gradient_magnitudes[1]:.2e}")

gradient_issues_demo()

在这里插入图片描述

梯度消失情况下的梯度均值: 3.36e-07
梯度爆炸情况下的梯度均值: 3.20e+01

5. 雅可比矩阵

5.1 雅可比矩阵定义

雅可比矩阵描述了向量函数的导数,是多元函数偏导数的矩阵形式。

def jacobian_example():
    """雅可比矩阵示例"""
    
    # 定义输入向量
    x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
    
    # 定义向量函数 f: R^3 -> R^2
    # f1 = x1^2 + x2*x3
    # f2 = x1*x2 - x3^2
    f1 = x[0]**2 + x[1]*x[2]
    f2 = x[0]*x[1] - x[2]**2
    
    # 输出向量
    f = torch.stack([f1, f2])
    
    # 计算雅可比矩阵
    jacobian = torch.zeros(2, 3)
    
    for i in range(2):
        # 重置梯度
        if x.grad is not None:
            x.grad.zero_()
        
        # 对第i个输出计算梯度
        f[i].backward(retain_graph=True)
        jacobian[i] = x.grad.clone()
    
    print("雅可比矩阵:")
    print("J = [∂f/∂x] =")
    print(jacobian)
    
    # 手动验证
    print("\n手动计算验证:")
    print(f"∂f1/∂x1 = 2*x1 = {2*x[0].item()}")
    print(f"∂f1/∂x2 = x3 = {x[2].item()}")
    print(f"∂f1/∂x3 = x2 = {x[1].item()}")
    print(f"∂f2/∂x1 = x2 = {x[1].item()}")
    print(f"∂f2/∂x2 = x1 = {x[0].item()}")
    print(f"∂f2/∂x3 = -2*x3 = {-2*x[2].item()}")
    
    # 可视化雅可比矩阵
    fig, ax = plt.subplots(figsize=(8, 6))
    im = ax.imshow(jacobian.detach().numpy(), cmap='RdBu', aspect='auto', vmin=-10, vmax=10)
    
    # 添加数值标签
    for i in range(2):
        for j in range(3):
            text = ax.text(j, i, f'{jacobian[i, j].item():.1f}',
                         ha="center", va="center", color="black", fontsize=14)
    
    ax.set_xticks([0, 1, 2])
    ax.set_yticks([0, 1])
    ax.set_xticklabels(['∂/∂x₁', '∂/∂x₂', '∂/∂x₃'])
    ax.set_yticklabels(['f₁', 'f₂'])
    ax.set_title('雅可比矩阵 J', fontsize=16)
    
    plt.colorbar(im, ax=ax)
    plt.tight_layout()
    plt.show()

jacobian_example()
雅可比矩阵:
J = [∂f/∂x] =
tensor([[ 2.,  3.,  2.],
        [ 2.,  1., -6.]])

手动计算验证:
∂f1/∂x1 = 2*x1 = 2.0
∂f1/∂x2 = x3 = 3.0
∂f1/∂x3 = x2 = 2.0
∂f2/∂x1 = x2 = 2.0
∂f2/∂x2 = x1 = 1.0
∂f2/∂x3 = -2*x3 = -6.0

在这里插入图片描述

5.2 雅可比矩阵在神经网络中的应用

# 使用torch.autograd.functional计算雅可比矩阵
from torch.autograd.functional import jacobian

def network_jacobian():
    """神经网络层的雅可比矩阵"""
    
    # 定义一个简单的网络层
    def layer(x):
        W = torch.tensor([[1.0, -0.5], 
                         [0.5, 2.0], 
                         [-1.0, 1.0]])
        b = torch.tensor([0.1, 0.2, -0.1])
        return torch.relu(x @ W.T + b)
    
    # 输入
    x = torch.tensor([1.0, 2.0])
    
    # 计算雅可比矩阵
    J = jacobian(layer, x)
    
    print("网络层函数: f(x) = ReLU(Wx + b)")
    print(f"输入维度: {x.shape}")
    print(f"输出维度: {layer(x).shape}")
    print(f"\n雅可比矩阵形状: {J.shape}")
    print("雅可比矩阵:")
    print(J)
    
    # 可视化输入输出关系
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    
    # 绘制网络结构
    ax1.set_xlim(-0.5, 2.5)
    ax1.set_ylim(-0.5, 3.5)
    
    # 输入节点
    for i in range(2):
        circle = plt.Circle((0, i+0.5), 0.2, color='lightblue', ec='black')
        ax1.add_patch(circle)
        ax1.text(0, i+0.5, f'x{i+1}', ha='center', va='center')
    
    # 输出节点
    for i in range(3):
        circle = plt.Circle((2, i), 0.2, color='lightgreen', ec='black')
        ax1.add_patch(circle)
        ax1.text(2, i, f'y{i+1}', ha='center', va='center')
    
    # 连接线
    for i in range(2):
        for j in range(3):
            ax1.arrow(0.2, i+0.5, 1.6, j-i-0.5, 
                     head_width=0.05, head_length=0.05, 
                     fc='gray', ec='gray', alpha=0.5)
    
    ax1.set_title('网络层结构', fontsize=14)
    ax1.axis('off')
    
    # 绘制雅可比矩阵热图
    im = ax2.imshow(J.detach().numpy(), cmap='coolwarm', aspect='auto')
    ax2.set_xlabel('输入维度')
    ax2.set_ylabel('输出维度')
    ax2.set_title('雅可比矩阵热图', fontsize=14)
    plt.colorbar(im, ax=ax2)
    
    plt.tight_layout()
    plt.show()

network_jacobian()

网络层函数: f(x) = ReLU(Wx + b)
输入维度: torch.Size([2])
输出维度: torch.Size([3])

雅可比矩阵形状: torch.Size([3, 2])
雅可比矩阵:
tensor([[ 1.0000, -0.5000],
        [ 0.5000,  2.0000],
        [-1.0000,  1.0000]])

在这里插入图片描述

6. GPU与CPU兼容性

6.1 设备管理

# 检查CUDA可用性
print(f"CUDA是否可用: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA设备数量: {torch.cuda.device_count()}")
    print(f"当前CUDA设备: {torch.cuda.current_device()}")
    print(f"CUDA设备名称: {torch.cuda.get_device_name(0)}")

# 设备选择最佳实践
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\n使用设备: {device}")

# 创建张量时指定设备
tensor_cpu = torch.randn(3, 3)  # 默认在CPU上
tensor_gpu = torch.randn(3, 3, device=device)  # 在指定设备上

print(f"\nCPU张量设备: {tensor_cpu.device}")
print(f"GPU/CPU张量设备: {tensor_gpu.device}")
CUDA是否可用: True
CUDA设备数量: 1
当前CUDA设备: 0
CUDA设备名称: NVIDIA GeForce RTX 3080

使用设备: cuda

CPU张量设备: cpu
GPU/CPU张量设备: cuda:0

6.2 设备间数据转移

# CPU到GPU
if torch.cuda.is_available():
    # 方法1: 使用.to()
    tensor_cpu = torch.randn(2, 3)
    tensor_gpu = tensor_cpu.to('cuda')
    
    # 方法2: 使用.cuda()
    tensor_gpu2 = tensor_cpu.cuda()
    
    # GPU到CPU
    tensor_back_to_cpu = tensor_gpu.cpu()
    
    print(f"原始张量设备: {tensor_cpu.device}")
    print(f"转移到GPU后: {tensor_gpu.device}")
    print(f"转回CPU后: {tensor_back_to_cpu.device}")
else:
    print("CUDA不可用,使用CPU进行演示")
    tensor = torch.randn(2, 3)
    print(f"张量设备: {tensor.device}")
原始张量设备: cpu
转移到GPU后: cuda:0
转回CPU后: cpu

6.3 设备无关代码编写

class DeviceAgnosticModel(torch.nn.Module):
    """设备无关的模型示例"""
    
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.fc1 = torch.nn.Linear(input_size, hidden_size)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

def train_on_device(model, data, target, device, epochs=100):
    """设备无关的训练函数"""
    # 将模型移到指定设备
    model = model.to(device)
    data = data.to(device)
    target = target.to(device)
    
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
    criterion = torch.nn.MSELoss()
    
    losses = []
    for epoch in range(epochs):
        # 前向传播
        output = model(data)
        loss = criterion(output, target)
        
        # 反向传播
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        losses.append(loss.item())
        
        if (epoch + 1) % 20 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')
    
    return losses

# 使用示例
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"训练设备: {device}")

# 创建模型和数据
model = DeviceAgnosticModel(10, 20, 1)
data = torch.randn(100, 10)
target = torch.randn(100, 1)

# 训练
losses = train_on_device(model, data, target, device, epochs=100)

# 绘制损失曲线
plt.figure(figsize=(10, 4))
plt.plot(losses)
plt.title(f'训练损失曲线 (设备: {device})')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True, alpha=0.3)
plt.show()
Epoch [20/100], Loss: 0.8893
Epoch [40/100], Loss: 0.8566
Epoch [60/100], Loss: 0.8442
Epoch [80/100], Loss: 0.8342
Epoch [100/100], Loss: 0.8252

在这里插入图片描述

6.4 性能优化技巧

def performance_tips():
    """GPU/CPU性能优化技巧"""
    
    print("性能优化技巧:")
    print("-" * 50)
    
    # 1. 批量操作
    print("1. 使用批量操作而非循环:")
    
    # 不推荐
    start_time = torch.cuda.Event(enable_timing=True) if torch.cuda.is_available() else None
    end_time = torch.cuda.Event(enable_timing=True) if torch.cuda.is_available() else None
    
    x = torch.randn(1000, 100, device=device)
    result_slow = torch.zeros(1000, device=device)
    
    if torch.cuda.is_available():
        start_time.record()
    
    for i in range(1000):
        result_slow[i] = x[i].sum()
    
    if torch.cuda.is_available():
        end_time.record()
        torch.cuda.synchronize()
        print(f"   循环方式时间: {start_time.elapsed_time(end_time):.2f} ms")
    
    # 推荐
    if torch.cuda.is_available():
        start_time.record()
    
    result_fast = x.sum(dim=1)
    
    if torch.cuda.is_available():
        end_time.record()
        torch.cuda.synchronize()
        print(f"   批量操作时间: {start_time.elapsed_time(end_time):.2f} ms")
    
    # 2. 避免频繁的设备间数据传输
    print("\n2. 最小化CPU-GPU数据传输:")
    print("   - 在GPU上完成所有计算")
    print("   - 只在必要时传回CPU")
    
    # 3. 使用适当的数据类型
    print("\n3. 使用合适的数据类型:")
    tensor_float32 = torch.randn(1000, 1000, device=device)
    tensor_float16 = tensor_float32.half()  # 转换为半精度
    
    print(f"   Float32内存: {tensor_float32.element_size() * tensor_float32.nelement() / 1024**2:.2f} MB")
    print(f"   Float16内存: {tensor_float16.element_size() * tensor_float16.nelement() / 1024**2:.2f} MB")
    
    # 4. 固定内存
    print("\n4. 使用pin_memory加速数据加载:")
    if torch.cuda.is_available():
        # 创建DataLoader时使用pin_memory=True
        print("   DataLoader(..., pin_memory=True)")

performance_tips()

性能优化技巧:
--------------------------------------------------
1. 使用批量操作而非循环:
   循环方式时间: 32.97 ms
   批量操作时间: 0.03 ms

2. 最小化CPU-GPU数据传输:
   - 在GPU上完成所有计算
   - 只在必要时传回CPU

3. 使用合适的数据类型:
   Float32内存: 3.81 MB
   Float16内存: 1.91 MB

4. 使用pin_memory加速数据加载:
   DataLoader(..., pin_memory=True)

7. 完整示例:构建简单神经网络

7.1 完整的训练流程

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import matplotlib.pyplot as plt

# 设置随机种子
torch.manual_seed(42)
np.random.seed(42)

# 设备配置
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")

# 自定义数据集
class SpiralDataset(Dataset):
    """生成螺旋形分类数据"""
    
    def __init__(self, n_points=1000, n_classes=3, noise=0.2):
        self.n_points = n_points
        self.n_classes = n_classes
        
        X = []
        y = []
        
        for class_idx in range(n_classes):
            # 生成螺旋数据
            theta = np.linspace(class_idx * 4, (class_idx + 1) * 4, n_points // n_classes) + np.random.randn(n_points // n_classes) * noise
            r = np.linspace(0.5, 2, n_points // n_classes) + np.random.randn(n_points // n_classes) * noise * 0.1
            
            x1 = r * np.cos(theta)
            x2 = r * np.sin(theta)
            
            X.append(np.column_stack([x1, x2]))
            y.append(np.full(n_points // n_classes, class_idx))
        
        self.X = torch.FloatTensor(np.vstack(X))
        self.y = torch.LongTensor(np.hstack(y))
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# 定义神经网络模型
class NeuralNetwork(nn.Module):
    def __init__(self, input_size=2, hidden_sizes=[64, 32], output_size=3):
        super(NeuralNetwork, self).__init__()
        
        layers = []
        prev_size = input_size
        
        for hidden_size in hidden_sizes:
            layers.append(nn.Linear(prev_size, hidden_size))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.2))
            prev_size = hidden_size
        
        layers.append(nn.Linear(prev_size, output_size))
        
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

# 训练函数
def train_model(model, train_loader, val_loader, epochs=100, lr=0.01):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=10)
    
    train_losses = []
    val_losses = []
    train_accs = []
    val_accs = []
    
    for epoch in range(epochs):
        # 训练阶段
        model.train()
        train_loss = 0
        correct = 0
        total = 0
        
        for batch_x, batch_y in train_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            
            # 前向传播
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            
            # 反向传播
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()
        
        train_loss /= len(train_loader)
        train_acc = 100 * correct / total
        train_losses.append(train_loss)
        train_accs.append(train_acc)
        
        # 验证阶段
        model.eval()
        val_loss = 0
        correct = 0
        total = 0
        
        with torch.no_grad():
            for batch_x, batch_y in val_loader:
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
                outputs = model(batch_x)
                loss = criterion(outputs, batch_y)
                
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total += batch_y.size(0)
                correct += (predicted == batch_y).sum().item()
        
        val_loss /= len(val_loader)
        val_acc = 100 * correct / total
        val_losses.append(val_loss)
        val_accs.append(val_acc)
        
        scheduler.step(val_loss)
        
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], '
                  f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, '
                  f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%')
    
    return train_losses, val_losses, train_accs, val_accs

# 创建数据集
dataset = SpiralDataset(n_points=3000, n_classes=3, noise=0.2)

# 划分训练集和验证集
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# 创建数据加载器
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# 创建模型
model = NeuralNetwork(input_size=2, hidden_sizes=[128, 64, 32], output_size=3)
print(f"\n模型结构:\n{model}")

# 训练模型
print("\n开始训练...")
train_losses, val_losses, train_accs, val_accs = train_model(
    model, train_loader, val_loader, epochs=100, lr=0.001
)

# 可视化结果
fig, axes = plt.subplots(2, 3, figsize=(15, 8))

# 绘制数据分布
ax = axes[0, 0]
colors = ['red', 'blue', 'green']
for i in range(3):
    mask = dataset.y == i
    ax.scatter(dataset.X[mask, 0], dataset.X[mask, 1], 
              c=colors[i], alpha=0.6, label=f'Class {i}')
ax.set_title('原始数据分布')
ax.set_xlabel('X1')
ax.set_ylabel('X2')
ax.legend()

# 绘制决策边界
ax = axes[0, 1]
model.eval()
h = 0.02
x_min, x_max = dataset.X[:, 0].min() - 0.5, dataset.X[:, 0].max() + 0.5
y_min, y_max = dataset.X[:, 1].min() - 0.5, dataset.X[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

with torch.no_grad():
    Z = model(torch.FloatTensor(np.c_[xx.ravel(), yy.ravel()]).to(device))
    _, Z = torch.max(Z, 1)
    Z = Z.cpu().numpy().reshape(xx.shape)

ax.contourf(xx, yy, Z, alpha=0.4, cmap='viridis')
for i in range(3):
    mask = dataset.y == i
    ax.scatter(dataset.X[mask, 0], dataset.X[mask, 1], 
              c=colors[i], alpha=0.6, label=f'Class {i}')
ax.set_title('学习到的决策边界')
ax.set_xlabel('X1')
ax.set_ylabel('X2')
ax.legend()

# 绘制训练损失
ax = axes[0, 2]
ax.plot(train_losses, label='训练损失', alpha=0.8)
ax.plot(val_losses, label='验证损失', alpha=0.8)
ax.set_title('损失曲线')
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
ax.legend()
ax.grid(True, alpha=0.3)

# 绘制准确率
ax = axes[1, 0]
ax.plot(train_accs, label='训练准确率', alpha=0.8)
ax.plot(val_accs, label='验证准确率', alpha=0.8)
ax.set_title('准确率曲线')
ax.set_xlabel('Epoch')
ax.set_ylabel('Accuracy (%)')
ax.legend()
ax.grid(True, alpha=0.3)

# 绘制梯度流
ax = axes[1, 1]
gradients = []
for name, param in model.named_parameters():
    if param.grad is not None:
        gradients.append(param.grad.abs().mean().item())

ax.bar(range(len(gradients)), gradients, color='skyblue', alpha=0.7)
ax.set_title('各层梯度大小')
ax.set_xlabel('层索引')
ax.set_ylabel('梯度绝对值均值')
ax.grid(True, alpha=0.3)

# 绘制权重分布
ax = axes[1, 2]
weights = []
for param in model.parameters():
    if len(param.shape) > 1:  # 只考虑权重矩阵,不考虑偏置
        weights.extend(param.cpu().detach().numpy().flatten())

ax.hist(weights, bins=50, color='purple', alpha=0.7, edgecolor='black')
ax.set_title('权重分布')
ax.set_xlabel('权重值')
ax.set_ylabel('频数')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()
使用设备: cuda

模型结构:
NeuralNetwork(
  (network): Sequential(
    (0): Linear(in_features=2, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=128, out_features=64, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=64, out_features=32, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.2, inplace=False)
    (9): Linear(in_features=32, out_features=3, bias=True)
  )
)

开始训练...
Epoch [10/100], Train Loss: 0.0451, Train Acc: 98.88%, Val Loss: 0.0076, Val Acc: 99.83%
Epoch [20/100], Train Loss: 0.0203, Train Acc: 99.42%, Val Loss: 0.0011, Val Acc: 100.00%
Epoch [30/100], Train Loss: 0.0142, Train Acc: 99.50%, Val Loss: 0.0036, Val Acc: 99.83%
Epoch [40/100], Train Loss: 0.0097, Train Acc: 99.71%, Val Loss: 0.0002, Val Acc: 100.00%
Epoch [50/100], Train Loss: 0.0061, Train Acc: 99.83%, Val Loss: 0.0000, Val Acc: 100.00%
Epoch [60/100], Train Loss: 0.0088, Train Acc: 99.54%, Val Loss: 0.0002, Val Acc: 100.00%
Epoch [70/100], Train Loss: 0.0037, Train Acc: 99.83%, Val Loss: 0.0001, Val Acc: 100.00%
Epoch [80/100], Train Loss: 0.0063, Train Acc: 99.83%, Val Loss: 0.0002, Val Acc: 100.00%
Epoch [90/100], Train Loss: 0.0111, Train Acc: 99.88%, Val Loss: 0.0002, Val Acc: 100.00%
Epoch [100/100], Train Loss: 0.0048, Train Acc: 99.83%, Val Loss: 0.0002, Val Acc: 100.00%

在这里插入图片描述

7.2 计算图可视化工具

def visualize_computation_graph_detailed(model, input_size=(1, 2)):
    """详细可视化模型的计算图"""
    
    # 创建示例输入
    x = torch.randn(input_size, requires_grad=True)
    
    # 前向传播
    y = model(x)
    
    # 创建一个简单的损失
    loss = y.sum()
    
    # 执行反向传播以构建完整的计算图
    loss.backward()
    
    print("计算图构建完成")
    print(f"输入形状: {x.shape}")
    print(f"输出形状: {y.shape}")
    print(f"损失值: {loss.item():.4f}")
    
    # 打印梯度信息
    print("\n各层参数的梯度:")
    for name, param in model.named_parameters():
        if param.grad is not None:
            print(f"{name}: 梯度形状={param.grad.shape}, "
                  f"梯度均值={param.grad.mean().item():.6f}, "
                  f"梯度标准差={param.grad.std().item():.6f}")

# 使用示例
small_model = NeuralNetwork(input_size=2, hidden_sizes=[4, 3], output_size=2)
visualize_computation_graph_detailed(small_model)
计算图构建完成
输入形状: torch.Size([1, 2])
输出形状: torch.Size([1, 2])
损失值: 0.8725

各层参数的梯度:
network.0.weight: 梯度形状=torch.Size([4, 2]), 梯度均值=0.000000, 梯度标准差=0.000000
network.0.bias: 梯度形状=torch.Size([4]), 梯度均值=0.000000, 梯度标准差=0.000000
network.3.weight: 梯度形状=torch.Size([3, 4]), 梯度均值=0.000000, 梯度标准差=0.000000
network.3.bias: 梯度形状=torch.Size([3]), 梯度均值=0.000000, 梯度标准差=0.000000
network.6.weight: 梯度形状=torch.Size([2, 3]), 梯度均值=0.000000, 梯度标准差=0.000000
network.6.bias: 梯度形状=torch.Size([2]), 梯度均值=1.000000, 梯度标准差=0.000000
  1. 张量操作:PyTorch的基础数据结构,支持CPU和GPU计算
  2. 自动求导:通过计算图自动计算梯度,是深度学习的核心
  3. 反向传播:基于链式法则的梯度计算方法
  4. 设备管理:灵活的GPU/CPU切换,提高计算效率
  5. 实践应用:从理论到实践的完整深度学习流程

最佳实践

  • 始终使用device变量编写设备无关的代码
  • 合理使用requires_grad控制梯度计算
  • 注意梯度累积,必要时清零梯度
  • 使用批量操作替代循环以提高性能
  • 监控梯度大小,避免梯度消失或爆炸

  1. 深入学习优化器的工作原理
  2. 探索更复杂的网络架构(CNN、RNN、Transformer)
  3. 学习分布式训练和混合精度训练
  4. 掌握模型部署和优化技术

网站公告

今日签到

点亮在社区的每一天
去签到