一文讲清楚Pytorch 张量、链式求导、正向传播、反向求导、计算图等基础知识-EW帮帮网

PyTorch深度学习基础知识详解

1. 基础数学概念

1.1 标量、向量、矩阵和张量

import torch
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from mpl_toolkits.mplot3d import Axes3D

# 设置随机种子以确保可重复性
torch.manual_seed(42)
np.random.seed(42)

# 检查CUDA是否可用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")
#使用设备: cuda


# 1. 标量 (Scalar) - 0维张量
scalar = torch.tensor(3.14)
print(f"标量: {scalar}")
#标量: 3.140000104904175
print(f"标量形状: {scalar.shape}")
#标量形状: torch.Size([])
print(f"标量维度: {scalar.dim()}")
#标量维度: 0

# 2. 向量 (Vector) - 1维张量
vector = torch.tensor([1.0, 2.0, 3.0, 4.0])
print(f"\n向量: {vector}")
#向量: tensor([1., 2., 3., 4.])
print(f"向量形状: {vector.shape}")
#向量形状: torch.Size([4])
print(f"向量维度: {vector.dim()}")
#向量维度: 1

# 3. 矩阵 (Matrix) - 2维张量
matrix = torch.tensor([[1, 2, 3],
                       [4, 5, 6],
                       [7, 8, 9]])
print(f"\n矩阵:\n{matrix}")
#矩阵:
#tensor([[1, 2, 3],
#       [4, 5, 6],
#       [7, 8, 9]])
print(f"矩阵形状: {matrix.shape}")
#矩阵形状: torch.Size([3, 3])
print(f"矩阵维度: {matrix.dim()}")
#矩阵维度: 2

# 4. 张量 (Tensor) - 多维数组
tensor_3d = torch.randn(2, 3, 4)  # 2个3x4的矩阵
print(f"\n3维张量形状: {tensor_3d.shape}")
#3维张量形状: torch.Size([2, 3, 4])
print(f"3维张量维度: {tensor_3d.dim()}")
#3维张量维度: 3

1.2 张量可视化

# 可视化不同维度的张量
fig = plt.figure(figsize=(15, 4))

# 标量可视化
ax1 = fig.add_subplot(141)
ax1.text(0.5, 0.5, '3.14', fontsize=20, ha='center', va='center')
ax1.set_xlim(0, 1)
ax1.set_ylim(0, 1)
ax1.set_title('标量 (0D)')
ax1.axis('off')

# 向量可视化
ax2 = fig.add_subplot(142)
vector_data = [1, 2, 3, 4]
ax2.bar(range(len(vector_data)), vector_data, color='blue', alpha=0.7)
ax2.set_title('向量 (1D)')
ax2.set_xlabel('索引')
ax2.set_ylabel('值')

# 矩阵可视化
ax3 = fig.add_subplot(143)
matrix_data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
im = ax3.imshow(matrix_data, cmap='viridis', aspect='auto')
ax3.set_title('矩阵 (2D)')
for i in range(3):
    for j in range(3):
        ax3.text(j, i, str(matrix_data[i, j]), ha='center', va='center', color='white')
plt.colorbar(im, ax=ax3)

# 3D张量可视化（显示为多个2D切片）
ax4 = fig.add_subplot(144, projection='3d')
tensor_3d_data = np.random.randn(3, 3, 3)
x, y, z = np.meshgrid(range(3), range(3), range(3))
ax4.scatter(x, y, z, c=tensor_3d_data.flatten(), cmap='coolwarm', s=100, alpha=0.6)
ax4.set_title('3D张量')
ax4.set_xlabel('X')
ax4.set_ylabel('Y')
ax4.set_zlabel('Z')

plt.tight_layout()
plt.show()

–

在这里插入图片描述

2. PyTorch张量基础

2.1 张量创建与操作

# 创建张量的多种方式
# 1. 从Python列表创建
tensor_from_list = torch.tensor([[1, 2], [3, 4]], dtype=torch.float32)

# 2. 创建特定形状的张量
zeros_tensor = torch.zeros(3, 4)  # 3x4的零张量
ones_tensor = torch.ones(2, 3)    # 2x3的全1张量
random_tensor = torch.randn(3, 3) # 3x3的随机张量（标准正态分布）

# 3. 创建等差数列
arange_tensor = torch.arange(0, 10, 2)  # [0, 2, 4, 6, 8]
linspace_tensor = torch.linspace(0, 1, 5)  # 5个均匀分布的点

print("从列表创建的张量:")
#从列表创建的张量:

print(tensor_from_list)
#tensor([[1., 2.],
#       [3., 4.]])
print(f"数据类型: {tensor_from_list.dtype}")
#数据类型: torch.float32
print(f"设备: {tensor_from_list.device}")
#设备: cpu

2.2 张量的基本运算

# 创建两个张量
a = torch.tensor([[1.0, 2.0], [3.0, 4.0]], device=device)
print(f"矩阵a:\n{a}")
#矩阵a:
#tensor([[1., 2.],
#        [3., 4.]], device='cuda:0')

b = torch.tensor([[5.0, 6.0], [7.0, 8.0]], device=device)
print(f"矩阵b:\n{b}")
#矩阵b:
#tensor([[5., 6.],
#      [7., 8.]], device='cuda:0')



# 基本算术运算
add_result = a + b  # 逐元素加法
sub_result = a - b  # 逐元素减法
mul_result = a * b  # 逐元素乘法
div_result = a / b  # 逐元素除法


#print(f"逐元素乘法 a * b:\n{mul_result}")
#逐元素乘法 a * b:
#tensor([[ 5., 12.],
#        [21., 32.]], device='cuda:0')


# 矩阵运算
matmul_result = torch.matmul(a, b)  # 矩阵乘法
# 或者使用 @ 操作符
matmul_result2 = a @ b

print(f"矩阵乘法 a @ b:\n{matmul_result}")
#矩阵乘法 a @ b:
#tensor([[19., 22.],
#        [43., 50.]], device='cuda:0')


# 转置
transpose_result = a.T  # 或 a.transpose(0, 1)

# 聚合操作
sum_result = a.sum()  # 所有元素求和
mean_result = a.mean()  # 平均值
max_result = a.max()  # 最大值
min_result = a.min()  # 最小值

2.3 张量的形状操作

# 创建一个张量
x = torch.randn(4, 3, 2)
print(f"原始形状: {x.shape}")
#原始形状: torch.Size([4, 3, 2])

# reshape操作
x_reshaped = x.reshape(6, 4)  # 必须保证元素总数不变
print(f"reshape后: {x_reshaped.shape}")
#reshape后: torch.Size([6, 4])
    
    
# view操作（与reshape类似，但要求张量在内存中连续）
x_viewed = x.view(12, 2)
print(f"view后: {x_viewed.shape}")
#view后: torch.Size([12, 2])


# squeeze和unsqueeze
y = torch.randn(1, 3, 1, 4)
y_squeezed = y.squeeze()  # 移除所有维度为1的维度
print(f"squeeze前: {y.shape}, squeeze后: {y_squeezed.shape}")
#squeeze前: torch.Size([1, 3, 1, 4]), squeeze后: torch.Size([3, 4])
           
y_unsqueezed = y_squeezed.unsqueeze(0)  # 在第0维添加一个维度
print(f"unsqueeze后: {y_unsqueezed.shape}")
#unsqueeze后: torch.Size([1, 3, 4])
    
    
# 广播机制示例
a = torch.randn(3, 1)
b = torch.randn(1, 4)
c = a + b  # 自动广播到 (3, 4)
print(f"广播结果形状: {c.shape}")
#广播结果形状: torch.Size([3, 4])

3. 计算图与自动求导

3.1 计算图概念

计算图是一种有向无环图（DAG），用于表示计算过程中的操作和数据流。

# 创建需要梯度的张量
x = torch.tensor(2.0, requires_grad=True)
y = torch.tensor(3.0, requires_grad=True)

# 构建计算图
z = x * y
w = z ** 2
loss = w + 5

print(f"x: {x}, requires_grad={x.requires_grad}")
#x: 2.0, requires_grad=True
    
print(f"y: {y}, requires_grad={y.requires_grad}")
#y: 3.0, requires_grad=True
    
print(f"z = x * y: {z}")
#z = x * y: 6.0
    
print(f"w = z^2: {w}")
#w = z^2: 36.0
    
print(f"loss = w + 5: {loss}")
#loss = w + 5: 41.0

# 查看计算图信息
print(f"\nloss的grad_fn: {loss.grad_fn}")
#loss的grad_fn: <AddBackward0 object at 0x0000024315868310>
print(f"w的grad_fn: {w.grad_fn}")
#w的grad_fn: <PowBackward0 object at 0x00000243136D70D0>
print(f"z的grad_fn: {z.grad_fn}")
#z的grad_fn: <MulBackward0 object at 0x0000024315868310>

3.2 计算图可视化

import networkx as nx

def visualize_computation_graph():
    """可视化简单的计算图"""
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
    
    # 前向传播图
    G_forward = nx.DiGraph()
    G_forward.add_edges_from([
        ('x', 'z = x * y'),
        ('y', 'z = x * y'),
        ('z = x * y', 'w = z²'),
        ('w = z²', 'loss = w + 5'),
        ('5', 'loss = w + 5')
    ])
    
    pos_forward = {
        'x': (0, 2),
        'y': (0, 0),
        'z = x * y': (2, 1),
        'w = z²': (4, 1),
        '5': (4, 0),
        'loss = w + 5': (6, 1)
    }
    
    nx.draw(G_forward, pos_forward, ax=ax1, with_labels=True, 
            node_color='lightblue', node_size=2000, font_size=10,
            arrows=True, arrowsize=20, edge_color='gray')
    ax1.set_title('前向传播', fontsize=14)
    
    # 反向传播图
    G_backward = nx.DiGraph()
    G_backward.add_edges_from([
        ('∂loss/∂loss = 1', '∂loss/∂w'),
        ('∂loss/∂w', '∂loss/∂z'),
        ('∂loss/∂z', '∂loss/∂x'),
        ('∂loss/∂z', '∂loss/∂y')
    ])
    
    pos_backward = {
        '∂loss/∂loss = 1': (6, 1),
        '∂loss/∂w': (4, 1),
        '∂loss/∂z': (2, 1),
        '∂loss/∂x': (0, 2),
        '∂loss/∂y': (0, 0)
    }
    
    nx.draw(G_backward, pos_backward, ax=ax2, with_labels=True,
            node_color='lightcoral', node_size=2000, font_size=10,
            arrows=True, arrowsize=20, edge_color='gray')
    ax2.set_title('反向传播', fontsize=14)
    
    plt.tight_layout()
    plt.show()

visualize_computation_graph()

在这里插入图片描述

3.3 自动求导机制

# PyTorch的autograd自动求导
x = torch.tensor(2.0, requires_grad=True)
y = torch.tensor(3.0, requires_grad=True)

# 前向传播
z = x * y
w = z ** 2
loss = w + 5

# 反向传播
loss.backward()

# 查看梯度
print(f"∂loss/∂x = {x.grad}")
print(f"∂loss/∂y = {y.grad}")

# 手动验证梯度计算
# loss = (x*y)^2 + 5
# ∂loss/∂x = 2*x*y^2 = 2*2*9 = 36
# ∂loss/∂y = 2*x^2*y = 2*4*3 = 24
print(f"\n手动计算验证:")
print(f"∂loss/∂x = 2*x*y^2 = 2*{x.item()}*{y.item()}^2 = {2*x.item()*y.item()**2}")
print(f"∂loss/∂y = 2*x^2*y = 2*{x.item()}^2*{y.item()} = {2*x.item()**2*y.item()}")

∂loss/∂x = 36.0
∂loss/∂y = 24.0

手动计算验证:
∂loss/∂x = 2*x*y^2 = 2*2.0*3.0^2 = 36.0
∂loss/∂y = 2*x^2*y = 2*2.0^2*3.0 = 24.0

3.4 梯度累积与清零

# 梯度累积示例
x = torch.tensor(2.0, requires_grad=True)

# 第一次计算
y1 = x ** 2
y1.backward()
print(f"第一次backward后，x.grad = {x.grad}")

# 第二次计算（梯度会累积）
y2 = x ** 3
y2.backward()
print(f"第二次backward后（累积），x.grad = {x.grad}")

# 清零梯度
x.grad.zero_()
y3 = x ** 4
y3.backward()
print(f"清零后再次backward，x.grad = {x.grad}")

import torch

# 演示梯度累积的详细过程
x = torch.tensor(2.0, requires_grad=True)

print("=== 第一次backward ===")
y1 = x ** 2  # y1 = x²
print(f"y1 = {y1}")
print(f"dy1/dx = 2x = 2*{x} = {2*x}")

y1.backward()
print(f"第一次backward后，x.grad = {x.grad}")

print("\n=== 第二次backward（累积）===")
y2 = x ** 2  # y2 = x²  
print(f"y2 = {y2}")
print(f"dy2/dx = 2x = 2*{x} = {2*x}")

y2.backward()
print(f"新梯度 = {2*x}")
print(f"累积梯度 = 原梯度 + 新梯度 = 4.0 + 4.0 = {x.grad}")

print("\n=== 清零后重新计算 ===")
x.grad.zero_()  # 手动清零
y3 = x ** 4  # y3 = x⁴
print(f"y3 = {y3}")
print(f"dy3/dx = 4x³ = 4*{x}³ = 4*8 = {4 * (x**3)}")

y3.backward()
print(f"清零后的梯度 = {x.grad}")

=== 第一次backward ===
y1 = 4.0
dy1/dx = 2x = 2*2.0 = 4.0
第一次backward后，x.grad = 4.0

=== 第二次backward（累积）===
y2 = 4.0
dy2/dx = 2x = 2*2.0 = 4.0
新梯度 = 4.0
累积梯度 = 原梯度 + 新梯度 = 4.0 + 4.0 = 8.0

=== 清零后重新计算 ===
y3 = 16.0
dy3/dx = 4x³ = 4*2.0³ = 4*8 = 32.0
清零后的梯度 = 32.0

import torch
import torch.nn as nn

# 场景1: 模拟大批量训练（内存限制）
def simulate_large_batch_training():
    """模拟大批量训练：将大批量分解为小批量累积"""
    model = nn.Linear(10, 1)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
    
    # 模拟数据：假设我们想要批量大小为16，但内存只能处理4个样本
    total_batch_size = 16
    mini_batch_size = 4
    accumulation_steps = total_batch_size // mini_batch_size
    
    print(f"总批量大小: {total_batch_size}")
    print(f"小批量大小: {mini_batch_size}")
    print(f"累积步数: {accumulation_steps}")
    
    # 生成模拟数据
    all_data = torch.randn(total_batch_size, 10)
    all_targets = torch.randn(total_batch_size, 1)
    
    optimizer.zero_grad()  # 清零梯度
    
    total_loss = 0
    for step in range(accumulation_steps):
        # 获取小批量数据
        start_idx = step * mini_batch_size
        end_idx = start_idx + mini_batch_size
        
        mini_batch_data = all_data[start_idx:end_idx]
        mini_batch_targets = all_targets[start_idx:end_idx]
        
        # 前向传播
        outputs = model(mini_batch_data)
        loss = nn.MSELoss()(outputs, mini_batch_targets)
        
        # 反向传播（梯度累积）
        loss.backward()  # 梯度会自动累积
        
        total_loss += loss.item()
        print(f"步骤 {step+1}: loss = {loss.item():.4f}")
        
        # 查看参数的梯度（累积过程）
        if hasattr(model.weight, 'grad') and model.weight.grad is not None:
            grad_norm = model.weight.grad.norm().item()
            print(f"  累积梯度范数: {grad_norm:.4f}")
    
    # 平均梯度（模拟大批量的效果）
    for param in model.parameters():
        if param.grad is not None:
            param.grad /= accumulation_steps
    
    # 更新参数
    optimizer.step()
    
    avg_loss = total_loss / accumulation_steps
    print(f"\n平均损失: {avg_loss:.4f}")
    print("参数更新完成")
    
    return model

# 场景2: 多任务学习中的梯度累积
def multi_task_gradient_accumulation():
    """多任务学习：来自不同任务的梯度累积"""
    shared_model = nn.Linear(10, 5)
    task1_head = nn.Linear(5, 1)
    task2_head = nn.Linear(5, 2)
    
    optimizer = torch.optim.Adam(
        list(shared_model.parameters()) + 
        list(task1_head.parameters()) + 
        list(task2_head.parameters()), 
        lr=0.001
    )
    
    # 模拟数据
    x = torch.randn(8, 10)
    y1 = torch.randn(8, 1)  # 任务1目标
    y2 = torch.randn(8, 2)  # 任务2目标
    
    optimizer.zero_grad()
    
    # 共享特征提取
    shared_features = shared_model(x)
    
    # 任务1的损失和梯度
    print("=== 任务1 ===")
    output1 = task1_head(shared_features)
    loss1 = nn.MSELoss()(output1, y1)
    print(f"任务1损失: {loss1.item():.4f}")
    
    loss1.backward(retain_graph=True)  # 保留计算图用于任务2
    
    # 检查共享层的梯度
    if shared_model.weight.grad is not None:
        grad_norm_after_task1 = shared_model.weight.grad.norm().item()
        print(f"任务1后共享层梯度范数: {grad_norm_after_task1:.4f}")
    
    # 任务2的损失和梯度（累积到任务1的梯度上）
    print("\n=== 任务2 ===")
    output2 = task2_head(shared_features)
    loss2 = nn.MSELoss()(output2, y2)
    print(f"任务2损失: {loss2.item():.4f}")
    
    loss2.backward()  # 梯度会累积到共享层
    
    # 检查累积后的梯度
    if shared_model.weight.grad is not None:
        grad_norm_after_both = shared_model.weight.grad.norm().item()
        print(f"两个任务后共享层梯度范数: {grad_norm_after_both:.4f}")
        print(f"梯度增加了: {grad_norm_after_both - grad_norm_after_task1:.4f}")
    
    # 更新所有参数
    optimizer.step()
    print("\n多任务梯度累积完成，参数已更新")

# 场景3: 演示为什么需要手动清零梯度
def why_zero_grad_needed():
    """演示不清零梯度的问题"""
    x = torch.tensor([1.0, 2.0], requires_grad=True)
    
    print("=== 不清零梯度的问题 ===")
    for epoch in range(3):
        print(f"\nEpoch {epoch + 1}:")
        
        # 计算损失
        y = (x ** 2).sum()
        print(f"损失: {y.item()}")
        print(f"理论梯度应该是: {2 * x}")
        
        # 反向传播
        y.backward()
        print(f"实际累积梯度: {x.grad}")
        
        if epoch == 0:
            print("第一次正确 ✓")
        else:
            print("梯度被错误累积了！ ✗")
        
        # 注意：这里故意不清零梯度来演示问题
    
    print("\n=== 正确做法：每次清零梯度 ===")
    x.grad.zero_()  # 重置
    
    for epoch in range(3):
        print(f"\nEpoch {epoch + 1}:")
        
        # 清零梯度（正确做法）
        if x.grad is not None:
            x.grad.zero_()
        
        # 计算损失
        y = (x ** 2).sum()
        print(f"损失: {y.item()}")
        print(f"理论梯度应该是: {2 * x}")
        
        # 反向传播
        y.backward()
        print(f"实际梯度: {x.grad}")
        print("正确！ ✓")

if __name__ == "__main__":
    print("1. 模拟大批量训练")
    print("=" * 50)
    simulate_large_batch_training()
    
    print("\n\n2. 多任务学习梯度累积")
    print("=" * 50)
    multi_task_gradient_accumulation()
    
    print("\n\n3. 为什么需要手动清零梯度")
    print("=" * 50)
    why_zero_grad_needed()

1. 模拟大批量训练
==================================================
总批量大小: 16
小批量大小: 4
累积步数: 4
步骤 1: loss = 1.8221
  累积梯度范数: 4.5243
步骤 2: loss = 2.0437
  累积梯度范数: 6.4268
步骤 3: loss = 2.1350
  累积梯度范数: 7.6952
步骤 4: loss = 1.1126
  累积梯度范数: 7.5846

平均损失: 1.7784
参数更新完成


2. 多任务学习梯度累积
==================================================
=== 任务1 ===
任务1损失: 0.9983
任务1后共享层梯度范数: 1.3653

=== 任务2 ===
任务2损失: 1.3457
两个任务后共享层梯度范数: 1.7382
梯度增加了: 0.3728

多任务梯度累积完成，参数已更新


3. 为什么需要手动清零梯度
==================================================
=== 不清零梯度的问题 ===

Epoch 1:
损失: 5.0
理论梯度应该是: tensor([2., 4.], grad_fn=<MulBackward0>)
实际累积梯度: tensor([2., 4.])
第一次正确 ✓

Epoch 2:
损失: 5.0
理论梯度应该是: tensor([2., 4.], grad_fn=<MulBackward0>)
实际累积梯度: tensor([4., 8.])
梯度被错误累积了！ ✗

Epoch 3:
损失: 5.0
理论梯度应该是: tensor([2., 4.], grad_fn=<MulBackward0>)
实际累积梯度: tensor([ 6., 12.])
梯度被错误累积了！ ✗

=== 正确做法：每次清零梯度 ===

Epoch 1:
损失: 5.0
理论梯度应该是: tensor([2., 4.], grad_fn=<MulBackward0>)
实际梯度: tensor([2., 4.])
正确！ ✓

Epoch 2:
损失: 5.0
理论梯度应该是: tensor([2., 4.], grad_fn=<MulBackward0>)
实际梯度: tensor([2., 4.])
正确！ ✓

Epoch 3:
损失: 5.0
理论梯度应该是: tensor([2., 4.], grad_fn=<MulBackward0>)
实际梯度: tensor([2., 4.])
正确！ ✓

4. 链式法则与反向传播

4.1 链式法则原理

def chain_rule_example():
    """演示链式法则"""
    # 定义函数: f(g(h(x)))
    # h(x) = x^2
    # g(h) = sin(h)
    # f(g) = exp(g)
    
    x = torch.tensor(0.5, requires_grad=True)
    
    # 前向传播
    h = x ** 2
    g = torch.sin(h)
    f = torch.exp(g)
    
    # 反向传播
    f.backward()
    
    print("链式法则示例:")
    print(f"x = {x.item():.4f}")
    print(f"h = x^2 = {h.item():.4f}")
    print(f"g = sin(h) = {g.item():.4f}")
    print(f"f = exp(g) = {f.item():.4f}")
    print(f"\n自动求导结果: df/dx = {x.grad.item():.4f}")
    
    # 手动计算验证
    df_dg = torch.exp(g)
    dg_dh = torch.cos(h)
    dh_dx = 2 * x
    df_dx_manual = df_dg * dg_dh * dh_dx
    print(f"手动计算结果: df/dx = {df_dx_manual.item():.4f}")
    
    # 可视化链式法则
    fig, ax = plt.subplots(figsize=(12, 4))
    ax.text(0.1, 0.5, 'x', fontsize=20, ha='center', va='center',
            bbox=dict(boxstyle="round,pad=0.3", facecolor="lightblue"))
    ax.arrow(0.15, 0.5, 0.1, 0, head_width=0.02, head_length=0.02, fc='black', ec='black')
    
    ax.text(0.3, 0.5, 'h=x²', fontsize=16, ha='center', va='center',
            bbox=dict(boxstyle="round,pad=0.3", facecolor="lightgreen"))
    ax.arrow(0.38, 0.5, 0.1, 0, head_width=0.02, head_length=0.02, fc='black', ec='black')
    
    ax.text(0.53, 0.5, 'g=sin(h)', fontsize=16, ha='center', va='center',
            bbox=dict(boxstyle="round,pad=0.3", facecolor="lightyellow"))
    ax.arrow(0.63, 0.5, 0.1, 0, head_width=0.02, head_length=0.02, fc='black', ec='black')
    
    ax.text(0.78, 0.5, 'f=exp(g)', fontsize=16, ha='center', va='center',
            bbox=dict(boxstyle="round,pad=0.3", facecolor="lightcoral"))
    
    # 反向传播箭头
    ax.arrow(0.78, 0.3, -0.15, 0, head_width=0.02, head_length=0.02, 
            fc='red', ec='red', linestyle='--', alpha=0.7)
    ax.text(0.7, 0.25, 'df/dg', fontsize=12, color='red')
    
    ax.arrow(0.53, 0.3, -0.15, 0, head_width=0.02, head_length=0.02,
            fc='red', ec='red', linestyle='--', alpha=0.7)
    ax.text(0.45, 0.25, 'dg/dh', fontsize=12, color='red')
    
    ax.arrow(0.3, 0.3, -0.12, 0, head_width=0.02, head_length=0.02,
            fc='red', ec='red', linestyle='--', alpha=0.7)
    ax.text(0.22, 0.25, 'dh/dx', fontsize=12, color='red')
    
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.axis('off')
    ax.set_title('链式法则: df/dx = (df/dg) × (dg/dh) × (dh/dx)', fontsize=14)
    plt.show()

chain_rule_example()

在这里插入图片描述

4.2 反向传播算法

class SimpleNN:
    """手动实现简单神经网络的前向和反向传播"""
    
    def __init__(self, input_size, hidden_size, output_size):
        # 初始化权重和偏置
        self.W1 = torch.randn(input_size, hidden_size, requires_grad=True)
        self.b1 = torch.zeros(hidden_size, requires_grad=True)
        self.W2 = torch.randn(hidden_size, output_size, requires_grad=True)
        self.b2 = torch.zeros(output_size, requires_grad=True)
    
    def forward(self, x):
        """前向传播"""
        # 第一层
        self.z1 = x @ self.W1 + self.b1
        self.a1 = torch.relu(self.z1)
        
        # 第二层
        self.z2 = self.a1 @ self.W2 + self.b2
        self.output = torch.sigmoid(self.z2)
        
        return self.output
    
    def backward_manual(self, x, y, output):
        """手动实现反向传播（用于教学）"""
        m = x.shape[0]
        
        # 输出层梯度
        dz2 = output - y
        dW2 = (self.a1.T @ dz2) / m
        db2 = torch.sum(dz2, dim=0) / m
        
        # 隐藏层梯度
        da1 = dz2 @ self.W2.T
        dz1 = da1 * (self.z1 > 0).float()  # ReLU的导数
        dW1 = (x.T @ dz1) / m
        db1 = torch.sum(dz1, dim=0) / m
        
        return dW1, db1, dW2, db2

# 测试反向传播
torch.manual_seed(42)
model = SimpleNN(2, 3, 1)

# 生成样本数据
x = torch.randn(4, 2)
y = torch.tensor([[1.], [0.], [1.], [0.]])

# 前向传播
output = model.forward(x)

# 计算损失
loss = torch.mean((output - y) ** 2)

# PyTorch自动反向传播
loss.backward()

print("神经网络结构:")
print(f"输入层: 2个神经元")
print(f"隐藏层: 3个神经元 (ReLU激活)")
print(f"输出层: 1个神经元 (Sigmoid激活)")
print(f"\n损失值: {loss.item():.4f}")
print(f"\nW1的梯度形状: {model.W1.grad.shape}")
print(f"W2的梯度形状: {model.W2.grad.shape}")

神经网络结构:
输入层: 2个神经元
隐藏层: 3个神经元 (ReLU激活)
输出层: 1个神经元 (Sigmoid激活)

损失值: 0.4442

W1的梯度形状: torch.Size([2, 3])
W2的梯度形状: torch.Size([3, 1])

4.3 梯度消失和梯度爆炸

def gradient_issues_demo():
    """演示梯度消失和梯度爆炸问题"""
    
    # 梯度消失示例（深层网络with sigmoid）
    x = torch.randn(1, 10, requires_grad=True)
    
    # 模拟深层网络
    h = x
    activations = [x.detach().numpy()]
    
    for i in range(10):
        h = torch.sigmoid(h)
        activations.append(h.detach().numpy())
    
    loss = h.sum()
    loss.backward()
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
    
    # 绘制激活值变化
    for i, act in enumerate(activations):
        ax1.plot(act.flatten(), label=f'Layer {i}', alpha=0.7)
    ax1.set_title('梯度消失：Sigmoid激活值逐层递减')
    ax1.set_xlabel('神经元索引')
    ax1.set_ylabel('激活值')
    ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    # 梯度爆炸示例
    x2 = torch.randn(1, 10, requires_grad=True)
    h2 = x2
    
    for i in range(5):
        h2 = h2 * 2.0  # 每层权重都大于1
    
    loss2 = h2.sum()
    loss2.backward()
    
    # 绘制梯度大小
    gradient_magnitudes = [x.grad.abs().mean().item(), x2.grad.abs().mean().item()]
    ax2.bar(['梯度消失\n(深层Sigmoid)', '梯度爆炸\n(权重>1)'], 
            gradient_magnitudes, color=['blue', 'red'], alpha=0.7)
    ax2.set_ylabel('梯度绝对值均值')
    ax2.set_title('梯度问题对比')
    ax2.set_yscale('log')
    
    plt.tight_layout()
    plt.show()
    
    print(f"梯度消失情况下的梯度均值: {gradient_magnitudes[0]:.2e}")
    print(f"梯度爆炸情况下的梯度均值: {gradient_magnitudes[1]:.2e}")

gradient_issues_demo()

在这里插入图片描述

梯度消失情况下的梯度均值: 3.36e-07
梯度爆炸情况下的梯度均值: 3.20e+01

5. 雅可比矩阵

5.1 雅可比矩阵定义

雅可比矩阵描述了向量函数的导数，是多元函数偏导数的矩阵形式。

def jacobian_example():
    """雅可比矩阵示例"""
    
    # 定义输入向量
    x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
    
    # 定义向量函数 f: R^3 -> R^2
    # f1 = x1^2 + x2*x3
    # f2 = x1*x2 - x3^2
    f1 = x[0]**2 + x[1]*x[2]
    f2 = x[0]*x[1] - x[2]**2
    
    # 输出向量
    f = torch.stack([f1, f2])
    
    # 计算雅可比矩阵
    jacobian = torch.zeros(2, 3)
    
    for i in range(2):
        # 重置梯度
        if x.grad is not None:
            x.grad.zero_()
        
        # 对第i个输出计算梯度
        f[i].backward(retain_graph=True)
        jacobian[i] = x.grad.clone()
    
    print("雅可比矩阵:")
    print("J = [∂f/∂x] =")
    print(jacobian)
    
    # 手动验证
    print("\n手动计算验证:")
    print(f"∂f1/∂x1 = 2*x1 = {2*x[0].item()}")
    print(f"∂f1/∂x2 = x3 = {x[2].item()}")
    print(f"∂f1/∂x3 = x2 = {x[1].item()}")
    print(f"∂f2/∂x1 = x2 = {x[1].item()}")
    print(f"∂f2/∂x2 = x1 = {x[0].item()}")
    print(f"∂f2/∂x3 = -2*x3 = {-2*x[2].item()}")
    
    # 可视化雅可比矩阵
    fig, ax = plt.subplots(figsize=(8, 6))
    im = ax.imshow(jacobian.detach().numpy(), cmap='RdBu', aspect='auto', vmin=-10, vmax=10)
    
    # 添加数值标签
    for i in range(2):
        for j in range(3):
            text = ax.text(j, i, f'{jacobian[i, j].item():.1f}',
                         ha="center", va="center", color="black", fontsize=14)
    
    ax.set_xticks([0, 1, 2])
    ax.set_yticks([0, 1])
    ax.set_xticklabels(['∂/∂x₁', '∂/∂x₂', '∂/∂x₃'])
    ax.set_yticklabels(['f₁', 'f₂'])
    ax.set_title('雅可比矩阵 J', fontsize=16)
    
    plt.colorbar(im, ax=ax)
    plt.tight_layout()
    plt.show()

jacobian_example()

雅可比矩阵:
J = [∂f/∂x] =
tensor([[ 2.,  3.,  2.],
        [ 2.,  1., -6.]])

手动计算验证:
∂f1/∂x1 = 2*x1 = 2.0
∂f1/∂x2 = x3 = 3.0
∂f1/∂x3 = x2 = 2.0
∂f2/∂x1 = x2 = 2.0
∂f2/∂x2 = x1 = 1.0
∂f2/∂x3 = -2*x3 = -6.0

在这里插入图片描述

5.2 雅可比矩阵在神经网络中的应用

# 使用torch.autograd.functional计算雅可比矩阵
from torch.autograd.functional import jacobian

def network_jacobian():
    """神经网络层的雅可比矩阵"""
    
    # 定义一个简单的网络层
    def layer(x):
        W = torch.tensor([[1.0, -0.5], 
                         [0.5, 2.0], 
                         [-1.0, 1.0]])
        b = torch.tensor([0.1, 0.2, -0.1])
        return torch.relu(x @ W.T + b)
    
    # 输入
    x = torch.tensor([1.0, 2.0])
    
    # 计算雅可比矩阵
    J = jacobian(layer, x)
    
    print("网络层函数: f(x) = ReLU(Wx + b)")
    print(f"输入维度: {x.shape}")
    print(f"输出维度: {layer(x).shape}")
    print(f"\n雅可比矩阵形状: {J.shape}")
    print("雅可比矩阵:")
    print(J)
    
    # 可视化输入输出关系
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    
    # 绘制网络结构
    ax1.set_xlim(-0.5, 2.5)
    ax1.set_ylim(-0.5, 3.5)
    
    # 输入节点
    for i in range(2):
        circle = plt.Circle((0, i+0.5), 0.2, color='lightblue', ec='black')
        ax1.add_patch(circle)
        ax1.text(0, i+0.5, f'x{i+1}', ha='center', va='center')
    
    # 输出节点
    for i in range(3):
        circle = plt.Circle((2, i), 0.2, color='lightgreen', ec='black')
        ax1.add_patch(circle)
        ax1.text(2, i, f'y{i+1}', ha='center', va='center')
    
    # 连接线
    for i in range(2):
        for j in range(3):
            ax1.arrow(0.2, i+0.5, 1.6, j-i-0.5, 
                     head_width=0.05, head_length=0.05, 
                     fc='gray', ec='gray', alpha=0.5)
    
    ax1.set_title('网络层结构', fontsize=14)
    ax1.axis('off')
    
    # 绘制雅可比矩阵热图
    im = ax2.imshow(J.detach().numpy(), cmap='coolwarm', aspect='auto')
    ax2.set_xlabel('输入维度')
    ax2.set_ylabel('输出维度')
    ax2.set_title('雅可比矩阵热图', fontsize=14)
    plt.colorbar(im, ax=ax2)
    
    plt.tight_layout()
    plt.show()

network_jacobian()

网络层函数: f(x) = ReLU(Wx + b)
输入维度: torch.Size([2])
输出维度: torch.Size([3])

雅可比矩阵形状: torch.Size([3, 2])
雅可比矩阵:
tensor([[ 1.0000, -0.5000],
        [ 0.5000,  2.0000],
        [-1.0000,  1.0000]])

在这里插入图片描述

6. GPU与CPU兼容性

6.1 设备管理

# 检查CUDA可用性
print(f"CUDA是否可用: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA设备数量: {torch.cuda.device_count()}")
    print(f"当前CUDA设备: {torch.cuda.current_device()}")
    print(f"CUDA设备名称: {torch.cuda.get_device_name(0)}")

# 设备选择最佳实践
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\n使用设备: {device}")

# 创建张量时指定设备
tensor_cpu = torch.randn(3, 3)  # 默认在CPU上
tensor_gpu = torch.randn(3, 3, device=device)  # 在指定设备上

print(f"\nCPU张量设备: {tensor_cpu.device}")
print(f"GPU/CPU张量设备: {tensor_gpu.device}")

CUDA是否可用: True
CUDA设备数量: 1
当前CUDA设备: 0
CUDA设备名称: NVIDIA GeForce RTX 3080

使用设备: cuda

CPU张量设备: cpu
GPU/CPU张量设备: cuda:0

6.2 设备间数据转移

# CPU到GPU
if torch.cuda.is_available():
    # 方法1: 使用.to()
    tensor_cpu = torch.randn(2, 3)
    tensor_gpu = tensor_cpu.to('cuda')
    
    # 方法2: 使用.cuda()
    tensor_gpu2 = tensor_cpu.cuda()
    
    # GPU到CPU
    tensor_back_to_cpu = tensor_gpu.cpu()
    
    print(f"原始张量设备: {tensor_cpu.device}")
    print(f"转移到GPU后: {tensor_gpu.device}")
    print(f"转回CPU后: {tensor_back_to_cpu.device}")
else:
    print("CUDA不可用，使用CPU进行演示")
    tensor = torch.randn(2, 3)
    print(f"张量设备: {tensor.device}")

原始张量设备: cpu
转移到GPU后: cuda:0
转回CPU后: cpu

6.3 设备无关代码编写

class DeviceAgnosticModel(torch.nn.Module):
    """设备无关的模型示例"""
    
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.fc1 = torch.nn.Linear(input_size, hidden_size)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

def train_on_device(model, data, target, device, epochs=100):
    """设备无关的训练函数"""
    # 将模型移到指定设备
    model = model.to(device)
    data = data.to(device)
    target = target.to(device)
    
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
    criterion = torch.nn.MSELoss()
    
    losses = []
    for epoch in range(epochs):
        # 前向传播
        output = model(data)
        loss = criterion(output, target)
        
        # 反向传播
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        losses.append(loss.item())
        
        if (epoch + 1) % 20 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')
    
    return losses

# 使用示例
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"训练设备: {device}")

# 创建模型和数据
model = DeviceAgnosticModel(10, 20, 1)
data = torch.randn(100, 10)
target = torch.randn(100, 1)

# 训练
losses = train_on_device(model, data, target, device, epochs=100)

# 绘制损失曲线
plt.figure(figsize=(10, 4))
plt.plot(losses)
plt.title(f'训练损失曲线 (设备: {device})')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True, alpha=0.3)
plt.show()

Epoch [20/100], Loss: 0.8893
Epoch [40/100], Loss: 0.8566
Epoch [60/100], Loss: 0.8442
Epoch [80/100], Loss: 0.8342
Epoch [100/100], Loss: 0.8252

在这里插入图片描述

6.4 性能优化技巧

def performance_tips():
    """GPU/CPU性能优化技巧"""
    
    print("性能优化技巧:")
    print("-" * 50)
    
    # 1. 批量操作
    print("1. 使用批量操作而非循环:")
    
    # 不推荐
    start_time = torch.cuda.Event(enable_timing=True) if torch.cuda.is_available() else None
    end_time = torch.cuda.Event(enable_timing=True) if torch.cuda.is_available() else None
    
    x = torch.randn(1000, 100, device=device)
    result_slow = torch.zeros(1000, device=device)
    
    if torch.cuda.is_available():
        start_time.record()
    
    for i in range(1000):
        result_slow[i] = x[i].sum()
    
    if torch.cuda.is_available():
        end_time.record()
        torch.cuda.synchronize()
        print(f"   循环方式时间: {start_time.elapsed_time(end_time):.2f} ms")
    
    # 推荐
    if torch.cuda.is_available():
        start_time.record()
    
    result_fast = x.sum(dim=1)
    
    if torch.cuda.is_available():
        end_time.record()
        torch.cuda.synchronize()
        print(f"   批量操作时间: {start_time.elapsed_time(end_time):.2f} ms")
    
    # 2. 避免频繁的设备间数据传输
    print("\n2. 最小化CPU-GPU数据传输:")
    print("   - 在GPU上完成所有计算")
    print("   - 只在必要时传回CPU")
    
    # 3. 使用适当的数据类型
    print("\n3. 使用合适的数据类型:")
    tensor_float32 = torch.randn(1000, 1000, device=device)
    tensor_float16 = tensor_float32.half()  # 转换为半精度
    
    print(f"   Float32内存: {tensor_float32.element_size() * tensor_float32.nelement() / 1024**2:.2f} MB")
    print(f"   Float16内存: {tensor_float16.element_size() * tensor_float16.nelement() / 1024**2:.2f} MB")
    
    # 4. 固定内存
    print("\n4. 使用pin_memory加速数据加载:")
    if torch.cuda.is_available():
        # 创建DataLoader时使用pin_memory=True
        print("   DataLoader(..., pin_memory=True)")

performance_tips()

性能优化技巧:
--------------------------------------------------
1. 使用批量操作而非循环:
   循环方式时间: 32.97 ms
   批量操作时间: 0.03 ms

2. 最小化CPU-GPU数据传输:
   - 在GPU上完成所有计算
   - 只在必要时传回CPU

3. 使用合适的数据类型:
   Float32内存: 3.81 MB
   Float16内存: 1.91 MB

4. 使用pin_memory加速数据加载:
   DataLoader(..., pin_memory=True)

7. 完整示例：构建简单神经网络

7.1 完整的训练流程

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import matplotlib.pyplot as plt

# 设置随机种子
torch.manual_seed(42)
np.random.seed(42)

# 设备配置
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")

# 自定义数据集
class SpiralDataset(Dataset):
    """生成螺旋形分类数据"""
    
    def __init__(self, n_points=1000, n_classes=3, noise=0.2):
        self.n_points = n_points
        self.n_classes = n_classes
        
        X = []
        y = []
        
        for class_idx in range(n_classes):
            # 生成螺旋数据
            theta = np.linspace(class_idx * 4, (class_idx + 1) * 4, n_points // n_classes) + np.random.randn(n_points // n_classes) * noise
            r = np.linspace(0.5, 2, n_points // n_classes) + np.random.randn(n_points // n_classes) * noise * 0.1
            
            x1 = r * np.cos(theta)
            x2 = r * np.sin(theta)
            
            X.append(np.column_stack([x1, x2]))
            y.append(np.full(n_points // n_classes, class_idx))
        
        self.X = torch.FloatTensor(np.vstack(X))
        self.y = torch.LongTensor(np.hstack(y))
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# 定义神经网络模型
class NeuralNetwork(nn.Module):
    def __init__(self, input_size=2, hidden_sizes=[64, 32], output_size=3):
        super(NeuralNetwork, self).__init__()
        
        layers = []
        prev_size = input_size
        
        for hidden_size in hidden_sizes:
            layers.append(nn.Linear(prev_size, hidden_size))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.2))
            prev_size = hidden_size
        
        layers.append(nn.Linear(prev_size, output_size))
        
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

# 训练函数
def train_model(model, train_loader, val_loader, epochs=100, lr=0.01):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=10)
    
    train_losses = []
    val_losses = []
    train_accs = []
    val_accs = []
    
    for epoch in range(epochs):
        # 训练阶段
        model.train()
        train_loss = 0
        correct = 0
        total = 0
        
        for batch_x, batch_y in train_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            
            # 前向传播
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            
            # 反向传播
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()
        
        train_loss /= len(train_loader)
        train_acc = 100 * correct / total
        train_losses.append(train_loss)
        train_accs.append(train_acc)
        
        # 验证阶段
        model.eval()
        val_loss = 0
        correct = 0
        total = 0
        
        with torch.no_grad():
            for batch_x, batch_y in val_loader:
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
                outputs = model(batch_x)
                loss = criterion(outputs, batch_y)
                
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total += batch_y.size(0)
                correct += (predicted == batch_y).sum().item()
        
        val_loss /= len(val_loader)
        val_acc = 100 * correct / total
        val_losses.append(val_loss)
        val_accs.append(val_acc)
        
        scheduler.step(val_loss)
        
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], '
                  f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, '
                  f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%')
    
    return train_losses, val_losses, train_accs, val_accs

# 创建数据集
dataset = SpiralDataset(n_points=3000, n_classes=3, noise=0.2)

# 划分训练集和验证集
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# 创建数据加载器
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# 创建模型
model = NeuralNetwork(input_size=2, hidden_sizes=[128, 64, 32], output_size=3)
print(f"\n模型结构:\n{model}")

# 训练模型
print("\n开始训练...")
train_losses, val_losses, train_accs, val_accs = train_model(
    model, train_loader, val_loader, epochs=100, lr=0.001
)

# 可视化结果
fig, axes = plt.subplots(2, 3, figsize=(15, 8))

# 绘制数据分布
ax = axes[0, 0]
colors = ['red', 'blue', 'green']
for i in range(3):
    mask = dataset.y == i
    ax.scatter(dataset.X[mask, 0], dataset.X[mask, 1], 
              c=colors[i], alpha=0.6, label=f'Class {i}')
ax.set_title('原始数据分布')
ax.set_xlabel('X1')
ax.set_ylabel('X2')
ax.legend()

# 绘制决策边界
ax = axes[0, 1]
model.eval()
h = 0.02
x_min, x_max = dataset.X[:, 0].min() - 0.5, dataset.X[:, 0].max() + 0.5
y_min, y_max = dataset.X[:, 1].min() - 0.5, dataset.X[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

with torch.no_grad():
    Z = model(torch.FloatTensor(np.c_[xx.ravel(), yy.ravel()]).to(device))
    _, Z = torch.max(Z, 1)
    Z = Z.cpu().numpy().reshape(xx.shape)

ax.contourf(xx, yy, Z, alpha=0.4, cmap='viridis')
for i in range(3):
    mask = dataset.y == i
    ax.scatter(dataset.X[mask, 0], dataset.X[mask, 1], 
              c=colors[i], alpha=0.6, label=f'Class {i}')
ax.set_title('学习到的决策边界')
ax.set_xlabel('X1')
ax.set_ylabel('X2')
ax.legend()

# 绘制训练损失
ax = axes[0, 2]
ax.plot(train_losses, label='训练损失', alpha=0.8)
ax.plot(val_losses, label='验证损失', alpha=0.8)
ax.set_title('损失曲线')
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
ax.legend()
ax.grid(True, alpha=0.3)

# 绘制准确率
ax = axes[1, 0]
ax.plot(train_accs, label='训练准确率', alpha=0.8)
ax.plot(val_accs, label='验证准确率', alpha=0.8)
ax.set_title('准确率曲线')
ax.set_xlabel('Epoch')
ax.set_ylabel('Accuracy (%)')
ax.legend()
ax.grid(True, alpha=0.3)

# 绘制梯度流
ax = axes[1, 1]
gradients = []
for name, param in model.named_parameters():
    if param.grad is not None:
        gradients.append(param.grad.abs().mean().item())

ax.bar(range(len(gradients)), gradients, color='skyblue', alpha=0.7)
ax.set_title('各层梯度大小')
ax.set_xlabel('层索引')
ax.set_ylabel('梯度绝对值均值')
ax.grid(True, alpha=0.3)

# 绘制权重分布
ax = axes[1, 2]
weights = []
for param in model.parameters():
    if len(param.shape) > 1:  # 只考虑权重矩阵，不考虑偏置
        weights.extend(param.cpu().detach().numpy().flatten())

ax.hist(weights, bins=50, color='purple', alpha=0.7, edgecolor='black')
ax.set_title('权重分布')
ax.set_xlabel('权重值')
ax.set_ylabel('频数')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

使用设备: cuda

模型结构:
NeuralNetwork(
  (network): Sequential(
    (0): Linear(in_features=2, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=128, out_features=64, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=64, out_features=32, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.2, inplace=False)
    (9): Linear(in_features=32, out_features=3, bias=True)
  )
)

开始训练...
Epoch [10/100], Train Loss: 0.0451, Train Acc: 98.88%, Val Loss: 0.0076, Val Acc: 99.83%
Epoch [20/100], Train Loss: 0.0203, Train Acc: 99.42%, Val Loss: 0.0011, Val Acc: 100.00%
Epoch [30/100], Train Loss: 0.0142, Train Acc: 99.50%, Val Loss: 0.0036, Val Acc: 99.83%
Epoch [40/100], Train Loss: 0.0097, Train Acc: 99.71%, Val Loss: 0.0002, Val Acc: 100.00%
Epoch [50/100], Train Loss: 0.0061, Train Acc: 99.83%, Val Loss: 0.0000, Val Acc: 100.00%
Epoch [60/100], Train Loss: 0.0088, Train Acc: 99.54%, Val Loss: 0.0002, Val Acc: 100.00%
Epoch [70/100], Train Loss: 0.0037, Train Acc: 99.83%, Val Loss: 0.0001, Val Acc: 100.00%
Epoch [80/100], Train Loss: 0.0063, Train Acc: 99.83%, Val Loss: 0.0002, Val Acc: 100.00%
Epoch [90/100], Train Loss: 0.0111, Train Acc: 99.88%, Val Loss: 0.0002, Val Acc: 100.00%
Epoch [100/100], Train Loss: 0.0048, Train Acc: 99.83%, Val Loss: 0.0002, Val Acc: 100.00%

在这里插入图片描述

7.2 计算图可视化工具

def visualize_computation_graph_detailed(model, input_size=(1, 2)):
    """详细可视化模型的计算图"""
    
    # 创建示例输入
    x = torch.randn(input_size, requires_grad=True)
    
    # 前向传播
    y = model(x)
    
    # 创建一个简单的损失
    loss = y.sum()
    
    # 执行反向传播以构建完整的计算图
    loss.backward()
    
    print("计算图构建完成")
    print(f"输入形状: {x.shape}")
    print(f"输出形状: {y.shape}")
    print(f"损失值: {loss.item():.4f}")
    
    # 打印梯度信息
    print("\n各层参数的梯度:")
    for name, param in model.named_parameters():
        if param.grad is not None:
            print(f"{name}: 梯度形状={param.grad.shape}, "
                  f"梯度均值={param.grad.mean().item():.6f}, "
                  f"梯度标准差={param.grad.std().item():.6f}")

# 使用示例
small_model = NeuralNetwork(input_size=2, hidden_sizes=[4, 3], output_size=2)
visualize_computation_graph_detailed(small_model)

计算图构建完成
输入形状: torch.Size([1, 2])
输出形状: torch.Size([1, 2])
损失值: 0.8725

各层参数的梯度:
network.0.weight: 梯度形状=torch.Size([4, 2]), 梯度均值=0.000000, 梯度标准差=0.000000
network.0.bias: 梯度形状=torch.Size([4]), 梯度均值=0.000000, 梯度标准差=0.000000
network.3.weight: 梯度形状=torch.Size([3, 4]), 梯度均值=0.000000, 梯度标准差=0.000000
network.3.bias: 梯度形状=torch.Size([3]), 梯度均值=0.000000, 梯度标准差=0.000000
network.6.weight: 梯度形状=torch.Size([2, 3]), 梯度均值=0.000000, 梯度标准差=0.000000
network.6.bias: 梯度形状=torch.Size([2]), 梯度均值=1.000000, 梯度标准差=0.000000

张量操作：PyTorch的基础数据结构，支持CPU和GPU计算
自动求导：通过计算图自动计算梯度，是深度学习的核心
反向传播：基于链式法则的梯度计算方法
设备管理：灵活的GPU/CPU切换，提高计算效率
实践应用：从理论到实践的完整深度学习流程

最佳实践

始终使用device变量编写设备无关的代码
合理使用requires_grad控制梯度计算
注意梯度累积，必要时清零梯度
使用批量操作替代循环以提高性能
监控梯度大小，避免梯度消失或爆炸

深入学习优化器的工作原理
探索更复杂的网络架构（CNN、RNN、Transformer）
学习分布式训练和混合精度训练
掌握模型部署和优化技术

一文讲清楚Pytorch 张量、链式求导、正向传播、反向求导、计算图等基础知识

PyTorch深度学习基础知识详解

目录

1. 基础数学概念

1.1 标量、向量、矩阵和张量

1.2 张量可视化

2. PyTorch张量基础

2.1 张量创建与操作

2.2 张量的基本运算

2.3 张量的形状操作

3. 计算图与自动求导

3.1 计算图概念

3.2 计算图可视化

3.3 自动求导机制

3.4 梯度累积与清零

4. 链式法则与反向传播

4.1 链式法则原理

4.2 反向传播算法

4.3 梯度消失和梯度爆炸

5. 雅可比矩阵

5.1 雅可比矩阵定义

5.2 雅可比矩阵在神经网络中的应用

6. GPU与CPU兼容性

6.1 设备管理

6.2 设备间数据转移

6.3 设备无关代码编写

6.4 性能优化技巧

7. 完整示例：构建简单神经网络

7.1 完整的训练流程

7.2 计算图可视化工具

最佳实践

网站公告

今日签到

热门文章

最新发布